mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-03-21 15:48:22 +03:00
Improve process watchdog (#22557)
Some checks are pending
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions
Some checks are pending
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions
* monitor subprocesses and auto-restart with watchdog * fix typing * formatting
This commit is contained in:
parent
a8da4c4521
commit
6d2b84e202
@ -8,7 +8,7 @@ from multiprocessing import Queue
|
||||
from multiprocessing.managers import DictProxy, SyncManager
|
||||
from multiprocessing.synchronize import Event as MpEvent
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Callable, Optional
|
||||
|
||||
import psutil
|
||||
import uvicorn
|
||||
@ -81,6 +81,7 @@ from frigate.timeline import TimelineProcessor
|
||||
from frigate.track.object_processing import TrackedObjectProcessor
|
||||
from frigate.util.builtin import empty_and_close_queue
|
||||
from frigate.util.image import UntrackedSharedMemory
|
||||
from frigate.util.process import FrigateProcess
|
||||
from frigate.util.services import set_file_limit
|
||||
from frigate.version import VERSION
|
||||
from frigate.watchdog import FrigateWatchdog
|
||||
@ -497,6 +498,47 @@ class FrigateApp:
|
||||
|
||||
def start_watchdog(self) -> None:
|
||||
self.frigate_watchdog = FrigateWatchdog(self.detectors, self.stop_event)
|
||||
|
||||
# (attribute on self, key in self.processes, factory)
|
||||
specs: list[tuple[str, str, Callable[[], FrigateProcess]]] = [
|
||||
(
|
||||
"embedding_process",
|
||||
"embeddings",
|
||||
lambda: EmbeddingProcess(
|
||||
self.config, self.embeddings_metrics, self.stop_event
|
||||
),
|
||||
),
|
||||
(
|
||||
"recording_process",
|
||||
"recording",
|
||||
lambda: RecordProcess(self.config, self.stop_event),
|
||||
),
|
||||
(
|
||||
"review_segment_process",
|
||||
"review_segment",
|
||||
lambda: ReviewProcess(self.config, self.stop_event),
|
||||
),
|
||||
(
|
||||
"output_processor",
|
||||
"output",
|
||||
lambda: OutputProcess(self.config, self.stop_event),
|
||||
),
|
||||
]
|
||||
|
||||
for attr, key, factory in specs:
|
||||
if not hasattr(self, attr):
|
||||
continue
|
||||
|
||||
def on_restart(
|
||||
proc: FrigateProcess, _attr: str = attr, _key: str = key
|
||||
) -> None:
|
||||
setattr(self, _attr, proc)
|
||||
self.processes[_key] = proc.pid or 0
|
||||
|
||||
self.frigate_watchdog.register(
|
||||
key, getattr(self, attr), factory, on_restart
|
||||
)
|
||||
|
||||
self.frigate_watchdog.start()
|
||||
|
||||
def init_auth(self) -> None:
|
||||
|
||||
@ -2,19 +2,111 @@ import datetime
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
from collections import deque
|
||||
from dataclasses import dataclass, field
|
||||
from multiprocessing.synchronize import Event as MpEvent
|
||||
from typing import Callable
|
||||
|
||||
from frigate.object_detection.base import ObjectDetectProcess
|
||||
from frigate.util.process import FrigateProcess
|
||||
from frigate.util.services import restart_frigate
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MAX_RESTARTS = 5
|
||||
RESTART_WINDOW_S = 60
|
||||
|
||||
|
||||
@dataclass
|
||||
class MonitoredProcess:
|
||||
"""A process monitored by the watchdog for automatic restart."""
|
||||
|
||||
name: str
|
||||
process: FrigateProcess
|
||||
factory: Callable[[], FrigateProcess]
|
||||
on_restart: Callable[[FrigateProcess], None] | None = None
|
||||
restart_timestamps: deque[float] = field(
|
||||
default_factory=lambda: deque(maxlen=MAX_RESTARTS)
|
||||
)
|
||||
|
||||
def is_restarting_too_fast(self, now: float) -> bool:
|
||||
while (
|
||||
self.restart_timestamps
|
||||
and now - self.restart_timestamps[0] > RESTART_WINDOW_S
|
||||
):
|
||||
self.restart_timestamps.popleft()
|
||||
return len(self.restart_timestamps) >= MAX_RESTARTS
|
||||
|
||||
|
||||
class FrigateWatchdog(threading.Thread):
|
||||
def __init__(self, detectors: dict[str, ObjectDetectProcess], stop_event: MpEvent):
|
||||
def __init__(
|
||||
self,
|
||||
detectors: dict[str, ObjectDetectProcess],
|
||||
stop_event: MpEvent,
|
||||
):
|
||||
super().__init__(name="frigate_watchdog")
|
||||
self.detectors = detectors
|
||||
self.stop_event = stop_event
|
||||
self._monitored: list[MonitoredProcess] = []
|
||||
|
||||
def register(
|
||||
self,
|
||||
name: str,
|
||||
process: FrigateProcess,
|
||||
factory: Callable[[], FrigateProcess],
|
||||
on_restart: Callable[[FrigateProcess], None] | None = None,
|
||||
) -> None:
|
||||
"""Register a FrigateProcess for monitoring and automatic restart."""
|
||||
self._monitored.append(
|
||||
MonitoredProcess(
|
||||
name=name,
|
||||
process=process,
|
||||
factory=factory,
|
||||
on_restart=on_restart,
|
||||
)
|
||||
)
|
||||
|
||||
def _check_process(self, entry: MonitoredProcess) -> None:
|
||||
if entry.process.is_alive():
|
||||
return
|
||||
|
||||
exitcode = entry.process.exitcode
|
||||
if exitcode == 0:
|
||||
logger.info("Process %s exited cleanly, not restarting", entry.name)
|
||||
return
|
||||
|
||||
logger.warning(
|
||||
"Process %s (PID %s) exited with code %s",
|
||||
entry.name,
|
||||
entry.process.pid,
|
||||
exitcode,
|
||||
)
|
||||
|
||||
now = datetime.datetime.now().timestamp()
|
||||
|
||||
if entry.is_restarting_too_fast(now):
|
||||
logger.error(
|
||||
"Process %s restarting too frequently (%d times in %ds), backing off",
|
||||
entry.name,
|
||||
MAX_RESTARTS,
|
||||
RESTART_WINDOW_S,
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
entry.process.close()
|
||||
new_process = entry.factory()
|
||||
new_process.start()
|
||||
|
||||
entry.process = new_process
|
||||
entry.restart_timestamps.append(now)
|
||||
|
||||
if entry.on_restart:
|
||||
entry.on_restart(new_process)
|
||||
|
||||
logger.info("Restarted %s (PID %s)", entry.name, new_process.pid)
|
||||
except Exception:
|
||||
logger.exception("Failed to restart %s", entry.name)
|
||||
|
||||
def run(self) -> None:
|
||||
time.sleep(10)
|
||||
@ -38,4 +130,7 @@ class FrigateWatchdog(threading.Thread):
|
||||
logger.info("Detection appears to have stopped. Exiting Frigate...")
|
||||
restart_frigate()
|
||||
|
||||
for entry in self._monitored:
|
||||
self._check_process(entry)
|
||||
|
||||
logger.info("Exiting watchdog...")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user