mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-06-21 11:51:53 +03:00
Corroborate record watchdog staleness against the cache dir before restarting
The record watchdog treats a stale maintainer heartbeat as a dead recorder. But the heartbeat is published by the recording maintainer, so whenever the maintainer lags (e.g. "Unable to keep up with recording segments in cache", #9661) every camera looks stale at once and all record processes restart together - while recording was actually healthy. The restart churn then produces more, shorter segments, making the maintainer fall further behind. Before restarting on staleness, check the camera's newest cache segment on disk: if a segment is fresher than the staleness threshold, the recorder is demonstrably writing - log a warning, adopt the disk mtime as the heartbeat, and skip the restart. The invalid-segment path is untouched. Validated on a 26-camera production deployment (0.17.1 backport of this change): synchronized mass restarts went from 52/hour to zero, with heartbeat-stale events still occurring ~2/hour but now correctly identified as maintainer lag instead of recording failure. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
d7ad3ba699
commit
36c61a3607
@ -9,6 +9,7 @@ from collections import deque
|
|||||||
from datetime import datetime, timedelta, timezone
|
from datetime import datetime, timedelta, timezone
|
||||||
from multiprocessing import Queue, Value
|
from multiprocessing import Queue, Value
|
||||||
from multiprocessing.synchronize import Event as MpEvent
|
from multiprocessing.synchronize import Event as MpEvent
|
||||||
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from frigate.camera import CameraMetrics
|
from frigate.camera import CameraMetrics
|
||||||
@ -22,7 +23,7 @@ from frigate.config.camera.updater import (
|
|||||||
CameraConfigUpdateEnum,
|
CameraConfigUpdateEnum,
|
||||||
CameraConfigUpdateSubscriber,
|
CameraConfigUpdateSubscriber,
|
||||||
)
|
)
|
||||||
from frigate.const import PROCESS_PRIORITY_HIGH
|
from frigate.const import CACHE_DIR, PROCESS_PRIORITY_HIGH
|
||||||
from frigate.log import LogPipe
|
from frigate.log import LogPipe
|
||||||
from frigate.util.builtin import EventsPerSecond, get_record_segment_time
|
from frigate.util.builtin import EventsPerSecond, get_record_segment_time
|
||||||
from frigate.util.ffmpeg import start_or_restart_ffmpeg, stop_ffmpeg
|
from frigate.util.ffmpeg import start_or_restart_ffmpeg, stop_ffmpeg
|
||||||
@ -453,6 +454,38 @@ class CameraWatchdog(threading.Thread):
|
|||||||
invalid_stale = invalid_stale_condition
|
invalid_stale = invalid_stale_condition
|
||||||
|
|
||||||
if cache_stale or valid_stale or invalid_stale:
|
if cache_stale or valid_stale or invalid_stale:
|
||||||
|
# The staleness above is measured from the recording
|
||||||
|
# maintainer's IPC heartbeat, which lags whenever the
|
||||||
|
# maintainer falls behind (e.g. "Unable to keep up with
|
||||||
|
# recording segments in cache"). A late message is not
|
||||||
|
# a dead recorder: corroborate against the cache dir
|
||||||
|
# before restarting, otherwise every camera restarts
|
||||||
|
# together on maintainer lag and the resulting segment
|
||||||
|
# churn makes the overload worse.
|
||||||
|
if not invalid_stale:
|
||||||
|
newest_on_disk = max(
|
||||||
|
(
|
||||||
|
f.stat().st_mtime
|
||||||
|
for f in Path(CACHE_DIR).glob(
|
||||||
|
f"{self.config.name}@*"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
default=0.0,
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
newest_on_disk > 0
|
||||||
|
and now_utc.timestamp() - newest_on_disk
|
||||||
|
< self.record_stale_threshold
|
||||||
|
):
|
||||||
|
self.logger.warning(
|
||||||
|
f"Recording heartbeat for {self.config.name} is stale but a cache "
|
||||||
|
f"segment is only {now_utc.timestamp() - newest_on_disk:.0f}s old — "
|
||||||
|
"skipping the record process restart (maintainer heartbeat lag, "
|
||||||
|
"not a recording failure)."
|
||||||
|
)
|
||||||
|
self.latest_cache_segment_time = newest_on_disk
|
||||||
|
continue
|
||||||
|
|
||||||
if cache_stale:
|
if cache_stale:
|
||||||
reason = "No new recording segments were created"
|
reason = "No new recording segments were created"
|
||||||
elif valid_stale:
|
elif valid_stale:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user