Watchdog enhancements (#20237)

* refactor get_video_properties and use json output from ffprobe * add zmq topic * publish valid segment data in recording maintainer * check for valid video data - restart separate record ffmpeg process if no video data has been received in 120s - refactor datetime import * listen to correct topic in embeddings maintainer * refactor to move get_latest_segment_datetime logic to recordings maintainer * debug logging * cleanup
2026-03-10 02:29:19 +03:00 · 2025-09-28 11:52:14 -05:00 · 2025-09-28 11:52:14 -05:00 · 12f8c3feac
commit 12f8c3feac
parent b6552987b0
5 changed files with 267 additions and 145 deletions
--- a/frigate/comms/recordings_updater.py
+++ b/frigate/comms/recordings_updater.py
@ -2,6 +2,7 @@
 import logging
 from enum import Enum
 from typing import Any
 from .zmq_proxy import Publisher, Subscriber
@ -10,18 +11,21 @@ logger = logging.getLogger(__name__)
 class RecordingsDataTypeEnum(str, Enum):
    all = ""
-    recordings_available_through = "recordings_available_through"
+    saved = "saved"  # segment has been saved to db
    latest = "latest"  # segment is in cache
    valid = "valid"  # segment is valid
    invalid = "invalid"  # segment is invalid
-class RecordingsDataPublisher(Publisher[tuple[str, float]]):
+class RecordingsDataPublisher(Publisher[Any]):
    """Publishes latest recording data."""
    topic_base = "recordings/"
-    def __init__(self, topic: RecordingsDataTypeEnum) -> None:
+    def __init__(self) -> None:
-        super().__init__(topic.value)
+        super().__init__()
-    def publish(self, payload: tuple[str, float], sub_topic: str = "") -> None:
+    def publish(self, payload: Any, sub_topic: str = "") -> None:
        super().publish(payload, sub_topic)
@ -32,3 +36,11 @@ class RecordingsDataSubscriber(Subscriber):
    def __init__(self, topic: RecordingsDataTypeEnum) -> None:
        super().__init__(topic.value)
    def _return_object(
        self, topic: str, payload: tuple | None
    ) -> tuple[str, Any] | tuple[None, None]:
        if payload is None:
            return (None, None)
        return (topic, payload)
--- a/frigate/embeddings/maintainer.py
+++ b/frigate/embeddings/maintainer.py
@ -144,7 +144,7 @@ class EmbeddingMaintainer(threading.Thread):
            EventMetadataTypeEnum.regenerate_description
        )
        self.recordings_subscriber = RecordingsDataSubscriber(
-            RecordingsDataTypeEnum.recordings_available_through
+            RecordingsDataTypeEnum.saved
        )
        self.review_subscriber = ReviewDataSubscriber("")
        self.detection_subscriber = DetectionSubscriber(DetectionTypeEnum.video.value)
@ -525,20 +525,28 @@ class EmbeddingMaintainer(threading.Thread):
    def _process_recordings_updates(self) -> None:
        """Process recordings updates."""
        while True:
-            recordings_data = self.recordings_subscriber.check_for_update()
+            update = self.recordings_subscriber.check_for_update()
-            if recordings_data == None:
+            if not update:
                break
-            camera, recordings_available_through_timestamp = recordings_data
+            (raw_topic, payload) = update
-            self.recordings_available_through[camera] = (
+            if not raw_topic or not payload:
-                recordings_available_through_timestamp
+                break
            )
-            logger.debug(
+            topic = str(raw_topic)
-                f"{camera} now has recordings available through {recordings_available_through_timestamp}"
+
-            )
+            if topic.endswith(RecordingsDataTypeEnum.saved.value):
                camera, recordings_available_through_timestamp, _ = payload
                self.recordings_available_through[camera] = (
                    recordings_available_through_timestamp
                )
                logger.debug(
                    f"{camera} now has recordings available through {recordings_available_through_timestamp}"
                )
    def _process_review_updates(self) -> None:
        """Process review updates."""
--- a/frigate/record/maintainer.py
+++ b/frigate/record/maintainer.py
@ -80,9 +80,7 @@ class RecordingMaintainer(threading.Thread):
            [CameraConfigUpdateEnum.add, CameraConfigUpdateEnum.record],
        )
        self.detection_subscriber = DetectionSubscriber(DetectionTypeEnum.all.value)
-        self.recordings_publisher = RecordingsDataPublisher(
+        self.recordings_publisher = RecordingsDataPublisher()
            RecordingsDataTypeEnum.recordings_available_through
        )
        self.stop_event = stop_event
        self.object_recordings_info: dict[str, list] = defaultdict(list)
@ -98,6 +96,41 @@ class RecordingMaintainer(threading.Thread):
            and not d.startswith("preview_")
        ]
        # publish newest cached segment per camera (including in use files)
        newest_cache_segments: dict[str, dict[str, Any]] = {}
        for cache in cache_files:
            cache_path = os.path.join(CACHE_DIR, cache)
            basename = os.path.splitext(cache)[0]
            camera, date = basename.rsplit("@", maxsplit=1)
            start_time = datetime.datetime.strptime(
                date, CACHE_SEGMENT_FORMAT
            ).astimezone(datetime.timezone.utc)
            if (
                camera not in newest_cache_segments
                or start_time > newest_cache_segments[camera]["start_time"]
            ):
                newest_cache_segments[camera] = {
                    "start_time": start_time,
                    "cache_path": cache_path,
                }
        for camera, newest in newest_cache_segments.items():
            self.recordings_publisher.publish(
                (
                    camera,
                    newest["start_time"].timestamp(),
                    newest["cache_path"],
                ),
                RecordingsDataTypeEnum.latest.value,
            )
        # publish None for cameras with no cache files (but only if we know the camera exists)
        for camera_name in self.config.cameras:
            if camera_name not in newest_cache_segments:
                self.recordings_publisher.publish(
                    (camera_name, None, None),
                    RecordingsDataTypeEnum.latest.value,
                )
        files_in_use = []
        for process in psutil.process_iter():
            try:
@ -111,7 +144,7 @@ class RecordingMaintainer(threading.Thread):
            except psutil.Error:
                continue
-        # group recordings by camera
+        # group recordings by camera (skip in-use for validation/moving)
        grouped_recordings: defaultdict[str, list[dict[str, Any]]] = defaultdict(list)
        for cache in cache_files:
            # Skip files currently in use
@ -233,7 +266,9 @@ class RecordingMaintainer(threading.Thread):
                    recordings[0]["start_time"].timestamp()
                    if self.config.cameras[camera].record.enabled
                    else None,
-                )
+                    None,
                ),
                RecordingsDataTypeEnum.saved.value,
            )
        recordings_to_insert: list[Optional[Recordings]] = await asyncio.gather(*tasks)
@ -250,7 +285,7 @@ class RecordingMaintainer(threading.Thread):
    async def validate_and_move_segment(
        self, camera: str, reviews: list[ReviewSegment], recording: dict[str, Any]
-    ) -> None:
+    ) -> Optional[Recordings]:
        cache_path: str = recording["cache_path"]
        start_time: datetime.datetime = recording["start_time"]
        record_config = self.config.cameras[camera].record
@ -261,7 +296,7 @@ class RecordingMaintainer(threading.Thread):
            or not self.config.cameras[camera].record.enabled
        ):
            self.drop_segment(cache_path)
-            return
+            return None
        if cache_path in self.end_time_cache:
            end_time, duration = self.end_time_cache[cache_path]
@ -270,10 +305,18 @@ class RecordingMaintainer(threading.Thread):
                self.config.ffmpeg, cache_path, get_duration=True
            )
-            if segment_info["duration"]:
+            if not segment_info.get("has_valid_video", False):
-                duration = float(segment_info["duration"])
+                logger.warning(
-            else:
+                    f"Invalid or missing video stream in segment {cache_path}. Discarding."
-                duration = -1
+                )
                self.recordings_publisher.publish(
                    (camera, start_time.timestamp(), cache_path),
                    RecordingsDataTypeEnum.invalid.value,
                )
                self.drop_segment(cache_path)
                return None
            duration = float(segment_info.get("duration", -1))
            # ensure duration is within expected length
            if 0 < duration < MAX_SEGMENT_DURATION:
@ -284,8 +327,18 @@ class RecordingMaintainer(threading.Thread):
                    logger.warning(f"Failed to probe corrupt segment {cache_path}")
                logger.warning(f"Discarding a corrupt recording segment: {cache_path}")
-                Path(cache_path).unlink(missing_ok=True)
+                self.recordings_publisher.publish(
-                return
+                    (camera, start_time.timestamp(), cache_path),
                    RecordingsDataTypeEnum.invalid.value,
                )
                self.drop_segment(cache_path)
                return None
            # this segment has a valid duration and has video data, so publish an update
            self.recordings_publisher.publish(
                (camera, start_time.timestamp(), cache_path),
                RecordingsDataTypeEnum.valid.value,
            )
        record_config = self.config.cameras[camera].record
        highest = None
--- a/frigate/util/services.py
+++ b/frigate/util/services.py
@ -603,87 +603,87 @@ def auto_detect_hwaccel() -> str:
 async def get_video_properties(
    ffmpeg, url: str, get_duration: bool = False
 ) -> dict[str, Any]:
-    async def calculate_duration(video: Optional[Any]) -> float:
+    async def probe_with_ffprobe(
-        duration = None
+        url: str,
-
+    ) -> tuple[bool, int, int, Optional[str], float]:
-        if video is not None:
+        """Fallback using ffprobe: returns (valid, width, height, codec, duration)."""
-            # Get the frames per second (fps) of the video stream
+        cmd = [
-            fps = video.get(cv2.CAP_PROP_FPS)
+            ffmpeg.ffprobe_path,
-            total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+            "-v",
-
+            "quiet",
-            if fps and total_frames:
+            "-print_format",
-                duration = total_frames / fps
+            "json",
-
+            "-show_format",
-        # if cv2 failed need to use ffprobe
+            "-show_streams",
-        if duration is None:
+            url,
-            p = await asyncio.create_subprocess_exec(
+        ]
-                ffmpeg.ffprobe_path,
+        try:
-                "-v",
+            proc = await asyncio.create_subprocess_exec(
-                "error",
+                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
                "-show_entries",
                "format=duration",
                "-of",
                "default=noprint_wrappers=1:nokey=1",
                f"{url}",
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
-            await p.wait()
+            stdout, _ = await proc.communicate()
            if proc.returncode != 0:
                return False, 0, 0, None, -1
-            if p.returncode == 0:
+            data = json.loads(stdout.decode())
-                result = (await p.stdout.read()).decode()
+            video_streams = [
-            else:
+                s for s in data.get("streams", []) if s.get("codec_type") == "video"
-                result = None
+            ]
            if not video_streams:
                return False, 0, 0, None, -1
-            if result:
+            v = video_streams[0]
-                try:
+            width = int(v.get("width", 0))
-                    duration = float(result.strip())
+            height = int(v.get("height", 0))
-                except ValueError:
+            codec = v.get("codec_name")
                    duration = -1
            else:
                duration = -1
-        return duration
+            duration_str = data.get("format", {}).get("duration")
            duration = float(duration_str) if duration_str else -1.0
-    width = height = 0
+            return True, width, height, codec, duration
        except (json.JSONDecodeError, ValueError, KeyError, asyncio.SubprocessError):
            return False, 0, 0, None, -1
-    try:
+    def probe_with_cv2(url: str) -> tuple[bool, int, int, Optional[str], float]:
-        # Open the video stream using OpenCV
+        """Primary attempt using cv2: returns (valid, width, height, fourcc, duration)."""
-        video = cv2.VideoCapture(url)
+        cap = cv2.VideoCapture(url)
        if not cap.isOpened():
            cap.release()
            return False, 0, 0, None, -1
-        # Check if the video stream was opened successfully
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        if not video.isOpened():
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-            video = None
+        valid = width > 0 and height > 0
-    except Exception:
+        fourcc = None
-        video = None
+        duration = -1.0
-    result = {}
+        if valid:
            fourcc_int = int(cap.get(cv2.CAP_PROP_FOURCC))
            fourcc = fourcc_int.to_bytes(4, "little").decode("latin-1").strip()
            if get_duration:
                fps = cap.get(cv2.CAP_PROP_FPS)
                total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                if fps > 0 and total_frames > 0:
                    duration = total_frames / fps
        cap.release()
        return valid, width, height, fourcc, duration
    # try cv2 first
    has_video, width, height, fourcc, duration = probe_with_cv2(url)
    # fallback to ffprobe if needed
    if not has_video or (get_duration and duration < 0):
        has_video, width, height, fourcc, duration = await probe_with_ffprobe(url)
    result: dict[str, Any] = {"has_valid_video": has_video}
    if has_video:
        result.update({"width": width, "height": height})
        if fourcc:
            result["fourcc"] = fourcc
    if get_duration:
-        result["duration"] = await calculate_duration(video)
+        result["duration"] = duration
    if video is not None:
        # Get the width of frames in the video stream
        width = video.get(cv2.CAP_PROP_FRAME_WIDTH)
        # Get the height of frames in the video stream
        height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
        # Get the stream encoding
        fourcc_int = int(video.get(cv2.CAP_PROP_FOURCC))
        fourcc = (
            chr((fourcc_int >> 0) & 255)
            + chr((fourcc_int >> 8) & 255)
            + chr((fourcc_int >> 16) & 255)
            + chr((fourcc_int >> 24) & 255)
        )
        # Release the video stream
        video.release()
        result["width"] = round(width)
        result["height"] = round(height)
        result["fourcc"] = fourcc
    return result
--- a/frigate/video.py
+++ b/frigate/video.py
@ -1,10 +1,9 @@
 import datetime
 import logging
 import os
 import queue
 import subprocess as sp
 import threading
 import time
 from datetime import datetime, timedelta, timezone
 from multiprocessing import Queue, Value
 from multiprocessing.synchronize import Event as MpEvent
 from typing import Any
@ -13,6 +12,10 @@ import cv2
 from frigate.camera import CameraMetrics, PTZMetrics
 from frigate.comms.inter_process import InterProcessRequestor
 from frigate.comms.recordings_updater import (
    RecordingsDataSubscriber,
    RecordingsDataTypeEnum,
 )
 from frigate.config import CameraConfig, DetectConfig, ModelConfig
 from frigate.config.camera.camera import CameraTypeEnum
 from frigate.config.camera.updater import (
@ -20,8 +23,6 @@ from frigate.config.camera.updater import (
    CameraConfigUpdateSubscriber,
 )
 from frigate.const import (
    CACHE_DIR,
    CACHE_SEGMENT_FORMAT,
    PROCESS_PRIORITY_HIGH,
    REQUEST_REGION_GRID,
 )
@ -129,7 +130,7 @@ def capture_frames(
        fps.value = frame_rate.eps()
        skipped_fps.value = skipped_eps.eps()
-        current_frame.value = datetime.datetime.now().timestamp()
+        current_frame.value = datetime.now().timestamp()
        frame_name = f"{config.name}_frame{frame_index}"
        frame_buffer = frame_manager.write(frame_name)
        try:
@ -199,6 +200,11 @@ class CameraWatchdog(threading.Thread):
        self.requestor = InterProcessRequestor()
        self.was_enabled = self.config.enabled
        self.segment_subscriber = RecordingsDataSubscriber(RecordingsDataTypeEnum.all)
        self.latest_valid_segment_time: float = 0
        self.latest_invalid_segment_time: float = 0
        self.latest_cache_segment_time: float = 0
    def _update_enabled_state(self) -> bool:
        """Fetch the latest config and update enabled state."""
        self.config_subscriber.check_for_updates()
@ -243,6 +249,11 @@ class CameraWatchdog(threading.Thread):
                if enabled:
                    self.logger.debug(f"Enabling camera {self.config.name}")
                    self.start_all_ffmpeg()
                    # reset all timestamps
                    self.latest_valid_segment_time = 0
                    self.latest_invalid_segment_time = 0
                    self.latest_cache_segment_time = 0
                else:
                    self.logger.debug(f"Disabling camera {self.config.name}")
                    self.stop_all_ffmpeg()
@ -260,7 +271,37 @@ class CameraWatchdog(threading.Thread):
            if not enabled:
                continue
-            now = datetime.datetime.now().timestamp()
+            while True:
                update = self.segment_subscriber.check_for_update(timeout=0)
                if update == (None, None):
                    break
                raw_topic, payload = update
                if raw_topic and payload:
                    topic = str(raw_topic)
                    camera, segment_time, _ = payload
                    if camera != self.config.name:
                        continue
                    if topic.endswith(RecordingsDataTypeEnum.valid.value):
                        self.logger.debug(
                            f"Latest valid recording segment time on {camera}: {segment_time}"
                        )
                        self.latest_valid_segment_time = segment_time
                    elif topic.endswith(RecordingsDataTypeEnum.invalid.value):
                        self.logger.warning(
                            f"Invalid recording segment detected for {camera} at {segment_time}"
                        )
                        self.latest_invalid_segment_time = segment_time
                    elif topic.endswith(RecordingsDataTypeEnum.latest.value):
                        if segment_time is not None:
                            self.latest_cache_segment_time = segment_time
                        else:
                            self.latest_cache_segment_time = 0
            now = datetime.now().timestamp()
            if not self.capture_thread.is_alive():
                self.requestor.send_data(f"{self.config.name}/status/detect", "offline")
@ -298,18 +339,55 @@ class CameraWatchdog(threading.Thread):
                poll = p["process"].poll()
                if self.config.record.enabled and "record" in p["roles"]:
-                    latest_segment_time = self.get_latest_segment_datetime(
+                    now_utc = datetime.now().astimezone(timezone.utc)
-                        p.get(
+
-                            "latest_segment_time",
+                    latest_cache_dt = (
-                            datetime.datetime.now().astimezone(datetime.timezone.utc),
+                        datetime.fromtimestamp(
                            self.latest_cache_segment_time, tz=timezone.utc
                        )
                        if self.latest_cache_segment_time > 0
                        else now_utc - timedelta(seconds=1)
                    )
-                    if datetime.datetime.now().astimezone(datetime.timezone.utc) > (
+                    latest_valid_dt = (
-                        latest_segment_time + datetime.timedelta(seconds=120)
+                        datetime.fromtimestamp(
-                    ):
+                            self.latest_valid_segment_time, tz=timezone.utc
                        )
                        if self.latest_valid_segment_time > 0
                        else now_utc - timedelta(seconds=1)
                    )
                    latest_invalid_dt = (
                        datetime.fromtimestamp(
                            self.latest_invalid_segment_time, tz=timezone.utc
                        )
                        if self.latest_invalid_segment_time > 0
                        else now_utc - timedelta(seconds=1)
                    )
                    # ensure segments are still being created and that they have valid video data
                    cache_stale = now_utc > (latest_cache_dt + timedelta(seconds=120))
                    valid_stale = now_utc > (latest_valid_dt + timedelta(seconds=120))
                    invalid_stale_condition = (
                        self.latest_invalid_segment_time > 0
                        and now_utc > (latest_invalid_dt + timedelta(seconds=120))
                        and self.latest_valid_segment_time
                        <= self.latest_invalid_segment_time
                    )
                    invalid_stale = invalid_stale_condition
                    if cache_stale or valid_stale or invalid_stale:
                        if cache_stale:
                            reason = "No new recording segments were created"
                        elif valid_stale:
                            reason = "No new valid recording segments were created"
                        else:  # invalid_stale
                            reason = (
                                "No valid segments created since last invalid segment"
                            )
                        self.logger.error(
-                            f"No new recording segments were created for {self.config.name} in the last 120s. restarting the ffmpeg record process..."
+                            f"{reason} for {self.config.name} in the last 120s. Restarting the ffmpeg record process..."
                        )
                        p["process"] = start_or_restart_ffmpeg(
                            p["cmd"],
@ -328,7 +406,7 @@ class CameraWatchdog(threading.Thread):
                        self.requestor.send_data(
                            f"{self.config.name}/status/record", "online"
                        )
-                        p["latest_segment_time"] = latest_segment_time
+                        p["latest_segment_time"] = self.latest_cache_segment_time
                if poll is None:
                    continue
@ -346,6 +424,7 @@ class CameraWatchdog(threading.Thread):
        self.stop_all_ffmpeg()
        self.logpipe.close()
        self.config_subscriber.stop()
        self.segment_subscriber.stop()
    def start_ffmpeg_detect(self):
        ffmpeg_cmd = [
@ -405,33 +484,6 @@ class CameraWatchdog(threading.Thread):
            p["logpipe"].close()
        self.ffmpeg_other_processes.clear()
    def get_latest_segment_datetime(
        self, latest_segment: datetime.datetime
    ) -> datetime.datetime:
        """Checks if ffmpeg is still writing recording segments to cache."""
        cache_files = sorted(
            [
                d
                for d in os.listdir(CACHE_DIR)
                if os.path.isfile(os.path.join(CACHE_DIR, d))
                and d.endswith(".mp4")
                and not d.startswith("preview_")
            ]
        )
        newest_segment_time = latest_segment
        for file in cache_files:
            if self.config.name in file:
                basename = os.path.splitext(file)[0]
                _, date = basename.rsplit("@", maxsplit=1)
                segment_time = datetime.datetime.strptime(
                    date, CACHE_SEGMENT_FORMAT
                ).astimezone(datetime.timezone.utc)
                if segment_time > newest_segment_time:
                    newest_segment_time = segment_time
        return newest_segment_time
 class CameraCaptureRunner(threading.Thread):
    def __init__(
@ -727,10 +779,7 @@ def process_frames(
            time.sleep(0.1)
            continue
-        if (
+        if datetime.now().astimezone(timezone.utc) > next_region_update:
            datetime.datetime.now().astimezone(datetime.timezone.utc)
            > next_region_update
        ):
            region_grid = requestor.send_data(REQUEST_REGION_GRID, camera_config.name)
            next_region_update = get_tomorrow_at_time(2)