Watchdog enhancements (#20237)

* refactor get_video_properties and use json output from ffprobe * add zmq topic * publish valid segment data in recording maintainer * check for valid video data - restart separate record ffmpeg process if no video data has been received in 120s - refactor datetime import * listen to correct topic in embeddings maintainer * refactor to move get_latest_segment_datetime logic to recordings maintainer * debug logging * cleanup
2026-01-22 20:18:30 +03:00 · 2025-09-28 11:52:14 -05:00 · 2025-09-28 11:52:14 -05:00 · 12f8c3feac
commit 12f8c3feac
parent b6552987b0
5 changed files with 267 additions and 145 deletions
--- a/frigate/comms/recordings_updater.py
+++ b/frigate/comms/recordings_updater.py
@ -2,6 +2,7 @@

 import logging
 from enum import Enum
+from typing import Any

 from .zmq_proxy import Publisher, Subscriber

@ -10,18 +11,21 @@ logger = logging.getLogger(__name__)

 class RecordingsDataTypeEnum(str, Enum):
    all = ""
-    recordings_available_through = "recordings_available_through"
+    saved = "saved"  # segment has been saved to db
+    latest = "latest"  # segment is in cache
+    valid = "valid"  # segment is valid
+    invalid = "invalid"  # segment is invalid


-class RecordingsDataPublisher(Publisher[tuple[str, float]]):
+class RecordingsDataPublisher(Publisher[Any]):
    """Publishes latest recording data."""

    topic_base = "recordings/"

-    def __init__(self, topic: RecordingsDataTypeEnum) -> None:
-        super().__init__(topic.value)
+    def __init__(self) -> None:
+        super().__init__()

-    def publish(self, payload: tuple[str, float], sub_topic: str = "") -> None:
+    def publish(self, payload: Any, sub_topic: str = "") -> None:
        super().publish(payload, sub_topic)


@ -32,3 +36,11 @@ class RecordingsDataSubscriber(Subscriber):

    def __init__(self, topic: RecordingsDataTypeEnum) -> None:
        super().__init__(topic.value)
+
+    def _return_object(
+        self, topic: str, payload: tuple | None
+    ) -> tuple[str, Any] | tuple[None, None]:
+        if payload is None:
+            return (None, None)
+
+        return (topic, payload)
--- a/frigate/embeddings/maintainer.py
+++ b/frigate/embeddings/maintainer.py
@ -144,7 +144,7 @@ class EmbeddingMaintainer(threading.Thread):
            EventMetadataTypeEnum.regenerate_description
        )
        self.recordings_subscriber = RecordingsDataSubscriber(
-            RecordingsDataTypeEnum.recordings_available_through
+            RecordingsDataTypeEnum.saved
        )
        self.review_subscriber = ReviewDataSubscriber("")
        self.detection_subscriber = DetectionSubscriber(DetectionTypeEnum.video.value)
@ -525,20 +525,28 @@ class EmbeddingMaintainer(threading.Thread):
    def _process_recordings_updates(self) -> None:
        """Process recordings updates."""
        while True:
-            recordings_data = self.recordings_subscriber.check_for_update()
+            update = self.recordings_subscriber.check_for_update()

-            if recordings_data == None:
+            if not update:
                break

-            camera, recordings_available_through_timestamp = recordings_data
+            (raw_topic, payload) = update

-            self.recordings_available_through[camera] = (
-                recordings_available_through_timestamp
-            )
+            if not raw_topic or not payload:
+                break

-            logger.debug(
-                f"{camera} now has recordings available through {recordings_available_through_timestamp}"
-            )
+            topic = str(raw_topic)
+
+            if topic.endswith(RecordingsDataTypeEnum.saved.value):
+                camera, recordings_available_through_timestamp, _ = payload
+
+                self.recordings_available_through[camera] = (
+                    recordings_available_through_timestamp
+                )
+
+                logger.debug(
+                    f"{camera} now has recordings available through {recordings_available_through_timestamp}"
+                )

    def _process_review_updates(self) -> None:
        """Process review updates."""
--- a/frigate/record/maintainer.py
+++ b/frigate/record/maintainer.py
@ -80,9 +80,7 @@ class RecordingMaintainer(threading.Thread):
            [CameraConfigUpdateEnum.add, CameraConfigUpdateEnum.record],
        )
        self.detection_subscriber = DetectionSubscriber(DetectionTypeEnum.all.value)
-        self.recordings_publisher = RecordingsDataPublisher(
-            RecordingsDataTypeEnum.recordings_available_through
-        )
+        self.recordings_publisher = RecordingsDataPublisher()

        self.stop_event = stop_event
        self.object_recordings_info: dict[str, list] = defaultdict(list)
@ -98,6 +96,41 @@ class RecordingMaintainer(threading.Thread):
            and not d.startswith("preview_")
        ]

+        # publish newest cached segment per camera (including in use files)
+        newest_cache_segments: dict[str, dict[str, Any]] = {}
+        for cache in cache_files:
+            cache_path = os.path.join(CACHE_DIR, cache)
+            basename = os.path.splitext(cache)[0]
+            camera, date = basename.rsplit("@", maxsplit=1)
+            start_time = datetime.datetime.strptime(
+                date, CACHE_SEGMENT_FORMAT
+            ).astimezone(datetime.timezone.utc)
+            if (
+                camera not in newest_cache_segments
+                or start_time > newest_cache_segments[camera]["start_time"]
+            ):
+                newest_cache_segments[camera] = {
+                    "start_time": start_time,
+                    "cache_path": cache_path,
+                }
+
+        for camera, newest in newest_cache_segments.items():
+            self.recordings_publisher.publish(
+                (
+                    camera,
+                    newest["start_time"].timestamp(),
+                    newest["cache_path"],
+                ),
+                RecordingsDataTypeEnum.latest.value,
+            )
+        # publish None for cameras with no cache files (but only if we know the camera exists)
+        for camera_name in self.config.cameras:
+            if camera_name not in newest_cache_segments:
+                self.recordings_publisher.publish(
+                    (camera_name, None, None),
+                    RecordingsDataTypeEnum.latest.value,
+                )
+
        files_in_use = []
        for process in psutil.process_iter():
            try:
@ -111,7 +144,7 @@ class RecordingMaintainer(threading.Thread):
            except psutil.Error:
                continue

-        # group recordings by camera
+        # group recordings by camera (skip in-use for validation/moving)
        grouped_recordings: defaultdict[str, list[dict[str, Any]]] = defaultdict(list)
        for cache in cache_files:
            # Skip files currently in use
@ -233,7 +266,9 @@ class RecordingMaintainer(threading.Thread):
                    recordings[0]["start_time"].timestamp()
                    if self.config.cameras[camera].record.enabled
                    else None,
-                )
+                    None,
+                ),
+                RecordingsDataTypeEnum.saved.value,
            )

        recordings_to_insert: list[Optional[Recordings]] = await asyncio.gather(*tasks)
@ -250,7 +285,7 @@ class RecordingMaintainer(threading.Thread):

    async def validate_and_move_segment(
        self, camera: str, reviews: list[ReviewSegment], recording: dict[str, Any]
-    ) -> None:
+    ) -> Optional[Recordings]:
        cache_path: str = recording["cache_path"]
        start_time: datetime.datetime = recording["start_time"]
        record_config = self.config.cameras[camera].record
@ -261,7 +296,7 @@ class RecordingMaintainer(threading.Thread):
            or not self.config.cameras[camera].record.enabled
        ):
            self.drop_segment(cache_path)
-            return
+            return None

        if cache_path in self.end_time_cache:
            end_time, duration = self.end_time_cache[cache_path]
@ -270,10 +305,18 @@ class RecordingMaintainer(threading.Thread):
                self.config.ffmpeg, cache_path, get_duration=True
            )

-            if segment_info["duration"]:
-                duration = float(segment_info["duration"])
-            else:
-                duration = -1
+            if not segment_info.get("has_valid_video", False):
+                logger.warning(
+                    f"Invalid or missing video stream in segment {cache_path}. Discarding."
+                )
+                self.recordings_publisher.publish(
+                    (camera, start_time.timestamp(), cache_path),
+                    RecordingsDataTypeEnum.invalid.value,
+                )
+                self.drop_segment(cache_path)
+                return None
+
+            duration = float(segment_info.get("duration", -1))

            # ensure duration is within expected length
            if 0 < duration < MAX_SEGMENT_DURATION:
@ -284,8 +327,18 @@ class RecordingMaintainer(threading.Thread):
                    logger.warning(f"Failed to probe corrupt segment {cache_path}")

                logger.warning(f"Discarding a corrupt recording segment: {cache_path}")
-                Path(cache_path).unlink(missing_ok=True)
-                return
+                self.recordings_publisher.publish(
+                    (camera, start_time.timestamp(), cache_path),
+                    RecordingsDataTypeEnum.invalid.value,
+                )
+                self.drop_segment(cache_path)
+                return None
+
+            # this segment has a valid duration and has video data, so publish an update
+            self.recordings_publisher.publish(
+                (camera, start_time.timestamp(), cache_path),
+                RecordingsDataTypeEnum.valid.value,
+            )

        record_config = self.config.cameras[camera].record
        highest = None
--- a/frigate/util/services.py
+++ b/frigate/util/services.py
@ -603,87 +603,87 @@ def auto_detect_hwaccel() -> str:
 async def get_video_properties(
    ffmpeg, url: str, get_duration: bool = False
 ) -> dict[str, Any]:
-    async def calculate_duration(video: Optional[Any]) -> float:
-        duration = None
-
-        if video is not None:
-            # Get the frames per second (fps) of the video stream
-            fps = video.get(cv2.CAP_PROP_FPS)
-            total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
-
-            if fps and total_frames:
-                duration = total_frames / fps
-
-        # if cv2 failed need to use ffprobe
-        if duration is None:
-            p = await asyncio.create_subprocess_exec(
-                ffmpeg.ffprobe_path,
-                "-v",
-                "error",
-                "-show_entries",
-                "format=duration",
-                "-of",
-                "default=noprint_wrappers=1:nokey=1",
-                f"{url}",
-                stdout=asyncio.subprocess.PIPE,
-                stderr=asyncio.subprocess.PIPE,
+    async def probe_with_ffprobe(
+        url: str,
+    ) -> tuple[bool, int, int, Optional[str], float]:
+        """Fallback using ffprobe: returns (valid, width, height, codec, duration)."""
+        cmd = [
+            ffmpeg.ffprobe_path,
+            "-v",
+            "quiet",
+            "-print_format",
+            "json",
+            "-show_format",
+            "-show_streams",
+            url,
+        ]
+        try:
+            proc = await asyncio.create_subprocess_exec(
+                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
            )
-            await p.wait()
+            stdout, _ = await proc.communicate()
+            if proc.returncode != 0:
+                return False, 0, 0, None, -1

-            if p.returncode == 0:
-                result = (await p.stdout.read()).decode()
-            else:
-                result = None
+            data = json.loads(stdout.decode())
+            video_streams = [
+                s for s in data.get("streams", []) if s.get("codec_type") == "video"
+            ]
+            if not video_streams:
+                return False, 0, 0, None, -1

-            if result:
-                try:
-                    duration = float(result.strip())
-                except ValueError:
-                    duration = -1
-            else:
-                duration = -1
+            v = video_streams[0]
+            width = int(v.get("width", 0))
+            height = int(v.get("height", 0))
+            codec = v.get("codec_name")

-        return duration
+            duration_str = data.get("format", {}).get("duration")
+            duration = float(duration_str) if duration_str else -1.0

-    width = height = 0
+            return True, width, height, codec, duration
+        except (json.JSONDecodeError, ValueError, KeyError, asyncio.SubprocessError):
+            return False, 0, 0, None, -1

-    try:
-        # Open the video stream using OpenCV
-        video = cv2.VideoCapture(url)
+    def probe_with_cv2(url: str) -> tuple[bool, int, int, Optional[str], float]:
+        """Primary attempt using cv2: returns (valid, width, height, fourcc, duration)."""
+        cap = cv2.VideoCapture(url)
+        if not cap.isOpened():
+            cap.release()
+            return False, 0, 0, None, -1

-        # Check if the video stream was opened successfully
-        if not video.isOpened():
-            video = None
-    except Exception:
-        video = None
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        valid = width > 0 and height > 0
+        fourcc = None
+        duration = -1.0

-    result = {}
+        if valid:
+            fourcc_int = int(cap.get(cv2.CAP_PROP_FOURCC))
+            fourcc = fourcc_int.to_bytes(4, "little").decode("latin-1").strip()

+            if get_duration:
+                fps = cap.get(cv2.CAP_PROP_FPS)
+                total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                if fps > 0 and total_frames > 0:
+                    duration = total_frames / fps
+
+        cap.release()
+        return valid, width, height, fourcc, duration
+
+    # try cv2 first
+    has_video, width, height, fourcc, duration = probe_with_cv2(url)
+
+    # fallback to ffprobe if needed
+    if not has_video or (get_duration and duration < 0):
+        has_video, width, height, fourcc, duration = await probe_with_ffprobe(url)
+
+    result: dict[str, Any] = {"has_valid_video": has_video}
+    if has_video:
+        result.update({"width": width, "height": height})
+        if fourcc:
+            result["fourcc"] = fourcc
    if get_duration:
-        result["duration"] = await calculate_duration(video)
-
-    if video is not None:
-        # Get the width of frames in the video stream
-        width = video.get(cv2.CAP_PROP_FRAME_WIDTH)
-
-        # Get the height of frames in the video stream
-        height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
-
-        # Get the stream encoding
-        fourcc_int = int(video.get(cv2.CAP_PROP_FOURCC))
-        fourcc = (
-            chr((fourcc_int >> 0) & 255)
-            + chr((fourcc_int >> 8) & 255)
-            + chr((fourcc_int >> 16) & 255)
-            + chr((fourcc_int >> 24) & 255)
-        )
-
-        # Release the video stream
-        video.release()
-
-        result["width"] = round(width)
-        result["height"] = round(height)
-        result["fourcc"] = fourcc
+        result["duration"] = duration

    return result

--- a/frigate/video.py
+++ b/frigate/video.py
@ -1,10 +1,9 @@
-import datetime
 import logging
-import os
 import queue
 import subprocess as sp
 import threading
 import time
+from datetime import datetime, timedelta, timezone
 from multiprocessing import Queue, Value
 from multiprocessing.synchronize import Event as MpEvent
 from typing import Any
@ -13,6 +12,10 @@ import cv2

 from frigate.camera import CameraMetrics, PTZMetrics
 from frigate.comms.inter_process import InterProcessRequestor
+from frigate.comms.recordings_updater import (
+    RecordingsDataSubscriber,
+    RecordingsDataTypeEnum,
+)
 from frigate.config import CameraConfig, DetectConfig, ModelConfig
 from frigate.config.camera.camera import CameraTypeEnum
 from frigate.config.camera.updater import (
@ -20,8 +23,6 @@ from frigate.config.camera.updater import (
    CameraConfigUpdateSubscriber,
 )
 from frigate.const import (
-    CACHE_DIR,
-    CACHE_SEGMENT_FORMAT,
    PROCESS_PRIORITY_HIGH,
    REQUEST_REGION_GRID,
 )
@ -129,7 +130,7 @@ def capture_frames(

        fps.value = frame_rate.eps()
        skipped_fps.value = skipped_eps.eps()
-        current_frame.value = datetime.datetime.now().timestamp()
+        current_frame.value = datetime.now().timestamp()
        frame_name = f"{config.name}_frame{frame_index}"
        frame_buffer = frame_manager.write(frame_name)
        try:
@ -199,6 +200,11 @@ class CameraWatchdog(threading.Thread):
        self.requestor = InterProcessRequestor()
        self.was_enabled = self.config.enabled

+        self.segment_subscriber = RecordingsDataSubscriber(RecordingsDataTypeEnum.all)
+        self.latest_valid_segment_time: float = 0
+        self.latest_invalid_segment_time: float = 0
+        self.latest_cache_segment_time: float = 0
+
    def _update_enabled_state(self) -> bool:
        """Fetch the latest config and update enabled state."""
        self.config_subscriber.check_for_updates()
@ -243,6 +249,11 @@ class CameraWatchdog(threading.Thread):
                if enabled:
                    self.logger.debug(f"Enabling camera {self.config.name}")
                    self.start_all_ffmpeg()
+
+                    # reset all timestamps
+                    self.latest_valid_segment_time = 0
+                    self.latest_invalid_segment_time = 0
+                    self.latest_cache_segment_time = 0
                else:
                    self.logger.debug(f"Disabling camera {self.config.name}")
                    self.stop_all_ffmpeg()
@ -260,7 +271,37 @@ class CameraWatchdog(threading.Thread):
            if not enabled:
                continue

-            now = datetime.datetime.now().timestamp()
+            while True:
+                update = self.segment_subscriber.check_for_update(timeout=0)
+
+                if update == (None, None):
+                    break
+
+                raw_topic, payload = update
+                if raw_topic and payload:
+                    topic = str(raw_topic)
+                    camera, segment_time, _ = payload
+
+                    if camera != self.config.name:
+                        continue
+
+                    if topic.endswith(RecordingsDataTypeEnum.valid.value):
+                        self.logger.debug(
+                            f"Latest valid recording segment time on {camera}: {segment_time}"
+                        )
+                        self.latest_valid_segment_time = segment_time
+                    elif topic.endswith(RecordingsDataTypeEnum.invalid.value):
+                        self.logger.warning(
+                            f"Invalid recording segment detected for {camera} at {segment_time}"
+                        )
+                        self.latest_invalid_segment_time = segment_time
+                    elif topic.endswith(RecordingsDataTypeEnum.latest.value):
+                        if segment_time is not None:
+                            self.latest_cache_segment_time = segment_time
+                        else:
+                            self.latest_cache_segment_time = 0
+
+            now = datetime.now().timestamp()

            if not self.capture_thread.is_alive():
                self.requestor.send_data(f"{self.config.name}/status/detect", "offline")
@ -298,18 +339,55 @@ class CameraWatchdog(threading.Thread):
                poll = p["process"].poll()

                if self.config.record.enabled and "record" in p["roles"]:
-                    latest_segment_time = self.get_latest_segment_datetime(
-                        p.get(
-                            "latest_segment_time",
-                            datetime.datetime.now().astimezone(datetime.timezone.utc),
+                    now_utc = datetime.now().astimezone(timezone.utc)
+
+                    latest_cache_dt = (
+                        datetime.fromtimestamp(
+                            self.latest_cache_segment_time, tz=timezone.utc
                        )
+                        if self.latest_cache_segment_time > 0
+                        else now_utc - timedelta(seconds=1)
                    )

-                    if datetime.datetime.now().astimezone(datetime.timezone.utc) > (
-                        latest_segment_time + datetime.timedelta(seconds=120)
-                    ):
+                    latest_valid_dt = (
+                        datetime.fromtimestamp(
+                            self.latest_valid_segment_time, tz=timezone.utc
+                        )
+                        if self.latest_valid_segment_time > 0
+                        else now_utc - timedelta(seconds=1)
+                    )
+
+                    latest_invalid_dt = (
+                        datetime.fromtimestamp(
+                            self.latest_invalid_segment_time, tz=timezone.utc
+                        )
+                        if self.latest_invalid_segment_time > 0
+                        else now_utc - timedelta(seconds=1)
+                    )
+
+                    # ensure segments are still being created and that they have valid video data
+                    cache_stale = now_utc > (latest_cache_dt + timedelta(seconds=120))
+                    valid_stale = now_utc > (latest_valid_dt + timedelta(seconds=120))
+                    invalid_stale_condition = (
+                        self.latest_invalid_segment_time > 0
+                        and now_utc > (latest_invalid_dt + timedelta(seconds=120))
+                        and self.latest_valid_segment_time
+                        <= self.latest_invalid_segment_time
+                    )
+                    invalid_stale = invalid_stale_condition
+
+                    if cache_stale or valid_stale or invalid_stale:
+                        if cache_stale:
+                            reason = "No new recording segments were created"
+                        elif valid_stale:
+                            reason = "No new valid recording segments were created"
+                        else:  # invalid_stale
+                            reason = (
+                                "No valid segments created since last invalid segment"
+                            )
+
                        self.logger.error(
-                            f"No new recording segments were created for {self.config.name} in the last 120s. restarting the ffmpeg record process..."
+                            f"{reason} for {self.config.name} in the last 120s. Restarting the ffmpeg record process..."
                        )
                        p["process"] = start_or_restart_ffmpeg(
                            p["cmd"],
@ -328,7 +406,7 @@ class CameraWatchdog(threading.Thread):
                        self.requestor.send_data(
                            f"{self.config.name}/status/record", "online"
                        )
-                        p["latest_segment_time"] = latest_segment_time
+                        p["latest_segment_time"] = self.latest_cache_segment_time

                if poll is None:
                    continue
@ -346,6 +424,7 @@ class CameraWatchdog(threading.Thread):
        self.stop_all_ffmpeg()
        self.logpipe.close()
        self.config_subscriber.stop()
+        self.segment_subscriber.stop()

    def start_ffmpeg_detect(self):
        ffmpeg_cmd = [
@ -405,33 +484,6 @@ class CameraWatchdog(threading.Thread):
            p["logpipe"].close()
        self.ffmpeg_other_processes.clear()

-    def get_latest_segment_datetime(
-        self, latest_segment: datetime.datetime
-    ) -> datetime.datetime:
-        """Checks if ffmpeg is still writing recording segments to cache."""
-        cache_files = sorted(
-            [
-                d
-                for d in os.listdir(CACHE_DIR)
-                if os.path.isfile(os.path.join(CACHE_DIR, d))
-                and d.endswith(".mp4")
-                and not d.startswith("preview_")
-            ]
-        )
-        newest_segment_time = latest_segment
-
-        for file in cache_files:
-            if self.config.name in file:
-                basename = os.path.splitext(file)[0]
-                _, date = basename.rsplit("@", maxsplit=1)
-                segment_time = datetime.datetime.strptime(
-                    date, CACHE_SEGMENT_FORMAT
-                ).astimezone(datetime.timezone.utc)
-                if segment_time > newest_segment_time:
-                    newest_segment_time = segment_time
-
-        return newest_segment_time
-

 class CameraCaptureRunner(threading.Thread):
    def __init__(
@ -727,10 +779,7 @@ def process_frames(
            time.sleep(0.1)
            continue

-        if (
-            datetime.datetime.now().astimezone(datetime.timezone.utc)
-            > next_region_update
-        ):
+        if datetime.now().astimezone(timezone.utc) > next_region_update:
            region_grid = requestor.send_data(REQUEST_REGION_GRID, camera_config.name)
            next_region_update = get_tomorrow_at_time(2)