Watchdog enhancements (#20237)
Some checks are pending
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions

* refactor get_video_properties and use json output from ffprobe

* add zmq topic

* publish valid segment data in recording maintainer

* check for valid video data

- restart separate record ffmpeg process if no video data has been received in 120s
- refactor datetime import

* listen to correct topic in embeddings maintainer

* refactor to move get_latest_segment_datetime logic to recordings maintainer

* debug logging

* cleanup
This commit is contained in:
Josh Hawkins 2025-09-28 11:52:14 -05:00 committed by GitHub
parent b6552987b0
commit 12f8c3feac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 267 additions and 145 deletions

View File

@ -2,6 +2,7 @@
import logging
from enum import Enum
from typing import Any
from .zmq_proxy import Publisher, Subscriber
@ -10,18 +11,21 @@ logger = logging.getLogger(__name__)
class RecordingsDataTypeEnum(str, Enum):
all = ""
recordings_available_through = "recordings_available_through"
saved = "saved" # segment has been saved to db
latest = "latest" # segment is in cache
valid = "valid" # segment is valid
invalid = "invalid" # segment is invalid
class RecordingsDataPublisher(Publisher[tuple[str, float]]):
class RecordingsDataPublisher(Publisher[Any]):
"""Publishes latest recording data."""
topic_base = "recordings/"
def __init__(self, topic: RecordingsDataTypeEnum) -> None:
super().__init__(topic.value)
def __init__(self) -> None:
super().__init__()
def publish(self, payload: tuple[str, float], sub_topic: str = "") -> None:
def publish(self, payload: Any, sub_topic: str = "") -> None:
super().publish(payload, sub_topic)
@ -32,3 +36,11 @@ class RecordingsDataSubscriber(Subscriber):
def __init__(self, topic: RecordingsDataTypeEnum) -> None:
super().__init__(topic.value)
def _return_object(
self, topic: str, payload: tuple | None
) -> tuple[str, Any] | tuple[None, None]:
if payload is None:
return (None, None)
return (topic, payload)

View File

@ -144,7 +144,7 @@ class EmbeddingMaintainer(threading.Thread):
EventMetadataTypeEnum.regenerate_description
)
self.recordings_subscriber = RecordingsDataSubscriber(
RecordingsDataTypeEnum.recordings_available_through
RecordingsDataTypeEnum.saved
)
self.review_subscriber = ReviewDataSubscriber("")
self.detection_subscriber = DetectionSubscriber(DetectionTypeEnum.video.value)
@ -525,20 +525,28 @@ class EmbeddingMaintainer(threading.Thread):
def _process_recordings_updates(self) -> None:
"""Process recordings updates."""
while True:
recordings_data = self.recordings_subscriber.check_for_update()
update = self.recordings_subscriber.check_for_update()
if recordings_data == None:
if not update:
break
camera, recordings_available_through_timestamp = recordings_data
(raw_topic, payload) = update
self.recordings_available_through[camera] = (
recordings_available_through_timestamp
)
if not raw_topic or not payload:
break
logger.debug(
f"{camera} now has recordings available through {recordings_available_through_timestamp}"
)
topic = str(raw_topic)
if topic.endswith(RecordingsDataTypeEnum.saved.value):
camera, recordings_available_through_timestamp, _ = payload
self.recordings_available_through[camera] = (
recordings_available_through_timestamp
)
logger.debug(
f"{camera} now has recordings available through {recordings_available_through_timestamp}"
)
def _process_review_updates(self) -> None:
"""Process review updates."""

View File

@ -80,9 +80,7 @@ class RecordingMaintainer(threading.Thread):
[CameraConfigUpdateEnum.add, CameraConfigUpdateEnum.record],
)
self.detection_subscriber = DetectionSubscriber(DetectionTypeEnum.all.value)
self.recordings_publisher = RecordingsDataPublisher(
RecordingsDataTypeEnum.recordings_available_through
)
self.recordings_publisher = RecordingsDataPublisher()
self.stop_event = stop_event
self.object_recordings_info: dict[str, list] = defaultdict(list)
@ -98,6 +96,41 @@ class RecordingMaintainer(threading.Thread):
and not d.startswith("preview_")
]
# publish newest cached segment per camera (including in use files)
newest_cache_segments: dict[str, dict[str, Any]] = {}
for cache in cache_files:
cache_path = os.path.join(CACHE_DIR, cache)
basename = os.path.splitext(cache)[0]
camera, date = basename.rsplit("@", maxsplit=1)
start_time = datetime.datetime.strptime(
date, CACHE_SEGMENT_FORMAT
).astimezone(datetime.timezone.utc)
if (
camera not in newest_cache_segments
or start_time > newest_cache_segments[camera]["start_time"]
):
newest_cache_segments[camera] = {
"start_time": start_time,
"cache_path": cache_path,
}
for camera, newest in newest_cache_segments.items():
self.recordings_publisher.publish(
(
camera,
newest["start_time"].timestamp(),
newest["cache_path"],
),
RecordingsDataTypeEnum.latest.value,
)
# publish None for cameras with no cache files (but only if we know the camera exists)
for camera_name in self.config.cameras:
if camera_name not in newest_cache_segments:
self.recordings_publisher.publish(
(camera_name, None, None),
RecordingsDataTypeEnum.latest.value,
)
files_in_use = []
for process in psutil.process_iter():
try:
@ -111,7 +144,7 @@ class RecordingMaintainer(threading.Thread):
except psutil.Error:
continue
# group recordings by camera
# group recordings by camera (skip in-use for validation/moving)
grouped_recordings: defaultdict[str, list[dict[str, Any]]] = defaultdict(list)
for cache in cache_files:
# Skip files currently in use
@ -233,7 +266,9 @@ class RecordingMaintainer(threading.Thread):
recordings[0]["start_time"].timestamp()
if self.config.cameras[camera].record.enabled
else None,
)
None,
),
RecordingsDataTypeEnum.saved.value,
)
recordings_to_insert: list[Optional[Recordings]] = await asyncio.gather(*tasks)
@ -250,7 +285,7 @@ class RecordingMaintainer(threading.Thread):
async def validate_and_move_segment(
self, camera: str, reviews: list[ReviewSegment], recording: dict[str, Any]
) -> None:
) -> Optional[Recordings]:
cache_path: str = recording["cache_path"]
start_time: datetime.datetime = recording["start_time"]
record_config = self.config.cameras[camera].record
@ -261,7 +296,7 @@ class RecordingMaintainer(threading.Thread):
or not self.config.cameras[camera].record.enabled
):
self.drop_segment(cache_path)
return
return None
if cache_path in self.end_time_cache:
end_time, duration = self.end_time_cache[cache_path]
@ -270,10 +305,18 @@ class RecordingMaintainer(threading.Thread):
self.config.ffmpeg, cache_path, get_duration=True
)
if segment_info["duration"]:
duration = float(segment_info["duration"])
else:
duration = -1
if not segment_info.get("has_valid_video", False):
logger.warning(
f"Invalid or missing video stream in segment {cache_path}. Discarding."
)
self.recordings_publisher.publish(
(camera, start_time.timestamp(), cache_path),
RecordingsDataTypeEnum.invalid.value,
)
self.drop_segment(cache_path)
return None
duration = float(segment_info.get("duration", -1))
# ensure duration is within expected length
if 0 < duration < MAX_SEGMENT_DURATION:
@ -284,8 +327,18 @@ class RecordingMaintainer(threading.Thread):
logger.warning(f"Failed to probe corrupt segment {cache_path}")
logger.warning(f"Discarding a corrupt recording segment: {cache_path}")
Path(cache_path).unlink(missing_ok=True)
return
self.recordings_publisher.publish(
(camera, start_time.timestamp(), cache_path),
RecordingsDataTypeEnum.invalid.value,
)
self.drop_segment(cache_path)
return None
# this segment has a valid duration and has video data, so publish an update
self.recordings_publisher.publish(
(camera, start_time.timestamp(), cache_path),
RecordingsDataTypeEnum.valid.value,
)
record_config = self.config.cameras[camera].record
highest = None

View File

@ -603,87 +603,87 @@ def auto_detect_hwaccel() -> str:
async def get_video_properties(
ffmpeg, url: str, get_duration: bool = False
) -> dict[str, Any]:
async def calculate_duration(video: Optional[Any]) -> float:
duration = None
if video is not None:
# Get the frames per second (fps) of the video stream
fps = video.get(cv2.CAP_PROP_FPS)
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
if fps and total_frames:
duration = total_frames / fps
# if cv2 failed need to use ffprobe
if duration is None:
p = await asyncio.create_subprocess_exec(
ffmpeg.ffprobe_path,
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
f"{url}",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
async def probe_with_ffprobe(
url: str,
) -> tuple[bool, int, int, Optional[str], float]:
"""Fallback using ffprobe: returns (valid, width, height, codec, duration)."""
cmd = [
ffmpeg.ffprobe_path,
"-v",
"quiet",
"-print_format",
"json",
"-show_format",
"-show_streams",
url,
]
try:
proc = await asyncio.create_subprocess_exec(
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
)
await p.wait()
stdout, _ = await proc.communicate()
if proc.returncode != 0:
return False, 0, 0, None, -1
if p.returncode == 0:
result = (await p.stdout.read()).decode()
else:
result = None
data = json.loads(stdout.decode())
video_streams = [
s for s in data.get("streams", []) if s.get("codec_type") == "video"
]
if not video_streams:
return False, 0, 0, None, -1
if result:
try:
duration = float(result.strip())
except ValueError:
duration = -1
else:
duration = -1
v = video_streams[0]
width = int(v.get("width", 0))
height = int(v.get("height", 0))
codec = v.get("codec_name")
return duration
duration_str = data.get("format", {}).get("duration")
duration = float(duration_str) if duration_str else -1.0
width = height = 0
return True, width, height, codec, duration
except (json.JSONDecodeError, ValueError, KeyError, asyncio.SubprocessError):
return False, 0, 0, None, -1
try:
# Open the video stream using OpenCV
video = cv2.VideoCapture(url)
def probe_with_cv2(url: str) -> tuple[bool, int, int, Optional[str], float]:
"""Primary attempt using cv2: returns (valid, width, height, fourcc, duration)."""
cap = cv2.VideoCapture(url)
if not cap.isOpened():
cap.release()
return False, 0, 0, None, -1
# Check if the video stream was opened successfully
if not video.isOpened():
video = None
except Exception:
video = None
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
valid = width > 0 and height > 0
fourcc = None
duration = -1.0
result = {}
if valid:
fourcc_int = int(cap.get(cv2.CAP_PROP_FOURCC))
fourcc = fourcc_int.to_bytes(4, "little").decode("latin-1").strip()
if get_duration:
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
if fps > 0 and total_frames > 0:
duration = total_frames / fps
cap.release()
return valid, width, height, fourcc, duration
# try cv2 first
has_video, width, height, fourcc, duration = probe_with_cv2(url)
# fallback to ffprobe if needed
if not has_video or (get_duration and duration < 0):
has_video, width, height, fourcc, duration = await probe_with_ffprobe(url)
result: dict[str, Any] = {"has_valid_video": has_video}
if has_video:
result.update({"width": width, "height": height})
if fourcc:
result["fourcc"] = fourcc
if get_duration:
result["duration"] = await calculate_duration(video)
if video is not None:
# Get the width of frames in the video stream
width = video.get(cv2.CAP_PROP_FRAME_WIDTH)
# Get the height of frames in the video stream
height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
# Get the stream encoding
fourcc_int = int(video.get(cv2.CAP_PROP_FOURCC))
fourcc = (
chr((fourcc_int >> 0) & 255)
+ chr((fourcc_int >> 8) & 255)
+ chr((fourcc_int >> 16) & 255)
+ chr((fourcc_int >> 24) & 255)
)
# Release the video stream
video.release()
result["width"] = round(width)
result["height"] = round(height)
result["fourcc"] = fourcc
result["duration"] = duration
return result

View File

@ -1,10 +1,9 @@
import datetime
import logging
import os
import queue
import subprocess as sp
import threading
import time
from datetime import datetime, timedelta, timezone
from multiprocessing import Queue, Value
from multiprocessing.synchronize import Event as MpEvent
from typing import Any
@ -13,6 +12,10 @@ import cv2
from frigate.camera import CameraMetrics, PTZMetrics
from frigate.comms.inter_process import InterProcessRequestor
from frigate.comms.recordings_updater import (
RecordingsDataSubscriber,
RecordingsDataTypeEnum,
)
from frigate.config import CameraConfig, DetectConfig, ModelConfig
from frigate.config.camera.camera import CameraTypeEnum
from frigate.config.camera.updater import (
@ -20,8 +23,6 @@ from frigate.config.camera.updater import (
CameraConfigUpdateSubscriber,
)
from frigate.const import (
CACHE_DIR,
CACHE_SEGMENT_FORMAT,
PROCESS_PRIORITY_HIGH,
REQUEST_REGION_GRID,
)
@ -129,7 +130,7 @@ def capture_frames(
fps.value = frame_rate.eps()
skipped_fps.value = skipped_eps.eps()
current_frame.value = datetime.datetime.now().timestamp()
current_frame.value = datetime.now().timestamp()
frame_name = f"{config.name}_frame{frame_index}"
frame_buffer = frame_manager.write(frame_name)
try:
@ -199,6 +200,11 @@ class CameraWatchdog(threading.Thread):
self.requestor = InterProcessRequestor()
self.was_enabled = self.config.enabled
self.segment_subscriber = RecordingsDataSubscriber(RecordingsDataTypeEnum.all)
self.latest_valid_segment_time: float = 0
self.latest_invalid_segment_time: float = 0
self.latest_cache_segment_time: float = 0
def _update_enabled_state(self) -> bool:
"""Fetch the latest config and update enabled state."""
self.config_subscriber.check_for_updates()
@ -243,6 +249,11 @@ class CameraWatchdog(threading.Thread):
if enabled:
self.logger.debug(f"Enabling camera {self.config.name}")
self.start_all_ffmpeg()
# reset all timestamps
self.latest_valid_segment_time = 0
self.latest_invalid_segment_time = 0
self.latest_cache_segment_time = 0
else:
self.logger.debug(f"Disabling camera {self.config.name}")
self.stop_all_ffmpeg()
@ -260,7 +271,37 @@ class CameraWatchdog(threading.Thread):
if not enabled:
continue
now = datetime.datetime.now().timestamp()
while True:
update = self.segment_subscriber.check_for_update(timeout=0)
if update == (None, None):
break
raw_topic, payload = update
if raw_topic and payload:
topic = str(raw_topic)
camera, segment_time, _ = payload
if camera != self.config.name:
continue
if topic.endswith(RecordingsDataTypeEnum.valid.value):
self.logger.debug(
f"Latest valid recording segment time on {camera}: {segment_time}"
)
self.latest_valid_segment_time = segment_time
elif topic.endswith(RecordingsDataTypeEnum.invalid.value):
self.logger.warning(
f"Invalid recording segment detected for {camera} at {segment_time}"
)
self.latest_invalid_segment_time = segment_time
elif topic.endswith(RecordingsDataTypeEnum.latest.value):
if segment_time is not None:
self.latest_cache_segment_time = segment_time
else:
self.latest_cache_segment_time = 0
now = datetime.now().timestamp()
if not self.capture_thread.is_alive():
self.requestor.send_data(f"{self.config.name}/status/detect", "offline")
@ -298,18 +339,55 @@ class CameraWatchdog(threading.Thread):
poll = p["process"].poll()
if self.config.record.enabled and "record" in p["roles"]:
latest_segment_time = self.get_latest_segment_datetime(
p.get(
"latest_segment_time",
datetime.datetime.now().astimezone(datetime.timezone.utc),
now_utc = datetime.now().astimezone(timezone.utc)
latest_cache_dt = (
datetime.fromtimestamp(
self.latest_cache_segment_time, tz=timezone.utc
)
if self.latest_cache_segment_time > 0
else now_utc - timedelta(seconds=1)
)
if datetime.datetime.now().astimezone(datetime.timezone.utc) > (
latest_segment_time + datetime.timedelta(seconds=120)
):
latest_valid_dt = (
datetime.fromtimestamp(
self.latest_valid_segment_time, tz=timezone.utc
)
if self.latest_valid_segment_time > 0
else now_utc - timedelta(seconds=1)
)
latest_invalid_dt = (
datetime.fromtimestamp(
self.latest_invalid_segment_time, tz=timezone.utc
)
if self.latest_invalid_segment_time > 0
else now_utc - timedelta(seconds=1)
)
# ensure segments are still being created and that they have valid video data
cache_stale = now_utc > (latest_cache_dt + timedelta(seconds=120))
valid_stale = now_utc > (latest_valid_dt + timedelta(seconds=120))
invalid_stale_condition = (
self.latest_invalid_segment_time > 0
and now_utc > (latest_invalid_dt + timedelta(seconds=120))
and self.latest_valid_segment_time
<= self.latest_invalid_segment_time
)
invalid_stale = invalid_stale_condition
if cache_stale or valid_stale or invalid_stale:
if cache_stale:
reason = "No new recording segments were created"
elif valid_stale:
reason = "No new valid recording segments were created"
else: # invalid_stale
reason = (
"No valid segments created since last invalid segment"
)
self.logger.error(
f"No new recording segments were created for {self.config.name} in the last 120s. restarting the ffmpeg record process..."
f"{reason} for {self.config.name} in the last 120s. Restarting the ffmpeg record process..."
)
p["process"] = start_or_restart_ffmpeg(
p["cmd"],
@ -328,7 +406,7 @@ class CameraWatchdog(threading.Thread):
self.requestor.send_data(
f"{self.config.name}/status/record", "online"
)
p["latest_segment_time"] = latest_segment_time
p["latest_segment_time"] = self.latest_cache_segment_time
if poll is None:
continue
@ -346,6 +424,7 @@ class CameraWatchdog(threading.Thread):
self.stop_all_ffmpeg()
self.logpipe.close()
self.config_subscriber.stop()
self.segment_subscriber.stop()
def start_ffmpeg_detect(self):
ffmpeg_cmd = [
@ -405,33 +484,6 @@ class CameraWatchdog(threading.Thread):
p["logpipe"].close()
self.ffmpeg_other_processes.clear()
def get_latest_segment_datetime(
self, latest_segment: datetime.datetime
) -> datetime.datetime:
"""Checks if ffmpeg is still writing recording segments to cache."""
cache_files = sorted(
[
d
for d in os.listdir(CACHE_DIR)
if os.path.isfile(os.path.join(CACHE_DIR, d))
and d.endswith(".mp4")
and not d.startswith("preview_")
]
)
newest_segment_time = latest_segment
for file in cache_files:
if self.config.name in file:
basename = os.path.splitext(file)[0]
_, date = basename.rsplit("@", maxsplit=1)
segment_time = datetime.datetime.strptime(
date, CACHE_SEGMENT_FORMAT
).astimezone(datetime.timezone.utc)
if segment_time > newest_segment_time:
newest_segment_time = segment_time
return newest_segment_time
class CameraCaptureRunner(threading.Thread):
def __init__(
@ -727,10 +779,7 @@ def process_frames(
time.sleep(0.1)
continue
if (
datetime.datetime.now().astimezone(datetime.timezone.utc)
> next_region_update
):
if datetime.now().astimezone(timezone.utc) > next_region_update:
region_grid = requestor.send_data(REQUEST_REGION_GRID, camera_config.name)
next_region_update = get_tomorrow_at_time(2)