diff --git a/frigate/events/audio.py b/frigate/events/audio.py index 8a929c8ff..5bf8c63b0 100644 --- a/frigate/events/audio.py +++ b/frigate/events/audio.py @@ -1,6 +1,7 @@ """Handle creating audio events.""" import datetime +import json import logging import random import string @@ -18,7 +19,7 @@ from frigate.comms.event_metadata_updater import ( EventMetadataTypeEnum, ) from frigate.comms.inter_process import InterProcessRequestor -from frigate.config import CameraConfig, CameraInput, FfmpegConfig +from frigate.config import CameraConfig, CameraInput, FfmpegConfig, FrigateConfig from frigate.config.camera.updater import ( CameraConfigUpdateEnum, CameraConfigUpdateSubscriber, @@ -30,9 +31,13 @@ from frigate.const import ( AUDIO_MIN_CONFIDENCE, AUDIO_SAMPLE_RATE, ) +from frigate.data_processing.real_time.audio_transcription import ( + AudioTranscriptionRealTimeProcessor, +) from frigate.ffmpeg_presets import parse_preset_input from frigate.log import LogPipe from frigate.object_detection.base import load_labels +from frigate.types import TrackedObjectUpdateTypesEnum from frigate.util.builtin import get_ffmpeg_arg_list from frigate.video import start_or_restart_ffmpeg, stop_ffmpeg @@ -75,6 +80,7 @@ class AudioProcessor(util.Process): def __init__( self, + config: FrigateConfig, cameras: list[CameraConfig], camera_metrics: dict[str, CameraMetrics], ): @@ -82,6 +88,7 @@ class AudioProcessor(util.Process): self.camera_metrics = camera_metrics self.cameras = cameras + self.config = config def run(self) -> None: audio_threads: list[AudioEventMaintainer] = [] @@ -94,6 +101,7 @@ class AudioProcessor(util.Process): for camera in self.cameras: audio_thread = AudioEventMaintainer( camera, + self.config, self.camera_metrics, self.stop_event, ) @@ -122,46 +130,62 @@ class AudioEventMaintainer(threading.Thread): def __init__( self, camera: CameraConfig, + config: FrigateConfig, camera_metrics: dict[str, CameraMetrics], stop_event: threading.Event, ) -> None: super().__init__(name=f"{camera.name}_audio_event_processor") - self.config = camera + self.config = config + self.camera_config = camera self.camera_metrics = camera_metrics self.detections: dict[dict[str, Any]] = {} self.stop_event = stop_event - self.detector = AudioTfl(stop_event, self.config.audio.num_threads) + self.detector = AudioTfl(stop_event, self.camera_config.audio.num_threads) self.shape = (int(round(AUDIO_DURATION * AUDIO_SAMPLE_RATE)),) self.chunk_size = int(round(AUDIO_DURATION * AUDIO_SAMPLE_RATE * 2)) - self.logger = logging.getLogger(f"audio.{self.config.name}") - self.ffmpeg_cmd = get_ffmpeg_command(self.config.ffmpeg) - self.logpipe = LogPipe(f"ffmpeg.{self.config.name}.audio") + self.logger = logging.getLogger(f"audio.{self.camera_config.name}") + self.ffmpeg_cmd = get_ffmpeg_command(self.camera_config.ffmpeg) + self.logpipe = LogPipe(f"ffmpeg.{self.camera_config.name}.audio") self.audio_listener = None + self.transcription_processor = None # create communication for audio detections self.requestor = InterProcessRequestor() self.config_subscriber = CameraConfigUpdateSubscriber( - {self.config.name: self.config}, - [CameraConfigUpdateEnum.audio, CameraConfigUpdateEnum.enabled], + {self.camera_config.name: self.camera_config}, + [ + CameraConfigUpdateEnum.audio, + CameraConfigUpdateEnum.enabled, + CameraConfigUpdateEnum.audio_transcription, + ], ) self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio) self.event_metadata_publisher = EventMetadataPublisher() + if self.camera_config.audio_transcription.enabled_in_config: + # init the transcription processor for this camera + self.transcription_processor = AudioTranscriptionRealTimeProcessor( + config=self.config, + camera_config=self.camera_config, + requestor=self.requestor, + metrics=self.camera_metrics[self.camera_config.name], + ) + self.was_enabled = camera.enabled def detect_audio(self, audio) -> None: - if not self.config.audio.enabled or self.stop_event.is_set(): + if not self.camera_config.audio.enabled or self.stop_event.is_set(): return audio_as_float = audio.astype(np.float32) rms, dBFS = self.calculate_audio_levels(audio_as_float) - self.camera_metrics[self.config.name].audio_rms.value = rms - self.camera_metrics[self.config.name].audio_dBFS.value = dBFS + self.camera_metrics[self.camera_config.name].audio_rms.value = rms + self.camera_metrics[self.camera_config.name].audio_dBFS.value = dBFS # only run audio detection when volume is above min_volume - if rms >= self.config.audio.min_volume: + if rms >= self.camera_config.audio.min_volume: # create waveform relative to max range and look for detections waveform = (audio / AUDIO_MAX_BIT_RANGE).astype(np.float32) model_detections = self.detector.detect(waveform) @@ -169,28 +193,43 @@ class AudioEventMaintainer(threading.Thread): for label, score, _ in model_detections: self.logger.debug( - f"{self.config.name} heard {label} with a score of {score}" + f"{self.camera_config.name} heard {label} with a score of {score}" ) - if label not in self.config.audio.listen: + if label not in self.camera_config.audio.listen: continue - if score > dict((self.config.audio.filters or {}).get(label, {})).get( - "threshold", 0.8 - ): + if score > dict( + (self.camera_config.audio.filters or {}).get(label, {}) + ).get("threshold", 0.8): self.handle_detection(label, score) audio_detections.append(label) # send audio detection data self.detection_publisher.publish( ( - self.config.name, + self.camera_config.name, datetime.datetime.now().timestamp(), dBFS, audio_detections, ) ) + # run audio transcription + if self.transcription_processor is not None and ( + # rms >= self.camera_config.audio.min_volume or self.is_endpoint is False + self.camera_config.audio_transcription.live_enabled + ): + self.transcribing = True + # process audio until we've reached the endpoint + self.transcription_processor.process_audio( + { + "id": f"{self.camera_config.name}_audio", + "camera": self.camera_config.name, + }, + audio, + ) + self.expire_detections() def calculate_audio_levels(self, audio_as_float: np.float32) -> Tuple[float, float]: @@ -204,8 +243,8 @@ class AudioEventMaintainer(threading.Thread): else: dBFS = 0 - self.requestor.send_data(f"{self.config.name}/audio/dBFS", float(dBFS)) - self.requestor.send_data(f"{self.config.name}/audio/rms", float(rms)) + self.requestor.send_data(f"{self.camera_config.name}/audio/dBFS", float(dBFS)) + self.requestor.send_data(f"{self.camera_config.name}/audio/rms", float(rms)) return float(rms), float(dBFS) @@ -220,13 +259,13 @@ class AudioEventMaintainer(threading.Thread): random.choices(string.ascii_lowercase + string.digits, k=6) ) event_id = f"{now}-{rand_id}" - self.requestor.send_data(f"{self.config.name}/audio/{label}", "ON") + self.requestor.send_data(f"{self.camera_config.name}/audio/{label}", "ON") self.event_metadata_publisher.publish( EventMetadataTypeEnum.manual_event_create, ( now, - self.config.name, + self.camera_config.name, label, event_id, True, @@ -252,10 +291,10 @@ class AudioEventMaintainer(threading.Thread): if ( now - detection.get("last_detection", now) - > self.config.audio.max_not_heard + > self.camera_config.audio.max_not_heard ): self.requestor.send_data( - f"{self.config.name}/audio/{detection['label']}", "OFF" + f"{self.camera_config.name}/audio/{detection['label']}", "OFF" ) self.event_metadata_publisher.publish( @@ -264,12 +303,28 @@ class AudioEventMaintainer(threading.Thread): ) self.detections[detection["label"]] = None + # clear real-time transcription + if self.transcription_processor is not None: + self.transcription_processor.reset(self.camera_config.name) + self.requestor.send_data( + "tracked_object_update", + json.dumps( + { + "type": TrackedObjectUpdateTypesEnum.transcription, + "text": "", + "camera": self.camera_config.name, + } + ), + ) + def expire_all_detections(self) -> None: """Immediately end all current detections""" now = datetime.datetime.now().timestamp() for label, detection in list(self.detections.items()): if detection: - self.requestor.send_data(f"{self.config.name}/audio/{label}", "OFF") + self.requestor.send_data( + f"{self.camera_config.name}/audio/{label}", "OFF" + ) self.event_metadata_publisher.publish( EventMetadataTypeEnum.manual_event_end, (detection["id"], now), @@ -290,7 +345,7 @@ class AudioEventMaintainer(threading.Thread): if self.stop_event.is_set(): return - time.sleep(self.config.ffmpeg.retry_interval) + time.sleep(self.camera_config.ffmpeg.retry_interval) self.logpipe.dump() self.start_or_restart_ffmpeg() @@ -312,20 +367,20 @@ class AudioEventMaintainer(threading.Thread): log_and_restart() def run(self) -> None: - if self.config.enabled: + if self.camera_config.enabled: self.start_or_restart_ffmpeg() while not self.stop_event.is_set(): - enabled = self.config.enabled + enabled = self.camera_config.enabled if enabled != self.was_enabled: if enabled: self.logger.debug( - f"Enabling audio detections for {self.config.name}" + f"Enabling audio detections for {self.camera_config.name}" ) self.start_or_restart_ffmpeg() else: self.logger.debug( - f"Disabling audio detections for {self.config.name}, ending events" + f"Disabling audio detections for {self.camera_config.name}, ending events" ) self.expire_all_detections() stop_ffmpeg(self.audio_listener, self.logger) diff --git a/frigate/util/audio.py b/frigate/util/audio.py new file mode 100644 index 000000000..eede9c0ea --- /dev/null +++ b/frigate/util/audio.py @@ -0,0 +1,116 @@ +"""Utilities for creating and manipulating audio.""" + +import logging +import os +import subprocess as sp +from typing import Optional + +from pathvalidate import sanitize_filename + +from frigate.const import CACHE_DIR +from frigate.models import Recordings + +logger = logging.getLogger(__name__) + + +def get_audio_from_recording( + ffmpeg, + camera_name: str, + start_ts: float, + end_ts: float, + sample_rate: int = 16000, +) -> Optional[bytes]: + """Extract audio from recording files between start_ts and end_ts in WAV format suitable for sherpa-onnx. + + Args: + ffmpeg: FFmpeg configuration object + camera_name: Name of the camera + start_ts: Start timestamp in seconds + end_ts: End timestamp in seconds + sample_rate: Sample rate for output audio (default 16kHz for sherpa-onnx) + + Returns: + Bytes of WAV audio data or None if extraction failed + """ + # Fetch all relevant recording segments + recordings = ( + Recordings.select( + Recordings.path, + Recordings.start_time, + Recordings.end_time, + ) + .where( + (Recordings.start_time.between(start_ts, end_ts)) + | (Recordings.end_time.between(start_ts, end_ts)) + | ((start_ts > Recordings.start_time) & (end_ts < Recordings.end_time)) + ) + .where(Recordings.camera == camera_name) + .order_by(Recordings.start_time.asc()) + ) + + if not recordings: + logger.debug( + f"No recordings found for {camera_name} between {start_ts} and {end_ts}" + ) + return None + + # Generate concat playlist file + file_name = sanitize_filename( + f"audio_playlist_{camera_name}_{start_ts}-{end_ts}.txt" + ) + file_path = os.path.join(CACHE_DIR, file_name) + try: + with open(file_path, "w") as file: + for clip in recordings: + file.write(f"file '{clip.path}'\n") + if clip.start_time < start_ts: + file.write(f"inpoint {int(start_ts - clip.start_time)}\n") + if clip.end_time > end_ts: + file.write(f"outpoint {int(end_ts - clip.start_time)}\n") + + ffmpeg_cmd = [ + ffmpeg.ffmpeg_path, + "-hide_banner", + "-loglevel", + "warning", + "-protocol_whitelist", + "pipe,file", + "-f", + "concat", + "-safe", + "0", + "-i", + file_path, + "-vn", # No video + "-acodec", + "pcm_s16le", # 16-bit PCM encoding + "-ar", + str(sample_rate), + "-ac", + "1", # Mono audio + "-f", + "wav", + "-", + ] + + process = sp.run( + ffmpeg_cmd, + capture_output=True, + ) + + if process.returncode == 0: + logger.debug( + f"Successfully extracted audio for {camera_name} from {start_ts} to {end_ts}" + ) + return process.stdout + else: + logger.error(f"Failed to extract audio: {process.stderr.decode()}") + return None + except Exception as e: + logger.error(f"Error extracting audio from recordings: {e}") + return None + finally: + try: + os.unlink(file_path) + except OSError: + pass