audio maintainer modifications to support transcription

This commit is contained in:
Josh Hawkins 2025-05-26 07:20:22 -05:00
parent 2c29fb4e95
commit 126fd5b61c
2 changed files with 201 additions and 30 deletions

View File

@ -1,6 +1,7 @@
"""Handle creating audio events.""" """Handle creating audio events."""
import datetime import datetime
import json
import logging import logging
import random import random
import string import string
@ -18,7 +19,7 @@ from frigate.comms.event_metadata_updater import (
EventMetadataTypeEnum, EventMetadataTypeEnum,
) )
from frigate.comms.inter_process import InterProcessRequestor from frigate.comms.inter_process import InterProcessRequestor
from frigate.config import CameraConfig, CameraInput, FfmpegConfig from frigate.config import CameraConfig, CameraInput, FfmpegConfig, FrigateConfig
from frigate.config.camera.updater import ( from frigate.config.camera.updater import (
CameraConfigUpdateEnum, CameraConfigUpdateEnum,
CameraConfigUpdateSubscriber, CameraConfigUpdateSubscriber,
@ -30,9 +31,13 @@ from frigate.const import (
AUDIO_MIN_CONFIDENCE, AUDIO_MIN_CONFIDENCE,
AUDIO_SAMPLE_RATE, AUDIO_SAMPLE_RATE,
) )
from frigate.data_processing.real_time.audio_transcription import (
AudioTranscriptionRealTimeProcessor,
)
from frigate.ffmpeg_presets import parse_preset_input from frigate.ffmpeg_presets import parse_preset_input
from frigate.log import LogPipe from frigate.log import LogPipe
from frigate.object_detection.base import load_labels from frigate.object_detection.base import load_labels
from frigate.types import TrackedObjectUpdateTypesEnum
from frigate.util.builtin import get_ffmpeg_arg_list from frigate.util.builtin import get_ffmpeg_arg_list
from frigate.video import start_or_restart_ffmpeg, stop_ffmpeg from frigate.video import start_or_restart_ffmpeg, stop_ffmpeg
@ -75,6 +80,7 @@ class AudioProcessor(util.Process):
def __init__( def __init__(
self, self,
config: FrigateConfig,
cameras: list[CameraConfig], cameras: list[CameraConfig],
camera_metrics: dict[str, CameraMetrics], camera_metrics: dict[str, CameraMetrics],
): ):
@ -82,6 +88,7 @@ class AudioProcessor(util.Process):
self.camera_metrics = camera_metrics self.camera_metrics = camera_metrics
self.cameras = cameras self.cameras = cameras
self.config = config
def run(self) -> None: def run(self) -> None:
audio_threads: list[AudioEventMaintainer] = [] audio_threads: list[AudioEventMaintainer] = []
@ -94,6 +101,7 @@ class AudioProcessor(util.Process):
for camera in self.cameras: for camera in self.cameras:
audio_thread = AudioEventMaintainer( audio_thread = AudioEventMaintainer(
camera, camera,
self.config,
self.camera_metrics, self.camera_metrics,
self.stop_event, self.stop_event,
) )
@ -122,46 +130,62 @@ class AudioEventMaintainer(threading.Thread):
def __init__( def __init__(
self, self,
camera: CameraConfig, camera: CameraConfig,
config: FrigateConfig,
camera_metrics: dict[str, CameraMetrics], camera_metrics: dict[str, CameraMetrics],
stop_event: threading.Event, stop_event: threading.Event,
) -> None: ) -> None:
super().__init__(name=f"{camera.name}_audio_event_processor") super().__init__(name=f"{camera.name}_audio_event_processor")
self.config = camera self.config = config
self.camera_config = camera
self.camera_metrics = camera_metrics self.camera_metrics = camera_metrics
self.detections: dict[dict[str, Any]] = {} self.detections: dict[dict[str, Any]] = {}
self.stop_event = stop_event self.stop_event = stop_event
self.detector = AudioTfl(stop_event, self.config.audio.num_threads) self.detector = AudioTfl(stop_event, self.camera_config.audio.num_threads)
self.shape = (int(round(AUDIO_DURATION * AUDIO_SAMPLE_RATE)),) self.shape = (int(round(AUDIO_DURATION * AUDIO_SAMPLE_RATE)),)
self.chunk_size = int(round(AUDIO_DURATION * AUDIO_SAMPLE_RATE * 2)) self.chunk_size = int(round(AUDIO_DURATION * AUDIO_SAMPLE_RATE * 2))
self.logger = logging.getLogger(f"audio.{self.config.name}") self.logger = logging.getLogger(f"audio.{self.camera_config.name}")
self.ffmpeg_cmd = get_ffmpeg_command(self.config.ffmpeg) self.ffmpeg_cmd = get_ffmpeg_command(self.camera_config.ffmpeg)
self.logpipe = LogPipe(f"ffmpeg.{self.config.name}.audio") self.logpipe = LogPipe(f"ffmpeg.{self.camera_config.name}.audio")
self.audio_listener = None self.audio_listener = None
self.transcription_processor = None
# create communication for audio detections # create communication for audio detections
self.requestor = InterProcessRequestor() self.requestor = InterProcessRequestor()
self.config_subscriber = CameraConfigUpdateSubscriber( self.config_subscriber = CameraConfigUpdateSubscriber(
{self.config.name: self.config}, {self.camera_config.name: self.camera_config},
[CameraConfigUpdateEnum.audio, CameraConfigUpdateEnum.enabled], [
CameraConfigUpdateEnum.audio,
CameraConfigUpdateEnum.enabled,
CameraConfigUpdateEnum.audio_transcription,
],
) )
self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio) self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio)
self.event_metadata_publisher = EventMetadataPublisher() self.event_metadata_publisher = EventMetadataPublisher()
if self.camera_config.audio_transcription.enabled_in_config:
# init the transcription processor for this camera
self.transcription_processor = AudioTranscriptionRealTimeProcessor(
config=self.config,
camera_config=self.camera_config,
requestor=self.requestor,
metrics=self.camera_metrics[self.camera_config.name],
)
self.was_enabled = camera.enabled self.was_enabled = camera.enabled
def detect_audio(self, audio) -> None: def detect_audio(self, audio) -> None:
if not self.config.audio.enabled or self.stop_event.is_set(): if not self.camera_config.audio.enabled or self.stop_event.is_set():
return return
audio_as_float = audio.astype(np.float32) audio_as_float = audio.astype(np.float32)
rms, dBFS = self.calculate_audio_levels(audio_as_float) rms, dBFS = self.calculate_audio_levels(audio_as_float)
self.camera_metrics[self.config.name].audio_rms.value = rms self.camera_metrics[self.camera_config.name].audio_rms.value = rms
self.camera_metrics[self.config.name].audio_dBFS.value = dBFS self.camera_metrics[self.camera_config.name].audio_dBFS.value = dBFS
# only run audio detection when volume is above min_volume # only run audio detection when volume is above min_volume
if rms >= self.config.audio.min_volume: if rms >= self.camera_config.audio.min_volume:
# create waveform relative to max range and look for detections # create waveform relative to max range and look for detections
waveform = (audio / AUDIO_MAX_BIT_RANGE).astype(np.float32) waveform = (audio / AUDIO_MAX_BIT_RANGE).astype(np.float32)
model_detections = self.detector.detect(waveform) model_detections = self.detector.detect(waveform)
@ -169,28 +193,43 @@ class AudioEventMaintainer(threading.Thread):
for label, score, _ in model_detections: for label, score, _ in model_detections:
self.logger.debug( self.logger.debug(
f"{self.config.name} heard {label} with a score of {score}" f"{self.camera_config.name} heard {label} with a score of {score}"
) )
if label not in self.config.audio.listen: if label not in self.camera_config.audio.listen:
continue continue
if score > dict((self.config.audio.filters or {}).get(label, {})).get( if score > dict(
"threshold", 0.8 (self.camera_config.audio.filters or {}).get(label, {})
): ).get("threshold", 0.8):
self.handle_detection(label, score) self.handle_detection(label, score)
audio_detections.append(label) audio_detections.append(label)
# send audio detection data # send audio detection data
self.detection_publisher.publish( self.detection_publisher.publish(
( (
self.config.name, self.camera_config.name,
datetime.datetime.now().timestamp(), datetime.datetime.now().timestamp(),
dBFS, dBFS,
audio_detections, audio_detections,
) )
) )
# run audio transcription
if self.transcription_processor is not None and (
# rms >= self.camera_config.audio.min_volume or self.is_endpoint is False
self.camera_config.audio_transcription.live_enabled
):
self.transcribing = True
# process audio until we've reached the endpoint
self.transcription_processor.process_audio(
{
"id": f"{self.camera_config.name}_audio",
"camera": self.camera_config.name,
},
audio,
)
self.expire_detections() self.expire_detections()
def calculate_audio_levels(self, audio_as_float: np.float32) -> Tuple[float, float]: def calculate_audio_levels(self, audio_as_float: np.float32) -> Tuple[float, float]:
@ -204,8 +243,8 @@ class AudioEventMaintainer(threading.Thread):
else: else:
dBFS = 0 dBFS = 0
self.requestor.send_data(f"{self.config.name}/audio/dBFS", float(dBFS)) self.requestor.send_data(f"{self.camera_config.name}/audio/dBFS", float(dBFS))
self.requestor.send_data(f"{self.config.name}/audio/rms", float(rms)) self.requestor.send_data(f"{self.camera_config.name}/audio/rms", float(rms))
return float(rms), float(dBFS) return float(rms), float(dBFS)
@ -220,13 +259,13 @@ class AudioEventMaintainer(threading.Thread):
random.choices(string.ascii_lowercase + string.digits, k=6) random.choices(string.ascii_lowercase + string.digits, k=6)
) )
event_id = f"{now}-{rand_id}" event_id = f"{now}-{rand_id}"
self.requestor.send_data(f"{self.config.name}/audio/{label}", "ON") self.requestor.send_data(f"{self.camera_config.name}/audio/{label}", "ON")
self.event_metadata_publisher.publish( self.event_metadata_publisher.publish(
EventMetadataTypeEnum.manual_event_create, EventMetadataTypeEnum.manual_event_create,
( (
now, now,
self.config.name, self.camera_config.name,
label, label,
event_id, event_id,
True, True,
@ -252,10 +291,10 @@ class AudioEventMaintainer(threading.Thread):
if ( if (
now - detection.get("last_detection", now) now - detection.get("last_detection", now)
> self.config.audio.max_not_heard > self.camera_config.audio.max_not_heard
): ):
self.requestor.send_data( self.requestor.send_data(
f"{self.config.name}/audio/{detection['label']}", "OFF" f"{self.camera_config.name}/audio/{detection['label']}", "OFF"
) )
self.event_metadata_publisher.publish( self.event_metadata_publisher.publish(
@ -264,12 +303,28 @@ class AudioEventMaintainer(threading.Thread):
) )
self.detections[detection["label"]] = None self.detections[detection["label"]] = None
# clear real-time transcription
if self.transcription_processor is not None:
self.transcription_processor.reset(self.camera_config.name)
self.requestor.send_data(
"tracked_object_update",
json.dumps(
{
"type": TrackedObjectUpdateTypesEnum.transcription,
"text": "",
"camera": self.camera_config.name,
}
),
)
def expire_all_detections(self) -> None: def expire_all_detections(self) -> None:
"""Immediately end all current detections""" """Immediately end all current detections"""
now = datetime.datetime.now().timestamp() now = datetime.datetime.now().timestamp()
for label, detection in list(self.detections.items()): for label, detection in list(self.detections.items()):
if detection: if detection:
self.requestor.send_data(f"{self.config.name}/audio/{label}", "OFF") self.requestor.send_data(
f"{self.camera_config.name}/audio/{label}", "OFF"
)
self.event_metadata_publisher.publish( self.event_metadata_publisher.publish(
EventMetadataTypeEnum.manual_event_end, EventMetadataTypeEnum.manual_event_end,
(detection["id"], now), (detection["id"], now),
@ -290,7 +345,7 @@ class AudioEventMaintainer(threading.Thread):
if self.stop_event.is_set(): if self.stop_event.is_set():
return return
time.sleep(self.config.ffmpeg.retry_interval) time.sleep(self.camera_config.ffmpeg.retry_interval)
self.logpipe.dump() self.logpipe.dump()
self.start_or_restart_ffmpeg() self.start_or_restart_ffmpeg()
@ -312,20 +367,20 @@ class AudioEventMaintainer(threading.Thread):
log_and_restart() log_and_restart()
def run(self) -> None: def run(self) -> None:
if self.config.enabled: if self.camera_config.enabled:
self.start_or_restart_ffmpeg() self.start_or_restart_ffmpeg()
while not self.stop_event.is_set(): while not self.stop_event.is_set():
enabled = self.config.enabled enabled = self.camera_config.enabled
if enabled != self.was_enabled: if enabled != self.was_enabled:
if enabled: if enabled:
self.logger.debug( self.logger.debug(
f"Enabling audio detections for {self.config.name}" f"Enabling audio detections for {self.camera_config.name}"
) )
self.start_or_restart_ffmpeg() self.start_or_restart_ffmpeg()
else: else:
self.logger.debug( self.logger.debug(
f"Disabling audio detections for {self.config.name}, ending events" f"Disabling audio detections for {self.camera_config.name}, ending events"
) )
self.expire_all_detections() self.expire_all_detections()
stop_ffmpeg(self.audio_listener, self.logger) stop_ffmpeg(self.audio_listener, self.logger)

116
frigate/util/audio.py Normal file
View File

@ -0,0 +1,116 @@
"""Utilities for creating and manipulating audio."""
import logging
import os
import subprocess as sp
from typing import Optional
from pathvalidate import sanitize_filename
from frigate.const import CACHE_DIR
from frigate.models import Recordings
logger = logging.getLogger(__name__)
def get_audio_from_recording(
ffmpeg,
camera_name: str,
start_ts: float,
end_ts: float,
sample_rate: int = 16000,
) -> Optional[bytes]:
"""Extract audio from recording files between start_ts and end_ts in WAV format suitable for sherpa-onnx.
Args:
ffmpeg: FFmpeg configuration object
camera_name: Name of the camera
start_ts: Start timestamp in seconds
end_ts: End timestamp in seconds
sample_rate: Sample rate for output audio (default 16kHz for sherpa-onnx)
Returns:
Bytes of WAV audio data or None if extraction failed
"""
# Fetch all relevant recording segments
recordings = (
Recordings.select(
Recordings.path,
Recordings.start_time,
Recordings.end_time,
)
.where(
(Recordings.start_time.between(start_ts, end_ts))
| (Recordings.end_time.between(start_ts, end_ts))
| ((start_ts > Recordings.start_time) & (end_ts < Recordings.end_time))
)
.where(Recordings.camera == camera_name)
.order_by(Recordings.start_time.asc())
)
if not recordings:
logger.debug(
f"No recordings found for {camera_name} between {start_ts} and {end_ts}"
)
return None
# Generate concat playlist file
file_name = sanitize_filename(
f"audio_playlist_{camera_name}_{start_ts}-{end_ts}.txt"
)
file_path = os.path.join(CACHE_DIR, file_name)
try:
with open(file_path, "w") as file:
for clip in recordings:
file.write(f"file '{clip.path}'\n")
if clip.start_time < start_ts:
file.write(f"inpoint {int(start_ts - clip.start_time)}\n")
if clip.end_time > end_ts:
file.write(f"outpoint {int(end_ts - clip.start_time)}\n")
ffmpeg_cmd = [
ffmpeg.ffmpeg_path,
"-hide_banner",
"-loglevel",
"warning",
"-protocol_whitelist",
"pipe,file",
"-f",
"concat",
"-safe",
"0",
"-i",
file_path,
"-vn", # No video
"-acodec",
"pcm_s16le", # 16-bit PCM encoding
"-ar",
str(sample_rate),
"-ac",
"1", # Mono audio
"-f",
"wav",
"-",
]
process = sp.run(
ffmpeg_cmd,
capture_output=True,
)
if process.returncode == 0:
logger.debug(
f"Successfully extracted audio for {camera_name} from {start_ts} to {end_ts}"
)
return process.stdout
else:
logger.error(f"Failed to extract audio: {process.stderr.decode()}")
return None
except Exception as e:
logger.error(f"Error extracting audio from recordings: {e}")
return None
finally:
try:
os.unlink(file_path)
except OSError:
pass