mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-05-01 19:17:41 +03:00
audio maintainer modifications to support transcription
This commit is contained in:
parent
2c29fb4e95
commit
126fd5b61c
@ -1,6 +1,7 @@
|
||||
"""Handle creating audio events."""
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
import string
|
||||
@ -18,7 +19,7 @@ from frigate.comms.event_metadata_updater import (
|
||||
EventMetadataTypeEnum,
|
||||
)
|
||||
from frigate.comms.inter_process import InterProcessRequestor
|
||||
from frigate.config import CameraConfig, CameraInput, FfmpegConfig
|
||||
from frigate.config import CameraConfig, CameraInput, FfmpegConfig, FrigateConfig
|
||||
from frigate.config.camera.updater import (
|
||||
CameraConfigUpdateEnum,
|
||||
CameraConfigUpdateSubscriber,
|
||||
@ -30,9 +31,13 @@ from frigate.const import (
|
||||
AUDIO_MIN_CONFIDENCE,
|
||||
AUDIO_SAMPLE_RATE,
|
||||
)
|
||||
from frigate.data_processing.real_time.audio_transcription import (
|
||||
AudioTranscriptionRealTimeProcessor,
|
||||
)
|
||||
from frigate.ffmpeg_presets import parse_preset_input
|
||||
from frigate.log import LogPipe
|
||||
from frigate.object_detection.base import load_labels
|
||||
from frigate.types import TrackedObjectUpdateTypesEnum
|
||||
from frigate.util.builtin import get_ffmpeg_arg_list
|
||||
from frigate.video import start_or_restart_ffmpeg, stop_ffmpeg
|
||||
|
||||
@ -75,6 +80,7 @@ class AudioProcessor(util.Process):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: FrigateConfig,
|
||||
cameras: list[CameraConfig],
|
||||
camera_metrics: dict[str, CameraMetrics],
|
||||
):
|
||||
@ -82,6 +88,7 @@ class AudioProcessor(util.Process):
|
||||
|
||||
self.camera_metrics = camera_metrics
|
||||
self.cameras = cameras
|
||||
self.config = config
|
||||
|
||||
def run(self) -> None:
|
||||
audio_threads: list[AudioEventMaintainer] = []
|
||||
@ -94,6 +101,7 @@ class AudioProcessor(util.Process):
|
||||
for camera in self.cameras:
|
||||
audio_thread = AudioEventMaintainer(
|
||||
camera,
|
||||
self.config,
|
||||
self.camera_metrics,
|
||||
self.stop_event,
|
||||
)
|
||||
@ -122,46 +130,62 @@ class AudioEventMaintainer(threading.Thread):
|
||||
def __init__(
|
||||
self,
|
||||
camera: CameraConfig,
|
||||
config: FrigateConfig,
|
||||
camera_metrics: dict[str, CameraMetrics],
|
||||
stop_event: threading.Event,
|
||||
) -> None:
|
||||
super().__init__(name=f"{camera.name}_audio_event_processor")
|
||||
|
||||
self.config = camera
|
||||
self.config = config
|
||||
self.camera_config = camera
|
||||
self.camera_metrics = camera_metrics
|
||||
self.detections: dict[dict[str, Any]] = {}
|
||||
self.stop_event = stop_event
|
||||
self.detector = AudioTfl(stop_event, self.config.audio.num_threads)
|
||||
self.detector = AudioTfl(stop_event, self.camera_config.audio.num_threads)
|
||||
self.shape = (int(round(AUDIO_DURATION * AUDIO_SAMPLE_RATE)),)
|
||||
self.chunk_size = int(round(AUDIO_DURATION * AUDIO_SAMPLE_RATE * 2))
|
||||
self.logger = logging.getLogger(f"audio.{self.config.name}")
|
||||
self.ffmpeg_cmd = get_ffmpeg_command(self.config.ffmpeg)
|
||||
self.logpipe = LogPipe(f"ffmpeg.{self.config.name}.audio")
|
||||
self.logger = logging.getLogger(f"audio.{self.camera_config.name}")
|
||||
self.ffmpeg_cmd = get_ffmpeg_command(self.camera_config.ffmpeg)
|
||||
self.logpipe = LogPipe(f"ffmpeg.{self.camera_config.name}.audio")
|
||||
self.audio_listener = None
|
||||
self.transcription_processor = None
|
||||
|
||||
# create communication for audio detections
|
||||
self.requestor = InterProcessRequestor()
|
||||
self.config_subscriber = CameraConfigUpdateSubscriber(
|
||||
{self.config.name: self.config},
|
||||
[CameraConfigUpdateEnum.audio, CameraConfigUpdateEnum.enabled],
|
||||
{self.camera_config.name: self.camera_config},
|
||||
[
|
||||
CameraConfigUpdateEnum.audio,
|
||||
CameraConfigUpdateEnum.enabled,
|
||||
CameraConfigUpdateEnum.audio_transcription,
|
||||
],
|
||||
)
|
||||
self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio)
|
||||
self.event_metadata_publisher = EventMetadataPublisher()
|
||||
|
||||
if self.camera_config.audio_transcription.enabled_in_config:
|
||||
# init the transcription processor for this camera
|
||||
self.transcription_processor = AudioTranscriptionRealTimeProcessor(
|
||||
config=self.config,
|
||||
camera_config=self.camera_config,
|
||||
requestor=self.requestor,
|
||||
metrics=self.camera_metrics[self.camera_config.name],
|
||||
)
|
||||
|
||||
self.was_enabled = camera.enabled
|
||||
|
||||
def detect_audio(self, audio) -> None:
|
||||
if not self.config.audio.enabled or self.stop_event.is_set():
|
||||
if not self.camera_config.audio.enabled or self.stop_event.is_set():
|
||||
return
|
||||
|
||||
audio_as_float = audio.astype(np.float32)
|
||||
rms, dBFS = self.calculate_audio_levels(audio_as_float)
|
||||
|
||||
self.camera_metrics[self.config.name].audio_rms.value = rms
|
||||
self.camera_metrics[self.config.name].audio_dBFS.value = dBFS
|
||||
self.camera_metrics[self.camera_config.name].audio_rms.value = rms
|
||||
self.camera_metrics[self.camera_config.name].audio_dBFS.value = dBFS
|
||||
|
||||
# only run audio detection when volume is above min_volume
|
||||
if rms >= self.config.audio.min_volume:
|
||||
if rms >= self.camera_config.audio.min_volume:
|
||||
# create waveform relative to max range and look for detections
|
||||
waveform = (audio / AUDIO_MAX_BIT_RANGE).astype(np.float32)
|
||||
model_detections = self.detector.detect(waveform)
|
||||
@ -169,28 +193,43 @@ class AudioEventMaintainer(threading.Thread):
|
||||
|
||||
for label, score, _ in model_detections:
|
||||
self.logger.debug(
|
||||
f"{self.config.name} heard {label} with a score of {score}"
|
||||
f"{self.camera_config.name} heard {label} with a score of {score}"
|
||||
)
|
||||
|
||||
if label not in self.config.audio.listen:
|
||||
if label not in self.camera_config.audio.listen:
|
||||
continue
|
||||
|
||||
if score > dict((self.config.audio.filters or {}).get(label, {})).get(
|
||||
"threshold", 0.8
|
||||
):
|
||||
if score > dict(
|
||||
(self.camera_config.audio.filters or {}).get(label, {})
|
||||
).get("threshold", 0.8):
|
||||
self.handle_detection(label, score)
|
||||
audio_detections.append(label)
|
||||
|
||||
# send audio detection data
|
||||
self.detection_publisher.publish(
|
||||
(
|
||||
self.config.name,
|
||||
self.camera_config.name,
|
||||
datetime.datetime.now().timestamp(),
|
||||
dBFS,
|
||||
audio_detections,
|
||||
)
|
||||
)
|
||||
|
||||
# run audio transcription
|
||||
if self.transcription_processor is not None and (
|
||||
# rms >= self.camera_config.audio.min_volume or self.is_endpoint is False
|
||||
self.camera_config.audio_transcription.live_enabled
|
||||
):
|
||||
self.transcribing = True
|
||||
# process audio until we've reached the endpoint
|
||||
self.transcription_processor.process_audio(
|
||||
{
|
||||
"id": f"{self.camera_config.name}_audio",
|
||||
"camera": self.camera_config.name,
|
||||
},
|
||||
audio,
|
||||
)
|
||||
|
||||
self.expire_detections()
|
||||
|
||||
def calculate_audio_levels(self, audio_as_float: np.float32) -> Tuple[float, float]:
|
||||
@ -204,8 +243,8 @@ class AudioEventMaintainer(threading.Thread):
|
||||
else:
|
||||
dBFS = 0
|
||||
|
||||
self.requestor.send_data(f"{self.config.name}/audio/dBFS", float(dBFS))
|
||||
self.requestor.send_data(f"{self.config.name}/audio/rms", float(rms))
|
||||
self.requestor.send_data(f"{self.camera_config.name}/audio/dBFS", float(dBFS))
|
||||
self.requestor.send_data(f"{self.camera_config.name}/audio/rms", float(rms))
|
||||
|
||||
return float(rms), float(dBFS)
|
||||
|
||||
@ -220,13 +259,13 @@ class AudioEventMaintainer(threading.Thread):
|
||||
random.choices(string.ascii_lowercase + string.digits, k=6)
|
||||
)
|
||||
event_id = f"{now}-{rand_id}"
|
||||
self.requestor.send_data(f"{self.config.name}/audio/{label}", "ON")
|
||||
self.requestor.send_data(f"{self.camera_config.name}/audio/{label}", "ON")
|
||||
|
||||
self.event_metadata_publisher.publish(
|
||||
EventMetadataTypeEnum.manual_event_create,
|
||||
(
|
||||
now,
|
||||
self.config.name,
|
||||
self.camera_config.name,
|
||||
label,
|
||||
event_id,
|
||||
True,
|
||||
@ -252,10 +291,10 @@ class AudioEventMaintainer(threading.Thread):
|
||||
|
||||
if (
|
||||
now - detection.get("last_detection", now)
|
||||
> self.config.audio.max_not_heard
|
||||
> self.camera_config.audio.max_not_heard
|
||||
):
|
||||
self.requestor.send_data(
|
||||
f"{self.config.name}/audio/{detection['label']}", "OFF"
|
||||
f"{self.camera_config.name}/audio/{detection['label']}", "OFF"
|
||||
)
|
||||
|
||||
self.event_metadata_publisher.publish(
|
||||
@ -264,12 +303,28 @@ class AudioEventMaintainer(threading.Thread):
|
||||
)
|
||||
self.detections[detection["label"]] = None
|
||||
|
||||
# clear real-time transcription
|
||||
if self.transcription_processor is not None:
|
||||
self.transcription_processor.reset(self.camera_config.name)
|
||||
self.requestor.send_data(
|
||||
"tracked_object_update",
|
||||
json.dumps(
|
||||
{
|
||||
"type": TrackedObjectUpdateTypesEnum.transcription,
|
||||
"text": "",
|
||||
"camera": self.camera_config.name,
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
def expire_all_detections(self) -> None:
|
||||
"""Immediately end all current detections"""
|
||||
now = datetime.datetime.now().timestamp()
|
||||
for label, detection in list(self.detections.items()):
|
||||
if detection:
|
||||
self.requestor.send_data(f"{self.config.name}/audio/{label}", "OFF")
|
||||
self.requestor.send_data(
|
||||
f"{self.camera_config.name}/audio/{label}", "OFF"
|
||||
)
|
||||
self.event_metadata_publisher.publish(
|
||||
EventMetadataTypeEnum.manual_event_end,
|
||||
(detection["id"], now),
|
||||
@ -290,7 +345,7 @@ class AudioEventMaintainer(threading.Thread):
|
||||
if self.stop_event.is_set():
|
||||
return
|
||||
|
||||
time.sleep(self.config.ffmpeg.retry_interval)
|
||||
time.sleep(self.camera_config.ffmpeg.retry_interval)
|
||||
self.logpipe.dump()
|
||||
self.start_or_restart_ffmpeg()
|
||||
|
||||
@ -312,20 +367,20 @@ class AudioEventMaintainer(threading.Thread):
|
||||
log_and_restart()
|
||||
|
||||
def run(self) -> None:
|
||||
if self.config.enabled:
|
||||
if self.camera_config.enabled:
|
||||
self.start_or_restart_ffmpeg()
|
||||
|
||||
while not self.stop_event.is_set():
|
||||
enabled = self.config.enabled
|
||||
enabled = self.camera_config.enabled
|
||||
if enabled != self.was_enabled:
|
||||
if enabled:
|
||||
self.logger.debug(
|
||||
f"Enabling audio detections for {self.config.name}"
|
||||
f"Enabling audio detections for {self.camera_config.name}"
|
||||
)
|
||||
self.start_or_restart_ffmpeg()
|
||||
else:
|
||||
self.logger.debug(
|
||||
f"Disabling audio detections for {self.config.name}, ending events"
|
||||
f"Disabling audio detections for {self.camera_config.name}, ending events"
|
||||
)
|
||||
self.expire_all_detections()
|
||||
stop_ffmpeg(self.audio_listener, self.logger)
|
||||
|
||||
116
frigate/util/audio.py
Normal file
116
frigate/util/audio.py
Normal file
@ -0,0 +1,116 @@
|
||||
"""Utilities for creating and manipulating audio."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess as sp
|
||||
from typing import Optional
|
||||
|
||||
from pathvalidate import sanitize_filename
|
||||
|
||||
from frigate.const import CACHE_DIR
|
||||
from frigate.models import Recordings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_audio_from_recording(
|
||||
ffmpeg,
|
||||
camera_name: str,
|
||||
start_ts: float,
|
||||
end_ts: float,
|
||||
sample_rate: int = 16000,
|
||||
) -> Optional[bytes]:
|
||||
"""Extract audio from recording files between start_ts and end_ts in WAV format suitable for sherpa-onnx.
|
||||
|
||||
Args:
|
||||
ffmpeg: FFmpeg configuration object
|
||||
camera_name: Name of the camera
|
||||
start_ts: Start timestamp in seconds
|
||||
end_ts: End timestamp in seconds
|
||||
sample_rate: Sample rate for output audio (default 16kHz for sherpa-onnx)
|
||||
|
||||
Returns:
|
||||
Bytes of WAV audio data or None if extraction failed
|
||||
"""
|
||||
# Fetch all relevant recording segments
|
||||
recordings = (
|
||||
Recordings.select(
|
||||
Recordings.path,
|
||||
Recordings.start_time,
|
||||
Recordings.end_time,
|
||||
)
|
||||
.where(
|
||||
(Recordings.start_time.between(start_ts, end_ts))
|
||||
| (Recordings.end_time.between(start_ts, end_ts))
|
||||
| ((start_ts > Recordings.start_time) & (end_ts < Recordings.end_time))
|
||||
)
|
||||
.where(Recordings.camera == camera_name)
|
||||
.order_by(Recordings.start_time.asc())
|
||||
)
|
||||
|
||||
if not recordings:
|
||||
logger.debug(
|
||||
f"No recordings found for {camera_name} between {start_ts} and {end_ts}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Generate concat playlist file
|
||||
file_name = sanitize_filename(
|
||||
f"audio_playlist_{camera_name}_{start_ts}-{end_ts}.txt"
|
||||
)
|
||||
file_path = os.path.join(CACHE_DIR, file_name)
|
||||
try:
|
||||
with open(file_path, "w") as file:
|
||||
for clip in recordings:
|
||||
file.write(f"file '{clip.path}'\n")
|
||||
if clip.start_time < start_ts:
|
||||
file.write(f"inpoint {int(start_ts - clip.start_time)}\n")
|
||||
if clip.end_time > end_ts:
|
||||
file.write(f"outpoint {int(end_ts - clip.start_time)}\n")
|
||||
|
||||
ffmpeg_cmd = [
|
||||
ffmpeg.ffmpeg_path,
|
||||
"-hide_banner",
|
||||
"-loglevel",
|
||||
"warning",
|
||||
"-protocol_whitelist",
|
||||
"pipe,file",
|
||||
"-f",
|
||||
"concat",
|
||||
"-safe",
|
||||
"0",
|
||||
"-i",
|
||||
file_path,
|
||||
"-vn", # No video
|
||||
"-acodec",
|
||||
"pcm_s16le", # 16-bit PCM encoding
|
||||
"-ar",
|
||||
str(sample_rate),
|
||||
"-ac",
|
||||
"1", # Mono audio
|
||||
"-f",
|
||||
"wav",
|
||||
"-",
|
||||
]
|
||||
|
||||
process = sp.run(
|
||||
ffmpeg_cmd,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
if process.returncode == 0:
|
||||
logger.debug(
|
||||
f"Successfully extracted audio for {camera_name} from {start_ts} to {end_ts}"
|
||||
)
|
||||
return process.stdout
|
||||
else:
|
||||
logger.error(f"Failed to extract audio: {process.stderr.decode()}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting audio from recordings: {e}")
|
||||
return None
|
||||
finally:
|
||||
try:
|
||||
os.unlink(file_path)
|
||||
except OSError:
|
||||
pass
|
||||
Loading…
Reference in New Issue
Block a user