mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-05-02 03:27:41 +03:00
audio maintainer modifications to support transcription
This commit is contained in:
parent
2c29fb4e95
commit
126fd5b61c
@ -1,6 +1,7 @@
|
|||||||
"""Handle creating audio events."""
|
"""Handle creating audio events."""
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
import string
|
import string
|
||||||
@ -18,7 +19,7 @@ from frigate.comms.event_metadata_updater import (
|
|||||||
EventMetadataTypeEnum,
|
EventMetadataTypeEnum,
|
||||||
)
|
)
|
||||||
from frigate.comms.inter_process import InterProcessRequestor
|
from frigate.comms.inter_process import InterProcessRequestor
|
||||||
from frigate.config import CameraConfig, CameraInput, FfmpegConfig
|
from frigate.config import CameraConfig, CameraInput, FfmpegConfig, FrigateConfig
|
||||||
from frigate.config.camera.updater import (
|
from frigate.config.camera.updater import (
|
||||||
CameraConfigUpdateEnum,
|
CameraConfigUpdateEnum,
|
||||||
CameraConfigUpdateSubscriber,
|
CameraConfigUpdateSubscriber,
|
||||||
@ -30,9 +31,13 @@ from frigate.const import (
|
|||||||
AUDIO_MIN_CONFIDENCE,
|
AUDIO_MIN_CONFIDENCE,
|
||||||
AUDIO_SAMPLE_RATE,
|
AUDIO_SAMPLE_RATE,
|
||||||
)
|
)
|
||||||
|
from frigate.data_processing.real_time.audio_transcription import (
|
||||||
|
AudioTranscriptionRealTimeProcessor,
|
||||||
|
)
|
||||||
from frigate.ffmpeg_presets import parse_preset_input
|
from frigate.ffmpeg_presets import parse_preset_input
|
||||||
from frigate.log import LogPipe
|
from frigate.log import LogPipe
|
||||||
from frigate.object_detection.base import load_labels
|
from frigate.object_detection.base import load_labels
|
||||||
|
from frigate.types import TrackedObjectUpdateTypesEnum
|
||||||
from frigate.util.builtin import get_ffmpeg_arg_list
|
from frigate.util.builtin import get_ffmpeg_arg_list
|
||||||
from frigate.video import start_or_restart_ffmpeg, stop_ffmpeg
|
from frigate.video import start_or_restart_ffmpeg, stop_ffmpeg
|
||||||
|
|
||||||
@ -75,6 +80,7 @@ class AudioProcessor(util.Process):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
config: FrigateConfig,
|
||||||
cameras: list[CameraConfig],
|
cameras: list[CameraConfig],
|
||||||
camera_metrics: dict[str, CameraMetrics],
|
camera_metrics: dict[str, CameraMetrics],
|
||||||
):
|
):
|
||||||
@ -82,6 +88,7 @@ class AudioProcessor(util.Process):
|
|||||||
|
|
||||||
self.camera_metrics = camera_metrics
|
self.camera_metrics = camera_metrics
|
||||||
self.cameras = cameras
|
self.cameras = cameras
|
||||||
|
self.config = config
|
||||||
|
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
audio_threads: list[AudioEventMaintainer] = []
|
audio_threads: list[AudioEventMaintainer] = []
|
||||||
@ -94,6 +101,7 @@ class AudioProcessor(util.Process):
|
|||||||
for camera in self.cameras:
|
for camera in self.cameras:
|
||||||
audio_thread = AudioEventMaintainer(
|
audio_thread = AudioEventMaintainer(
|
||||||
camera,
|
camera,
|
||||||
|
self.config,
|
||||||
self.camera_metrics,
|
self.camera_metrics,
|
||||||
self.stop_event,
|
self.stop_event,
|
||||||
)
|
)
|
||||||
@ -122,46 +130,62 @@ class AudioEventMaintainer(threading.Thread):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
camera: CameraConfig,
|
camera: CameraConfig,
|
||||||
|
config: FrigateConfig,
|
||||||
camera_metrics: dict[str, CameraMetrics],
|
camera_metrics: dict[str, CameraMetrics],
|
||||||
stop_event: threading.Event,
|
stop_event: threading.Event,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(name=f"{camera.name}_audio_event_processor")
|
super().__init__(name=f"{camera.name}_audio_event_processor")
|
||||||
|
|
||||||
self.config = camera
|
self.config = config
|
||||||
|
self.camera_config = camera
|
||||||
self.camera_metrics = camera_metrics
|
self.camera_metrics = camera_metrics
|
||||||
self.detections: dict[dict[str, Any]] = {}
|
self.detections: dict[dict[str, Any]] = {}
|
||||||
self.stop_event = stop_event
|
self.stop_event = stop_event
|
||||||
self.detector = AudioTfl(stop_event, self.config.audio.num_threads)
|
self.detector = AudioTfl(stop_event, self.camera_config.audio.num_threads)
|
||||||
self.shape = (int(round(AUDIO_DURATION * AUDIO_SAMPLE_RATE)),)
|
self.shape = (int(round(AUDIO_DURATION * AUDIO_SAMPLE_RATE)),)
|
||||||
self.chunk_size = int(round(AUDIO_DURATION * AUDIO_SAMPLE_RATE * 2))
|
self.chunk_size = int(round(AUDIO_DURATION * AUDIO_SAMPLE_RATE * 2))
|
||||||
self.logger = logging.getLogger(f"audio.{self.config.name}")
|
self.logger = logging.getLogger(f"audio.{self.camera_config.name}")
|
||||||
self.ffmpeg_cmd = get_ffmpeg_command(self.config.ffmpeg)
|
self.ffmpeg_cmd = get_ffmpeg_command(self.camera_config.ffmpeg)
|
||||||
self.logpipe = LogPipe(f"ffmpeg.{self.config.name}.audio")
|
self.logpipe = LogPipe(f"ffmpeg.{self.camera_config.name}.audio")
|
||||||
self.audio_listener = None
|
self.audio_listener = None
|
||||||
|
self.transcription_processor = None
|
||||||
|
|
||||||
# create communication for audio detections
|
# create communication for audio detections
|
||||||
self.requestor = InterProcessRequestor()
|
self.requestor = InterProcessRequestor()
|
||||||
self.config_subscriber = CameraConfigUpdateSubscriber(
|
self.config_subscriber = CameraConfigUpdateSubscriber(
|
||||||
{self.config.name: self.config},
|
{self.camera_config.name: self.camera_config},
|
||||||
[CameraConfigUpdateEnum.audio, CameraConfigUpdateEnum.enabled],
|
[
|
||||||
|
CameraConfigUpdateEnum.audio,
|
||||||
|
CameraConfigUpdateEnum.enabled,
|
||||||
|
CameraConfigUpdateEnum.audio_transcription,
|
||||||
|
],
|
||||||
)
|
)
|
||||||
self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio)
|
self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio)
|
||||||
self.event_metadata_publisher = EventMetadataPublisher()
|
self.event_metadata_publisher = EventMetadataPublisher()
|
||||||
|
|
||||||
|
if self.camera_config.audio_transcription.enabled_in_config:
|
||||||
|
# init the transcription processor for this camera
|
||||||
|
self.transcription_processor = AudioTranscriptionRealTimeProcessor(
|
||||||
|
config=self.config,
|
||||||
|
camera_config=self.camera_config,
|
||||||
|
requestor=self.requestor,
|
||||||
|
metrics=self.camera_metrics[self.camera_config.name],
|
||||||
|
)
|
||||||
|
|
||||||
self.was_enabled = camera.enabled
|
self.was_enabled = camera.enabled
|
||||||
|
|
||||||
def detect_audio(self, audio) -> None:
|
def detect_audio(self, audio) -> None:
|
||||||
if not self.config.audio.enabled or self.stop_event.is_set():
|
if not self.camera_config.audio.enabled or self.stop_event.is_set():
|
||||||
return
|
return
|
||||||
|
|
||||||
audio_as_float = audio.astype(np.float32)
|
audio_as_float = audio.astype(np.float32)
|
||||||
rms, dBFS = self.calculate_audio_levels(audio_as_float)
|
rms, dBFS = self.calculate_audio_levels(audio_as_float)
|
||||||
|
|
||||||
self.camera_metrics[self.config.name].audio_rms.value = rms
|
self.camera_metrics[self.camera_config.name].audio_rms.value = rms
|
||||||
self.camera_metrics[self.config.name].audio_dBFS.value = dBFS
|
self.camera_metrics[self.camera_config.name].audio_dBFS.value = dBFS
|
||||||
|
|
||||||
# only run audio detection when volume is above min_volume
|
# only run audio detection when volume is above min_volume
|
||||||
if rms >= self.config.audio.min_volume:
|
if rms >= self.camera_config.audio.min_volume:
|
||||||
# create waveform relative to max range and look for detections
|
# create waveform relative to max range and look for detections
|
||||||
waveform = (audio / AUDIO_MAX_BIT_RANGE).astype(np.float32)
|
waveform = (audio / AUDIO_MAX_BIT_RANGE).astype(np.float32)
|
||||||
model_detections = self.detector.detect(waveform)
|
model_detections = self.detector.detect(waveform)
|
||||||
@ -169,28 +193,43 @@ class AudioEventMaintainer(threading.Thread):
|
|||||||
|
|
||||||
for label, score, _ in model_detections:
|
for label, score, _ in model_detections:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
f"{self.config.name} heard {label} with a score of {score}"
|
f"{self.camera_config.name} heard {label} with a score of {score}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if label not in self.config.audio.listen:
|
if label not in self.camera_config.audio.listen:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if score > dict((self.config.audio.filters or {}).get(label, {})).get(
|
if score > dict(
|
||||||
"threshold", 0.8
|
(self.camera_config.audio.filters or {}).get(label, {})
|
||||||
):
|
).get("threshold", 0.8):
|
||||||
self.handle_detection(label, score)
|
self.handle_detection(label, score)
|
||||||
audio_detections.append(label)
|
audio_detections.append(label)
|
||||||
|
|
||||||
# send audio detection data
|
# send audio detection data
|
||||||
self.detection_publisher.publish(
|
self.detection_publisher.publish(
|
||||||
(
|
(
|
||||||
self.config.name,
|
self.camera_config.name,
|
||||||
datetime.datetime.now().timestamp(),
|
datetime.datetime.now().timestamp(),
|
||||||
dBFS,
|
dBFS,
|
||||||
audio_detections,
|
audio_detections,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# run audio transcription
|
||||||
|
if self.transcription_processor is not None and (
|
||||||
|
# rms >= self.camera_config.audio.min_volume or self.is_endpoint is False
|
||||||
|
self.camera_config.audio_transcription.live_enabled
|
||||||
|
):
|
||||||
|
self.transcribing = True
|
||||||
|
# process audio until we've reached the endpoint
|
||||||
|
self.transcription_processor.process_audio(
|
||||||
|
{
|
||||||
|
"id": f"{self.camera_config.name}_audio",
|
||||||
|
"camera": self.camera_config.name,
|
||||||
|
},
|
||||||
|
audio,
|
||||||
|
)
|
||||||
|
|
||||||
self.expire_detections()
|
self.expire_detections()
|
||||||
|
|
||||||
def calculate_audio_levels(self, audio_as_float: np.float32) -> Tuple[float, float]:
|
def calculate_audio_levels(self, audio_as_float: np.float32) -> Tuple[float, float]:
|
||||||
@ -204,8 +243,8 @@ class AudioEventMaintainer(threading.Thread):
|
|||||||
else:
|
else:
|
||||||
dBFS = 0
|
dBFS = 0
|
||||||
|
|
||||||
self.requestor.send_data(f"{self.config.name}/audio/dBFS", float(dBFS))
|
self.requestor.send_data(f"{self.camera_config.name}/audio/dBFS", float(dBFS))
|
||||||
self.requestor.send_data(f"{self.config.name}/audio/rms", float(rms))
|
self.requestor.send_data(f"{self.camera_config.name}/audio/rms", float(rms))
|
||||||
|
|
||||||
return float(rms), float(dBFS)
|
return float(rms), float(dBFS)
|
||||||
|
|
||||||
@ -220,13 +259,13 @@ class AudioEventMaintainer(threading.Thread):
|
|||||||
random.choices(string.ascii_lowercase + string.digits, k=6)
|
random.choices(string.ascii_lowercase + string.digits, k=6)
|
||||||
)
|
)
|
||||||
event_id = f"{now}-{rand_id}"
|
event_id = f"{now}-{rand_id}"
|
||||||
self.requestor.send_data(f"{self.config.name}/audio/{label}", "ON")
|
self.requestor.send_data(f"{self.camera_config.name}/audio/{label}", "ON")
|
||||||
|
|
||||||
self.event_metadata_publisher.publish(
|
self.event_metadata_publisher.publish(
|
||||||
EventMetadataTypeEnum.manual_event_create,
|
EventMetadataTypeEnum.manual_event_create,
|
||||||
(
|
(
|
||||||
now,
|
now,
|
||||||
self.config.name,
|
self.camera_config.name,
|
||||||
label,
|
label,
|
||||||
event_id,
|
event_id,
|
||||||
True,
|
True,
|
||||||
@ -252,10 +291,10 @@ class AudioEventMaintainer(threading.Thread):
|
|||||||
|
|
||||||
if (
|
if (
|
||||||
now - detection.get("last_detection", now)
|
now - detection.get("last_detection", now)
|
||||||
> self.config.audio.max_not_heard
|
> self.camera_config.audio.max_not_heard
|
||||||
):
|
):
|
||||||
self.requestor.send_data(
|
self.requestor.send_data(
|
||||||
f"{self.config.name}/audio/{detection['label']}", "OFF"
|
f"{self.camera_config.name}/audio/{detection['label']}", "OFF"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.event_metadata_publisher.publish(
|
self.event_metadata_publisher.publish(
|
||||||
@ -264,12 +303,28 @@ class AudioEventMaintainer(threading.Thread):
|
|||||||
)
|
)
|
||||||
self.detections[detection["label"]] = None
|
self.detections[detection["label"]] = None
|
||||||
|
|
||||||
|
# clear real-time transcription
|
||||||
|
if self.transcription_processor is not None:
|
||||||
|
self.transcription_processor.reset(self.camera_config.name)
|
||||||
|
self.requestor.send_data(
|
||||||
|
"tracked_object_update",
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"type": TrackedObjectUpdateTypesEnum.transcription,
|
||||||
|
"text": "",
|
||||||
|
"camera": self.camera_config.name,
|
||||||
|
}
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
def expire_all_detections(self) -> None:
|
def expire_all_detections(self) -> None:
|
||||||
"""Immediately end all current detections"""
|
"""Immediately end all current detections"""
|
||||||
now = datetime.datetime.now().timestamp()
|
now = datetime.datetime.now().timestamp()
|
||||||
for label, detection in list(self.detections.items()):
|
for label, detection in list(self.detections.items()):
|
||||||
if detection:
|
if detection:
|
||||||
self.requestor.send_data(f"{self.config.name}/audio/{label}", "OFF")
|
self.requestor.send_data(
|
||||||
|
f"{self.camera_config.name}/audio/{label}", "OFF"
|
||||||
|
)
|
||||||
self.event_metadata_publisher.publish(
|
self.event_metadata_publisher.publish(
|
||||||
EventMetadataTypeEnum.manual_event_end,
|
EventMetadataTypeEnum.manual_event_end,
|
||||||
(detection["id"], now),
|
(detection["id"], now),
|
||||||
@ -290,7 +345,7 @@ class AudioEventMaintainer(threading.Thread):
|
|||||||
if self.stop_event.is_set():
|
if self.stop_event.is_set():
|
||||||
return
|
return
|
||||||
|
|
||||||
time.sleep(self.config.ffmpeg.retry_interval)
|
time.sleep(self.camera_config.ffmpeg.retry_interval)
|
||||||
self.logpipe.dump()
|
self.logpipe.dump()
|
||||||
self.start_or_restart_ffmpeg()
|
self.start_or_restart_ffmpeg()
|
||||||
|
|
||||||
@ -312,20 +367,20 @@ class AudioEventMaintainer(threading.Thread):
|
|||||||
log_and_restart()
|
log_and_restart()
|
||||||
|
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
if self.config.enabled:
|
if self.camera_config.enabled:
|
||||||
self.start_or_restart_ffmpeg()
|
self.start_or_restart_ffmpeg()
|
||||||
|
|
||||||
while not self.stop_event.is_set():
|
while not self.stop_event.is_set():
|
||||||
enabled = self.config.enabled
|
enabled = self.camera_config.enabled
|
||||||
if enabled != self.was_enabled:
|
if enabled != self.was_enabled:
|
||||||
if enabled:
|
if enabled:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
f"Enabling audio detections for {self.config.name}"
|
f"Enabling audio detections for {self.camera_config.name}"
|
||||||
)
|
)
|
||||||
self.start_or_restart_ffmpeg()
|
self.start_or_restart_ffmpeg()
|
||||||
else:
|
else:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
f"Disabling audio detections for {self.config.name}, ending events"
|
f"Disabling audio detections for {self.camera_config.name}, ending events"
|
||||||
)
|
)
|
||||||
self.expire_all_detections()
|
self.expire_all_detections()
|
||||||
stop_ffmpeg(self.audio_listener, self.logger)
|
stop_ffmpeg(self.audio_listener, self.logger)
|
||||||
|
|||||||
116
frigate/util/audio.py
Normal file
116
frigate/util/audio.py
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
"""Utilities for creating and manipulating audio."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import subprocess as sp
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pathvalidate import sanitize_filename
|
||||||
|
|
||||||
|
from frigate.const import CACHE_DIR
|
||||||
|
from frigate.models import Recordings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def get_audio_from_recording(
|
||||||
|
ffmpeg,
|
||||||
|
camera_name: str,
|
||||||
|
start_ts: float,
|
||||||
|
end_ts: float,
|
||||||
|
sample_rate: int = 16000,
|
||||||
|
) -> Optional[bytes]:
|
||||||
|
"""Extract audio from recording files between start_ts and end_ts in WAV format suitable for sherpa-onnx.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ffmpeg: FFmpeg configuration object
|
||||||
|
camera_name: Name of the camera
|
||||||
|
start_ts: Start timestamp in seconds
|
||||||
|
end_ts: End timestamp in seconds
|
||||||
|
sample_rate: Sample rate for output audio (default 16kHz for sherpa-onnx)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Bytes of WAV audio data or None if extraction failed
|
||||||
|
"""
|
||||||
|
# Fetch all relevant recording segments
|
||||||
|
recordings = (
|
||||||
|
Recordings.select(
|
||||||
|
Recordings.path,
|
||||||
|
Recordings.start_time,
|
||||||
|
Recordings.end_time,
|
||||||
|
)
|
||||||
|
.where(
|
||||||
|
(Recordings.start_time.between(start_ts, end_ts))
|
||||||
|
| (Recordings.end_time.between(start_ts, end_ts))
|
||||||
|
| ((start_ts > Recordings.start_time) & (end_ts < Recordings.end_time))
|
||||||
|
)
|
||||||
|
.where(Recordings.camera == camera_name)
|
||||||
|
.order_by(Recordings.start_time.asc())
|
||||||
|
)
|
||||||
|
|
||||||
|
if not recordings:
|
||||||
|
logger.debug(
|
||||||
|
f"No recordings found for {camera_name} between {start_ts} and {end_ts}"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Generate concat playlist file
|
||||||
|
file_name = sanitize_filename(
|
||||||
|
f"audio_playlist_{camera_name}_{start_ts}-{end_ts}.txt"
|
||||||
|
)
|
||||||
|
file_path = os.path.join(CACHE_DIR, file_name)
|
||||||
|
try:
|
||||||
|
with open(file_path, "w") as file:
|
||||||
|
for clip in recordings:
|
||||||
|
file.write(f"file '{clip.path}'\n")
|
||||||
|
if clip.start_time < start_ts:
|
||||||
|
file.write(f"inpoint {int(start_ts - clip.start_time)}\n")
|
||||||
|
if clip.end_time > end_ts:
|
||||||
|
file.write(f"outpoint {int(end_ts - clip.start_time)}\n")
|
||||||
|
|
||||||
|
ffmpeg_cmd = [
|
||||||
|
ffmpeg.ffmpeg_path,
|
||||||
|
"-hide_banner",
|
||||||
|
"-loglevel",
|
||||||
|
"warning",
|
||||||
|
"-protocol_whitelist",
|
||||||
|
"pipe,file",
|
||||||
|
"-f",
|
||||||
|
"concat",
|
||||||
|
"-safe",
|
||||||
|
"0",
|
||||||
|
"-i",
|
||||||
|
file_path,
|
||||||
|
"-vn", # No video
|
||||||
|
"-acodec",
|
||||||
|
"pcm_s16le", # 16-bit PCM encoding
|
||||||
|
"-ar",
|
||||||
|
str(sample_rate),
|
||||||
|
"-ac",
|
||||||
|
"1", # Mono audio
|
||||||
|
"-f",
|
||||||
|
"wav",
|
||||||
|
"-",
|
||||||
|
]
|
||||||
|
|
||||||
|
process = sp.run(
|
||||||
|
ffmpeg_cmd,
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if process.returncode == 0:
|
||||||
|
logger.debug(
|
||||||
|
f"Successfully extracted audio for {camera_name} from {start_ts} to {end_ts}"
|
||||||
|
)
|
||||||
|
return process.stdout
|
||||||
|
else:
|
||||||
|
logger.error(f"Failed to extract audio: {process.stderr.decode()}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting audio from recordings: {e}")
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(file_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
Loading…
Reference in New Issue
Block a user