From 8e56b132f1f46c43922513f2bf457fba8e9ff88d Mon Sep 17 00:00:00 2001 From: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com> Date: Tue, 27 May 2025 09:50:17 -0500 Subject: [PATCH] publish live transcriptions on their own topic instead of tracked_object_update --- docs/docs/configuration/audio_detectors.md | 4 ++-- docs/docs/integrations/mqtt.md | 16 ++++++-------- .../real_time/audio_transcription.py | 21 +++---------------- frigate/events/audio.py | 11 +--------- frigate/types.py | 1 - web/src/api/ws.tsx | 9 ++++++++ web/src/views/live/LiveCameraView.tsx | 19 ++++++----------- 7 files changed, 27 insertions(+), 54 deletions(-) diff --git a/docs/docs/configuration/audio_detectors.md b/docs/docs/configuration/audio_detectors.md index 0c536ced0..5031e4184 100644 --- a/docs/docs/configuration/audio_detectors.md +++ b/docs/docs/configuration/audio_detectors.md @@ -116,7 +116,7 @@ Optional config parameters that can be set at the global level include: #### Live transcription -The single camera Live view in the Frigate UI supports live transcription of audio for streams defined with the `audio` role. +The single camera Live view in the Frigate UI supports live transcription of audio for streams defined with the `audio` role. Use the Enable/Disable Live Audio Transcription button/switch to toggle transcription processing. When speech is heard, the UI will display a black box over the top of the camera stream with text. The MQTT topic `frigate//audio/transcription` will also be updated in real-time with transcribed text. Results can be error-prone due to a number of factors, including: @@ -128,7 +128,7 @@ Results can be error-prone due to a number of factors, including: For speech sources close to the camera with minimal background noise, use the `small` model. -If you have CUDA hardware, you can experiment with the `large` `whisper` model on GPU. Performance is not quite as fast as the `sherpa-onnx` `small` model, but live transcription is far more accurate. +If you have CUDA hardware, you can experiment with the `large` `whisper` model on GPU. Performance is not quite as fast as the `sherpa-onnx` `small` model, but live transcription is far more accurate. Using the `large` model with CPU will likely be too slow for real-time transcription. #### Transcription and translation of `speech` audio events diff --git a/docs/docs/integrations/mqtt.md b/docs/docs/integrations/mqtt.md index f09aceb9d..56f31a021 100644 --- a/docs/docs/integrations/mqtt.md +++ b/docs/docs/integrations/mqtt.md @@ -143,16 +143,6 @@ Message published for updates to tracked object metadata, for example: } ``` -#### Live Audio Transcription Update - -```json -{ - "type": "transcription", - "text": "Hello Johnny, are you home?", - "camera": "doorbell" -} -``` - ### `frigate/reviews` Message published for each changed review item. The first message is published when the `detection` or `alert` is initiated. When additional objects are detected or when a zone change occurs, it will publish a, `update` message with the same id. When the review activity has ended a final `end` message is published. @@ -265,6 +255,12 @@ Publishes the rms value for audio detected on this camera. **NOTE:** Requires audio detection to be enabled +### `frigate//audio/transcription` + +Publishes transcribed text for audio detected on this camera. + +**NOTE:** Requires audio detection and transcription to be enabled + ### `frigate//enabled/set` Topic to turn Frigate's processing of a camera on and off. Expected values are `ON` and `OFF`. diff --git a/frigate/data_processing/real_time/audio_transcription.py b/frigate/data_processing/real_time/audio_transcription.py index c0c8b86ae..7ed644498 100644 --- a/frigate/data_processing/real_time/audio_transcription.py +++ b/frigate/data_processing/real_time/audio_transcription.py @@ -1,6 +1,5 @@ """Handle processing audio for speech transcription using sherpa-onnx with FFmpeg pipe.""" -import json import logging import os import queue @@ -13,7 +12,6 @@ import sherpa_onnx from frigate.comms.inter_process import InterProcessRequestor from frigate.config import CameraConfig, FrigateConfig from frigate.const import MODEL_CACHE_DIR -from frigate.types import TrackedObjectUpdateTypesEnum from frigate.util.downloader import ModelDownloader from ..types import DataProcessorMetrics @@ -205,14 +203,7 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi): logger.debug(f"Transcribed audio: '{text}', Endpoint: {is_endpoint}") self.requestor.send_data( - "tracked_object_update", - json.dumps( - { - "type": TrackedObjectUpdateTypesEnum.transcription, - "text": text, - "camera": obj_data["camera"], - } - ), + f"{self.camera_config.name}/audio/transcription", text ) self.audio_queue.task_done() @@ -237,14 +228,8 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi): self.transcription_segments = [] self.requestor.send_data( - "tracked_object_update", - json.dumps( - { - "type": TrackedObjectUpdateTypesEnum.transcription, - "text": (output[2].strip()), - "camera": camera, - } - ), + f"{self.camera_config.name}/audio/transcription", + (output[2].strip() + " "), ) # reset whisper diff --git a/frigate/events/audio.py b/frigate/events/audio.py index 11e8a2ae8..dc6ee7128 100644 --- a/frigate/events/audio.py +++ b/frigate/events/audio.py @@ -1,7 +1,6 @@ """Handle creating audio events.""" import datetime -import json import logging import random import string @@ -37,7 +36,6 @@ from frigate.data_processing.real_time.audio_transcription import ( from frigate.ffmpeg_presets import parse_preset_input from frigate.log import LogPipe from frigate.object_detection.base import load_labels -from frigate.types import TrackedObjectUpdateTypesEnum from frigate.util.builtin import get_ffmpeg_arg_list from frigate.video import start_or_restart_ffmpeg, stop_ffmpeg @@ -315,14 +313,7 @@ class AudioEventMaintainer(threading.Thread): if self.transcription_processor is not None: self.transcription_processor.reset(self.camera_config.name) self.requestor.send_data( - "tracked_object_update", - json.dumps( - { - "type": TrackedObjectUpdateTypesEnum.transcription, - "text": "", - "camera": self.camera_config.name, - } - ), + f"{self.camera_config.name}/audio/transcription", "" ) def expire_all_detections(self) -> None: diff --git a/frigate/types.py b/frigate/types.py index 13d51390f..ee48cc02b 100644 --- a/frigate/types.py +++ b/frigate/types.py @@ -27,4 +27,3 @@ class TrackedObjectUpdateTypesEnum(str, Enum): description = "description" face = "face" lpr = "lpr" - transcription = "transcription" diff --git a/web/src/api/ws.tsx b/web/src/api/ws.tsx index d0f2364ef..79bf9e79d 100644 --- a/web/src/api/ws.tsx +++ b/web/src/api/ws.tsx @@ -440,6 +440,15 @@ export function useAudioActivity(camera: string): { payload: number } { return { payload: payload as number }; } +export function useAudioLiveTranscription(camera: string): { + payload: string; +} { + const { + value: { payload }, + } = useWs(`${camera}/audio/transcription`, ""); + return { payload: payload as string }; +} + export function useMotionThreshold(camera: string): { payload: string; send: (payload: number, retain?: boolean) => void; diff --git a/web/src/views/live/LiveCameraView.tsx b/web/src/views/live/LiveCameraView.tsx index 0ab908fce..039265f65 100644 --- a/web/src/views/live/LiveCameraView.tsx +++ b/web/src/views/live/LiveCameraView.tsx @@ -1,4 +1,5 @@ import { + useAudioLiveTranscription, useAudioState, useAudioTranscriptionState, useAutotrackingState, @@ -7,7 +8,6 @@ import { usePtzCommand, useRecordingsState, useSnapshotsState, - useTrackedObjectUpdate, } from "@/api/ws"; import CameraFeatureToggle from "@/components/dynamic/CameraFeatureToggle"; import FilterSwitch from "@/components/filter/FilterSwitch"; @@ -204,21 +204,17 @@ export default function LiveCameraView({ const { payload: audioTranscriptionState, send: sendTranscription } = useAudioTranscriptionState(camera.name); - const { payload: wsUpdate } = useTrackedObjectUpdate(); + const { payload: transcription } = useAudioLiveTranscription(camera.name); const transcriptionRef = useRef(null); useEffect(() => { - if ( - wsUpdate && - wsUpdate.type == "transcription" && - wsUpdate.camera == camera.name - ) { + if (transcription) { if (transcriptionRef.current) { transcriptionRef.current.scrollTop = transcriptionRef.current.scrollHeight; } } - }, [wsUpdate, camera.name]); + }, [transcription]); useEffect(() => { return () => { @@ -661,15 +657,12 @@ export default function LiveCameraView({ {camera?.audio?.enabled_in_config && audioTranscriptionState == "ON" && - wsUpdate && - wsUpdate.type === "transcription" && - wsUpdate.camera === camera.name && - wsUpdate.text !== "" && ( + transcription != null && (
- {wsUpdate.text} + {transcription}
)}