From 8e56b132f1f46c43922513f2bf457fba8e9ff88d Mon Sep 17 00:00:00 2001
From: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com>
Date: Tue, 27 May 2025 09:50:17 -0500
Subject: [PATCH] publish live transcriptions on their own topic instead of
 tracked_object_update

---
 docs/docs/configuration/audio_detectors.md    |  4 ++--
 docs/docs/integrations/mqtt.md                | 16 ++++++--------
 .../real_time/audio_transcription.py          | 21 +++----------------
 frigate/events/audio.py                       | 11 +---------
 frigate/types.py                              |  1 -
 web/src/api/ws.tsx                            |  9 ++++++++
 web/src/views/live/LiveCameraView.tsx         | 19 ++++++-----------
 7 files changed, 27 insertions(+), 54 deletions(-)

diff --git a/docs/docs/configuration/audio_detectors.md b/docs/docs/configuration/audio_detectors.md
index 0c536ced0..5031e4184 100644
--- a/docs/docs/configuration/audio_detectors.md
+++ b/docs/docs/configuration/audio_detectors.md
@@ -116,7 +116,7 @@ Optional config parameters that can be set at the global level include:
 
 #### Live transcription
 
-The single camera Live view in the Frigate UI supports live transcription of audio for streams defined with the `audio` role.
+The single camera Live view in the Frigate UI supports live transcription of audio for streams defined with the `audio` role. Use the Enable/Disable Live Audio Transcription button/switch to toggle transcription processing. When speech is heard, the UI will display a black box over the top of the camera stream with text. The MQTT topic `frigate/<camera_name>/audio/transcription` will also be updated in real-time with transcribed text.
 
 Results can be error-prone due to a number of factors, including:
 
@@ -128,7 +128,7 @@ Results can be error-prone due to a number of factors, including:
 
 For speech sources close to the camera with minimal background noise, use the `small` model.
 
-If you have CUDA hardware, you can experiment with the `large` `whisper` model on GPU. Performance is not quite as fast as the `sherpa-onnx` `small` model, but live transcription is far more accurate.
+If you have CUDA hardware, you can experiment with the `large` `whisper` model on GPU. Performance is not quite as fast as the `sherpa-onnx` `small` model, but live transcription is far more accurate. Using the `large` model with CPU will likely be too slow for real-time transcription.
 
 #### Transcription and translation of `speech` audio events
 
diff --git a/docs/docs/integrations/mqtt.md b/docs/docs/integrations/mqtt.md
index f09aceb9d..56f31a021 100644
--- a/docs/docs/integrations/mqtt.md
+++ b/docs/docs/integrations/mqtt.md
@@ -143,16 +143,6 @@ Message published for updates to tracked object metadata, for example:
 }
 ```
 
-#### Live Audio Transcription Update
-
-```json
-{
-  "type": "transcription",
-  "text": "Hello Johnny, are you home?",
-  "camera": "doorbell"
-}
-```
-
 ### `frigate/reviews`
 
 Message published for each changed review item. The first message is published when the `detection` or `alert` is initiated. When additional objects are detected or when a zone change occurs, it will publish a, `update` message with the same id. When the review activity has ended a final `end` message is published.
@@ -265,6 +255,12 @@ Publishes the rms value for audio detected on this camera.
 
 **NOTE:** Requires audio detection to be enabled
 
+### `frigate/<camera_name>/audio/transcription`
+
+Publishes transcribed text for audio detected on this camera.
+
+**NOTE:** Requires audio detection and transcription to be enabled
+
 ### `frigate/<camera_name>/enabled/set`
 
 Topic to turn Frigate's processing of a camera on and off. Expected values are `ON` and `OFF`.
diff --git a/frigate/data_processing/real_time/audio_transcription.py b/frigate/data_processing/real_time/audio_transcription.py
index c0c8b86ae..7ed644498 100644
--- a/frigate/data_processing/real_time/audio_transcription.py
+++ b/frigate/data_processing/real_time/audio_transcription.py
@@ -1,6 +1,5 @@
 """Handle processing audio for speech transcription using sherpa-onnx with FFmpeg pipe."""
 
-import json
 import logging
 import os
 import queue
@@ -13,7 +12,6 @@ import sherpa_onnx
 from frigate.comms.inter_process import InterProcessRequestor
 from frigate.config import CameraConfig, FrigateConfig
 from frigate.const import MODEL_CACHE_DIR
-from frigate.types import TrackedObjectUpdateTypesEnum
 from frigate.util.downloader import ModelDownloader
 
 from ..types import DataProcessorMetrics
@@ -205,14 +203,7 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi):
                 logger.debug(f"Transcribed audio: '{text}', Endpoint: {is_endpoint}")
 
                 self.requestor.send_data(
-                    "tracked_object_update",
-                    json.dumps(
-                        {
-                            "type": TrackedObjectUpdateTypesEnum.transcription,
-                            "text": text,
-                            "camera": obj_data["camera"],
-                        }
-                    ),
+                    f"{self.camera_config.name}/audio/transcription", text
                 )
 
                 self.audio_queue.task_done()
@@ -237,14 +228,8 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi):
             self.transcription_segments = []
 
             self.requestor.send_data(
-                "tracked_object_update",
-                json.dumps(
-                    {
-                        "type": TrackedObjectUpdateTypesEnum.transcription,
-                        "text": (output[2].strip()),
-                        "camera": camera,
-                    }
-                ),
+                f"{self.camera_config.name}/audio/transcription",
+                (output[2].strip() + " "),
             )
 
             # reset whisper
diff --git a/frigate/events/audio.py b/frigate/events/audio.py
index 11e8a2ae8..dc6ee7128 100644
--- a/frigate/events/audio.py
+++ b/frigate/events/audio.py
@@ -1,7 +1,6 @@
 """Handle creating audio events."""
 
 import datetime
-import json
 import logging
 import random
 import string
@@ -37,7 +36,6 @@ from frigate.data_processing.real_time.audio_transcription import (
 from frigate.ffmpeg_presets import parse_preset_input
 from frigate.log import LogPipe
 from frigate.object_detection.base import load_labels
-from frigate.types import TrackedObjectUpdateTypesEnum
 from frigate.util.builtin import get_ffmpeg_arg_list
 from frigate.video import start_or_restart_ffmpeg, stop_ffmpeg
 
@@ -315,14 +313,7 @@ class AudioEventMaintainer(threading.Thread):
                 if self.transcription_processor is not None:
                     self.transcription_processor.reset(self.camera_config.name)
                     self.requestor.send_data(
-                        "tracked_object_update",
-                        json.dumps(
-                            {
-                                "type": TrackedObjectUpdateTypesEnum.transcription,
-                                "text": "",
-                                "camera": self.camera_config.name,
-                            }
-                        ),
+                        f"{self.camera_config.name}/audio/transcription", ""
                     )
 
     def expire_all_detections(self) -> None:
diff --git a/frigate/types.py b/frigate/types.py
index 13d51390f..ee48cc02b 100644
--- a/frigate/types.py
+++ b/frigate/types.py
@@ -27,4 +27,3 @@ class TrackedObjectUpdateTypesEnum(str, Enum):
     description = "description"
     face = "face"
     lpr = "lpr"
-    transcription = "transcription"
diff --git a/web/src/api/ws.tsx b/web/src/api/ws.tsx
index d0f2364ef..79bf9e79d 100644
--- a/web/src/api/ws.tsx
+++ b/web/src/api/ws.tsx
@@ -440,6 +440,15 @@ export function useAudioActivity(camera: string): { payload: number } {
   return { payload: payload as number };
 }
 
+export function useAudioLiveTranscription(camera: string): {
+  payload: string;
+} {
+  const {
+    value: { payload },
+  } = useWs(`${camera}/audio/transcription`, "");
+  return { payload: payload as string };
+}
+
 export function useMotionThreshold(camera: string): {
   payload: string;
   send: (payload: number, retain?: boolean) => void;
diff --git a/web/src/views/live/LiveCameraView.tsx b/web/src/views/live/LiveCameraView.tsx
index 0ab908fce..039265f65 100644
--- a/web/src/views/live/LiveCameraView.tsx
+++ b/web/src/views/live/LiveCameraView.tsx
@@ -1,4 +1,5 @@
 import {
+  useAudioLiveTranscription,
   useAudioState,
   useAudioTranscriptionState,
   useAutotrackingState,
@@ -7,7 +8,6 @@ import {
   usePtzCommand,
   useRecordingsState,
   useSnapshotsState,
-  useTrackedObjectUpdate,
 } from "@/api/ws";
 import CameraFeatureToggle from "@/components/dynamic/CameraFeatureToggle";
 import FilterSwitch from "@/components/filter/FilterSwitch";
@@ -204,21 +204,17 @@ export default function LiveCameraView({
 
   const { payload: audioTranscriptionState, send: sendTranscription } =
     useAudioTranscriptionState(camera.name);
-  const { payload: wsUpdate } = useTrackedObjectUpdate();
+  const { payload: transcription } = useAudioLiveTranscription(camera.name);
   const transcriptionRef = useRef<HTMLDivElement>(null);
 
   useEffect(() => {
-    if (
-      wsUpdate &&
-      wsUpdate.type == "transcription" &&
-      wsUpdate.camera == camera.name
-    ) {
+    if (transcription) {
       if (transcriptionRef.current) {
         transcriptionRef.current.scrollTop =
           transcriptionRef.current.scrollHeight;
       }
     }
-  }, [wsUpdate, camera.name]);
+  }, [transcription]);
 
   useEffect(() => {
     return () => {
@@ -661,15 +657,12 @@ export default function LiveCameraView({
           </TransformComponent>
           {camera?.audio?.enabled_in_config &&
             audioTranscriptionState == "ON" &&
-            wsUpdate &&
-            wsUpdate.type === "transcription" &&
-            wsUpdate.camera === camera.name &&
-            wsUpdate.text !== "" && (
+            transcription != null && (
               <div
                 ref={transcriptionRef}
                 className="text-md scrollbar-container absolute bottom-4 left-1/2 max-h-[15vh] w-[75%] -translate-x-1/2 overflow-y-auto rounded-lg bg-black/70 p-2 text-white md:w-[50%]"
               >
-                {wsUpdate.text}
+                {transcription}
               </div>
             )}
         </div>