mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-05-01 19:17:41 +03:00
publish live transcriptions on their own topic instead of tracked_object_update
This commit is contained in:
parent
772190869f
commit
8e56b132f1
@ -116,7 +116,7 @@ Optional config parameters that can be set at the global level include:
|
||||
|
||||
#### Live transcription
|
||||
|
||||
The single camera Live view in the Frigate UI supports live transcription of audio for streams defined with the `audio` role.
|
||||
The single camera Live view in the Frigate UI supports live transcription of audio for streams defined with the `audio` role. Use the Enable/Disable Live Audio Transcription button/switch to toggle transcription processing. When speech is heard, the UI will display a black box over the top of the camera stream with text. The MQTT topic `frigate/<camera_name>/audio/transcription` will also be updated in real-time with transcribed text.
|
||||
|
||||
Results can be error-prone due to a number of factors, including:
|
||||
|
||||
@ -128,7 +128,7 @@ Results can be error-prone due to a number of factors, including:
|
||||
|
||||
For speech sources close to the camera with minimal background noise, use the `small` model.
|
||||
|
||||
If you have CUDA hardware, you can experiment with the `large` `whisper` model on GPU. Performance is not quite as fast as the `sherpa-onnx` `small` model, but live transcription is far more accurate.
|
||||
If you have CUDA hardware, you can experiment with the `large` `whisper` model on GPU. Performance is not quite as fast as the `sherpa-onnx` `small` model, but live transcription is far more accurate. Using the `large` model with CPU will likely be too slow for real-time transcription.
|
||||
|
||||
#### Transcription and translation of `speech` audio events
|
||||
|
||||
|
||||
@ -143,16 +143,6 @@ Message published for updates to tracked object metadata, for example:
|
||||
}
|
||||
```
|
||||
|
||||
#### Live Audio Transcription Update
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "transcription",
|
||||
"text": "Hello Johnny, are you home?",
|
||||
"camera": "doorbell"
|
||||
}
|
||||
```
|
||||
|
||||
### `frigate/reviews`
|
||||
|
||||
Message published for each changed review item. The first message is published when the `detection` or `alert` is initiated. When additional objects are detected or when a zone change occurs, it will publish a, `update` message with the same id. When the review activity has ended a final `end` message is published.
|
||||
@ -265,6 +255,12 @@ Publishes the rms value for audio detected on this camera.
|
||||
|
||||
**NOTE:** Requires audio detection to be enabled
|
||||
|
||||
### `frigate/<camera_name>/audio/transcription`
|
||||
|
||||
Publishes transcribed text for audio detected on this camera.
|
||||
|
||||
**NOTE:** Requires audio detection and transcription to be enabled
|
||||
|
||||
### `frigate/<camera_name>/enabled/set`
|
||||
|
||||
Topic to turn Frigate's processing of a camera on and off. Expected values are `ON` and `OFF`.
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
"""Handle processing audio for speech transcription using sherpa-onnx with FFmpeg pipe."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
@ -13,7 +12,6 @@ import sherpa_onnx
|
||||
from frigate.comms.inter_process import InterProcessRequestor
|
||||
from frigate.config import CameraConfig, FrigateConfig
|
||||
from frigate.const import MODEL_CACHE_DIR
|
||||
from frigate.types import TrackedObjectUpdateTypesEnum
|
||||
from frigate.util.downloader import ModelDownloader
|
||||
|
||||
from ..types import DataProcessorMetrics
|
||||
@ -205,14 +203,7 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi):
|
||||
logger.debug(f"Transcribed audio: '{text}', Endpoint: {is_endpoint}")
|
||||
|
||||
self.requestor.send_data(
|
||||
"tracked_object_update",
|
||||
json.dumps(
|
||||
{
|
||||
"type": TrackedObjectUpdateTypesEnum.transcription,
|
||||
"text": text,
|
||||
"camera": obj_data["camera"],
|
||||
}
|
||||
),
|
||||
f"{self.camera_config.name}/audio/transcription", text
|
||||
)
|
||||
|
||||
self.audio_queue.task_done()
|
||||
@ -237,14 +228,8 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi):
|
||||
self.transcription_segments = []
|
||||
|
||||
self.requestor.send_data(
|
||||
"tracked_object_update",
|
||||
json.dumps(
|
||||
{
|
||||
"type": TrackedObjectUpdateTypesEnum.transcription,
|
||||
"text": (output[2].strip()),
|
||||
"camera": camera,
|
||||
}
|
||||
),
|
||||
f"{self.camera_config.name}/audio/transcription",
|
||||
(output[2].strip() + " "),
|
||||
)
|
||||
|
||||
# reset whisper
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
"""Handle creating audio events."""
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
import string
|
||||
@ -37,7 +36,6 @@ from frigate.data_processing.real_time.audio_transcription import (
|
||||
from frigate.ffmpeg_presets import parse_preset_input
|
||||
from frigate.log import LogPipe
|
||||
from frigate.object_detection.base import load_labels
|
||||
from frigate.types import TrackedObjectUpdateTypesEnum
|
||||
from frigate.util.builtin import get_ffmpeg_arg_list
|
||||
from frigate.video import start_or_restart_ffmpeg, stop_ffmpeg
|
||||
|
||||
@ -315,14 +313,7 @@ class AudioEventMaintainer(threading.Thread):
|
||||
if self.transcription_processor is not None:
|
||||
self.transcription_processor.reset(self.camera_config.name)
|
||||
self.requestor.send_data(
|
||||
"tracked_object_update",
|
||||
json.dumps(
|
||||
{
|
||||
"type": TrackedObjectUpdateTypesEnum.transcription,
|
||||
"text": "",
|
||||
"camera": self.camera_config.name,
|
||||
}
|
||||
),
|
||||
f"{self.camera_config.name}/audio/transcription", ""
|
||||
)
|
||||
|
||||
def expire_all_detections(self) -> None:
|
||||
|
||||
@ -27,4 +27,3 @@ class TrackedObjectUpdateTypesEnum(str, Enum):
|
||||
description = "description"
|
||||
face = "face"
|
||||
lpr = "lpr"
|
||||
transcription = "transcription"
|
||||
|
||||
@ -440,6 +440,15 @@ export function useAudioActivity(camera: string): { payload: number } {
|
||||
return { payload: payload as number };
|
||||
}
|
||||
|
||||
export function useAudioLiveTranscription(camera: string): {
|
||||
payload: string;
|
||||
} {
|
||||
const {
|
||||
value: { payload },
|
||||
} = useWs(`${camera}/audio/transcription`, "");
|
||||
return { payload: payload as string };
|
||||
}
|
||||
|
||||
export function useMotionThreshold(camera: string): {
|
||||
payload: string;
|
||||
send: (payload: number, retain?: boolean) => void;
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import {
|
||||
useAudioLiveTranscription,
|
||||
useAudioState,
|
||||
useAudioTranscriptionState,
|
||||
useAutotrackingState,
|
||||
@ -7,7 +8,6 @@ import {
|
||||
usePtzCommand,
|
||||
useRecordingsState,
|
||||
useSnapshotsState,
|
||||
useTrackedObjectUpdate,
|
||||
} from "@/api/ws";
|
||||
import CameraFeatureToggle from "@/components/dynamic/CameraFeatureToggle";
|
||||
import FilterSwitch from "@/components/filter/FilterSwitch";
|
||||
@ -204,21 +204,17 @@ export default function LiveCameraView({
|
||||
|
||||
const { payload: audioTranscriptionState, send: sendTranscription } =
|
||||
useAudioTranscriptionState(camera.name);
|
||||
const { payload: wsUpdate } = useTrackedObjectUpdate();
|
||||
const { payload: transcription } = useAudioLiveTranscription(camera.name);
|
||||
const transcriptionRef = useRef<HTMLDivElement>(null);
|
||||
|
||||
useEffect(() => {
|
||||
if (
|
||||
wsUpdate &&
|
||||
wsUpdate.type == "transcription" &&
|
||||
wsUpdate.camera == camera.name
|
||||
) {
|
||||
if (transcription) {
|
||||
if (transcriptionRef.current) {
|
||||
transcriptionRef.current.scrollTop =
|
||||
transcriptionRef.current.scrollHeight;
|
||||
}
|
||||
}
|
||||
}, [wsUpdate, camera.name]);
|
||||
}, [transcription]);
|
||||
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
@ -661,15 +657,12 @@ export default function LiveCameraView({
|
||||
</TransformComponent>
|
||||
{camera?.audio?.enabled_in_config &&
|
||||
audioTranscriptionState == "ON" &&
|
||||
wsUpdate &&
|
||||
wsUpdate.type === "transcription" &&
|
||||
wsUpdate.camera === camera.name &&
|
||||
wsUpdate.text !== "" && (
|
||||
transcription != null && (
|
||||
<div
|
||||
ref={transcriptionRef}
|
||||
className="text-md scrollbar-container absolute bottom-4 left-1/2 max-h-[15vh] w-[75%] -translate-x-1/2 overflow-y-auto rounded-lg bg-black/70 p-2 text-white md:w-[50%]"
|
||||
>
|
||||
{wsUpdate.text}
|
||||
{transcription}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
Loading…
Reference in New Issue
Block a user