Fix audio transcription (#20395)

* camera level config * set up model runner on thread start to avoid unpickling error * ensure feature is enabled globally * suppress info logs from faster_whisper * fix incorrect event_type for api and audio timeline entries * docs * fix * clean up
2025-12-06 13:34:13 +03:00 · 2025-10-08 18:06:03 -05:00 · 2025-10-08 18:06:03 -05:00 · c61bb8f8ae
commit c61bb8f8ae
parent c71e235b38
7 changed files with 33 additions and 21 deletions
--- a/docs/docs/configuration/audio_detectors.md
+++ b/docs/docs/configuration/audio_detectors.md
@ -75,23 +75,23 @@ audio:
 ### Audio Transcription
-Frigate supports fully local audio transcription using either `sherpa-onnx` or OpenAI’s open-source Whisper models via `faster-whisper`. To enable transcription, it is recommended to only configure the features at the global level, and enable it at the individual camera level.
+Frigate supports fully local audio transcription using either `sherpa-onnx` or OpenAI’s open-source Whisper models via `faster-whisper`. To enable transcription, enable it in your config. Note that audio detection must also be enabled as described above in order to use audio transcription features.
 ```yaml
 audio_transcription:
-  enabled: False
+  enabled: True
  device: ...
  model_size: ...
 ```
-Enable audio transcription for select cameras at the camera level:
+Disable audio transcription for select cameras at the camera level:
 ```yaml
 cameras:
  back_yard:
    ...
    audio_transcription:
-      enabled: True
+      enabled: False
 ```
 :::note
@ -111,7 +111,6 @@ The optional config parameters that can be set at the global level include:
 - **`model_size`**: The size of the model used for live transcription.
  - Default: `small`
  - This can be `small` or `large`. The `small` setting uses `sherpa-onnx` models that are fast, lightweight, and always run on the CPU but are not as accurate as the `whisper` model.
  - The
  - This config option applies to **live transcription only**. Recorded `speech` events will always use a different `whisper` model (and can be accelerated for CUDA hardware if available with `device: GPU`).
 - **`language`**: Defines the language used by `whisper` to translate `speech` audio events (and live audio only if using the `large` model).
  - Default: `en`
--- a/frigate/config/camera/camera.py
+++ b/frigate/config/camera/camera.py
@ -19,7 +19,7 @@ from frigate.util.builtin import (
 from ..base import FrigateBaseModel
 from ..classification import (
-    AudioTranscriptionConfig,
+    CameraAudioTranscriptionConfig,
    CameraFaceRecognitionConfig,
    CameraLicensePlateRecognitionConfig,
    CameraSemanticSearchConfig,
@ -69,8 +69,9 @@ class CameraConfig(FrigateBaseModel):
    audio: AudioConfig = Field(
        default_factory=AudioConfig, title="Audio events configuration."
    )
-    audio_transcription: AudioTranscriptionConfig = Field(
+    audio_transcription: CameraAudioTranscriptionConfig = Field(
-        default_factory=AudioTranscriptionConfig, title="Audio transcription config."
+        default_factory=CameraAudioTranscriptionConfig,
        title="Audio transcription config.",
    )
    birdseye: BirdseyeCameraConfig = Field(
        default_factory=BirdseyeCameraConfig, title="Birdseye camera configuration."
--- a/frigate/config/classification.py
+++ b/frigate/config/classification.py
@ -8,6 +8,7 @@ from .base import FrigateBaseModel
 __all__ = [
    "CameraFaceRecognitionConfig",
    "CameraLicensePlateRecognitionConfig",
    "CameraAudioTranscriptionConfig",
    "FaceRecognitionConfig",
    "SemanticSearchConfig",
    "CameraSemanticSearchConfig",
@ -47,14 +48,11 @@ class AudioTranscriptionConfig(FrigateBaseModel):
    )
    device: Optional[EnrichmentsDeviceEnum] = Field(
        default=EnrichmentsDeviceEnum.CPU,
-        title="The device used for license plate recognition.",
+        title="The device used for audio transcription.",
    )
    model_size: str = Field(
        default="small", title="The size of the embeddings model used."
    )
    enabled_in_config: Optional[bool] = Field(
        default=None, title="Keep track of original state of camera."
    )
    live_enabled: Optional[bool] = Field(
        default=False, title="Enable live transcriptions."
    )
@ -304,3 +302,15 @@ class CameraLicensePlateRecognitionConfig(FrigateBaseModel):
    )
    model_config = ConfigDict(extra="forbid", protected_namespaces=())
 class CameraAudioTranscriptionConfig(FrigateBaseModel):
    enabled: bool = Field(default=False, title="Enable audio transcription.")
    enabled_in_config: Optional[bool] = Field(
        default=None, title="Keep track of original state of audio transcription."
    )
    live_enabled: Optional[bool] = Field(
        default=False, title="Enable live transcriptions."
    )
    model_config = ConfigDict(extra="forbid", protected_namespaces=())
--- a/frigate/data_processing/real_time/whisper_online.py
+++ b/frigate/data_processing/real_time/whisper_online.py
@ -141,6 +141,8 @@ class FasterWhisperASR(ASRBase):
    def transcribe(self, audio, init_prompt=""):
        from faster_whisper import BatchedInferencePipeline
        logging.getLogger("faster_whisper").setLevel(logging.WARNING)
        # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
        batched_model = BatchedInferencePipeline(model=self.model)
        segments, info = batched_model.transcribe(
--- a/frigate/embeddings/maintainer.py
+++ b/frigate/embeddings/maintainer.py
@ -215,7 +215,7 @@ class EmbeddingMaintainer(threading.Thread):
                )
            )
-        if any(
+        if self.config.audio_transcription.enabled and any(
            c.enabled_in_config and c.audio_transcription.enabled
            for c in self.config.cameras.values()
        ):
--- a/frigate/events/audio.py
+++ b/frigate/events/audio.py
@ -95,6 +95,12 @@ class AudioProcessor(FrigateProcess):
        self.cameras = cameras
        self.config = config
    def run(self) -> None:
        self.pre_run_setup(self.config.logger)
        audio_threads: list[AudioEventMaintainer] = []
        threading.current_thread().name = "process:audio_manager"
        if self.config.audio_transcription.enabled:
            self.transcription_model_runner = AudioTranscriptionModelRunner(
                self.config.audio_transcription.device,
@ -103,12 +109,6 @@ class AudioProcessor(FrigateProcess):
        else:
            self.transcription_model_runner = None
    def run(self) -> None:
        self.pre_run_setup(self.config.logger)
        audio_threads: list[AudioEventMaintainer] = []
        threading.current_thread().name = "process:audio_manager"
        if len(self.cameras) == 0:
            return
@ -180,7 +180,7 @@ class AudioEventMaintainer(threading.Thread):
        )
        self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio.value)
-        if self.camera_config.audio_transcription.enabled_in_config:
+        if self.config.audio_transcription.enabled:
            # init the transcription processor for this camera
            self.transcription_processor = AudioTranscriptionRealTimeProcessor(
                config=self.config,
--- a/frigate/timeline.py
+++ b/frigate/timeline.py
@ -156,7 +156,7 @@ class TimelineProcessor(threading.Thread):
        event_type: str,
        event_data: dict[Any, Any],
    ) -> bool:
-        if event_type != "new":
+        if event_type != "start":
            return False
        if event_data.get("type", "api") == "audio":