Fix audio transcription (#20395)
Some checks are pending
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions

* camera level config

* set up model runner on thread start to avoid unpickling error

* ensure feature is enabled globally

* suppress info logs from faster_whisper

* fix incorrect event_type for api and audio timeline entries

* docs

* fix

* clean up
This commit is contained in:
Josh Hawkins 2025-10-08 18:06:03 -05:00 committed by GitHub
parent c71e235b38
commit c61bb8f8ae
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 33 additions and 21 deletions

View File

@ -75,23 +75,23 @@ audio:
### Audio Transcription
Frigate supports fully local audio transcription using either `sherpa-onnx` or OpenAIs open-source Whisper models via `faster-whisper`. To enable transcription, it is recommended to only configure the features at the global level, and enable it at the individual camera level.
Frigate supports fully local audio transcription using either `sherpa-onnx` or OpenAIs open-source Whisper models via `faster-whisper`. To enable transcription, enable it in your config. Note that audio detection must also be enabled as described above in order to use audio transcription features.
```yaml
audio_transcription:
enabled: False
enabled: True
device: ...
model_size: ...
```
Enable audio transcription for select cameras at the camera level:
Disable audio transcription for select cameras at the camera level:
```yaml
cameras:
back_yard:
...
audio_transcription:
enabled: True
enabled: False
```
:::note
@ -111,7 +111,6 @@ The optional config parameters that can be set at the global level include:
- **`model_size`**: The size of the model used for live transcription.
- Default: `small`
- This can be `small` or `large`. The `small` setting uses `sherpa-onnx` models that are fast, lightweight, and always run on the CPU but are not as accurate as the `whisper` model.
- The
- This config option applies to **live transcription only**. Recorded `speech` events will always use a different `whisper` model (and can be accelerated for CUDA hardware if available with `device: GPU`).
- **`language`**: Defines the language used by `whisper` to translate `speech` audio events (and live audio only if using the `large` model).
- Default: `en`

View File

@ -19,7 +19,7 @@ from frigate.util.builtin import (
from ..base import FrigateBaseModel
from ..classification import (
AudioTranscriptionConfig,
CameraAudioTranscriptionConfig,
CameraFaceRecognitionConfig,
CameraLicensePlateRecognitionConfig,
CameraSemanticSearchConfig,
@ -69,8 +69,9 @@ class CameraConfig(FrigateBaseModel):
audio: AudioConfig = Field(
default_factory=AudioConfig, title="Audio events configuration."
)
audio_transcription: AudioTranscriptionConfig = Field(
default_factory=AudioTranscriptionConfig, title="Audio transcription config."
audio_transcription: CameraAudioTranscriptionConfig = Field(
default_factory=CameraAudioTranscriptionConfig,
title="Audio transcription config.",
)
birdseye: BirdseyeCameraConfig = Field(
default_factory=BirdseyeCameraConfig, title="Birdseye camera configuration."

View File

@ -8,6 +8,7 @@ from .base import FrigateBaseModel
__all__ = [
"CameraFaceRecognitionConfig",
"CameraLicensePlateRecognitionConfig",
"CameraAudioTranscriptionConfig",
"FaceRecognitionConfig",
"SemanticSearchConfig",
"CameraSemanticSearchConfig",
@ -47,14 +48,11 @@ class AudioTranscriptionConfig(FrigateBaseModel):
)
device: Optional[EnrichmentsDeviceEnum] = Field(
default=EnrichmentsDeviceEnum.CPU,
title="The device used for license plate recognition.",
title="The device used for audio transcription.",
)
model_size: str = Field(
default="small", title="The size of the embeddings model used."
)
enabled_in_config: Optional[bool] = Field(
default=None, title="Keep track of original state of camera."
)
live_enabled: Optional[bool] = Field(
default=False, title="Enable live transcriptions."
)
@ -304,3 +302,15 @@ class CameraLicensePlateRecognitionConfig(FrigateBaseModel):
)
model_config = ConfigDict(extra="forbid", protected_namespaces=())
class CameraAudioTranscriptionConfig(FrigateBaseModel):
enabled: bool = Field(default=False, title="Enable audio transcription.")
enabled_in_config: Optional[bool] = Field(
default=None, title="Keep track of original state of audio transcription."
)
live_enabled: Optional[bool] = Field(
default=False, title="Enable live transcriptions."
)
model_config = ConfigDict(extra="forbid", protected_namespaces=())

View File

@ -141,6 +141,8 @@ class FasterWhisperASR(ASRBase):
def transcribe(self, audio, init_prompt=""):
from faster_whisper import BatchedInferencePipeline
logging.getLogger("faster_whisper").setLevel(logging.WARNING)
# tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
batched_model = BatchedInferencePipeline(model=self.model)
segments, info = batched_model.transcribe(

View File

@ -215,7 +215,7 @@ class EmbeddingMaintainer(threading.Thread):
)
)
if any(
if self.config.audio_transcription.enabled and any(
c.enabled_in_config and c.audio_transcription.enabled
for c in self.config.cameras.values()
):

View File

@ -95,6 +95,12 @@ class AudioProcessor(FrigateProcess):
self.cameras = cameras
self.config = config
def run(self) -> None:
self.pre_run_setup(self.config.logger)
audio_threads: list[AudioEventMaintainer] = []
threading.current_thread().name = "process:audio_manager"
if self.config.audio_transcription.enabled:
self.transcription_model_runner = AudioTranscriptionModelRunner(
self.config.audio_transcription.device,
@ -103,12 +109,6 @@ class AudioProcessor(FrigateProcess):
else:
self.transcription_model_runner = None
def run(self) -> None:
self.pre_run_setup(self.config.logger)
audio_threads: list[AudioEventMaintainer] = []
threading.current_thread().name = "process:audio_manager"
if len(self.cameras) == 0:
return
@ -180,7 +180,7 @@ class AudioEventMaintainer(threading.Thread):
)
self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio.value)
if self.camera_config.audio_transcription.enabled_in_config:
if self.config.audio_transcription.enabled:
# init the transcription processor for this camera
self.transcription_processor = AudioTranscriptionRealTimeProcessor(
config=self.config,

View File

@ -156,7 +156,7 @@ class TimelineProcessor(threading.Thread):
event_type: str,
event_data: dict[Any, Any],
) -> bool:
if event_type != "new":
if event_type != "start":
return False
if event_data.get("type", "api") == "audio":