mirror of
https://github.com/blakeblackshear/frigate.git
synced 2025-12-06 13:34:13 +03:00
Fix audio transcription (#20395)
Some checks are pending
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions
Some checks are pending
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions
* camera level config * set up model runner on thread start to avoid unpickling error * ensure feature is enabled globally * suppress info logs from faster_whisper * fix incorrect event_type for api and audio timeline entries * docs * fix * clean up
This commit is contained in:
parent
c71e235b38
commit
c61bb8f8ae
@ -75,23 +75,23 @@ audio:
|
||||
|
||||
### Audio Transcription
|
||||
|
||||
Frigate supports fully local audio transcription using either `sherpa-onnx` or OpenAI’s open-source Whisper models via `faster-whisper`. To enable transcription, it is recommended to only configure the features at the global level, and enable it at the individual camera level.
|
||||
Frigate supports fully local audio transcription using either `sherpa-onnx` or OpenAI’s open-source Whisper models via `faster-whisper`. To enable transcription, enable it in your config. Note that audio detection must also be enabled as described above in order to use audio transcription features.
|
||||
|
||||
```yaml
|
||||
audio_transcription:
|
||||
enabled: False
|
||||
enabled: True
|
||||
device: ...
|
||||
model_size: ...
|
||||
```
|
||||
|
||||
Enable audio transcription for select cameras at the camera level:
|
||||
Disable audio transcription for select cameras at the camera level:
|
||||
|
||||
```yaml
|
||||
cameras:
|
||||
back_yard:
|
||||
...
|
||||
audio_transcription:
|
||||
enabled: True
|
||||
enabled: False
|
||||
```
|
||||
|
||||
:::note
|
||||
@ -111,7 +111,6 @@ The optional config parameters that can be set at the global level include:
|
||||
- **`model_size`**: The size of the model used for live transcription.
|
||||
- Default: `small`
|
||||
- This can be `small` or `large`. The `small` setting uses `sherpa-onnx` models that are fast, lightweight, and always run on the CPU but are not as accurate as the `whisper` model.
|
||||
- The
|
||||
- This config option applies to **live transcription only**. Recorded `speech` events will always use a different `whisper` model (and can be accelerated for CUDA hardware if available with `device: GPU`).
|
||||
- **`language`**: Defines the language used by `whisper` to translate `speech` audio events (and live audio only if using the `large` model).
|
||||
- Default: `en`
|
||||
|
||||
@ -19,7 +19,7 @@ from frigate.util.builtin import (
|
||||
|
||||
from ..base import FrigateBaseModel
|
||||
from ..classification import (
|
||||
AudioTranscriptionConfig,
|
||||
CameraAudioTranscriptionConfig,
|
||||
CameraFaceRecognitionConfig,
|
||||
CameraLicensePlateRecognitionConfig,
|
||||
CameraSemanticSearchConfig,
|
||||
@ -69,8 +69,9 @@ class CameraConfig(FrigateBaseModel):
|
||||
audio: AudioConfig = Field(
|
||||
default_factory=AudioConfig, title="Audio events configuration."
|
||||
)
|
||||
audio_transcription: AudioTranscriptionConfig = Field(
|
||||
default_factory=AudioTranscriptionConfig, title="Audio transcription config."
|
||||
audio_transcription: CameraAudioTranscriptionConfig = Field(
|
||||
default_factory=CameraAudioTranscriptionConfig,
|
||||
title="Audio transcription config.",
|
||||
)
|
||||
birdseye: BirdseyeCameraConfig = Field(
|
||||
default_factory=BirdseyeCameraConfig, title="Birdseye camera configuration."
|
||||
|
||||
@ -8,6 +8,7 @@ from .base import FrigateBaseModel
|
||||
__all__ = [
|
||||
"CameraFaceRecognitionConfig",
|
||||
"CameraLicensePlateRecognitionConfig",
|
||||
"CameraAudioTranscriptionConfig",
|
||||
"FaceRecognitionConfig",
|
||||
"SemanticSearchConfig",
|
||||
"CameraSemanticSearchConfig",
|
||||
@ -47,14 +48,11 @@ class AudioTranscriptionConfig(FrigateBaseModel):
|
||||
)
|
||||
device: Optional[EnrichmentsDeviceEnum] = Field(
|
||||
default=EnrichmentsDeviceEnum.CPU,
|
||||
title="The device used for license plate recognition.",
|
||||
title="The device used for audio transcription.",
|
||||
)
|
||||
model_size: str = Field(
|
||||
default="small", title="The size of the embeddings model used."
|
||||
)
|
||||
enabled_in_config: Optional[bool] = Field(
|
||||
default=None, title="Keep track of original state of camera."
|
||||
)
|
||||
live_enabled: Optional[bool] = Field(
|
||||
default=False, title="Enable live transcriptions."
|
||||
)
|
||||
@ -304,3 +302,15 @@ class CameraLicensePlateRecognitionConfig(FrigateBaseModel):
|
||||
)
|
||||
|
||||
model_config = ConfigDict(extra="forbid", protected_namespaces=())
|
||||
|
||||
|
||||
class CameraAudioTranscriptionConfig(FrigateBaseModel):
|
||||
enabled: bool = Field(default=False, title="Enable audio transcription.")
|
||||
enabled_in_config: Optional[bool] = Field(
|
||||
default=None, title="Keep track of original state of audio transcription."
|
||||
)
|
||||
live_enabled: Optional[bool] = Field(
|
||||
default=False, title="Enable live transcriptions."
|
||||
)
|
||||
|
||||
model_config = ConfigDict(extra="forbid", protected_namespaces=())
|
||||
|
||||
@ -141,6 +141,8 @@ class FasterWhisperASR(ASRBase):
|
||||
def transcribe(self, audio, init_prompt=""):
|
||||
from faster_whisper import BatchedInferencePipeline
|
||||
|
||||
logging.getLogger("faster_whisper").setLevel(logging.WARNING)
|
||||
|
||||
# tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
|
||||
batched_model = BatchedInferencePipeline(model=self.model)
|
||||
segments, info = batched_model.transcribe(
|
||||
|
||||
@ -215,7 +215,7 @@ class EmbeddingMaintainer(threading.Thread):
|
||||
)
|
||||
)
|
||||
|
||||
if any(
|
||||
if self.config.audio_transcription.enabled and any(
|
||||
c.enabled_in_config and c.audio_transcription.enabled
|
||||
for c in self.config.cameras.values()
|
||||
):
|
||||
|
||||
@ -95,6 +95,12 @@ class AudioProcessor(FrigateProcess):
|
||||
self.cameras = cameras
|
||||
self.config = config
|
||||
|
||||
def run(self) -> None:
|
||||
self.pre_run_setup(self.config.logger)
|
||||
audio_threads: list[AudioEventMaintainer] = []
|
||||
|
||||
threading.current_thread().name = "process:audio_manager"
|
||||
|
||||
if self.config.audio_transcription.enabled:
|
||||
self.transcription_model_runner = AudioTranscriptionModelRunner(
|
||||
self.config.audio_transcription.device,
|
||||
@ -103,12 +109,6 @@ class AudioProcessor(FrigateProcess):
|
||||
else:
|
||||
self.transcription_model_runner = None
|
||||
|
||||
def run(self) -> None:
|
||||
self.pre_run_setup(self.config.logger)
|
||||
audio_threads: list[AudioEventMaintainer] = []
|
||||
|
||||
threading.current_thread().name = "process:audio_manager"
|
||||
|
||||
if len(self.cameras) == 0:
|
||||
return
|
||||
|
||||
@ -180,7 +180,7 @@ class AudioEventMaintainer(threading.Thread):
|
||||
)
|
||||
self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio.value)
|
||||
|
||||
if self.camera_config.audio_transcription.enabled_in_config:
|
||||
if self.config.audio_transcription.enabled:
|
||||
# init the transcription processor for this camera
|
||||
self.transcription_processor = AudioTranscriptionRealTimeProcessor(
|
||||
config=self.config,
|
||||
|
||||
@ -156,7 +156,7 @@ class TimelineProcessor(threading.Thread):
|
||||
event_type: str,
|
||||
event_data: dict[Any, Any],
|
||||
) -> bool:
|
||||
if event_type != "new":
|
||||
if event_type != "start":
|
||||
return False
|
||||
|
||||
if event_data.get("type", "api") == "audio":
|
||||
|
||||
Loading…
Reference in New Issue
Block a user