diff --git a/docs/docs/configuration/audio_detectors.md b/docs/docs/configuration/audio_detectors.md index 71ad5c335..bf71f8d81 100644 --- a/docs/docs/configuration/audio_detectors.md +++ b/docs/docs/configuration/audio_detectors.md @@ -75,23 +75,23 @@ audio: ### Audio Transcription -Frigate supports fully local audio transcription using either `sherpa-onnx` or OpenAI’s open-source Whisper models via `faster-whisper`. To enable transcription, it is recommended to only configure the features at the global level, and enable it at the individual camera level. +Frigate supports fully local audio transcription using either `sherpa-onnx` or OpenAI’s open-source Whisper models via `faster-whisper`. To enable transcription, enable it in your config. Note that audio detection must also be enabled as described above in order to use audio transcription features. ```yaml audio_transcription: - enabled: False + enabled: True device: ... model_size: ... ``` -Enable audio transcription for select cameras at the camera level: +Disable audio transcription for select cameras at the camera level: ```yaml cameras: back_yard: ... audio_transcription: - enabled: True + enabled: False ``` :::note @@ -111,7 +111,6 @@ The optional config parameters that can be set at the global level include: - **`model_size`**: The size of the model used for live transcription. - Default: `small` - This can be `small` or `large`. The `small` setting uses `sherpa-onnx` models that are fast, lightweight, and always run on the CPU but are not as accurate as the `whisper` model. - - The - This config option applies to **live transcription only**. Recorded `speech` events will always use a different `whisper` model (and can be accelerated for CUDA hardware if available with `device: GPU`). - **`language`**: Defines the language used by `whisper` to translate `speech` audio events (and live audio only if using the `large` model). - Default: `en` diff --git a/frigate/config/camera/camera.py b/frigate/config/camera/camera.py index 68f874138..821b2a16d 100644 --- a/frigate/config/camera/camera.py +++ b/frigate/config/camera/camera.py @@ -19,7 +19,7 @@ from frigate.util.builtin import ( from ..base import FrigateBaseModel from ..classification import ( - AudioTranscriptionConfig, + CameraAudioTranscriptionConfig, CameraFaceRecognitionConfig, CameraLicensePlateRecognitionConfig, CameraSemanticSearchConfig, @@ -69,8 +69,9 @@ class CameraConfig(FrigateBaseModel): audio: AudioConfig = Field( default_factory=AudioConfig, title="Audio events configuration." ) - audio_transcription: AudioTranscriptionConfig = Field( - default_factory=AudioTranscriptionConfig, title="Audio transcription config." + audio_transcription: CameraAudioTranscriptionConfig = Field( + default_factory=CameraAudioTranscriptionConfig, + title="Audio transcription config.", ) birdseye: BirdseyeCameraConfig = Field( default_factory=BirdseyeCameraConfig, title="Birdseye camera configuration." diff --git a/frigate/config/classification.py b/frigate/config/classification.py index 5cc07d28a..56126e4d4 100644 --- a/frigate/config/classification.py +++ b/frigate/config/classification.py @@ -8,6 +8,7 @@ from .base import FrigateBaseModel __all__ = [ "CameraFaceRecognitionConfig", "CameraLicensePlateRecognitionConfig", + "CameraAudioTranscriptionConfig", "FaceRecognitionConfig", "SemanticSearchConfig", "CameraSemanticSearchConfig", @@ -47,14 +48,11 @@ class AudioTranscriptionConfig(FrigateBaseModel): ) device: Optional[EnrichmentsDeviceEnum] = Field( default=EnrichmentsDeviceEnum.CPU, - title="The device used for license plate recognition.", + title="The device used for audio transcription.", ) model_size: str = Field( default="small", title="The size of the embeddings model used." ) - enabled_in_config: Optional[bool] = Field( - default=None, title="Keep track of original state of camera." - ) live_enabled: Optional[bool] = Field( default=False, title="Enable live transcriptions." ) @@ -304,3 +302,15 @@ class CameraLicensePlateRecognitionConfig(FrigateBaseModel): ) model_config = ConfigDict(extra="forbid", protected_namespaces=()) + + +class CameraAudioTranscriptionConfig(FrigateBaseModel): + enabled: bool = Field(default=False, title="Enable audio transcription.") + enabled_in_config: Optional[bool] = Field( + default=None, title="Keep track of original state of audio transcription." + ) + live_enabled: Optional[bool] = Field( + default=False, title="Enable live transcriptions." + ) + + model_config = ConfigDict(extra="forbid", protected_namespaces=()) diff --git a/frigate/data_processing/real_time/whisper_online.py b/frigate/data_processing/real_time/whisper_online.py index 9b81d7fbe..024b19fba 100644 --- a/frigate/data_processing/real_time/whisper_online.py +++ b/frigate/data_processing/real_time/whisper_online.py @@ -141,6 +141,8 @@ class FasterWhisperASR(ASRBase): def transcribe(self, audio, init_prompt=""): from faster_whisper import BatchedInferencePipeline + logging.getLogger("faster_whisper").setLevel(logging.WARNING) + # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01) batched_model = BatchedInferencePipeline(model=self.model) segments, info = batched_model.transcribe( diff --git a/frigate/embeddings/maintainer.py b/frigate/embeddings/maintainer.py index ca2fa9534..55e3d57ba 100644 --- a/frigate/embeddings/maintainer.py +++ b/frigate/embeddings/maintainer.py @@ -215,7 +215,7 @@ class EmbeddingMaintainer(threading.Thread): ) ) - if any( + if self.config.audio_transcription.enabled and any( c.enabled_in_config and c.audio_transcription.enabled for c in self.config.cameras.values() ): diff --git a/frigate/events/audio.py b/frigate/events/audio.py index 31b9a7f3c..1aa227719 100644 --- a/frigate/events/audio.py +++ b/frigate/events/audio.py @@ -95,6 +95,12 @@ class AudioProcessor(FrigateProcess): self.cameras = cameras self.config = config + def run(self) -> None: + self.pre_run_setup(self.config.logger) + audio_threads: list[AudioEventMaintainer] = [] + + threading.current_thread().name = "process:audio_manager" + if self.config.audio_transcription.enabled: self.transcription_model_runner = AudioTranscriptionModelRunner( self.config.audio_transcription.device, @@ -103,12 +109,6 @@ class AudioProcessor(FrigateProcess): else: self.transcription_model_runner = None - def run(self) -> None: - self.pre_run_setup(self.config.logger) - audio_threads: list[AudioEventMaintainer] = [] - - threading.current_thread().name = "process:audio_manager" - if len(self.cameras) == 0: return @@ -180,7 +180,7 @@ class AudioEventMaintainer(threading.Thread): ) self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio.value) - if self.camera_config.audio_transcription.enabled_in_config: + if self.config.audio_transcription.enabled: # init the transcription processor for this camera self.transcription_processor = AudioTranscriptionRealTimeProcessor( config=self.config, diff --git a/frigate/timeline.py b/frigate/timeline.py index 4e3c8e293..4c3d0d457 100644 --- a/frigate/timeline.py +++ b/frigate/timeline.py @@ -156,7 +156,7 @@ class TimelineProcessor(threading.Thread): event_type: str, event_data: dict[Any, Any], ) -> bool: - if event_type != "new": + if event_type != "start": return False if event_data.get("type", "api") == "audio":