mirror of
https://github.com/blakeblackshear/frigate.git
synced 2025-12-06 13:34:13 +03:00
Fix audio transcription (#20395)
Some checks are pending
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions
Some checks are pending
CI / AMD64 Build (push) Waiting to run
CI / ARM Build (push) Waiting to run
CI / Jetson Jetpack 6 (push) Waiting to run
CI / AMD64 Extra Build (push) Blocked by required conditions
CI / ARM Extra Build (push) Blocked by required conditions
CI / Synaptics Build (push) Blocked by required conditions
CI / Assemble and push default build (push) Blocked by required conditions
* camera level config * set up model runner on thread start to avoid unpickling error * ensure feature is enabled globally * suppress info logs from faster_whisper * fix incorrect event_type for api and audio timeline entries * docs * fix * clean up
This commit is contained in:
parent
c71e235b38
commit
c61bb8f8ae
@ -75,23 +75,23 @@ audio:
|
|||||||
|
|
||||||
### Audio Transcription
|
### Audio Transcription
|
||||||
|
|
||||||
Frigate supports fully local audio transcription using either `sherpa-onnx` or OpenAI’s open-source Whisper models via `faster-whisper`. To enable transcription, it is recommended to only configure the features at the global level, and enable it at the individual camera level.
|
Frigate supports fully local audio transcription using either `sherpa-onnx` or OpenAI’s open-source Whisper models via `faster-whisper`. To enable transcription, enable it in your config. Note that audio detection must also be enabled as described above in order to use audio transcription features.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
audio_transcription:
|
audio_transcription:
|
||||||
enabled: False
|
enabled: True
|
||||||
device: ...
|
device: ...
|
||||||
model_size: ...
|
model_size: ...
|
||||||
```
|
```
|
||||||
|
|
||||||
Enable audio transcription for select cameras at the camera level:
|
Disable audio transcription for select cameras at the camera level:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
cameras:
|
cameras:
|
||||||
back_yard:
|
back_yard:
|
||||||
...
|
...
|
||||||
audio_transcription:
|
audio_transcription:
|
||||||
enabled: True
|
enabled: False
|
||||||
```
|
```
|
||||||
|
|
||||||
:::note
|
:::note
|
||||||
@ -111,7 +111,6 @@ The optional config parameters that can be set at the global level include:
|
|||||||
- **`model_size`**: The size of the model used for live transcription.
|
- **`model_size`**: The size of the model used for live transcription.
|
||||||
- Default: `small`
|
- Default: `small`
|
||||||
- This can be `small` or `large`. The `small` setting uses `sherpa-onnx` models that are fast, lightweight, and always run on the CPU but are not as accurate as the `whisper` model.
|
- This can be `small` or `large`. The `small` setting uses `sherpa-onnx` models that are fast, lightweight, and always run on the CPU but are not as accurate as the `whisper` model.
|
||||||
- The
|
|
||||||
- This config option applies to **live transcription only**. Recorded `speech` events will always use a different `whisper` model (and can be accelerated for CUDA hardware if available with `device: GPU`).
|
- This config option applies to **live transcription only**. Recorded `speech` events will always use a different `whisper` model (and can be accelerated for CUDA hardware if available with `device: GPU`).
|
||||||
- **`language`**: Defines the language used by `whisper` to translate `speech` audio events (and live audio only if using the `large` model).
|
- **`language`**: Defines the language used by `whisper` to translate `speech` audio events (and live audio only if using the `large` model).
|
||||||
- Default: `en`
|
- Default: `en`
|
||||||
|
|||||||
@ -19,7 +19,7 @@ from frigate.util.builtin import (
|
|||||||
|
|
||||||
from ..base import FrigateBaseModel
|
from ..base import FrigateBaseModel
|
||||||
from ..classification import (
|
from ..classification import (
|
||||||
AudioTranscriptionConfig,
|
CameraAudioTranscriptionConfig,
|
||||||
CameraFaceRecognitionConfig,
|
CameraFaceRecognitionConfig,
|
||||||
CameraLicensePlateRecognitionConfig,
|
CameraLicensePlateRecognitionConfig,
|
||||||
CameraSemanticSearchConfig,
|
CameraSemanticSearchConfig,
|
||||||
@ -69,8 +69,9 @@ class CameraConfig(FrigateBaseModel):
|
|||||||
audio: AudioConfig = Field(
|
audio: AudioConfig = Field(
|
||||||
default_factory=AudioConfig, title="Audio events configuration."
|
default_factory=AudioConfig, title="Audio events configuration."
|
||||||
)
|
)
|
||||||
audio_transcription: AudioTranscriptionConfig = Field(
|
audio_transcription: CameraAudioTranscriptionConfig = Field(
|
||||||
default_factory=AudioTranscriptionConfig, title="Audio transcription config."
|
default_factory=CameraAudioTranscriptionConfig,
|
||||||
|
title="Audio transcription config.",
|
||||||
)
|
)
|
||||||
birdseye: BirdseyeCameraConfig = Field(
|
birdseye: BirdseyeCameraConfig = Field(
|
||||||
default_factory=BirdseyeCameraConfig, title="Birdseye camera configuration."
|
default_factory=BirdseyeCameraConfig, title="Birdseye camera configuration."
|
||||||
|
|||||||
@ -8,6 +8,7 @@ from .base import FrigateBaseModel
|
|||||||
__all__ = [
|
__all__ = [
|
||||||
"CameraFaceRecognitionConfig",
|
"CameraFaceRecognitionConfig",
|
||||||
"CameraLicensePlateRecognitionConfig",
|
"CameraLicensePlateRecognitionConfig",
|
||||||
|
"CameraAudioTranscriptionConfig",
|
||||||
"FaceRecognitionConfig",
|
"FaceRecognitionConfig",
|
||||||
"SemanticSearchConfig",
|
"SemanticSearchConfig",
|
||||||
"CameraSemanticSearchConfig",
|
"CameraSemanticSearchConfig",
|
||||||
@ -47,14 +48,11 @@ class AudioTranscriptionConfig(FrigateBaseModel):
|
|||||||
)
|
)
|
||||||
device: Optional[EnrichmentsDeviceEnum] = Field(
|
device: Optional[EnrichmentsDeviceEnum] = Field(
|
||||||
default=EnrichmentsDeviceEnum.CPU,
|
default=EnrichmentsDeviceEnum.CPU,
|
||||||
title="The device used for license plate recognition.",
|
title="The device used for audio transcription.",
|
||||||
)
|
)
|
||||||
model_size: str = Field(
|
model_size: str = Field(
|
||||||
default="small", title="The size of the embeddings model used."
|
default="small", title="The size of the embeddings model used."
|
||||||
)
|
)
|
||||||
enabled_in_config: Optional[bool] = Field(
|
|
||||||
default=None, title="Keep track of original state of camera."
|
|
||||||
)
|
|
||||||
live_enabled: Optional[bool] = Field(
|
live_enabled: Optional[bool] = Field(
|
||||||
default=False, title="Enable live transcriptions."
|
default=False, title="Enable live transcriptions."
|
||||||
)
|
)
|
||||||
@ -304,3 +302,15 @@ class CameraLicensePlateRecognitionConfig(FrigateBaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
model_config = ConfigDict(extra="forbid", protected_namespaces=())
|
model_config = ConfigDict(extra="forbid", protected_namespaces=())
|
||||||
|
|
||||||
|
|
||||||
|
class CameraAudioTranscriptionConfig(FrigateBaseModel):
|
||||||
|
enabled: bool = Field(default=False, title="Enable audio transcription.")
|
||||||
|
enabled_in_config: Optional[bool] = Field(
|
||||||
|
default=None, title="Keep track of original state of audio transcription."
|
||||||
|
)
|
||||||
|
live_enabled: Optional[bool] = Field(
|
||||||
|
default=False, title="Enable live transcriptions."
|
||||||
|
)
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="forbid", protected_namespaces=())
|
||||||
|
|||||||
@ -141,6 +141,8 @@ class FasterWhisperASR(ASRBase):
|
|||||||
def transcribe(self, audio, init_prompt=""):
|
def transcribe(self, audio, init_prompt=""):
|
||||||
from faster_whisper import BatchedInferencePipeline
|
from faster_whisper import BatchedInferencePipeline
|
||||||
|
|
||||||
|
logging.getLogger("faster_whisper").setLevel(logging.WARNING)
|
||||||
|
|
||||||
# tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
|
# tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
|
||||||
batched_model = BatchedInferencePipeline(model=self.model)
|
batched_model = BatchedInferencePipeline(model=self.model)
|
||||||
segments, info = batched_model.transcribe(
|
segments, info = batched_model.transcribe(
|
||||||
|
|||||||
@ -215,7 +215,7 @@ class EmbeddingMaintainer(threading.Thread):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if any(
|
if self.config.audio_transcription.enabled and any(
|
||||||
c.enabled_in_config and c.audio_transcription.enabled
|
c.enabled_in_config and c.audio_transcription.enabled
|
||||||
for c in self.config.cameras.values()
|
for c in self.config.cameras.values()
|
||||||
):
|
):
|
||||||
|
|||||||
@ -95,6 +95,12 @@ class AudioProcessor(FrigateProcess):
|
|||||||
self.cameras = cameras
|
self.cameras = cameras
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
|
def run(self) -> None:
|
||||||
|
self.pre_run_setup(self.config.logger)
|
||||||
|
audio_threads: list[AudioEventMaintainer] = []
|
||||||
|
|
||||||
|
threading.current_thread().name = "process:audio_manager"
|
||||||
|
|
||||||
if self.config.audio_transcription.enabled:
|
if self.config.audio_transcription.enabled:
|
||||||
self.transcription_model_runner = AudioTranscriptionModelRunner(
|
self.transcription_model_runner = AudioTranscriptionModelRunner(
|
||||||
self.config.audio_transcription.device,
|
self.config.audio_transcription.device,
|
||||||
@ -103,12 +109,6 @@ class AudioProcessor(FrigateProcess):
|
|||||||
else:
|
else:
|
||||||
self.transcription_model_runner = None
|
self.transcription_model_runner = None
|
||||||
|
|
||||||
def run(self) -> None:
|
|
||||||
self.pre_run_setup(self.config.logger)
|
|
||||||
audio_threads: list[AudioEventMaintainer] = []
|
|
||||||
|
|
||||||
threading.current_thread().name = "process:audio_manager"
|
|
||||||
|
|
||||||
if len(self.cameras) == 0:
|
if len(self.cameras) == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -180,7 +180,7 @@ class AudioEventMaintainer(threading.Thread):
|
|||||||
)
|
)
|
||||||
self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio.value)
|
self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio.value)
|
||||||
|
|
||||||
if self.camera_config.audio_transcription.enabled_in_config:
|
if self.config.audio_transcription.enabled:
|
||||||
# init the transcription processor for this camera
|
# init the transcription processor for this camera
|
||||||
self.transcription_processor = AudioTranscriptionRealTimeProcessor(
|
self.transcription_processor = AudioTranscriptionRealTimeProcessor(
|
||||||
config=self.config,
|
config=self.config,
|
||||||
|
|||||||
@ -156,7 +156,7 @@ class TimelineProcessor(threading.Thread):
|
|||||||
event_type: str,
|
event_type: str,
|
||||||
event_data: dict[Any, Any],
|
event_data: dict[Any, Any],
|
||||||
) -> bool:
|
) -> bool:
|
||||||
if event_type != "new":
|
if event_type != "start":
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if event_data.get("type", "api") == "audio":
|
if event_data.get("type", "api") == "audio":
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user