diff --git a/docs/docs/configuration/audio_detectors.md b/docs/docs/configuration/audio_detectors.md
index 71ad5c335..bf71f8d81 100644
--- a/docs/docs/configuration/audio_detectors.md
+++ b/docs/docs/configuration/audio_detectors.md
@@ -75,23 +75,23 @@ audio:
 
 ### Audio Transcription
 
-Frigate supports fully local audio transcription using either `sherpa-onnx` or OpenAI’s open-source Whisper models via `faster-whisper`. To enable transcription, it is recommended to only configure the features at the global level, and enable it at the individual camera level.
+Frigate supports fully local audio transcription using either `sherpa-onnx` or OpenAI’s open-source Whisper models via `faster-whisper`. To enable transcription, enable it in your config. Note that audio detection must also be enabled as described above in order to use audio transcription features.
 
 ```yaml
 audio_transcription:
-  enabled: False
+  enabled: True
   device: ...
   model_size: ...
 ```
 
-Enable audio transcription for select cameras at the camera level:
+Disable audio transcription for select cameras at the camera level:
 
 ```yaml
 cameras:
   back_yard:
     ...
     audio_transcription:
-      enabled: True
+      enabled: False
 ```
 
 :::note
@@ -111,7 +111,6 @@ The optional config parameters that can be set at the global level include:
 - **`model_size`**: The size of the model used for live transcription.
   - Default: `small`
   - This can be `small` or `large`. The `small` setting uses `sherpa-onnx` models that are fast, lightweight, and always run on the CPU but are not as accurate as the `whisper` model.
-  - The
   - This config option applies to **live transcription only**. Recorded `speech` events will always use a different `whisper` model (and can be accelerated for CUDA hardware if available with `device: GPU`).
 - **`language`**: Defines the language used by `whisper` to translate `speech` audio events (and live audio only if using the `large` model).
   - Default: `en`
diff --git a/frigate/config/camera/camera.py b/frigate/config/camera/camera.py
index 68f874138..821b2a16d 100644
--- a/frigate/config/camera/camera.py
+++ b/frigate/config/camera/camera.py
@@ -19,7 +19,7 @@ from frigate.util.builtin import (
 
 from ..base import FrigateBaseModel
 from ..classification import (
-    AudioTranscriptionConfig,
+    CameraAudioTranscriptionConfig,
     CameraFaceRecognitionConfig,
     CameraLicensePlateRecognitionConfig,
     CameraSemanticSearchConfig,
@@ -69,8 +69,9 @@ class CameraConfig(FrigateBaseModel):
     audio: AudioConfig = Field(
         default_factory=AudioConfig, title="Audio events configuration."
     )
-    audio_transcription: AudioTranscriptionConfig = Field(
-        default_factory=AudioTranscriptionConfig, title="Audio transcription config."
+    audio_transcription: CameraAudioTranscriptionConfig = Field(
+        default_factory=CameraAudioTranscriptionConfig,
+        title="Audio transcription config.",
     )
     birdseye: BirdseyeCameraConfig = Field(
         default_factory=BirdseyeCameraConfig, title="Birdseye camera configuration."
diff --git a/frigate/config/classification.py b/frigate/config/classification.py
index 5cc07d28a..56126e4d4 100644
--- a/frigate/config/classification.py
+++ b/frigate/config/classification.py
@@ -8,6 +8,7 @@ from .base import FrigateBaseModel
 __all__ = [
     "CameraFaceRecognitionConfig",
     "CameraLicensePlateRecognitionConfig",
+    "CameraAudioTranscriptionConfig",
     "FaceRecognitionConfig",
     "SemanticSearchConfig",
     "CameraSemanticSearchConfig",
@@ -47,14 +48,11 @@ class AudioTranscriptionConfig(FrigateBaseModel):
     )
     device: Optional[EnrichmentsDeviceEnum] = Field(
         default=EnrichmentsDeviceEnum.CPU,
-        title="The device used for license plate recognition.",
+        title="The device used for audio transcription.",
     )
     model_size: str = Field(
         default="small", title="The size of the embeddings model used."
     )
-    enabled_in_config: Optional[bool] = Field(
-        default=None, title="Keep track of original state of camera."
-    )
     live_enabled: Optional[bool] = Field(
         default=False, title="Enable live transcriptions."
     )
@@ -304,3 +302,15 @@ class CameraLicensePlateRecognitionConfig(FrigateBaseModel):
     )
 
     model_config = ConfigDict(extra="forbid", protected_namespaces=())
+
+
+class CameraAudioTranscriptionConfig(FrigateBaseModel):
+    enabled: bool = Field(default=False, title="Enable audio transcription.")
+    enabled_in_config: Optional[bool] = Field(
+        default=None, title="Keep track of original state of audio transcription."
+    )
+    live_enabled: Optional[bool] = Field(
+        default=False, title="Enable live transcriptions."
+    )
+
+    model_config = ConfigDict(extra="forbid", protected_namespaces=())
diff --git a/frigate/data_processing/real_time/whisper_online.py b/frigate/data_processing/real_time/whisper_online.py
index 9b81d7fbe..024b19fba 100644
--- a/frigate/data_processing/real_time/whisper_online.py
+++ b/frigate/data_processing/real_time/whisper_online.py
@@ -141,6 +141,8 @@ class FasterWhisperASR(ASRBase):
     def transcribe(self, audio, init_prompt=""):
         from faster_whisper import BatchedInferencePipeline
 
+        logging.getLogger("faster_whisper").setLevel(logging.WARNING)
+
         # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
         batched_model = BatchedInferencePipeline(model=self.model)
         segments, info = batched_model.transcribe(
diff --git a/frigate/embeddings/maintainer.py b/frigate/embeddings/maintainer.py
index ca2fa9534..55e3d57ba 100644
--- a/frigate/embeddings/maintainer.py
+++ b/frigate/embeddings/maintainer.py
@@ -215,7 +215,7 @@ class EmbeddingMaintainer(threading.Thread):
                 )
             )
 
-        if any(
+        if self.config.audio_transcription.enabled and any(
             c.enabled_in_config and c.audio_transcription.enabled
             for c in self.config.cameras.values()
         ):
diff --git a/frigate/events/audio.py b/frigate/events/audio.py
index 31b9a7f3c..1aa227719 100644
--- a/frigate/events/audio.py
+++ b/frigate/events/audio.py
@@ -95,6 +95,12 @@ class AudioProcessor(FrigateProcess):
         self.cameras = cameras
         self.config = config
 
+    def run(self) -> None:
+        self.pre_run_setup(self.config.logger)
+        audio_threads: list[AudioEventMaintainer] = []
+
+        threading.current_thread().name = "process:audio_manager"
+
         if self.config.audio_transcription.enabled:
             self.transcription_model_runner = AudioTranscriptionModelRunner(
                 self.config.audio_transcription.device,
@@ -103,12 +109,6 @@ class AudioProcessor(FrigateProcess):
         else:
             self.transcription_model_runner = None
 
-    def run(self) -> None:
-        self.pre_run_setup(self.config.logger)
-        audio_threads: list[AudioEventMaintainer] = []
-
-        threading.current_thread().name = "process:audio_manager"
-
         if len(self.cameras) == 0:
             return
 
@@ -180,7 +180,7 @@ class AudioEventMaintainer(threading.Thread):
         )
         self.detection_publisher = DetectionPublisher(DetectionTypeEnum.audio.value)
 
-        if self.camera_config.audio_transcription.enabled_in_config:
+        if self.config.audio_transcription.enabled:
             # init the transcription processor for this camera
             self.transcription_processor = AudioTranscriptionRealTimeProcessor(
                 config=self.config,
diff --git a/frigate/timeline.py b/frigate/timeline.py
index 4e3c8e293..4c3d0d457 100644
--- a/frigate/timeline.py
+++ b/frigate/timeline.py
@@ -156,7 +156,7 @@ class TimelineProcessor(threading.Thread):
         event_type: str,
         event_data: dict[Any, Any],
     ) -> bool:
-        if event_type != "new":
+        if event_type != "start":
             return False
 
         if event_data.get("type", "api") == "audio":