diff --git a/frigate/data_processing/real_time/audio_transcription.py b/frigate/data_processing/real_time/audio_transcription.py
index 36ae4f2e9..2e6d599eb 100644
--- a/frigate/data_processing/real_time/audio_transcription.py
+++ b/frigate/data_processing/real_time/audio_transcription.py
@@ -40,7 +40,7 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi):
         self.camera_config = camera_config
         self.requestor = requestor
         self.stream = None
-        self.whispermodel = None
+        self.whisper_model = None
         self.model_runner = model_runner
         self.transcription_segments = []
         self.audio_queue = queue.Queue()
@@ -51,6 +51,7 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi):
             if self.config.audio_transcription.model_size == "large":
                 # Whisper models need to be per-process and can only run one stream at a time
                 # TODO: try parallel: https://github.com/SYSTRAN/faster-whisper/issues/100
+                logger.debug(f"Loading Whisper model for {self.camera_config.name}")
                 self.whisper_model = FasterWhisperASR(
                     modelsize="tiny",
                     device="cuda"
@@ -64,6 +65,7 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi):
                     asr=self.whisper_model,
                 )
             else:
+                logger.debug(f"Loading sherpa stream for {self.camera_config.name}")
                 self.stream = self.model_runner.model.create_stream()
             logger.debug(
                 f"Audio transcription (live) initialized for {self.camera_config.name}"
@@ -101,7 +103,11 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi):
                 self.stream.insert_audio_chunk(audio_data)
                 output = self.stream.process_iter()
                 text = output[2].strip()
-                is_endpoint = text.endswith((".", "!", "?"))
+                is_endpoint = (
+                    text.endswith((".", "!", "?"))
+                    and sum(len(str(lines)) for lines in self.transcription_segments)
+                    > 300
+                )
 
                 if text:
                     self.transcription_segments.append(text)
@@ -153,10 +159,17 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi):
         logger.debug(
             f"Starting audio transcription thread for {self.camera_config.name}"
         )
+
+        # start with an empty transcription
+        self.requestor.send_data(
+            f"{self.camera_config.name}/audio/transcription",
+            "",
+        )
+
         while not self.stop_event.is_set():
             try:
                 # Get audio data from queue with a timeout to check stop_event
-                obj_data, audio = self.audio_queue.get(timeout=0.1)
+                _, audio = self.audio_queue.get(timeout=0.1)
                 result = self.__process_audio_stream(audio)
 
                 if not result:
@@ -172,7 +185,7 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi):
                 self.audio_queue.task_done()
 
                 if is_endpoint:
-                    self.reset(obj_data["camera"])
+                    self.reset()
 
             except queue.Empty:
                 continue
@@ -184,7 +197,16 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi):
             f"Stopping audio transcription thread for {self.camera_config.name}"
         )
 
-    def reset(self, camera: str) -> None:
+    def clear_audio_queue(self) -> None:
+        # Clear the audio queue
+        while not self.audio_queue.empty():
+            try:
+                self.audio_queue.get_nowait()
+                self.audio_queue.task_done()
+            except queue.Empty:
+                break
+
+    def reset(self) -> None:
         if self.config.audio_transcription.model_size == "large":
             # get final output from whisper
             output = self.stream.finish()
@@ -197,20 +219,41 @@ class AudioTranscriptionRealTimeProcessor(RealTimeProcessorApi):
 
             # reset whisper
             self.stream.init()
+            self.transcription_segments = []
         else:
             # reset sherpa
             self.model_runner.model.reset(self.stream)
 
-        # Clear the audio queue
-        while not self.audio_queue.empty():
-            try:
-                self.audio_queue.get_nowait()
-                self.audio_queue.task_done()
-            except queue.Empty:
-                break
-
         logger.debug("Stream reset")
 
+    def check_unload_model(self) -> None:
+        # regularly called in the loop in audio maintainer
+        if (
+            self.config.audio_transcription.model_size == "large"
+            and self.whisper_model is not None
+        ):
+            logger.debug(f"Unloading Whisper model for {self.camera_config.name}")
+            self.clear_audio_queue()
+            self.transcription_segments = []
+            self.stream = None
+            self.whisper_model = None
+
+            self.requestor.send_data(
+                f"{self.camera_config.name}/audio/transcription",
+                "",
+            )
+        if (
+            self.config.audio_transcription.model_size == "small"
+            and self.stream is not None
+        ):
+            logger.debug(f"Clearing sherpa stream for {self.camera_config.name}")
+            self.stream = None
+
+            self.requestor.send_data(
+                f"{self.camera_config.name}/audio/transcription",
+                "",
+            )
+
     def stop(self) -> None:
         """Stop the transcription thread and clean up."""
         self.stop_event.set()
diff --git a/frigate/events/audio.py b/frigate/events/audio.py
index 6447c1e8d..aeeaf3b4f 100644
--- a/frigate/events/audio.py
+++ b/frigate/events/audio.py
@@ -234,18 +234,18 @@ class AudioEventMaintainer(threading.Thread):
             )
 
         # run audio transcription
-        if self.transcription_processor is not None and (
-            self.camera_config.audio_transcription.live_enabled
-        ):
-            self.transcribing = True
-            # process audio until we've reached the endpoint
-            self.transcription_processor.process_audio(
-                {
-                    "id": f"{self.camera_config.name}_audio",
-                    "camera": self.camera_config.name,
-                },
-                audio,
-            )
+        if self.transcription_processor is not None:
+            if self.camera_config.audio_transcription.live_enabled:
+                # process audio until we've reached the endpoint
+                self.transcription_processor.process_audio(
+                    {
+                        "id": f"{self.camera_config.name}_audio",
+                        "camera": self.camera_config.name,
+                    },
+                    audio,
+                )
+            else:
+                self.transcription_processor.check_unload_model()
 
         self.expire_detections()
 
@@ -320,13 +320,6 @@ class AudioEventMaintainer(threading.Thread):
                 )
                 self.detections[detection["label"]] = None
 
-                # clear real-time transcription
-                if self.transcription_processor is not None:
-                    self.transcription_processor.reset(self.camera_config.name)
-                    self.requestor.send_data(
-                        f"{self.camera_config.name}/audio/transcription", ""
-                    )
-
     def expire_all_detections(self) -> None:
         """Immediately end all current detections"""
         now = datetime.datetime.now().timestamp()