diff --git a/frigate/detectors/detection_runners.py b/frigate/detectors/detection_runners.py
index 277b1c542..e397d73fb 100644
--- a/frigate/detectors/detection_runners.py
+++ b/frigate/detectors/detection_runners.py
@@ -625,10 +625,11 @@ def get_optimized_runner(
                 ),
                 cuda_graph_options["device_id"],
             )
-        except Exception:
+        except Exception as e:
             logger.warning(
-                "CUDA graph capture failed for %s, falling back to standard ONNX runner",
+                "CUDA graph capture failed for %s, falling back to standard ONNX runner: %s",
                 model_path,
+                e,
             )
 
     if (
diff --git a/frigate/test/test_detection_runners.py b/frigate/test/test_detection_runners.py
index c914b9bdc..e8079f1c6 100644
--- a/frigate/test/test_detection_runners.py
+++ b/frigate/test/test_detection_runners.py
@@ -100,52 +100,82 @@ class TestHasVariableLengthInputs(unittest.TestCase):
 
 
 class TestComputeCudaMemLimit(unittest.TestCase):
+    @staticmethod
+    def _fake_mem_get_info(free_value: int, total_value: int):
+        def _impl(free_ptr, total_ptr):
+            free_ptr._obj.value = free_value
+            total_ptr._obj.value = total_value
+            return 0  # cudaSuccess
+
+        return _impl
+
     @patch("frigate.util.model.ctypes.CDLL")
-    @patch("os.path.getsize", return_value=200 * 1024 * 1024)  # 200 MB model
-    def test_respects_ceiling(self, mock_getsize, mock_cdll):
-        """gpu_mem_limit must not exceed 80% of total VRAM."""
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_respects_ceiling(self, _mock_getsize, mock_cdll):
         from frigate.util.model import compute_cuda_mem_limit
 
-        total_vram = 24 * 1024**3  # 24 GB
+        total_vram = 24 * 1024**3
         mock_lib = MagicMock()
         mock_cdll.return_value = mock_lib
-
-        def fake_mem_get_info(free_ptr, total_ptr):
-            total_ptr._obj.value = total_vram
-            free_ptr._obj.value = total_vram
-
-        mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            total_vram, total_vram
+        )
 
         limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
         self.assertLessEqual(limit, int(total_vram * 0.80))
 
     @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
     def test_fallback_on_cuda_unavailable(self, _mock_cdll):
-        """Falls back to 4 GB when CUDA runtime is not available."""
         from frigate.util.model import compute_cuda_mem_limit
 
         limit = compute_cuda_mem_limit("/fake/model.onnx")
         self.assertEqual(limit, 4 * 1024**3)
 
     @patch("frigate.util.model.ctypes.CDLL")
-    @patch("os.path.getsize", return_value=50 * 1024 * 1024)  # 50 MB model
-    def test_floor_is_at_least_2gb(self, mock_getsize, mock_cdll):
-        """Floor must be at least 2 GB regardless of model size."""
+    @patch("os.path.getsize", return_value=50 * 1024 * 1024)
+    def test_floor_is_at_least_2gb(self, _mock_getsize, mock_cdll):
         from frigate.util.model import compute_cuda_mem_limit
 
         total_vram = 24 * 1024**3
         mock_lib = MagicMock()
         mock_cdll.return_value = mock_lib
-
-        def fake_mem_get_info(free_ptr, total_ptr):
-            total_ptr._obj.value = total_vram
-            free_ptr._obj.value = total_vram
-
-        mock_lib.cudaMemGetInfo.side_effect = fake_mem_get_info
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            total_vram, total_vram
+        )
 
         limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
         self.assertGreaterEqual(limit, 2 * 1024**3)
 
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_fallback_when_cuda_returns_error_code(self, _mock_getsize, mock_cdll):
+        # Bug #1: cudaMemGetInfo returning non-zero left both ptrs at 0,
+        # producing gpu_mem_limit=0 and immediate session OOM.
+        from frigate.util.model import compute_cuda_mem_limit
+
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.return_value = 2  # cudaErrorMemoryAllocation
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertEqual(limit, 4 * 1024**3)
+
+    @patch("frigate.util.model.ctypes.CDLL")
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_capped_by_free_vram_when_constrained(self, _mock_getsize, mock_cdll):
+        # Bug #2: with 3 GB free of 24 GB, the limit must respect free × 0.9,
+        # not 80% of total — co-resident embedding sessions would OOM otherwise.
+        from frigate.util.model import compute_cuda_mem_limit
+
+        mock_lib = MagicMock()
+        mock_cdll.return_value = mock_lib
+        mock_lib.cudaMemGetInfo.side_effect = self._fake_mem_get_info(
+            3 * 1024**3, 24 * 1024**3
+        )
+
+        limit = compute_cuda_mem_limit("/fake/model.onnx", cuda_graph=False)
+        self.assertLessEqual(limit, int(3 * 1024**3 * 0.90))
+
 
 class TestOrtLeakFixRegression(unittest.TestCase):
     """Regression guards for the embeddings_manager ORT memory leak fix.
@@ -311,5 +341,43 @@ class TestOrtLeakFixRegression(unittest.TestCase):
             )
 
 
+class TestCudaGraphFallbackLogsException(unittest.TestCase):
+    @patch("frigate.detectors.detection_runners.ort.InferenceSession")
+    @patch(
+        "frigate.detectors.detection_runners.get_ort_providers",
+        return_value=(["CUDAExecutionProvider"], [{"device_id": 0}]),
+    )
+    @patch(
+        "frigate.detectors.detection_runners.is_rknn_compatible",
+        return_value=False,
+    )
+    @patch("frigate.util.model.ctypes.CDLL", side_effect=OSError("no cuda"))
+    @patch("os.path.getsize", return_value=200 * 1024 * 1024)
+    def test_fallback_warning_includes_exception_text(
+        self, _gs, _cdll, _rknn, _gp, mock_session
+    ):
+        # Concern #1: the bare `except Exception:` swallowed the underlying
+        # ORT error (cudaErrorStreamCaptureUnsupported, missing libnvrtc, etc.),
+        # turning a debuggable failure into an opaque "fell back to ONNX runner".
+        from frigate.detectors.detection_runners import get_optimized_runner
+        from frigate.detectors.detector_config import ModelTypeEnum
+
+        mock_session.side_effect = [
+            RuntimeError("cudaErrorStreamCaptureUnsupported"),
+            MagicMock(get_inputs=lambda: [], get_outputs=lambda: []),
+        ]
+
+        with self.assertLogs(
+            "frigate.detectors.detection_runners", level="WARNING"
+        ) as captured:
+            get_optimized_runner(
+                "/m/yolo.onnx", "GPU", ModelTypeEnum.yologeneric.value
+            )
+
+        joined = "\n".join(captured.output)
+        self.assertIn("CUDA graph capture failed", joined)
+        self.assertIn("cudaErrorStreamCaptureUnsupported", joined)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/frigate/util/model.py b/frigate/util/model.py
index d0b8721cc..9867115a3 100644
--- a/frigate/util/model.py
+++ b/frigate/util/model.py
@@ -294,23 +294,29 @@ def compute_cuda_mem_limit(model_path: str, cuda_graph: bool = False) -> int:
 
     Returns a limit derived from:
     - Floor: model file size × peak_multiplier (≥ 2 GB)
-    - Ceiling: 80% of total GPU VRAM
+    - Ceiling: min(80% of total VRAM, 90% of currently free VRAM)
     Falls back to 4 GB if the CUDA runtime query fails.
     """
     try:
         libcudart = ctypes.CDLL("libcudart.so")
         free_bytes = ctypes.c_size_t()
         total_bytes = ctypes.c_size_t()
-        libcudart.cudaMemGetInfo(ctypes.byref(free_bytes), ctypes.byref(total_bytes))
+        rc = libcudart.cudaMemGetInfo(
+            ctypes.byref(free_bytes), ctypes.byref(total_bytes)
+        )
+        if rc != 0 or total_bytes.value == 0:
+            raise RuntimeError(f"cudaMemGetInfo rc={rc} total={total_bytes.value}")
         total = total_bytes.value
-    except Exception:
-        logger.debug("cudaMemGetInfo unavailable; using 4 GB gpu_mem_limit fallback")
+        free = free_bytes.value
+    except Exception as e:
+        logger.debug("cudaMemGetInfo unavailable (%s); using 4 GB gpu_mem_limit fallback", e)
         return 4 * 1024**3
 
     peak_multiplier = 14 if cuda_graph else 7
     floor = max(os.path.getsize(model_path) * peak_multiplier, 2 * 1024**3)
-    ceiling = int(total * 0.80)
-    return min(floor, ceiling)
+    # Honor free VRAM so co-resident embedding sessions (jina text + vision,
+    # paddleocr det + rec, arcface) don't OOM each other on shared GPUs.
+    return min(floor, int(total * 0.80), int(free * 0.90))
 
 
 def get_ort_providers(