initial implementation of Google Gemini captions

2026-02-07 11:45:24 +03:00 · 2023-12-14 03:11:14 -05:00 · 2023-12-14 03:11:14 -05:00 · af7cfee82d
commit af7cfee82d
parent e390533760
6 changed files with 74 additions and 11 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -15,13 +15,13 @@ services:
      # Use target devcontainer-trt for TensorRT dev
      target: devcontainer
    ## Uncomment this block for nvidia gpu support
-    # deploy:
-    #       resources:
-    #           reservations:
-    #               devices:
-    #                   - driver: nvidia
-    #                     count: 1
-    #                     capabilities: [gpu]
+    deploy:
+          resources:
+              reservations:
+                  devices:
+                      - driver: nvidia
+                        count: 1
+                        capabilities: [gpu]
    environment:
      YOLO_MODELS: yolov7-320
    devices:
--- a/docker/main/requirements-wheels.txt
+++ b/docker/main/requirements-wheels.txt
@ -27,3 +27,5 @@ unidecode == 1.3.*
 # Openvino Library - Custom built with MYRIAD support
 openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.3.1/openvino-2022.3.1-1-cp39-cp39-manylinux_2_31_x86_64.whl; platform_machine == 'x86_64'
 openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.3.1/openvino-2022.3.1-1-cp39-cp39-linux_aarch64.whl; platform_machine == 'aarch64'
+# Google Generative AI
+google-generativeai == 0.3.*
--- a/frigate/app.py
+++ b/frigate/app.py
@ -35,6 +35,7 @@ from frigate.events.audio import listen_to_audio
 from frigate.events.cleanup import EventCleanup
 from frigate.events.external import ExternalEventProcessor
 from frigate.events.maintainer import EventProcessor
+from frigate.gemini import GeminiProcessor
 from frigate.http import create_app
 from frigate.log import log_process, root_configurer
 from frigate.models import Event, Recordings, RecordingsToDelete, Regions, Timeline
@ -266,6 +267,9 @@ class FrigateApp:
        # Queue for timeline events
        self.timeline_queue: Queue = mp.Queue()

+        # Queue for Google Gemini events
+        self.gemini_queue: Queue = mp.Queue()
+
        # Queue for inter process communication
        self.inter_process_queue: Queue = mp.Queue()

@ -576,6 +580,12 @@ class FrigateApp:
        )
        self.timeline_processor.start()

+    def start_gemini_processor(self) -> None:
+        self.gemini_processor = GeminiProcessor(
+            self.config, self.gemini_queue, self.stop_event
+        )
+        self.gemini_processor.start()
+
    def start_event_processor(self) -> None:
        self.event_processor = EventProcessor(
            self.config,
@ -583,6 +593,7 @@ class FrigateApp:
            self.event_queue,
            self.event_processed_queue,
            self.timeline_queue,
+            self.gemini_queue,
            self.stop_event,
        )
        self.event_processor.start()
@ -692,6 +703,7 @@ class FrigateApp:
        self.init_external_event_processor()
        self.init_web_server()
        self.start_timeline_processor()
+        self.start_gemini_processor()
        self.start_event_processor()
        self.start_event_cleanup()
        self.start_record_cleanup()
@ -734,6 +746,7 @@ class FrigateApp:
        self.record_cleanup.join()
        self.stats_emitter.join()
        self.frigate_watchdog.join()
+        self.gemini_processor.join()
        self.db.stop()

        while len(self.detection_shms) > 0:
--- a/frigate/config.py
+++ b/frigate/config.py
@ -382,6 +382,26 @@ class DetectConfig(FrigateBaseModel):
    )


+class GeminiConfig(FrigateBaseModel):
+    enabled: bool = Field(default=False, title="Enable Google Gemini captioning.")
+    override_existing: bool = Field(
+        default=False, title="Override existing sub labels."
+    )
+    api_key: str = Field(default="", title="Google AI Studio API Key.")
+    model: str = Field(default="gemini-pro-vision", title="Google AI Studio Model.")
+    prompt: str = Field(
+        default="Caption this image with as much detail as possible. Make sure the response is under 90 characters.",
+        title="Default caption prompt.",
+    )
+    object_prompts: Dict[str, str] = Field(
+        default={
+            "person": "Describe the main person in the image (gender, age, clothing, activity, etc). Do not include where the activity is occurring (sidewalk, concrete, driveway, etc). If delivering a package, include the company the package is from. Make sure the response is under 90 characters.",
+            "car": "Label the primary vehicle in the image with just the name of the company if it is a delivery vehicle, or the color make and model.",
+        },
+        title="Object specific prompts.",
+    )
+
+
 class FilterConfig(FrigateBaseModel):
    min_area: int = Field(
        default=0, title="Minimum area of bounding box for object to be counted."
@ -780,6 +800,9 @@ class CameraConfig(FrigateBaseModel):
    onvif: OnvifConfig = Field(
        default_factory=OnvifConfig, title="Camera Onvif Configuration."
    )
+    gemini: GeminiConfig = Field(
+        default_factory=GeminiConfig, title="Google Gemini Configuration."
+    )
    ui: CameraUiConfig = Field(
        default_factory=CameraUiConfig, title="Camera UI Modifications."
    )
@ -1092,6 +1115,9 @@ class FrigateConfig(FrigateBaseModel):
    detect: DetectConfig = Field(
        default_factory=DetectConfig, title="Global object tracking configuration."
    )
+    gemini: GeminiConfig = Field(
+        default_factory=GeminiConfig, title="Global Google Gemini Configuration."
+    )
    cameras: Dict[str, CameraConfig] = Field(title="Camera configuration.")
    timestamp_style: TimestampStyleConfig = Field(
        default_factory=TimestampStyleConfig,
@ -1107,6 +1133,10 @@ class FrigateConfig(FrigateBaseModel):
            config.mqtt.user = config.mqtt.user.format(**FRIGATE_ENV_VARS)
            config.mqtt.password = config.mqtt.password.format(**FRIGATE_ENV_VARS)

+        # Gemini API Key substitutions
+        if config.gemini.api_key:
+            config.gemini.api_key = config.gemini.api_key.format(**FRIGATE_ENV_VARS)
+
        # set default min_score for object attributes
        for attribute in ALL_ATTRIBUTE_LABELS:
            if not config.objects.filters.get(attribute):
@ -1128,6 +1158,7 @@ class FrigateConfig(FrigateBaseModel):
                "detect": ...,
                "ffmpeg": ...,
                "timestamp_style": ...,
+                "gemini": ...,
            },
            exclude_unset=True,
        )
@ -1194,6 +1225,13 @@ class FrigateConfig(FrigateBaseModel):
                camera_config.onvif.password = camera_config.onvif.password.format(
                    **FRIGATE_ENV_VARS
                )
+
+            # Gemini substitution
+            if camera_config.gemini.api_key:
+                camera_config.gemini.api_key = camera_config.gemini.api_key.format(
+                    **FRIGATE_ENV_VARS
+                )
+
            # set config pre-value
            camera_config.record.enabled_in_config = camera_config.record.enabled
            camera_config.audio.enabled_in_config = camera_config.audio.enabled
--- a/frigate/events/maintainer.py
+++ b/frigate/events/maintainer.py
@ -32,6 +32,7 @@ def should_update_db(prev_event: Event, current_event: Event) -> bool:
            or prev_event["entered_zones"] != current_event["entered_zones"]
            or prev_event["thumbnail"] != current_event["thumbnail"]
            or prev_event["end_time"] != current_event["end_time"]
+            or prev_event["sub_label"] != current_event["sub_label"]
        ):
            return True
    return False
@ -56,6 +57,7 @@ class EventProcessor(threading.Thread):
        event_queue: Queue,
        event_processed_queue: Queue,
        timeline_queue: Queue,
+        gemini_queue: Queue,
        stop_event: MpEvent,
    ):
        threading.Thread.__init__(self)
@ -65,6 +67,7 @@ class EventProcessor(threading.Thread):
        self.event_queue = event_queue
        self.event_processed_queue = event_processed_queue
        self.timeline_queue = timeline_queue
+        self.gemini_queue = gemini_queue
        self.events_in_process: Dict[str, Event] = {}
        self.stop_event = stop_event

@ -102,6 +105,14 @@ class EventProcessor(threading.Thread):
                    continue

                self.handle_object_detection(event_type, camera, event_data)
+
+                if event_type == "end" and self.config.cameras[camera].gemini.enabled:
+                    self.gemini_queue.put(
+                        (
+                            camera,
+                            event_data,
+                        )
+                    )
            elif source_type == EventTypeEnum.api:
                self.handle_external_detection(event_type, event_data)

--- a/web/src/routes/Events.jsx
+++ b/web/src/routes/Events.jsx
@ -797,11 +797,10 @@ function Event({
        </div>
        <div className="m-2 flex grow">
          <div className="flex flex-col grow">
-            <div className="capitalize text-lg font-bold">
-              {event.label.replaceAll('_', ' ')}
-              {event.sub_label ? `: ${event.sub_label.replaceAll('_', ' ')}` : null}
+            <div className="capitalize text-lg font-bold">{event.label.replaceAll('_', ' ')}</div>
+            <div className="text-sm flex pb-2">
+              {event.sub_label ? `${event.sub_label.replaceAll('_', ' ')}` : null}
            </div>
-
            <div className="text-sm flex">
              <Clock className="h-5 w-5 mr-2 inline" />
              {formatUnixTimestampToDateTime(event.start_time, { ...config.ui })}