initial implementation of Google Gemini captions

This commit is contained in:
Jason Hunter 2023-12-14 03:11:14 -05:00
parent e390533760
commit af7cfee82d
6 changed files with 74 additions and 11 deletions

View File

@ -15,13 +15,13 @@ services:
# Use target devcontainer-trt for TensorRT dev
target: devcontainer
## Uncomment this block for nvidia gpu support
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: 1
# capabilities: [gpu]
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
environment:
YOLO_MODELS: yolov7-320
devices:

View File

@ -27,3 +27,5 @@ unidecode == 1.3.*
# Openvino Library - Custom built with MYRIAD support
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.3.1/openvino-2022.3.1-1-cp39-cp39-manylinux_2_31_x86_64.whl; platform_machine == 'x86_64'
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.3.1/openvino-2022.3.1-1-cp39-cp39-linux_aarch64.whl; platform_machine == 'aarch64'
# Google Generative AI
google-generativeai == 0.3.*

View File

@ -35,6 +35,7 @@ from frigate.events.audio import listen_to_audio
from frigate.events.cleanup import EventCleanup
from frigate.events.external import ExternalEventProcessor
from frigate.events.maintainer import EventProcessor
from frigate.gemini import GeminiProcessor
from frigate.http import create_app
from frigate.log import log_process, root_configurer
from frigate.models import Event, Recordings, RecordingsToDelete, Regions, Timeline
@ -266,6 +267,9 @@ class FrigateApp:
# Queue for timeline events
self.timeline_queue: Queue = mp.Queue()
# Queue for Google Gemini events
self.gemini_queue: Queue = mp.Queue()
# Queue for inter process communication
self.inter_process_queue: Queue = mp.Queue()
@ -576,6 +580,12 @@ class FrigateApp:
)
self.timeline_processor.start()
def start_gemini_processor(self) -> None:
self.gemini_processor = GeminiProcessor(
self.config, self.gemini_queue, self.stop_event
)
self.gemini_processor.start()
def start_event_processor(self) -> None:
self.event_processor = EventProcessor(
self.config,
@ -583,6 +593,7 @@ class FrigateApp:
self.event_queue,
self.event_processed_queue,
self.timeline_queue,
self.gemini_queue,
self.stop_event,
)
self.event_processor.start()
@ -692,6 +703,7 @@ class FrigateApp:
self.init_external_event_processor()
self.init_web_server()
self.start_timeline_processor()
self.start_gemini_processor()
self.start_event_processor()
self.start_event_cleanup()
self.start_record_cleanup()
@ -734,6 +746,7 @@ class FrigateApp:
self.record_cleanup.join()
self.stats_emitter.join()
self.frigate_watchdog.join()
self.gemini_processor.join()
self.db.stop()
while len(self.detection_shms) > 0:

View File

@ -382,6 +382,26 @@ class DetectConfig(FrigateBaseModel):
)
class GeminiConfig(FrigateBaseModel):
enabled: bool = Field(default=False, title="Enable Google Gemini captioning.")
override_existing: bool = Field(
default=False, title="Override existing sub labels."
)
api_key: str = Field(default="", title="Google AI Studio API Key.")
model: str = Field(default="gemini-pro-vision", title="Google AI Studio Model.")
prompt: str = Field(
default="Caption this image with as much detail as possible. Make sure the response is under 90 characters.",
title="Default caption prompt.",
)
object_prompts: Dict[str, str] = Field(
default={
"person": "Describe the main person in the image (gender, age, clothing, activity, etc). Do not include where the activity is occurring (sidewalk, concrete, driveway, etc). If delivering a package, include the company the package is from. Make sure the response is under 90 characters.",
"car": "Label the primary vehicle in the image with just the name of the company if it is a delivery vehicle, or the color make and model.",
},
title="Object specific prompts.",
)
class FilterConfig(FrigateBaseModel):
min_area: int = Field(
default=0, title="Minimum area of bounding box for object to be counted."
@ -780,6 +800,9 @@ class CameraConfig(FrigateBaseModel):
onvif: OnvifConfig = Field(
default_factory=OnvifConfig, title="Camera Onvif Configuration."
)
gemini: GeminiConfig = Field(
default_factory=GeminiConfig, title="Google Gemini Configuration."
)
ui: CameraUiConfig = Field(
default_factory=CameraUiConfig, title="Camera UI Modifications."
)
@ -1092,6 +1115,9 @@ class FrigateConfig(FrigateBaseModel):
detect: DetectConfig = Field(
default_factory=DetectConfig, title="Global object tracking configuration."
)
gemini: GeminiConfig = Field(
default_factory=GeminiConfig, title="Global Google Gemini Configuration."
)
cameras: Dict[str, CameraConfig] = Field(title="Camera configuration.")
timestamp_style: TimestampStyleConfig = Field(
default_factory=TimestampStyleConfig,
@ -1107,6 +1133,10 @@ class FrigateConfig(FrigateBaseModel):
config.mqtt.user = config.mqtt.user.format(**FRIGATE_ENV_VARS)
config.mqtt.password = config.mqtt.password.format(**FRIGATE_ENV_VARS)
# Gemini API Key substitutions
if config.gemini.api_key:
config.gemini.api_key = config.gemini.api_key.format(**FRIGATE_ENV_VARS)
# set default min_score for object attributes
for attribute in ALL_ATTRIBUTE_LABELS:
if not config.objects.filters.get(attribute):
@ -1128,6 +1158,7 @@ class FrigateConfig(FrigateBaseModel):
"detect": ...,
"ffmpeg": ...,
"timestamp_style": ...,
"gemini": ...,
},
exclude_unset=True,
)
@ -1194,6 +1225,13 @@ class FrigateConfig(FrigateBaseModel):
camera_config.onvif.password = camera_config.onvif.password.format(
**FRIGATE_ENV_VARS
)
# Gemini substitution
if camera_config.gemini.api_key:
camera_config.gemini.api_key = camera_config.gemini.api_key.format(
**FRIGATE_ENV_VARS
)
# set config pre-value
camera_config.record.enabled_in_config = camera_config.record.enabled
camera_config.audio.enabled_in_config = camera_config.audio.enabled

View File

@ -32,6 +32,7 @@ def should_update_db(prev_event: Event, current_event: Event) -> bool:
or prev_event["entered_zones"] != current_event["entered_zones"]
or prev_event["thumbnail"] != current_event["thumbnail"]
or prev_event["end_time"] != current_event["end_time"]
or prev_event["sub_label"] != current_event["sub_label"]
):
return True
return False
@ -56,6 +57,7 @@ class EventProcessor(threading.Thread):
event_queue: Queue,
event_processed_queue: Queue,
timeline_queue: Queue,
gemini_queue: Queue,
stop_event: MpEvent,
):
threading.Thread.__init__(self)
@ -65,6 +67,7 @@ class EventProcessor(threading.Thread):
self.event_queue = event_queue
self.event_processed_queue = event_processed_queue
self.timeline_queue = timeline_queue
self.gemini_queue = gemini_queue
self.events_in_process: Dict[str, Event] = {}
self.stop_event = stop_event
@ -102,6 +105,14 @@ class EventProcessor(threading.Thread):
continue
self.handle_object_detection(event_type, camera, event_data)
if event_type == "end" and self.config.cameras[camera].gemini.enabled:
self.gemini_queue.put(
(
camera,
event_data,
)
)
elif source_type == EventTypeEnum.api:
self.handle_external_detection(event_type, event_data)

View File

@ -797,11 +797,10 @@ function Event({
</div>
<div className="m-2 flex grow">
<div className="flex flex-col grow">
<div className="capitalize text-lg font-bold">
{event.label.replaceAll('_', ' ')}
{event.sub_label ? `: ${event.sub_label.replaceAll('_', ' ')}` : null}
<div className="capitalize text-lg font-bold">{event.label.replaceAll('_', ' ')}</div>
<div className="text-sm flex pb-2">
{event.sub_label ? `${event.sub_label.replaceAll('_', ' ')}` : null}
</div>
<div className="text-sm flex">
<Clock className="h-5 w-5 mr-2 inline" />
{formatUnixTimestampToDateTime(event.start_time, { ...config.ui })}