initial implementation of Google Gemini captions

This commit is contained in:
Jason Hunter 2023-12-14 03:11:14 -05:00
parent e390533760
commit af7cfee82d
6 changed files with 74 additions and 11 deletions

View File

@ -15,13 +15,13 @@ services:
# Use target devcontainer-trt for TensorRT dev # Use target devcontainer-trt for TensorRT dev
target: devcontainer target: devcontainer
## Uncomment this block for nvidia gpu support ## Uncomment this block for nvidia gpu support
# deploy: deploy:
# resources: resources:
# reservations: reservations:
# devices: devices:
# - driver: nvidia - driver: nvidia
# count: 1 count: 1
# capabilities: [gpu] capabilities: [gpu]
environment: environment:
YOLO_MODELS: yolov7-320 YOLO_MODELS: yolov7-320
devices: devices:

View File

@ -27,3 +27,5 @@ unidecode == 1.3.*
# Openvino Library - Custom built with MYRIAD support # Openvino Library - Custom built with MYRIAD support
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.3.1/openvino-2022.3.1-1-cp39-cp39-manylinux_2_31_x86_64.whl; platform_machine == 'x86_64' openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.3.1/openvino-2022.3.1-1-cp39-cp39-manylinux_2_31_x86_64.whl; platform_machine == 'x86_64'
openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.3.1/openvino-2022.3.1-1-cp39-cp39-linux_aarch64.whl; platform_machine == 'aarch64' openvino @ https://github.com/NateMeyer/openvino-wheels/releases/download/multi-arch_2022.3.1/openvino-2022.3.1-1-cp39-cp39-linux_aarch64.whl; platform_machine == 'aarch64'
# Google Generative AI
google-generativeai == 0.3.*

View File

@ -35,6 +35,7 @@ from frigate.events.audio import listen_to_audio
from frigate.events.cleanup import EventCleanup from frigate.events.cleanup import EventCleanup
from frigate.events.external import ExternalEventProcessor from frigate.events.external import ExternalEventProcessor
from frigate.events.maintainer import EventProcessor from frigate.events.maintainer import EventProcessor
from frigate.gemini import GeminiProcessor
from frigate.http import create_app from frigate.http import create_app
from frigate.log import log_process, root_configurer from frigate.log import log_process, root_configurer
from frigate.models import Event, Recordings, RecordingsToDelete, Regions, Timeline from frigate.models import Event, Recordings, RecordingsToDelete, Regions, Timeline
@ -266,6 +267,9 @@ class FrigateApp:
# Queue for timeline events # Queue for timeline events
self.timeline_queue: Queue = mp.Queue() self.timeline_queue: Queue = mp.Queue()
# Queue for Google Gemini events
self.gemini_queue: Queue = mp.Queue()
# Queue for inter process communication # Queue for inter process communication
self.inter_process_queue: Queue = mp.Queue() self.inter_process_queue: Queue = mp.Queue()
@ -576,6 +580,12 @@ class FrigateApp:
) )
self.timeline_processor.start() self.timeline_processor.start()
def start_gemini_processor(self) -> None:
self.gemini_processor = GeminiProcessor(
self.config, self.gemini_queue, self.stop_event
)
self.gemini_processor.start()
def start_event_processor(self) -> None: def start_event_processor(self) -> None:
self.event_processor = EventProcessor( self.event_processor = EventProcessor(
self.config, self.config,
@ -583,6 +593,7 @@ class FrigateApp:
self.event_queue, self.event_queue,
self.event_processed_queue, self.event_processed_queue,
self.timeline_queue, self.timeline_queue,
self.gemini_queue,
self.stop_event, self.stop_event,
) )
self.event_processor.start() self.event_processor.start()
@ -692,6 +703,7 @@ class FrigateApp:
self.init_external_event_processor() self.init_external_event_processor()
self.init_web_server() self.init_web_server()
self.start_timeline_processor() self.start_timeline_processor()
self.start_gemini_processor()
self.start_event_processor() self.start_event_processor()
self.start_event_cleanup() self.start_event_cleanup()
self.start_record_cleanup() self.start_record_cleanup()
@ -734,6 +746,7 @@ class FrigateApp:
self.record_cleanup.join() self.record_cleanup.join()
self.stats_emitter.join() self.stats_emitter.join()
self.frigate_watchdog.join() self.frigate_watchdog.join()
self.gemini_processor.join()
self.db.stop() self.db.stop()
while len(self.detection_shms) > 0: while len(self.detection_shms) > 0:

View File

@ -382,6 +382,26 @@ class DetectConfig(FrigateBaseModel):
) )
class GeminiConfig(FrigateBaseModel):
enabled: bool = Field(default=False, title="Enable Google Gemini captioning.")
override_existing: bool = Field(
default=False, title="Override existing sub labels."
)
api_key: str = Field(default="", title="Google AI Studio API Key.")
model: str = Field(default="gemini-pro-vision", title="Google AI Studio Model.")
prompt: str = Field(
default="Caption this image with as much detail as possible. Make sure the response is under 90 characters.",
title="Default caption prompt.",
)
object_prompts: Dict[str, str] = Field(
default={
"person": "Describe the main person in the image (gender, age, clothing, activity, etc). Do not include where the activity is occurring (sidewalk, concrete, driveway, etc). If delivering a package, include the company the package is from. Make sure the response is under 90 characters.",
"car": "Label the primary vehicle in the image with just the name of the company if it is a delivery vehicle, or the color make and model.",
},
title="Object specific prompts.",
)
class FilterConfig(FrigateBaseModel): class FilterConfig(FrigateBaseModel):
min_area: int = Field( min_area: int = Field(
default=0, title="Minimum area of bounding box for object to be counted." default=0, title="Minimum area of bounding box for object to be counted."
@ -780,6 +800,9 @@ class CameraConfig(FrigateBaseModel):
onvif: OnvifConfig = Field( onvif: OnvifConfig = Field(
default_factory=OnvifConfig, title="Camera Onvif Configuration." default_factory=OnvifConfig, title="Camera Onvif Configuration."
) )
gemini: GeminiConfig = Field(
default_factory=GeminiConfig, title="Google Gemini Configuration."
)
ui: CameraUiConfig = Field( ui: CameraUiConfig = Field(
default_factory=CameraUiConfig, title="Camera UI Modifications." default_factory=CameraUiConfig, title="Camera UI Modifications."
) )
@ -1092,6 +1115,9 @@ class FrigateConfig(FrigateBaseModel):
detect: DetectConfig = Field( detect: DetectConfig = Field(
default_factory=DetectConfig, title="Global object tracking configuration." default_factory=DetectConfig, title="Global object tracking configuration."
) )
gemini: GeminiConfig = Field(
default_factory=GeminiConfig, title="Global Google Gemini Configuration."
)
cameras: Dict[str, CameraConfig] = Field(title="Camera configuration.") cameras: Dict[str, CameraConfig] = Field(title="Camera configuration.")
timestamp_style: TimestampStyleConfig = Field( timestamp_style: TimestampStyleConfig = Field(
default_factory=TimestampStyleConfig, default_factory=TimestampStyleConfig,
@ -1107,6 +1133,10 @@ class FrigateConfig(FrigateBaseModel):
config.mqtt.user = config.mqtt.user.format(**FRIGATE_ENV_VARS) config.mqtt.user = config.mqtt.user.format(**FRIGATE_ENV_VARS)
config.mqtt.password = config.mqtt.password.format(**FRIGATE_ENV_VARS) config.mqtt.password = config.mqtt.password.format(**FRIGATE_ENV_VARS)
# Gemini API Key substitutions
if config.gemini.api_key:
config.gemini.api_key = config.gemini.api_key.format(**FRIGATE_ENV_VARS)
# set default min_score for object attributes # set default min_score for object attributes
for attribute in ALL_ATTRIBUTE_LABELS: for attribute in ALL_ATTRIBUTE_LABELS:
if not config.objects.filters.get(attribute): if not config.objects.filters.get(attribute):
@ -1128,6 +1158,7 @@ class FrigateConfig(FrigateBaseModel):
"detect": ..., "detect": ...,
"ffmpeg": ..., "ffmpeg": ...,
"timestamp_style": ..., "timestamp_style": ...,
"gemini": ...,
}, },
exclude_unset=True, exclude_unset=True,
) )
@ -1194,6 +1225,13 @@ class FrigateConfig(FrigateBaseModel):
camera_config.onvif.password = camera_config.onvif.password.format( camera_config.onvif.password = camera_config.onvif.password.format(
**FRIGATE_ENV_VARS **FRIGATE_ENV_VARS
) )
# Gemini substitution
if camera_config.gemini.api_key:
camera_config.gemini.api_key = camera_config.gemini.api_key.format(
**FRIGATE_ENV_VARS
)
# set config pre-value # set config pre-value
camera_config.record.enabled_in_config = camera_config.record.enabled camera_config.record.enabled_in_config = camera_config.record.enabled
camera_config.audio.enabled_in_config = camera_config.audio.enabled camera_config.audio.enabled_in_config = camera_config.audio.enabled

View File

@ -32,6 +32,7 @@ def should_update_db(prev_event: Event, current_event: Event) -> bool:
or prev_event["entered_zones"] != current_event["entered_zones"] or prev_event["entered_zones"] != current_event["entered_zones"]
or prev_event["thumbnail"] != current_event["thumbnail"] or prev_event["thumbnail"] != current_event["thumbnail"]
or prev_event["end_time"] != current_event["end_time"] or prev_event["end_time"] != current_event["end_time"]
or prev_event["sub_label"] != current_event["sub_label"]
): ):
return True return True
return False return False
@ -56,6 +57,7 @@ class EventProcessor(threading.Thread):
event_queue: Queue, event_queue: Queue,
event_processed_queue: Queue, event_processed_queue: Queue,
timeline_queue: Queue, timeline_queue: Queue,
gemini_queue: Queue,
stop_event: MpEvent, stop_event: MpEvent,
): ):
threading.Thread.__init__(self) threading.Thread.__init__(self)
@ -65,6 +67,7 @@ class EventProcessor(threading.Thread):
self.event_queue = event_queue self.event_queue = event_queue
self.event_processed_queue = event_processed_queue self.event_processed_queue = event_processed_queue
self.timeline_queue = timeline_queue self.timeline_queue = timeline_queue
self.gemini_queue = gemini_queue
self.events_in_process: Dict[str, Event] = {} self.events_in_process: Dict[str, Event] = {}
self.stop_event = stop_event self.stop_event = stop_event
@ -102,6 +105,14 @@ class EventProcessor(threading.Thread):
continue continue
self.handle_object_detection(event_type, camera, event_data) self.handle_object_detection(event_type, camera, event_data)
if event_type == "end" and self.config.cameras[camera].gemini.enabled:
self.gemini_queue.put(
(
camera,
event_data,
)
)
elif source_type == EventTypeEnum.api: elif source_type == EventTypeEnum.api:
self.handle_external_detection(event_type, event_data) self.handle_external_detection(event_type, event_data)

View File

@ -797,11 +797,10 @@ function Event({
</div> </div>
<div className="m-2 flex grow"> <div className="m-2 flex grow">
<div className="flex flex-col grow"> <div className="flex flex-col grow">
<div className="capitalize text-lg font-bold"> <div className="capitalize text-lg font-bold">{event.label.replaceAll('_', ' ')}</div>
{event.label.replaceAll('_', ' ')} <div className="text-sm flex pb-2">
{event.sub_label ? `: ${event.sub_label.replaceAll('_', ' ')}` : null} {event.sub_label ? `${event.sub_label.replaceAll('_', ' ')}` : null}
</div> </div>
<div className="text-sm flex"> <div className="text-sm flex">
<Clock className="h-5 w-5 mr-2 inline" /> <Clock className="h-5 w-5 mr-2 inline" />
{formatUnixTimestampToDateTime(event.start_time, { ...config.ui })} {formatUnixTimestampToDateTime(event.start_time, { ...config.ui })}