diff --git a/frigate/data_processing/post/types.py b/frigate/data_processing/post/types.py index 44bb09fb0..beb746658 100644 --- a/frigate/data_processing/post/types.py +++ b/frigate/data_processing/post/types.py @@ -4,20 +4,24 @@ from pydantic import BaseModel, ConfigDict, Field class ReviewMetadata(BaseModel): model_config = ConfigDict(extra="ignore", protected_namespaces=()) - title: str = Field(description="A concise title for the activity.") + title: str = Field( + description="A short title characterizing what took place and where, under 10 words." + ) scene: str = Field( - description="A comprehensive description of the setting and entities, including relevant context and plausible inferences if supported by visual evidence." + description="A chronological narrative of what happens from start to finish." ) shortSummary: str = Field( - description="A brief 2-sentence summary of the scene, suitable for notifications. Should capture the key activity and context without full detail." + description="A brief 2-sentence summary of the scene, suitable for notifications." ) confidence: float = Field( - description="A float between 0 and 1 representing your overall confidence in this analysis." + ge=0.0, + le=1.0, + description="Confidence in the analysis, from 0 to 1.", ) potential_threat_level: int = Field( ge=0, - le=3, - description="An integer representing the potential threat level (1-3). 1: Minor anomaly. 2: Moderate concern. 3: High threat. Only include this field if a clear security concern is observable; otherwise, omit it.", + le=2, + description="Threat level: 0 = normal, 1 = suspicious, 2 = critical threat.", ) other_concerns: list[str] | None = Field( default=None, diff --git a/frigate/genai/__init__.py b/frigate/genai/__init__.py index fa90f1463..95e1ca046 100644 --- a/frigate/genai/__init__.py +++ b/frigate/genai/__init__.py @@ -89,12 +89,7 @@ Your task is to analyze a sequence of images taken in chronological order from a ## Task Instructions -Your task is to provide a clear, accurate description of the scene that: -1. States exactly what is happening based on observable actions and movements. -2. Evaluates the activity against the Normal and Suspicious Activity Indicators above. -3. Assigns a potential_threat_level (0, 1, or 2) based on the threat level indicators defined above, applying them consistently. - -**Use the activity patterns above as guidance to calibrate your assessment. Match the activity against both normal and suspicious indicators, then use your judgment based on the complete context.** +Describe the scene based on observable actions and movements, evaluate the activity against the Activity Indicators above, and assign a potential_threat_level (0, 1, or 2) by applying the threat level indicators consistently. ## Analysis Guidelines @@ -108,14 +103,12 @@ When forming your description: - **Consider duration as a primary factor**: Apply the duration thresholds defined in the activity patterns above. Brief sequences during normal hours with apparent purpose typically indicate normal activity unless explicit suspicious actions are visible. - **Weigh all evidence holistically**: Match the activity against the normal and suspicious patterns defined above, then evaluate based on the complete context (zone, objects, time, actions, duration). Apply the threat level indicators consistently. Use your judgment for edge cases. -## Response Format +## Response Field Guidelines -Your response MUST be a flat JSON object with: -- `scene` (string): A narrative description of what happens across the sequence from start to finish, in chronological order. Start by describing how the sequence begins, then describe the progression of events. **Describe all significant movements and actions in the order they occur.** For example, if a vehicle arrives and then a person exits, describe both actions sequentially. **Only describe actions you can actually observe happening in the frames provided.** Do not infer or assume actions that aren't visible (e.g., if you see someone walking but never see them sit, don't say they sat down). Include setting, detected objects, and their observable actions. Avoid speculation or filling in assumed behaviors. Your description should align with and support the threat level you assign. -- `title` (string): A concise, grammatically complete title in the format "[Subject] [action verb] [context]" that matches your scene description. Use names from "Objects in Scene" when you visually observe them. -- `shortSummary` (string): A brief 2-sentence summary of the scene, suitable for notifications. Should capture the key activity and context without full detail. This should be a condensed version of the scene description above. -- `confidence` (float): 0-1 confidence in your analysis. Higher confidence when objects/actions are clearly visible and context is unambiguous. Lower confidence when the sequence is unclear, objects are partially obscured, or context is ambiguous. -- `potential_threat_level` (integer): 0, 1, or 2 as defined in "Normal Activity Patterns for This Property" above. Your threat level must be consistent with your scene description and the guidance above. +Respond with a JSON object matching the provided schema. Field-specific guidance: +- `scene`: Describe how the sequence begins, then the progression of events — all significant movements and actions in order. For example, if a vehicle arrives and then a person exits, describe both sequentially. Your description should align with and support the threat level you assign. +- `title`: Characterize **what took place and where** — interpret the overall purpose or outcome, do not simply compress the scene description into fewer words. Include the relevant location (zone, area, or entry point). Always include subject names from "Objects in Scene" — do not replace named subjects with generic terms. No editorial qualifiers like "routine" or "suspicious." +- `potential_threat_level`: Must be consistent with your scene description and the activity patterns above. {get_concern_prompt()} ## Sequence Details @@ -134,10 +127,6 @@ Each line represents a detection state, not necessarily unique individuals. Pare **Note: Unidentified objects (without names) are NOT indicators of suspicious activity—they simply mean the system hasn't identified that object.** {get_objects_list()} -## Important Notes -- Values must be plain strings, floats, or integers — no nested objects, no extra commentary. -- Only describe objects from the "Objects in Scene" list above. Do not hallucinate additional objects. -- When describing people or vehicles, use the exact names provided. {get_language_prompt()} """ logger.debug( @@ -153,7 +142,27 @@ Each line represents a detection state, not necessarily unique individuals. Pare ) as f: f.write(context_prompt) - response = self._send(context_prompt, thumbnails) + # Build JSON schema for structured output from ReviewMetadata model + schema = ReviewMetadata.model_json_schema() + schema.get("properties", {}).pop("time", None) + + if "time" in schema.get("required", []): + schema["required"].remove("time") + if not concerns: + schema.get("properties", {}).pop("other_concerns", None) + if "other_concerns" in schema.get("required", []): + schema["required"].remove("other_concerns") + + response_format = { + "type": "json_schema", + "json_schema": { + "name": "review_metadata", + "strict": True, + "schema": schema, + }, + } + + response = self._send(context_prompt, thumbnails, response_format) if debug_save and response: with open( @@ -297,7 +306,12 @@ Guidelines: """Initialize the client.""" return None - def _send(self, prompt: str, images: list[bytes]) -> Optional[str]: + def _send( + self, + prompt: str, + images: list[bytes], + response_format: Optional[dict] = None, + ) -> Optional[str]: """Submit a request to the provider.""" return None diff --git a/frigate/genai/azure-openai.py b/frigate/genai/azure-openai.py index 9122ca14e..f424f7610 100644 --- a/frigate/genai/azure-openai.py +++ b/frigate/genai/azure-openai.py @@ -42,13 +42,18 @@ class OpenAIClient(GenAIClient): azure_endpoint=azure_endpoint, ) - def _send(self, prompt: str, images: list[bytes]) -> Optional[str]: + def _send( + self, + prompt: str, + images: list[bytes], + response_format: Optional[dict] = None, + ) -> Optional[str]: """Submit a request to Azure OpenAI.""" encoded_images = [base64.b64encode(image).decode("utf-8") for image in images] try: - result = self.provider.chat.completions.create( - model=self.genai_config.model, - messages=[ + request_params = { + "model": self.genai_config.model, + "messages": [ { "role": "user", "content": [{"type": "text", "text": prompt}] @@ -64,9 +69,12 @@ class OpenAIClient(GenAIClient): ], }, ], - timeout=self.timeout, + "timeout": self.timeout, **self.genai_config.runtime_options, - ) + } + if response_format: + request_params["response_format"] = response_format + result = self.provider.chat.completions.create(**request_params) except Exception as e: logger.warning("Azure OpenAI returned an error: %s", str(e)) return None diff --git a/frigate/genai/gemini.py b/frigate/genai/gemini.py index 418d633b2..9e01192dc 100644 --- a/frigate/genai/gemini.py +++ b/frigate/genai/gemini.py @@ -42,7 +42,12 @@ class GeminiClient(GenAIClient): http_options=types.HttpOptions(**http_options_dict), ) - def _send(self, prompt: str, images: list[bytes]) -> Optional[str]: + def _send( + self, + prompt: str, + images: list[bytes], + response_format: Optional[dict] = None, + ) -> Optional[str]: """Submit a request to Gemini.""" contents = [ types.Part.from_bytes(data=img, mime_type="image/jpeg") for img in images @@ -52,6 +57,12 @@ class GeminiClient(GenAIClient): generation_config_dict = {"candidate_count": 1} generation_config_dict.update(self.genai_config.runtime_options) + if response_format and response_format.get("type") == "json_schema": + generation_config_dict["response_mime_type"] = "application/json" + schema = response_format.get("json_schema", {}).get("schema") + if schema: + generation_config_dict["response_schema"] = schema + response = self.provider.models.generate_content( model=self.genai_config.model, contents=contents, diff --git a/frigate/genai/llama_cpp.py b/frigate/genai/llama_cpp.py index f9c251790..87443ac4f 100644 --- a/frigate/genai/llama_cpp.py +++ b/frigate/genai/llama_cpp.py @@ -57,7 +57,12 @@ class LlamaCppClient(GenAIClient): else None ) - def _send(self, prompt: str, images: list[bytes]) -> Optional[str]: + def _send( + self, + prompt: str, + images: list[bytes], + response_format: Optional[dict] = None, + ) -> Optional[str]: """Submit a request to llama.cpp server.""" if self.provider is None: logger.warning( @@ -96,6 +101,9 @@ class LlamaCppClient(GenAIClient): **self.provider_options, } + if response_format: + payload["response_format"] = response_format + response = requests.post( f"{self.provider}/v1/chat/completions", json=payload, diff --git a/frigate/genai/ollama.py b/frigate/genai/ollama.py index e98f6ab07..90bf3f05e 100644 --- a/frigate/genai/ollama.py +++ b/frigate/genai/ollama.py @@ -53,7 +53,12 @@ class OllamaClient(GenAIClient): logger.warning("Error initializing Ollama: %s", str(e)) return None - def _send(self, prompt: str, images: list[bytes]) -> Optional[str]: + def _send( + self, + prompt: str, + images: list[bytes], + response_format: Optional[dict] = None, + ) -> Optional[str]: """Submit a request to Ollama""" if self.provider is None: logger.warning( @@ -65,6 +70,10 @@ class OllamaClient(GenAIClient): **self.provider_options, **self.genai_config.runtime_options, } + if response_format and response_format.get("type") == "json_schema": + schema = response_format.get("json_schema", {}).get("schema") + if schema: + ollama_options["format"] = schema result = self.provider.generate( self.genai_config.model, prompt, diff --git a/frigate/genai/openai.py b/frigate/genai/openai.py index b3031ff33..7d8700579 100644 --- a/frigate/genai/openai.py +++ b/frigate/genai/openai.py @@ -36,7 +36,12 @@ class OpenAIClient(GenAIClient): return OpenAI(api_key=self.genai_config.api_key, **provider_opts) - def _send(self, prompt: str, images: list[bytes]) -> Optional[str]: + def _send( + self, + prompt: str, + images: list[bytes], + response_format: Optional[dict] = None, + ) -> Optional[str]: """Submit a request to OpenAI.""" encoded_images = [base64.b64encode(image).decode("utf-8") for image in images] messages_content = [] @@ -57,17 +62,20 @@ class OpenAIClient(GenAIClient): } ) try: - result = self.provider.chat.completions.create( - model=self.genai_config.model, - messages=[ + request_params = { + "model": self.genai_config.model, + "messages": [ { "role": "user", "content": messages_content, }, ], - timeout=self.timeout, + "timeout": self.timeout, **self.genai_config.runtime_options, - ) + } + if response_format: + request_params["response_format"] = response_format + result = self.provider.chat.completions.create(**request_params) if ( result is not None and hasattr(result, "choices")