diff --git a/frigate/data_processing/post/review_descriptions.py b/frigate/data_processing/post/review_descriptions.py index e99f3d9d7..0548876e9 100644 --- a/frigate/data_processing/post/review_descriptions.py +++ b/frigate/data_processing/post/review_descriptions.py @@ -95,6 +95,7 @@ class ReviewDescriptionProcessor(PostProcessorApi): "objects": final_data["data"]["objects"], "recognized_objects": final_data["data"]["sub_labels"], "zones": final_data["data"]["zones"], + "timestamp": final_data["end_time"], }, [r[1] for r in self.tracked_review_items[id]], ) diff --git a/frigate/genai/__init__.py b/frigate/genai/__init__.py index 13f8cfb3f..d206a87a5 100644 --- a/frigate/genai/__init__.py +++ b/frigate/genai/__init__.py @@ -39,15 +39,16 @@ class GenAIClient: ) -> None: """Generate a description for the review item activity.""" context_prompt = f""" - Here is additional context about the scene from a security camera: - The following objects were detected: {review_data['objects']} - The following recognized objects were detected: {review_data['recognized_objects']} - The activity happened in the following zones: {review_data['zones']} - Please analyze the image(s), which are in chronological order, strictly from the perspective of the {review_data["camera"].replace("_", " ")} security camera. - Your task is to provide a **neutral, factual, and objective description** of the scene. + Your task is to provide a **neutral, factual, and objective description** of the scene and the objects interacting with it. Focus solely on observable actions, visible entities, and the environment. + Here is some information we already know: + - the following activity occurred at {review_data['timestamp'].strftime('%I:%M %p')} + - the following objects were detected: {review_data['objects']} + - the following recognized objects were detected: {review_data['recognized_objects']} + - the activity happened in the following zones: {review_data['zones']} + Your response **MUST** be a flat JSON object with the following fields: - `scene` (string): A single, comprehensive description of the entire visual scene. - `action` (string): A single description of any key actions or movements.