diff --git a/frigate/api/chat.py b/frigate/api/chat.py index a701a04bd5..291503dbba 100644 --- a/frigate/api/chat.py +++ b/frigate/api/chat.py @@ -1185,6 +1185,13 @@ async def chat_completion( ) + b"\n" ) + elif kind == "reasoning_delta": + yield ( + json.dumps({"type": "reasoning", "delta": value}).encode( + "utf-8" + ) + + b"\n" + ) elif kind == "stats": yield ( json.dumps({"type": "stats", **value}).encode("utf-8") @@ -1285,6 +1292,7 @@ async def chat_completion( final_content = response.get("content") or "" if body.stream: + final_reasoning = response.get("reasoning") async def stream_body() -> Any: if tool_calls: @@ -1299,6 +1307,15 @@ async def chat_completion( ).encode("utf-8") + b"\n" ) + # Emit the full reasoning trace up front when the + # underlying client did not stream it + if final_reasoning: + yield ( + json.dumps( + {"type": "reasoning", "delta": final_reasoning} + ).encode("utf-8") + + b"\n" + ) # Stream content in word-sized chunks for smooth UX for part in chunk_content(final_content): yield ( @@ -1319,6 +1336,7 @@ async def chat_completion( message=ChatMessageResponse( role="assistant", content=final_content, + reasoning=response.get("reasoning"), tool_calls=None, ), finish_reason=response.get("finish_reason", "stop"), diff --git a/frigate/api/defs/response/chat_response.py b/frigate/api/defs/response/chat_response.py index 0bc864ba68..c2b3e6b1f2 100644 --- a/frigate/api/defs/response/chat_response.py +++ b/frigate/api/defs/response/chat_response.py @@ -20,6 +20,10 @@ class ChatMessageResponse(BaseModel): content: Optional[str] = Field( default=None, description="Message content (None if tool calls present)" ) + reasoning: Optional[str] = Field( + default=None, + description="Separated reasoning/thinking trace if the model emitted one", + ) tool_calls: Optional[list[ToolCallInvocation]] = Field( default=None, description="Tool calls if LLM wants to call tools" ) diff --git a/frigate/genai/__init__.py b/frigate/genai/__init__.py index 76ee8b888c..864092df58 100644 --- a/frigate/genai/__init__.py +++ b/frigate/genai/__init__.py @@ -300,6 +300,10 @@ class GenAIClient: Returns: Dictionary with: - 'content': Optional[str] - The text response from the LLM, None if tool calls + - 'reasoning': Optional[str] - The separated reasoning/thinking trace + if the model emitted one (e.g. via OpenAI-compatible + `reasoning_content`). None when the model does not surface a + trace or the provider does not parse it. - 'tool_calls': Optional[List[Dict]] - List of tool calls if LLM wants to call tools. Each tool call dict has: - 'id': str - Unique identifier for this tool call @@ -311,6 +315,14 @@ class GenAIClient: - 'length': Hit token limit - 'error': An error occurred + Streaming counterpart `chat_with_tools_stream` yields + ``(kind, value)`` tuples where ``kind`` is one of: + - 'content_delta': value is a string fragment of the answer + - 'reasoning_delta': value is a string fragment of the reasoning + trace (emitted before content for thinking models) + - 'stats': value is a usage stats dict + - 'message': value is the final dict shape described above + Raises: NotImplementedError: If the provider doesn't implement this method. """ @@ -321,6 +333,7 @@ class GenAIClient: ) return { "content": None, + "reasoning": None, "tool_calls": None, "finish_reason": "error", } diff --git a/frigate/genai/plugins/llama_cpp.py b/frigate/genai/plugins/llama_cpp.py index 6e2fc910c4..830dd6817b 100644 --- a/frigate/genai/plugins/llama_cpp.py +++ b/frigate/genai/plugins/llama_cpp.py @@ -531,16 +531,24 @@ class LlamaCppClient(GenAIClient): return payload def _message_from_choice(self, choice: dict[str, Any]) -> dict[str, Any]: - """Parse OpenAI-style choice into {content, tool_calls, finish_reason}.""" + """Parse OpenAI-style choice into {content, reasoning, tool_calls, finish_reason}. + + llama.cpp's `--reasoning-format` puts the trace in + `message.reasoning_content` (preferred) or `message.thinking`; both + keys are accepted so different builds work without configuration. + """ message = choice.get("message", {}) content = message.get("content") content = content.strip() if content else None + reasoning = message.get("reasoning_content") or message.get("thinking") + reasoning = reasoning.strip() if reasoning else None tool_calls = parse_tool_calls_from_message(message) finish_reason = choice.get("finish_reason") or ( "tool_calls" if tool_calls else "stop" if content else "error" ) return { "content": content, + "reasoning": reasoning, "tool_calls": tool_calls, "finish_reason": finish_reason, } @@ -803,6 +811,7 @@ class LlamaCppClient(GenAIClient): try: payload = self._build_payload(messages, tools, tool_choice, stream=True) content_parts: list[str] = [] + reasoning_parts: list[str] = [] tool_calls_by_index: dict[int, dict[str, Any]] = {} finish_reason = "stop" @@ -832,6 +841,15 @@ class LlamaCppClient(GenAIClient): delta = choices[0].get("delta", {}) if choices[0].get("finish_reason"): finish_reason = choices[0]["finish_reason"] + # llama.cpp emits separated thinking under + # reasoning_content (preferred) or thinking before any + # content tokens arrive + reasoning_delta = delta.get("reasoning_content") or delta.get( + "thinking" + ) + if reasoning_delta: + reasoning_parts.append(reasoning_delta) + yield ("reasoning_delta", reasoning_delta) if delta.get("content"): content_parts.append(delta["content"]) yield ("content_delta", delta["content"]) @@ -857,6 +875,7 @@ class LlamaCppClient(GenAIClient): ) full_content = "".join(content_parts).strip() or None + full_reasoning = "".join(reasoning_parts).strip() or None tool_calls_list = self._streamed_tool_calls_to_list(tool_calls_by_index) if tool_calls_list: finish_reason = "tool_calls" @@ -864,6 +883,7 @@ class LlamaCppClient(GenAIClient): "message", { "content": full_content, + "reasoning": full_reasoning, "tool_calls": tool_calls_list, "finish_reason": finish_reason, }, diff --git a/web/public/locales/en/views/chat.json b/web/public/locales/en/views/chat.json index bc320c2049..4cd3ad20f6 100644 --- a/web/public/locales/en/views/chat.json +++ b/web/public/locales/en/views/chat.json @@ -60,5 +60,10 @@ "stats": { "context": "{{tokens}} tokens", "tokens_per_second": "{{rate}} t/s" + }, + "reasoning": { + "thinking": "Thinking…", + "show": "Show reasoning", + "hide": "Hide reasoning" } } diff --git a/web/src/components/chat/ReasoningBubble.tsx b/web/src/components/chat/ReasoningBubble.tsx new file mode 100644 index 0000000000..580a99c45d --- /dev/null +++ b/web/src/components/chat/ReasoningBubble.tsx @@ -0,0 +1,87 @@ +import { useState, useEffect, useRef } from "react"; +import { useTranslation } from "react-i18next"; +import { LuBrain, LuChevronDown, LuChevronRight } from "react-icons/lu"; +import { + Collapsible, + CollapsibleContent, + CollapsibleTrigger, +} from "@/components/ui/collapsible"; +import { Button } from "@/components/ui/button"; +import { cn } from "@/lib/utils"; + +type ReasoningBubbleProps = { + /** The accumulated reasoning text from the model. */ + reasoning: string; + /** + * Whether the assistant has begun producing the user-facing answer. + * While false the reasoning is still streaming and we keep the panel + * open with a "Thinking…" label. Once true, the panel auto-collapses + * so the answer is the primary focus, but stays expandable. + */ + answerStarted: boolean; +}; + +export function ReasoningBubble({ + reasoning, + answerStarted, +}: ReasoningBubbleProps) { + const { t } = useTranslation(["views/chat"]); + // Open while the model is still mid-thought (no answer tokens yet); + // once the answer begins, collapse on its own but let the user reopen. + const [open, setOpen] = useState(true); + const userInteractedRef = useRef(false); + const lastAutoState = useRef(true); + + useEffect(() => { + if (userInteractedRef.current) return; + const desired = !answerStarted; + if (desired !== lastAutoState.current) { + lastAutoState.current = desired; + setOpen(desired); + } + }, [answerStarted]); + + const handleOpenChange = (next: boolean) => { + userInteractedRef.current = true; + setOpen(next); + }; + + const label = !answerStarted + ? t("reasoning.thinking") + : open + ? t("reasoning.hide") + : t("reasoning.show"); + + return ( +
+ {reasoning}
+
+