Support Dynamic Thinking Models (#23281)

* Add ability to toggle thinking * Disable thinking for descriptions automatically * mypy * Cleanup
2026-06-21 03:41:55 +03:00 · 2026-05-21 11:54:23 -06:00 · 2026-05-21 11:54:23 -06:00 · 66a2417229
commit 66a2417229
parent 555ef89800
16 changed files with 410 additions and 175 deletions
--- a/frigate/api/chat.py
+++ b/frigate/api/chat.py
@ -1173,6 +1173,7 @@ async def chat_completion(
                    messages=conversation,
                    tools=tools if tools else None,
                    tool_choice="auto",
                    enable_thinking=body.enable_thinking,
                ):
                    if await request.is_disconnected():
                        logger.debug("Client disconnected, stopping chat stream")
@ -1267,6 +1268,7 @@ async def chat_completion(
                messages=conversation,
                tools=tools if tools else None,
                tool_choice="auto",
                enable_thinking=body.enable_thinking,
            )
            if response.get("finish_reason") == "error":
--- a/frigate/api/defs/request/chat_body.py
+++ b/frigate/api/defs/request/chat_body.py
@ -36,3 +36,10 @@ class ChatCompletionRequest(BaseModel):
        default=False,
        description="If true, stream the final assistant response in the body as newline-delimited JSON.",
    )
    enable_thinking: Optional[bool] = Field(
        default=None,
        description=(
            "Per-request thinking toggle. None means use the provider default. "
            "Ignored by providers that do not expose a per-request thinking switch."
        ),
    )
--- a/frigate/genai/init.py
+++ b/frigate/genai/init.py
@ -222,8 +222,15 @@ class GenAIClient:
        prompt: str,
        images: list[bytes],
        response_format: Optional[dict] = None,
        enable_thinking: bool = False,
    ) -> Optional[str]:
-        """Submit a request to the provider."""
+        """Submit a request to the provider.
        ``enable_thinking`` is honored only by providers that report
        ``supports_toggleable_thinking``. Description-style callers leave it
        at the default (off) since synthesis tasks don't benefit from
        reasoning traces.
        """
        return None
    @property
@ -235,6 +242,11 @@ class GenAIClient:
        """
        return True
    @property
    def supports_toggleable_thinking(self) -> bool:
        """Whether the configured model exposes a per-request thinking toggle."""
        return False
    def list_models(self) -> list[str]:
        """Return the list of model names available from this provider.
@ -278,6 +290,7 @@ class GenAIClient:
        messages: list[dict[str, Any]],
        tools: Optional[list[dict[str, Any]]] = None,
        tool_choice: Optional[str] = "auto",
        enable_thinking: Optional[bool] = None,
    ) -> dict[str, Any]:
        """
        Send chat messages to LLM with optional tool definitions.
@ -301,7 +314,9 @@ class GenAIClient:
                - 'none': Model must not call tools
                - 'required': Model must call at least one tool
                - Or a dict specifying a specific tool to call
-            **kwargs: Additional provider-specific parameters.
+            enable_thinking: Per-request thinking toggle. None means use the
                provider default. Ignored by providers without a per-request
                toggle (see `supports_toggleable_thinking`).
        Returns:
            Dictionary with:
--- a/frigate/genai/manager.py
+++ b/frigate/genai/manager.py
@ -6,7 +6,7 @@ no chat feature is active) are never initialized.
 """
 import logging
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Any, Optional
 from frigate.config import FrigateConfig
 from frigate.config.camera.genai import GenAIConfig, GenAIRoleEnum
@ -108,11 +108,16 @@ class GenAIClientManager:
        name = self._role_map.get(GenAIRoleEnum.embeddings)
        return self._get_client(name) if name else None
-    def list_models(self) -> dict[str, list[str]]:
+    def list_models(self) -> dict[str, dict[str, Any]]:
-        """Return available models keyed by config entry name."""
+        """Return per-entry model lists and capabilities, keyed by config entry name."""
-        result: dict[str, list[str]] = {}
+        result: dict[str, dict[str, Any]] = {}
-        for name in self._configs:
+        for name, genai_cfg in self._configs.items():
            client = self._get_client(name)
-            if client:
+            if not client:
-                result[name] = client.list_models()
+                continue
            result[name] = {
                "models": client.list_models(),
                "roles": [r.value for r in genai_cfg.roles],
                "supports_toggleable_thinking": client.supports_toggleable_thinking,
            }
        return result
--- a/frigate/genai/plugins/gemini.py
+++ b/frigate/genai/plugins/gemini.py
@ -62,6 +62,7 @@ class GeminiClient(GenAIClient):
        prompt: str,
        images: list[bytes],
        response_format: Optional[dict] = None,
        enable_thinking: bool = False,
    ) -> Optional[str]:
        """Submit a request to Gemini."""
        contents = [prompt] + [
@ -119,11 +120,14 @@ class GeminiClient(GenAIClient):
        messages: list[dict[str, Any]],
        tools: Optional[list[dict[str, Any]]] = None,
        tool_choice: Optional[str] = "auto",
        enable_thinking: Optional[bool] = None,
    ) -> dict[str, Any]:
        """
        Send chat messages to Gemini with optional tool definitions.
-        Implements function calling/tool usage for Gemini models.
+        Implements function calling/tool usage for Gemini models. Thinking is
        configured at the model level for Gemini, so ``enable_thinking`` is
        accepted for interface parity and ignored.
        """
        try:
            # Convert messages to Gemini format
--- a/frigate/genai/plugins/llama_cpp.py
+++ b/frigate/genai/plugins/llama_cpp.py
@ -122,6 +122,7 @@ class LlamaCppClient(GenAIClient):
    _supports_vision: bool
    _supports_audio: bool
    _supports_tools: bool
    _supports_reasoning: bool
    _image_token_cache: dict[tuple[int, int], int]
    _text_baseline_tokens: int | None
    _media_marker: str
@ -135,6 +136,7 @@ class LlamaCppClient(GenAIClient):
        self._supports_vision = False
        self._supports_audio = False
        self._supports_tools = False
        self._supports_reasoning = False
        self._image_token_cache = {}
        self._text_baseline_tokens = None
        self._media_marker = "<__media__>"
@ -164,15 +166,17 @@ class LlamaCppClient(GenAIClient):
        self._supports_vision = info["supports_vision"]
        self._supports_audio = info["supports_audio"]
        self._supports_tools = info["supports_tools"]
        self._supports_reasoning = info["supports_reasoning"]
        self._media_marker = info["media_marker"]
        logger.info(
-            "llama.cpp model '%s' initialized — context: %s, vision: %s, audio: %s, tools: %s",
+            "llama.cpp model '%s' initialized — context: %s, vision: %s, audio: %s, tools: %s, reasoning: %s",
            configured_model,
            self._context_size or "unknown",
            self._supports_vision,
            self._supports_audio,
            self._supports_tools,
            self._supports_reasoning,
        )
        return base_url
@ -200,6 +204,7 @@ class LlamaCppClient(GenAIClient):
            "supports_vision": False,
            "supports_audio": False,
            "supports_tools": False,
            "supports_reasoning": False,
            "media_marker": "<__media__>",
        }
@ -279,10 +284,17 @@ class LlamaCppClient(GenAIClient):
                info["supports_vision"] = bool(modalities.get("vision", False))
                info["supports_audio"] = bool(modalities.get("audio", False))
            chat_caps = props.get("chat_template_caps") or {}
            if not info["supports_tools"]:
                chat_caps = props.get("chat_template_caps", {})
                info["supports_tools"] = bool(chat_caps.get("supports_tools", False))
            # llama.cpp does not advertise per-template reasoning support, so
            # detect it by looking for the `enable_thinking` toggle variable
            # in the Jinja chat template itself.
            chat_template = props.get("chat_template") or ""
            info["supports_reasoning"] = "enable_thinking" in chat_template
            media_marker = props.get("media_marker")
            if isinstance(media_marker, str) and media_marker:
                info["media_marker"] = media_marker
@ -300,6 +312,7 @@ class LlamaCppClient(GenAIClient):
        prompt: str,
        images: list[bytes],
        response_format: Optional[dict] = None,
        enable_thinking: bool = False,
    ) -> Optional[str]:
        """Submit a request to llama.cpp server."""
        if self.provider is None:
@ -327,7 +340,7 @@ class LlamaCppClient(GenAIClient):
                )
            # Build request payload with llama.cpp native options
-            payload = {
+            payload: dict[str, Any] = {
                "model": self.genai_config.model,
                "messages": [
                    {
@ -341,6 +354,9 @@ class LlamaCppClient(GenAIClient):
            if response_format:
                payload["response_format"] = response_format
            if self.supports_toggleable_thinking:
                payload["chat_template_kwargs"] = {"enable_thinking": enable_thinking}
            response = requests.post(
                f"{self.provider}/v1/chat/completions",
                json=payload,
@ -377,6 +393,10 @@ class LlamaCppClient(GenAIClient):
        """Whether the loaded model supports tool/function calling."""
        return self._supports_tools
    @property
    def supports_toggleable_thinking(self) -> bool:
        return self._supports_reasoning
    def list_models(self) -> list[str]:
        """Return available model IDs from the llama.cpp server."""
        base_url = self.provider or (
@ -504,6 +524,7 @@ class LlamaCppClient(GenAIClient):
        tools: Optional[list[dict[str, Any]]],
        tool_choice: Optional[str],
        stream: bool = False,
        enable_thinking: Optional[bool] = None,
    ) -> dict[str, Any]:
        """Build request payload for chat completions (sync or stream)."""
        openai_tool_choice = None
@ -519,14 +540,21 @@ class LlamaCppClient(GenAIClient):
            "messages": messages,
            "model": self.genai_config.model,
        }
        if stream:
            payload["stream"] = True
            payload["stream_options"] = {"include_usage": True}
            payload["timings_per_token"] = True
        if tools:
            payload["tools"] = tools
            if openai_tool_choice is not None:
                payload["tool_choice"] = openai_tool_choice
        if enable_thinking is not None and self._supports_reasoning:
            payload["chat_template_kwargs"] = {"enable_thinking": enable_thinking}
        provider_opts = {
            k: v for k, v in self.provider_options.items() if k != "context_size"
        }
@ -732,6 +760,7 @@ class LlamaCppClient(GenAIClient):
        messages: list[dict[str, Any]],
        tools: Optional[list[dict[str, Any]]] = None,
        tool_choice: Optional[str] = "auto",
        enable_thinking: Optional[bool] = None,
    ) -> dict[str, Any]:
        """
        Send chat messages to llama.cpp server with optional tool definitions.
@ -749,7 +778,13 @@ class LlamaCppClient(GenAIClient):
                "finish_reason": "error",
            }
        try:
-            payload = self._build_payload(messages, tools, tool_choice, stream=False)
+            payload = self._build_payload(
                messages,
                tools,
                tool_choice,
                stream=False,
                enable_thinking=enable_thinking,
            )
            response = requests.post(
                f"{self.provider}/v1/chat/completions",
                json=payload,
@ -797,6 +832,7 @@ class LlamaCppClient(GenAIClient):
        messages: list[dict[str, Any]],
        tools: Optional[list[dict[str, Any]]] = None,
        tool_choice: Optional[str] = "auto",
        enable_thinking: Optional[bool] = None,
    ) -> AsyncGenerator[tuple[str, Any], None]:
        """Stream chat with tools via OpenAI-compatible streaming API."""
        if self.provider is None:
@ -813,7 +849,13 @@ class LlamaCppClient(GenAIClient):
            )
            return
        try:
-            payload = self._build_payload(messages, tools, tool_choice, stream=True)
+            payload = self._build_payload(
                messages,
                tools,
                tool_choice,
                stream=True,
                enable_thinking=enable_thinking,
            )
            content_parts: list[str] = []
            reasoning_parts: list[str] = []
            tool_calls_by_index: dict[int, dict[str, Any]] = {}
--- a/frigate/genai/plugins/ollama.py
+++ b/frigate/genai/plugins/ollama.py
@ -98,6 +98,22 @@ class OllamaClient(GenAIClient):
    provider: ApiClient | None
    provider_options: dict[str, Any]
    _supports_thinking_cache: Optional[bool] = None
    @property
    def supports_toggleable_thinking(self) -> bool:
        if self._supports_thinking_cache is not None:
            return self._supports_thinking_cache
        if self.provider is None:
            return False
        try:
            response = self.provider.show(self.genai_config.model)
            capabilities = response.get("capabilities") or []
            self._supports_thinking_cache = "thinking" in capabilities
        except Exception as e:
            logger.debug("Failed to query Ollama model capabilities: %s", e)
            self._supports_thinking_cache = False
        return self._supports_thinking_cache
    def _auth_headers(self) -> dict | None:
        if self.genai_config.api_key:
@ -178,6 +194,7 @@ class OllamaClient(GenAIClient):
        prompt: str,
        images: list[bytes],
        response_format: Optional[dict] = None,
        enable_thinking: bool = False,
    ) -> Optional[str]:
        """Submit a request to Ollama"""
        if self.provider is None:
@ -194,6 +211,8 @@ class OllamaClient(GenAIClient):
                schema = response_format.get("json_schema", {}).get("schema")
                if schema:
                    ollama_options["format"] = self._clean_schema_for_ollama(schema)
            if self.supports_toggleable_thinking:
                ollama_options["think"] = enable_thinking
            logger.debug(
                "Ollama generate request: model=%s, prompt_len=%s, image_count=%s, "
                "has_format=%s, options=%s",
@ -274,6 +293,7 @@ class OllamaClient(GenAIClient):
        tools: Optional[list[dict[str, Any]]],
        tool_choice: Optional[str],
        stream: bool = False,
        enable_thinking: Optional[bool] = None,
    ) -> dict[str, Any]:
        """Build request_messages and params for chat (sync or stream)."""
        request_messages = []
@ -318,6 +338,8 @@ class OllamaClient(GenAIClient):
            request_params["stream"] = True
        if tools:
            request_params["tools"] = tools
        if enable_thinking is not None and self.supports_toggleable_thinking:
            request_params["think"] = enable_thinking
        return request_params
    def _message_from_response(self, response: dict[str, Any]) -> dict[str, Any]:
@ -365,6 +387,7 @@ class OllamaClient(GenAIClient):
        messages: list[dict[str, Any]],
        tools: Optional[list[dict[str, Any]]] = None,
        tool_choice: Optional[str] = "auto",
        enable_thinking: Optional[bool] = None,
    ) -> dict[str, Any]:
        if self.provider is None:
            logger.warning(
@ -377,7 +400,11 @@ class OllamaClient(GenAIClient):
            }
        try:
            request_params = self._build_request_params(
-                messages, tools, tool_choice, stream=False
+                messages,
                tools,
                tool_choice,
                stream=False,
                enable_thinking=enable_thinking,
            )
            response = self.provider.chat(**request_params)
            return self._message_from_response(response)
@ -401,6 +428,7 @@ class OllamaClient(GenAIClient):
        messages: list[dict[str, Any]],
        tools: Optional[list[dict[str, Any]]] = None,
        tool_choice: Optional[str] = "auto",
        enable_thinking: Optional[bool] = None,
    ) -> AsyncGenerator[tuple[str, Any], None]:
        """Stream chat with tools; yields content deltas then final message.
@ -430,7 +458,11 @@ class OllamaClient(GenAIClient):
                    "Ollama: tools provided, using non-streaming call for tool support"
                )
                request_params = self._build_request_params(
-                    messages, tools, tool_choice, stream=False
+                    messages,
                    tools,
                    tool_choice,
                    stream=False,
                    enable_thinking=enable_thinking,
                )
                async_client = OllamaAsyncClient(
                    host=self.genai_config.base_url,
@ -452,7 +484,11 @@ class OllamaClient(GenAIClient):
                return
            request_params = self._build_request_params(
-                messages, tools, tool_choice, stream=True
+                messages,
                tools,
                tool_choice,
                stream=True,
                enable_thinking=enable_thinking,
            )
            async_client = OllamaAsyncClient(
                host=self.genai_config.base_url,
--- a/frigate/genai/plugins/openai.py
+++ b/frigate/genai/plugins/openai.py
@ -61,6 +61,7 @@ class OpenAIClient(GenAIClient):
        prompt: str,
        images: list[bytes],
        response_format: Optional[dict] = None,
        enable_thinking: bool = False,
    ) -> Optional[str]:
        """Submit a request to OpenAI."""
        encoded_images = [base64.b64encode(image).decode("utf-8") for image in images]
@ -187,11 +188,14 @@ class OpenAIClient(GenAIClient):
        messages: list[dict[str, Any]],
        tools: Optional[list[dict[str, Any]]] = None,
        tool_choice: Optional[str] = "auto",
        enable_thinking: Optional[bool] = None,
    ) -> dict[str, Any]:
        """
        Send chat messages to OpenAI with optional tool definitions.
-        Implements function calling/tool usage for OpenAI models.
+        Implements function calling/tool usage for OpenAI models. The OpenAI
        chat completions API does not expose a per-request thinking toggle,
        so ``enable_thinking`` is accepted for interface parity and ignored.
        """
        try:
            openai_tool_choice = None
--- a/web/public/locales/en/views/chat.json
+++ b/web/public/locales/en/views/chat.json
@ -65,5 +65,8 @@
    "active": "Reasoning…",
    "show": "Show reasoning",
    "hide": "Hide reasoning"
  },
  "thinking": {
    "toggle": "Toggle thinking"
  }
 }
--- a/web/src/components/chat/ChatComposer.tsx
+++ b/web/src/components/chat/ChatComposer.tsx
@ -0,0 +1,147 @@
 import { Button } from "@/components/ui/button";
 import { Input } from "@/components/ui/input";
 import { FaArrowUpLong, FaStop } from "react-icons/fa6";
 import { LuBrain } from "react-icons/lu";
 import { useTranslation } from "react-i18next";
 import { cn } from "@/lib/utils";
 import {
  Tooltip,
  TooltipContent,
  TooltipProvider,
  TooltipTrigger,
 } from "@/components/ui/tooltip";
 import { ChatAttachmentChip } from "@/components/chat/ChatAttachmentChip";
 import { ChatQuickReplies } from "@/components/chat/ChatQuickReplies";
 import { ChatPaperclipButton } from "@/components/chat/ChatPaperclipButton";
 type ChatComposerProps = {
  input: string;
  setInput: (value: string) => void;
  sendMessage: (textOverride?: string) => void;
  placeholder: string;
  supportsThinking: boolean;
  thinkingEnabled: boolean;
  setThinkingEnabled: (value: boolean | undefined) => void;
  isLoading?: boolean;
  onStop?: () => void;
  attachedEventId?: string | null;
  onClearAttachment?: () => void;
  onAttach?: (eventId: string) => void;
  recentEventIds?: string[];
  large?: boolean;
 };
 export function ChatComposer({
  input,
  setInput,
  sendMessage,
  placeholder,
  supportsThinking,
  thinkingEnabled,
  setThinkingEnabled,
  isLoading = false,
  onStop,
  attachedEventId,
  onClearAttachment,
  onAttach,
  recentEventIds,
  large = false,
 }: ChatComposerProps) {
  const { t } = useTranslation(["views/chat"]);
  const handleKeyDown = (e: React.KeyboardEvent<HTMLInputElement>) => {
    if (e.key === "Enter" && !e.shiftKey) {
      e.preventDefault();
      sendMessage();
    }
  };
  const showPaperclip = !!onAttach;
  const showStop = isLoading && !!onStop;
  return (
    <div className="flex w-full flex-col items-stretch justify-center gap-2 rounded-xl bg-secondary p-3">
      {attachedEventId && onClearAttachment && (
        <div className="flex items-center">
          <ChatAttachmentChip
            eventId={attachedEventId}
            mode="composer"
            onRemove={onClearAttachment}
          />
        </div>
      )}
      {attachedEventId && (
        <ChatQuickReplies
          onSend={(text) => sendMessage(text)}
          disabled={isLoading}
        />
      )}
      <div className="flex w-full flex-row items-center gap-2">
        {showPaperclip && (
          <ChatPaperclipButton
            recentEventIds={recentEventIds ?? []}
            onAttach={onAttach!}
            disabled={isLoading || attachedEventId != null}
          />
        )}
        {supportsThinking && (
          <TooltipProvider>
            <Tooltip>
              <TooltipTrigger asChild>
                <Button
                  type="button"
                  size="sm"
                  variant={thinkingEnabled ? "select" : "ghost"}
                  aria-pressed={thinkingEnabled}
                  aria-label={t("thinking.toggle")}
                  className={cn(
                    "flex size-9 shrink-0 items-center justify-center rounded-full p-0",
                    !thinkingEnabled && "text-secondary-foreground",
                  )}
                  onClick={() => setThinkingEnabled(!thinkingEnabled)}
                  disabled={isLoading}
                >
                  <LuBrain className="size-4" />
                </Button>
              </TooltipTrigger>
              <TooltipContent>{t("thinking.toggle")}</TooltipContent>
            </Tooltip>
          </TooltipProvider>
        )}
        <Input
          className={cn(
            "w-full flex-1 border-transparent bg-transparent shadow-none focus-visible:ring-0 dark:bg-transparent",
            large && "h-12 text-base",
          )}
          placeholder={placeholder}
          value={input}
          onChange={(e) => setInput(e.target.value)}
          onKeyDown={handleKeyDown}
          aria-busy={isLoading}
        />
        {showStop ? (
          <Button
            variant="destructive"
            className="size-10 shrink-0 rounded-full"
            onClick={onStop}
          >
            <FaStop className="size-3" />
          </Button>
        ) : (
          <Button
            variant="select"
            className="size-10 shrink-0 rounded-full"
            disabled={!input.trim() || isLoading}
            onClick={() => sendMessage()}
          >
            <FaArrowUpLong className="size-4" />
          </Button>
        )}
      </div>
    </div>
  );
 }
--- a/web/src/components/chat/ChatStartingState.tsx
+++ b/web/src/components/chat/ChatStartingState.tsx
@ -1,15 +1,22 @@
 import { Button } from "@/components/ui/button";
 import { Input } from "@/components/ui/input";
 import { FaArrowUpLong } from "react-icons/fa6";
 import { useTranslation } from "react-i18next";
 import { useState } from "react";
 import type { StartingRequest } from "@/types/chat";
 import { ChatComposer } from "@/components/chat/ChatComposer";
 type ChatStartingStateProps = {
  onSendMessage: (message: string) => void;
  supportsThinking: boolean;
  thinkingEnabled: boolean;
  setThinkingEnabled: (value: boolean | undefined) => void;
 };
-export function ChatStartingState({ onSendMessage }: ChatStartingStateProps) {
+export function ChatStartingState({
  onSendMessage,
  supportsThinking,
  thinkingEnabled,
  setThinkingEnabled,
 }: ChatStartingStateProps) {
  const { t } = useTranslation(["views/chat"]);
  const [input, setInput] = useState("");
@ -36,20 +43,13 @@ export function ChatStartingState({ onSendMessage }: ChatStartingStateProps) {
    onSendMessage(prompt);
  };
-  const handleSubmit = () => {
+  const handleSend = (textOverride?: string) => {
-    const text = input.trim();
+    const text = (textOverride ?? input).trim();
    if (!text) return;
    onSendMessage(text);
    setInput("");
  };
  const handleKeyDown = (e: React.KeyboardEvent<HTMLInputElement>) => {
    if (e.key === "Enter" && !e.shiftKey) {
      e.preventDefault();
      handleSubmit();
    }
  };
  return (
    <div className="flex size-full flex-col items-center justify-center gap-6 p-8">
      <div className="flex flex-col items-center gap-2">
@ -77,22 +77,17 @@ export function ChatStartingState({ onSendMessage }: ChatStartingStateProps) {
        </div>
      </div>
-      <div className="flex w-full max-w-2xl flex-row items-center gap-2 rounded-xl bg-secondary p-3">
+      <div className="w-full max-w-2xl">
-        <Input
+        <ChatComposer
-          className="h-12 w-full flex-1 border-transparent bg-transparent text-base shadow-none focus-visible:ring-0 dark:bg-transparent"
+          input={input}
          setInput={setInput}
          sendMessage={handleSend}
          placeholder={t("placeholder")}
-          value={input}
+          supportsThinking={supportsThinking}
-          onChange={(e) => setInput(e.target.value)}
+          thinkingEnabled={thinkingEnabled}
-          onKeyDown={handleKeyDown}
+          setThinkingEnabled={setThinkingEnabled}
          large
        />
        <Button
          variant="select"
          className="size-10 shrink-0 rounded-full"
          disabled={!input.trim()}
          onClick={handleSubmit}
        >
          <FaArrowUpLong size="18" />
        </Button>
      </div>
    </div>
  );
--- a/web/src/components/chat/ReasoningBubble.tsx
+++ b/web/src/components/chat/ReasoningBubble.tsx
@ -8,6 +8,12 @@ import {
 } from "@/components/ui/collapsible";
 import { Button } from "@/components/ui/button";
 import { cn } from "@/lib/utils";
 import {
  Tooltip,
  TooltipContent,
  TooltipProvider,
  TooltipTrigger,
 } from "@/components/ui/tooltip";
 type ReasoningBubbleProps = {
  /** The accumulated reasoning text from the model. */
@ -54,34 +60,42 @@ export function ReasoningBubble({
  return (
    <div className="self-start rounded-2xl bg-muted/60 px-3 py-2 text-muted-foreground">
-      <Collapsible open={open} onOpenChange={handleOpenChange}>
+      <TooltipProvider>
-        <CollapsibleTrigger asChild>
+        <Collapsible open={open} onOpenChange={handleOpenChange}>
-          <Button
+          <CollapsibleTrigger asChild>
-            variant="ghost"
+            <Button
-            size="sm"
+              variant="ghost"
-            className="h-auto w-full min-w-0 justify-start gap-2 whitespace-normal p-0 text-left text-xs hover:bg-transparent"
+              size="sm"
-          >
+              className="h-auto w-full min-w-0 justify-start gap-2 whitespace-normal p-0 text-left text-xs hover:bg-transparent"
-            <LuBrain
+            >
-              className={cn(
+              <Tooltip>
-                "size-3 shrink-0",
+                <TooltipTrigger asChild>
-                !answerStarted && "animate-pulse",
+                  <div className="flex items-center gap-2">
-              )}
+                    <LuBrain
-            />
+                      className={cn(
-            <span className="break-words font-medium">{label}</span>
+                        "size-3 shrink-0",
-            {answerStarted &&
+                        !answerStarted && "animate-pulse",
-              (open ? (
+                      )}
-                <LuChevronDown className="ml-auto size-3 shrink-0" />
+                    />
-              ) : (
+                  </div>
-                <LuChevronRight className="ml-auto size-3 shrink-0" />
+                </TooltipTrigger>
-              ))}
+                <TooltipContent>{label}</TooltipContent>
-          </Button>
+              </Tooltip>
-        </CollapsibleTrigger>
+              {answerStarted &&
-        <CollapsibleContent>
+                (open ? (
-          <pre className="scrollbar-container mt-2 max-h-64 overflow-auto whitespace-pre-wrap break-words rounded bg-muted/50 p-2 font-sans text-xs leading-relaxed">
+                  <LuChevronDown className="ml-auto size-3 shrink-0" />
-            {reasoning}
+                ) : (
-          </pre>
+                  <LuChevronRight className="ml-auto size-3 shrink-0" />
-        </CollapsibleContent>
+                ))}
-      </Collapsible>
+            </Button>
          </CollapsibleTrigger>
          <CollapsibleContent>
            <pre className="scrollbar-container mt-2 max-h-64 overflow-auto whitespace-pre-wrap break-words rounded bg-muted/50 p-2 font-sans text-xs leading-relaxed">
              {reasoning}
            </pre>
          </CollapsibleContent>
        </Collapsible>
      </TooltipProvider>
    </div>
  );
 }
--- a/web/src/components/config-form/theme/widgets/GenAIModelWidget.tsx
+++ b/web/src/components/config-form/theme/widgets/GenAIModelWidget.tsx
@ -23,6 +23,7 @@ import {
  PopoverTrigger,
 } from "@/components/ui/popover";
 import type { ConfigFormContext, JsonObject } from "@/types/configForm";
 import type { GenAIModelsResponse } from "@/types/chat";
 import { getSizedFieldClassName } from "../utils";
 type ProbeResponse =
@ -73,11 +74,12 @@ export function GenAIModelWidget(props: WidgetProps) {
    return `${e.provider ?? ""}|${e.base_url ?? ""}`;
  }, [providerKey, formContext?.fullConfig]);
-  const { data: allModels, mutate: mutateModels } = useSWR<
+  const { data: allModels, mutate: mutateModels } = useSWR<GenAIModelsResponse>(
-    Record<string, string[]>
+    "genai/models",
-  >("genai/models", {
+    {
-    revalidateOnFocus: false,
+      revalidateOnFocus: false,
-  });
+    },
  );
  // Revalidate models when the saved config fingerprint changes (e.g. after
  // switching provider or base_url and saving).
@ -89,9 +91,9 @@ export function GenAIModelWidget(props: WidgetProps) {
    }
  }, [configFingerprint, mutateModels]);
-  const fetchedModels = useMemo(() => {
+  const fetchedModels = useMemo<string[]>(() => {
    if (!allModels || !providerKey) return [];
-    return allModels[providerKey] ?? [];
+    return allModels[providerKey]?.models ?? [];
  }, [allModels, providerKey]);
  const [probeStatus, setProbeStatus] = useState<ProbeStatus>("idle");
--- a/web/src/pages/Chat.tsx
+++ b/web/src/pages/Chat.tsx
@ -1,20 +1,21 @@
 import { Button } from "@/components/ui/button";
 import { Input } from "@/components/ui/input";
 import { FaArrowUpLong, FaStop } from "react-icons/fa6";
 import { LuCircleAlert, LuMessageSquarePlus } from "react-icons/lu";
 import { useTranslation } from "react-i18next";
 import { useState, useCallback, useRef, useEffect, useMemo } from "react";
 import axios from "axios";
 import useSWR from "swr";
 import { ChatEventThumbnailsRow } from "@/components/chat/ChatEventThumbnailsRow";
 import { MessageBubble } from "@/components/chat/ChatMessage";
 import { ReasoningBubble } from "@/components/chat/ReasoningBubble";
 import { ToolCallsGroup } from "@/components/chat/ToolCallsGroup";
 import { ChatStartingState } from "@/components/chat/ChatStartingState";
-import { ChatAttachmentChip } from "@/components/chat/ChatAttachmentChip";
+import { ChatComposer } from "@/components/chat/ChatComposer";
 import { ChatQuickReplies } from "@/components/chat/ChatQuickReplies";
 import { ChatPaperclipButton } from "@/components/chat/ChatPaperclipButton";
 import ChatSettings from "@/components/chat/ChatSettings";
-import type { ChatMessage, ShowStatsMode } from "@/types/chat";
+import type {
  ChatMessage,
  GenAIModelsResponse,
  ShowStatsMode,
 } from "@/types/chat";
 import { usePersistence } from "@/hooks/use-persistence";
 import {
  getEventIdsFromSearchObjectsToolCalls,
@ -38,9 +39,26 @@ export default function ChatPage() {
    "chat-auto-scroll",
    true,
  );
  const [thinkingEnabled, setThinkingEnabled] = usePersistence<boolean>(
    "chat-thinking-enabled",
    false,
  );
  const scrollRef = useRef<HTMLDivElement>(null);
  const abortRef = useRef<AbortController | null>(null);
  const { data: genaiInfo } = useSWR<GenAIModelsResponse>("genai/models", {
    revalidateOnFocus: false,
  });
  const supportsThinking = useMemo(() => {
    if (!genaiInfo) return false;
    for (const entry of Object.values(genaiInfo)) {
      if (entry.roles?.includes("chat") && entry.supports_toggleable_thinking) {
        return true;
      }
    }
    return false;
  }, [genaiInfo]);
  useEffect(() => {
    document.title = t("documentTitle");
  }, [t]);
@ -100,9 +118,10 @@ export default function ChatPage() {
          defaultErrorMessage: t("error"),
        },
        controller.signal,
        supportsThinking ? { enableThinking: !!thinkingEnabled } : {},
      );
    },
-    [isLoading, t],
+    [isLoading, supportsThinking, t, thinkingEnabled],
  );
  const recentEventIds = useMemo(() => {
@ -305,6 +324,9 @@ export default function ChatPage() {
                  setInput("");
                  submitConversation([{ role: "user", content: message }]);
                }}
                supportsThinking={supportsThinking}
                thinkingEnabled={!!thinkingEnabled}
                setThinkingEnabled={setThinkingEnabled}
              />
            )}
          </div>
@ -313,7 +335,7 @@ export default function ChatPage() {
      {hasStarted && (
        <div className="flex shrink-0 justify-center p-2 md:px-4 md:pb-4">
          <div className="flex w-full xl:w-[50%] 3xl:w-[35%]">
-            <ChatEntry
+            <ChatComposer
              input={input}
              setInput={setInput}
              sendMessage={sendMessage}
@ -324,6 +346,9 @@ export default function ChatPage() {
              onAttach={setAttachedEventId}
              onStop={stopGeneration}
              recentEventIds={recentEventIds}
              supportsThinking={supportsThinking}
              thinkingEnabled={!!thinkingEnabled}
              setThinkingEnabled={setThinkingEnabled}
            />
          </div>
        </div>
@ -331,89 +356,3 @@ export default function ChatPage() {
    </div>
  );
 }
 type ChatEntryProps = {
  input: string;
  setInput: (value: string) => void;
  sendMessage: (textOverride?: string) => void;
  isLoading: boolean;
  placeholder: string;
  attachedEventId: string | null;
  onClearAttachment: () => void;
  onAttach: (eventId: string) => void;
  onStop: () => void;
  recentEventIds: string[];
 };
 function ChatEntry({
  input,
  setInput,
  sendMessage,
  isLoading,
  placeholder,
  attachedEventId,
  onClearAttachment,
  onAttach,
  onStop,
  recentEventIds,
 }: ChatEntryProps) {
  const handleKeyDown = (e: React.KeyboardEvent<HTMLInputElement>) => {
    if (e.key === "Enter" && !e.shiftKey) {
      e.preventDefault();
      sendMessage();
    }
  };
  return (
    <div className="flex w-full flex-col items-stretch justify-center gap-2 rounded-xl bg-secondary p-3">
      {attachedEventId && (
        <div className="flex items-center">
          <ChatAttachmentChip
            eventId={attachedEventId}
            mode="composer"
            onRemove={onClearAttachment}
          />
        </div>
      )}
      {attachedEventId && (
        <ChatQuickReplies
          onSend={(text) => sendMessage(text)}
          disabled={isLoading}
        />
      )}
      <div className="flex w-full flex-row items-center gap-2">
        <ChatPaperclipButton
          recentEventIds={recentEventIds}
          onAttach={onAttach}
          disabled={isLoading || attachedEventId != null}
        />
        <Input
          className="w-full flex-1 border-transparent bg-transparent shadow-none focus-visible:ring-0 dark:bg-transparent"
          placeholder={placeholder}
          value={input}
          onChange={(e) => setInput(e.target.value)}
          onKeyDown={handleKeyDown}
          aria-busy={isLoading}
        />
        {isLoading ? (
          <Button
            variant="destructive"
            className="size-10 shrink-0 rounded-full"
            onClick={onStop}
          >
            <FaStop className="size-3" />
          </Button>
        ) : (
          <Button
            variant="select"
            className="size-10 shrink-0 rounded-full"
            disabled={!input.trim()}
            onClick={() => sendMessage()}
          >
            <FaArrowUpLong className="size-4" />
          </Button>
        )}
      </div>
    </div>
  );
 }
--- a/web/src/types/chat.ts
+++ b/web/src/types/chat.ts
@ -25,3 +25,11 @@ export type ChatStats = {
 };
 export type ShowStatsMode = "while_generating" | "always";
 export type GenAIProviderInfo = {
  models: string[];
  roles: string[];
  supports_toggleable_thinking: boolean;
 };
 export type GenAIModelsResponse = Record<string, GenAIProviderInfo>;
--- a/web/src/utils/chatUtil.ts
+++ b/web/src/utils/chatUtil.ts
@ -34,12 +34,17 @@ type StreamChunk =
 * POST to chat/completion with stream: true, parse NDJSON stream, and invoke
 * callbacks so the caller can update UI (e.g. React state).
 */
 export type StreamChatOptions = {
  enableThinking?: boolean;
 };
 export async function streamChatCompletion(
  url: string,
  headers: Record<string, string>,
  apiMessages: { role: string; content: string }[],
  callbacks: StreamChatCallbacks,
  signal?: AbortSignal,
  options: StreamChatOptions = {},
 ): Promise<void> {
  const {
    updateMessages,
@ -50,10 +55,17 @@ export async function streamChatCompletion(
  } = callbacks;
  try {
    const body: Record<string, unknown> = {
      messages: apiMessages,
      stream: true,
    };
    if (options.enableThinking !== undefined) {
      body.enable_thinking = options.enableThinking;
    }
    const res = await fetch(url, {
      method: "POST",
      headers,
-      body: JSON.stringify({ messages: apiMessages, stream: true }),
+      body: JSON.stringify(body),
      signal,
    });