Implement reasoning traces in the UI

This commit is contained in:
Nicolas Mowen 2026-05-19 09:55:49 -06:00
parent 31f9611d34
commit c0d784a45b
9 changed files with 176 additions and 4 deletions

View File

@ -1185,6 +1185,13 @@ async def chat_completion(
) )
+ b"\n" + b"\n"
) )
elif kind == "reasoning_delta":
yield (
json.dumps({"type": "reasoning", "delta": value}).encode(
"utf-8"
)
+ b"\n"
)
elif kind == "stats": elif kind == "stats":
yield ( yield (
json.dumps({"type": "stats", **value}).encode("utf-8") json.dumps({"type": "stats", **value}).encode("utf-8")
@ -1285,6 +1292,7 @@ async def chat_completion(
final_content = response.get("content") or "" final_content = response.get("content") or ""
if body.stream: if body.stream:
final_reasoning = response.get("reasoning")
async def stream_body() -> Any: async def stream_body() -> Any:
if tool_calls: if tool_calls:
@ -1299,6 +1307,15 @@ async def chat_completion(
).encode("utf-8") ).encode("utf-8")
+ b"\n" + b"\n"
) )
# Emit the full reasoning trace up front when the
# underlying client did not stream it
if final_reasoning:
yield (
json.dumps(
{"type": "reasoning", "delta": final_reasoning}
).encode("utf-8")
+ b"\n"
)
# Stream content in word-sized chunks for smooth UX # Stream content in word-sized chunks for smooth UX
for part in chunk_content(final_content): for part in chunk_content(final_content):
yield ( yield (
@ -1319,6 +1336,7 @@ async def chat_completion(
message=ChatMessageResponse( message=ChatMessageResponse(
role="assistant", role="assistant",
content=final_content, content=final_content,
reasoning=response.get("reasoning"),
tool_calls=None, tool_calls=None,
), ),
finish_reason=response.get("finish_reason", "stop"), finish_reason=response.get("finish_reason", "stop"),

View File

@ -20,6 +20,10 @@ class ChatMessageResponse(BaseModel):
content: Optional[str] = Field( content: Optional[str] = Field(
default=None, description="Message content (None if tool calls present)" default=None, description="Message content (None if tool calls present)"
) )
reasoning: Optional[str] = Field(
default=None,
description="Separated reasoning/thinking trace if the model emitted one",
)
tool_calls: Optional[list[ToolCallInvocation]] = Field( tool_calls: Optional[list[ToolCallInvocation]] = Field(
default=None, description="Tool calls if LLM wants to call tools" default=None, description="Tool calls if LLM wants to call tools"
) )

View File

@ -300,6 +300,10 @@ class GenAIClient:
Returns: Returns:
Dictionary with: Dictionary with:
- 'content': Optional[str] - The text response from the LLM, None if tool calls - 'content': Optional[str] - The text response from the LLM, None if tool calls
- 'reasoning': Optional[str] - The separated reasoning/thinking trace
if the model emitted one (e.g. via OpenAI-compatible
`reasoning_content`). None when the model does not surface a
trace or the provider does not parse it.
- 'tool_calls': Optional[List[Dict]] - List of tool calls if LLM wants to call tools. - 'tool_calls': Optional[List[Dict]] - List of tool calls if LLM wants to call tools.
Each tool call dict has: Each tool call dict has:
- 'id': str - Unique identifier for this tool call - 'id': str - Unique identifier for this tool call
@ -311,6 +315,14 @@ class GenAIClient:
- 'length': Hit token limit - 'length': Hit token limit
- 'error': An error occurred - 'error': An error occurred
Streaming counterpart `chat_with_tools_stream` yields
``(kind, value)`` tuples where ``kind`` is one of:
- 'content_delta': value is a string fragment of the answer
- 'reasoning_delta': value is a string fragment of the reasoning
trace (emitted before content for thinking models)
- 'stats': value is a usage stats dict
- 'message': value is the final dict shape described above
Raises: Raises:
NotImplementedError: If the provider doesn't implement this method. NotImplementedError: If the provider doesn't implement this method.
""" """
@ -321,6 +333,7 @@ class GenAIClient:
) )
return { return {
"content": None, "content": None,
"reasoning": None,
"tool_calls": None, "tool_calls": None,
"finish_reason": "error", "finish_reason": "error",
} }

View File

@ -531,16 +531,24 @@ class LlamaCppClient(GenAIClient):
return payload return payload
def _message_from_choice(self, choice: dict[str, Any]) -> dict[str, Any]: def _message_from_choice(self, choice: dict[str, Any]) -> dict[str, Any]:
"""Parse OpenAI-style choice into {content, tool_calls, finish_reason}.""" """Parse OpenAI-style choice into {content, reasoning, tool_calls, finish_reason}.
llama.cpp's `--reasoning-format` puts the trace in
`message.reasoning_content` (preferred) or `message.thinking`; both
keys are accepted so different builds work without configuration.
"""
message = choice.get("message", {}) message = choice.get("message", {})
content = message.get("content") content = message.get("content")
content = content.strip() if content else None content = content.strip() if content else None
reasoning = message.get("reasoning_content") or message.get("thinking")
reasoning = reasoning.strip() if reasoning else None
tool_calls = parse_tool_calls_from_message(message) tool_calls = parse_tool_calls_from_message(message)
finish_reason = choice.get("finish_reason") or ( finish_reason = choice.get("finish_reason") or (
"tool_calls" if tool_calls else "stop" if content else "error" "tool_calls" if tool_calls else "stop" if content else "error"
) )
return { return {
"content": content, "content": content,
"reasoning": reasoning,
"tool_calls": tool_calls, "tool_calls": tool_calls,
"finish_reason": finish_reason, "finish_reason": finish_reason,
} }
@ -803,6 +811,7 @@ class LlamaCppClient(GenAIClient):
try: try:
payload = self._build_payload(messages, tools, tool_choice, stream=True) payload = self._build_payload(messages, tools, tool_choice, stream=True)
content_parts: list[str] = [] content_parts: list[str] = []
reasoning_parts: list[str] = []
tool_calls_by_index: dict[int, dict[str, Any]] = {} tool_calls_by_index: dict[int, dict[str, Any]] = {}
finish_reason = "stop" finish_reason = "stop"
@ -832,6 +841,15 @@ class LlamaCppClient(GenAIClient):
delta = choices[0].get("delta", {}) delta = choices[0].get("delta", {})
if choices[0].get("finish_reason"): if choices[0].get("finish_reason"):
finish_reason = choices[0]["finish_reason"] finish_reason = choices[0]["finish_reason"]
# llama.cpp emits separated thinking under
# reasoning_content (preferred) or thinking before any
# content tokens arrive
reasoning_delta = delta.get("reasoning_content") or delta.get(
"thinking"
)
if reasoning_delta:
reasoning_parts.append(reasoning_delta)
yield ("reasoning_delta", reasoning_delta)
if delta.get("content"): if delta.get("content"):
content_parts.append(delta["content"]) content_parts.append(delta["content"])
yield ("content_delta", delta["content"]) yield ("content_delta", delta["content"])
@ -857,6 +875,7 @@ class LlamaCppClient(GenAIClient):
) )
full_content = "".join(content_parts).strip() or None full_content = "".join(content_parts).strip() or None
full_reasoning = "".join(reasoning_parts).strip() or None
tool_calls_list = self._streamed_tool_calls_to_list(tool_calls_by_index) tool_calls_list = self._streamed_tool_calls_to_list(tool_calls_by_index)
if tool_calls_list: if tool_calls_list:
finish_reason = "tool_calls" finish_reason = "tool_calls"
@ -864,6 +883,7 @@ class LlamaCppClient(GenAIClient):
"message", "message",
{ {
"content": full_content, "content": full_content,
"reasoning": full_reasoning,
"tool_calls": tool_calls_list, "tool_calls": tool_calls_list,
"finish_reason": finish_reason, "finish_reason": finish_reason,
}, },

View File

@ -60,5 +60,10 @@
"stats": { "stats": {
"context": "{{tokens}} tokens", "context": "{{tokens}} tokens",
"tokens_per_second": "{{rate}} t/s" "tokens_per_second": "{{rate}} t/s"
},
"reasoning": {
"thinking": "Thinking…",
"show": "Show reasoning",
"hide": "Hide reasoning"
} }
} }

View File

@ -0,0 +1,87 @@
import { useState, useEffect, useRef } from "react";
import { useTranslation } from "react-i18next";
import { LuBrain, LuChevronDown, LuChevronRight } from "react-icons/lu";
import {
Collapsible,
CollapsibleContent,
CollapsibleTrigger,
} from "@/components/ui/collapsible";
import { Button } from "@/components/ui/button";
import { cn } from "@/lib/utils";
type ReasoningBubbleProps = {
/** The accumulated reasoning text from the model. */
reasoning: string;
/**
* Whether the assistant has begun producing the user-facing answer.
* While false the reasoning is still streaming and we keep the panel
* open with a "Thinking…" label. Once true, the panel auto-collapses
* so the answer is the primary focus, but stays expandable.
*/
answerStarted: boolean;
};
export function ReasoningBubble({
reasoning,
answerStarted,
}: ReasoningBubbleProps) {
const { t } = useTranslation(["views/chat"]);
// Open while the model is still mid-thought (no answer tokens yet);
// once the answer begins, collapse on its own but let the user reopen.
const [open, setOpen] = useState(true);
const userInteractedRef = useRef(false);
const lastAutoState = useRef(true);
useEffect(() => {
if (userInteractedRef.current) return;
const desired = !answerStarted;
if (desired !== lastAutoState.current) {
lastAutoState.current = desired;
setOpen(desired);
}
}, [answerStarted]);
const handleOpenChange = (next: boolean) => {
userInteractedRef.current = true;
setOpen(next);
};
const label = !answerStarted
? t("reasoning.thinking")
: open
? t("reasoning.hide")
: t("reasoning.show");
return (
<div className="self-start rounded-2xl bg-muted/60 px-3 py-2 text-muted-foreground">
<Collapsible open={open} onOpenChange={handleOpenChange}>
<CollapsibleTrigger asChild>
<Button
variant="ghost"
size="sm"
className="h-auto w-full min-w-0 justify-start gap-2 whitespace-normal p-0 text-left text-xs hover:bg-transparent"
>
<LuBrain
className={cn(
"size-3 shrink-0",
!answerStarted && "animate-pulse",
)}
/>
<span className="break-words font-medium">{label}</span>
{answerStarted &&
(open ? (
<LuChevronDown className="ml-auto size-3 shrink-0" />
) : (
<LuChevronRight className="ml-auto size-3 shrink-0" />
))}
</Button>
</CollapsibleTrigger>
<CollapsibleContent>
<pre className="scrollbar-container mt-2 max-h-64 overflow-auto whitespace-pre-wrap break-words rounded bg-muted/50 p-2 font-sans text-xs leading-relaxed">
{reasoning}
</pre>
</CollapsibleContent>
</Collapsible>
</div>
);
}

View File

@ -7,6 +7,7 @@ import { useState, useCallback, useRef, useEffect, useMemo } from "react";
import axios from "axios"; import axios from "axios";
import { ChatEventThumbnailsRow } from "@/components/chat/ChatEventThumbnailsRow"; import { ChatEventThumbnailsRow } from "@/components/chat/ChatEventThumbnailsRow";
import { MessageBubble } from "@/components/chat/ChatMessage"; import { MessageBubble } from "@/components/chat/ChatMessage";
import { ReasoningBubble } from "@/components/chat/ReasoningBubble";
import { ToolCallsGroup } from "@/components/chat/ToolCallsGroup"; import { ToolCallsGroup } from "@/components/chat/ToolCallsGroup";
import { ChatStartingState } from "@/components/chat/ChatStartingState"; import { ChatStartingState } from "@/components/chat/ChatStartingState";
import { ChatAttachmentChip } from "@/components/chat/ChatAttachmentChip"; import { ChatAttachmentChip } from "@/components/chat/ChatAttachmentChip";
@ -200,15 +201,18 @@ export default function ChatPage() {
const hasToolCalls = const hasToolCalls =
msg.toolCalls && msg.toolCalls.length > 0; msg.toolCalls && msg.toolCalls.length > 0;
const hasContent = !!msg.content?.trim(); const hasContent = !!msg.content?.trim();
const hasReasoning = !!msg.reasoning?.trim();
const showProcessing = const showProcessing =
isLastAssistant && isLoading && !hasContent; isLastAssistant && isLoading && !hasContent && !hasReasoning;
// Hide empty placeholder only when there are no tool calls yet // Hide empty placeholder only when there are no tool calls
// and no reasoning streaming yet
if ( if (
isLastAssistant && isLastAssistant &&
isLoading && isLoading &&
!hasContent && !hasContent &&
!hasToolCalls !hasToolCalls &&
!hasReasoning
) )
return ( return (
<div <div
@ -226,6 +230,12 @@ export default function ChatPage() {
{msg.role === "assistant" && hasToolCalls && ( {msg.role === "assistant" && hasToolCalls && (
<ToolCallsGroup toolCalls={msg.toolCalls!} /> <ToolCallsGroup toolCalls={msg.toolCalls!} />
)} )}
{msg.role === "assistant" && hasReasoning && (
<ReasoningBubble
reasoning={msg.reasoning!}
answerStarted={hasContent}
/>
)}
{showProcessing ? ( {showProcessing ? (
<div className="flex items-center gap-2 self-start rounded-2xl bg-muted px-5 py-4"> <div className="flex items-center gap-2 self-start rounded-2xl bg-muted px-5 py-4">
<span className="size-2 animate-bounce rounded-full bg-muted-foreground/60 [animation-delay:-0.3s]" /> <span className="size-2 animate-bounce rounded-full bg-muted-foreground/60 [animation-delay:-0.3s]" />

View File

@ -7,6 +7,7 @@ export type ToolCall = {
export type ChatMessage = { export type ChatMessage = {
role: "user" | "assistant"; role: "user" | "assistant";
content: string; content: string;
reasoning?: string;
toolCalls?: ToolCall[]; toolCalls?: ToolCall[];
stats?: ChatStats; stats?: ChatStats;
}; };

View File

@ -27,6 +27,7 @@ type StreamChunk =
| { type: "error"; error: string } | { type: "error"; error: string }
| { type: "tool_calls"; tool_calls: ToolCall[] } | { type: "tool_calls"; tool_calls: ToolCall[] }
| { type: "content"; delta: string } | { type: "content"; delta: string }
| { type: "reasoning"; delta: string }
| StatsChunk; | StatsChunk;
/** /**
@ -109,6 +110,19 @@ export async function streamChatCompletion(
}); });
return "continue"; return "continue";
} }
if (data.type === "reasoning" && data.delta !== undefined) {
updateMessages((prev) => {
const next = [...prev];
const lastMsg = next[next.length - 1];
if (lastMsg?.role === "assistant")
next[next.length - 1] = {
...lastMsg,
reasoning: (lastMsg.reasoning ?? "") + data.delta,
};
return next;
});
return "continue";
}
if (data.type === "stats") { if (data.type === "stats") {
const stats: ChatStats = { const stats: ChatStats = {
promptTokens: data.prompt_tokens, promptTokens: data.prompt_tokens,