diff --git a/frigate/api/chat.py b/frigate/api/chat.py
index 2e6283a525..428d23cb79 100644
--- a/frigate/api/chat.py
+++ b/frigate/api/chat.py
@@ -1138,19 +1138,23 @@ async def chat_completion(
     )
     conversation = []
 
-    system_prompt = build_chat_system_prompt(
-        config=config,
-        allowed_cameras=allowed_cameras,
-        semantic_search_enabled=semantic_search_enabled,
-        attribute_classifications=attribute_classifications,
-    )
-
-    conversation.append(
-        {
-            "role": "system",
-            "content": system_prompt,
-        }
-    )
+    # Build the system message only when the client hasn't already pinned one.
+    # The first turn has no system message; we generate it (with the current
+    # timestamp) and return the whole chain so the client persists it. Later
+    # turns send it back verbatim, freezing the timestamp so the prompt prefix
+    # stays byte-identical and the model server's prompt cache keeps hitting.
+    if not body.messages or body.messages[0].role != "system":
+        conversation.append(
+            {
+                "role": "system",
+                "content": build_chat_system_prompt(
+                    config=config,
+                    allowed_cameras=allowed_cameras,
+                    semantic_search_enabled=semantic_search_enabled,
+                    attribute_classifications=attribute_classifications,
+                ),
+            }
+        )
 
     for msg in body.messages:
         msg_dict = {
@@ -1166,9 +1170,6 @@ async def chat_completion(
 
         conversation.append(msg_dict)
 
-    # Messages appended past this point form this turn's replay record.
-    turn_start_len = len(conversation)
-
     tool_iterations = 0
     tool_calls: list[ToolCall] = []
     max_iterations = body.max_tool_iterations
@@ -1185,12 +1186,12 @@ async def chat_completion(
         async def stream_body_llm():
             nonlocal conversation, stream_iterations
 
-            def _emit_replay_messages(extra: Optional[list[dict[str, Any]]] = None):
-                turn_messages = conversation[turn_start_len:] + (extra or [])
+            def _emit_chain(extra: Optional[list[dict[str, Any]]] = None):
+                # Return the full conversation (including the system message) so
+                # the client persists and replays it verbatim next turn.
+                chain = conversation + (extra or [])
                 return (
-                    json.dumps({"type": "messages", "messages": turn_messages}).encode(
-                        "utf-8"
-                    )
+                    json.dumps({"type": "messages", "messages": chain}).encode("utf-8")
                     + b"\n"
                 )
 
@@ -1266,14 +1267,14 @@ async def chat_completion(
                             )
                             conversation.extend(tool_results)
                             conversation.extend(extra_msgs)
-                            # Running turn slice: lets the client render tool
+                            # Emit the running chain so the client can render tool
                             # calls live and replay them verbatim next turn.
-                            yield _emit_replay_messages()
+                            yield _emit_chain()
                             break
                         else:
                             # Streaming never appends the final assistant message
-                            # to the conversation, so add it to the replay slice.
-                            yield _emit_replay_messages(
+                            # to the conversation, so add it to the chain.
+                            yield _emit_chain(
                                 extra=[
                                     {
                                         "role": "assistant",
@@ -1284,7 +1285,7 @@ async def chat_completion(
                             yield (json.dumps({"type": "done"}).encode("utf-8") + b"\n")
                             return
             else:
-                yield _emit_replay_messages()
+                yield _emit_chain()
                 yield json.dumps({"type": "done"}).encode("utf-8") + b"\n"
 
         return StreamingResponse(
@@ -1331,12 +1332,12 @@ async def chat_completion(
                 if body.stream:
                     final_reasoning = response.get("reasoning")
 
-                    turn_messages = conversation[turn_start_len:]
+                    chain = list(conversation)
 
                     async def stream_body() -> Any:
                         yield (
                             json.dumps(
-                                {"type": "messages", "messages": turn_messages}
+                                {"type": "messages", "messages": chain}
                             ).encode("utf-8")
                             + b"\n"
                         )
@@ -1375,7 +1376,7 @@ async def chat_completion(
                         finish_reason=response.get("finish_reason", "stop"),
                         tool_iterations=tool_iterations,
                         tool_calls=tool_calls,
-                        messages=conversation[turn_start_len:],
+                        messages=list(conversation),
                     ).model_dump(),
                 )
 
@@ -1408,7 +1409,7 @@ async def chat_completion(
                 finish_reason="length",
                 tool_iterations=tool_iterations,
                 tool_calls=tool_calls,
-                messages=conversation[turn_start_len:],
+                messages=list(conversation),
             ).model_dump(),
         )
 
diff --git a/frigate/api/defs/response/chat_response.py b/frigate/api/defs/response/chat_response.py
index 105104baa4..59c8549e73 100644
--- a/frigate/api/defs/response/chat_response.py
+++ b/frigate/api/defs/response/chat_response.py
@@ -59,9 +59,9 @@ class ChatCompletionResponse(BaseModel):
     messages: list[dict[str, Any]] = Field(
         default_factory=list,
         description=(
-            "The exact conversation messages appended for this assistant turn "
-            "(assistant tool-call turns, tool results, and the final assistant "
-            "message). Replay these verbatim as conversation history on the next "
-            "request to keep the model server's prompt cache prefix intact."
+            "The full conversation chain, including the system message. Persist "
+            "and replay this verbatim on the next request so the prompt prefix "
+            "stays byte-identical and the model server's prompt cache keeps "
+            "hitting."
         ),
     )
diff --git a/web/src/pages/Chat.tsx b/web/src/pages/Chat.tsx
index ac708fd05e..bd92097873 100644
--- a/web/src/pages/Chat.tsx
+++ b/web/src/pages/Chat.tsx
@@ -30,7 +30,7 @@ import {
 type StreamingTurn = {
   content: string;
   reasoning: string;
-  turn: ChatMessage[];
+  chain: ChatMessage[];
   stats?: ChatStats;
 };
 
@@ -99,7 +99,7 @@ export default function ChatPage() {
 
       setError(null);
       setMessages(messagesToSend);
-      setStreaming({ content: "", reasoning: "", turn: [] });
+      setStreaming({ content: "", reasoning: "", chain: [] });
       setIsLoading(true);
 
       const baseURL = axios.defaults.baseURL ?? "";
@@ -112,7 +112,7 @@ export default function ChatPage() {
       const controller = new AbortController();
       abortRef.current = controller;
 
-      let turn: ChatMessage[] = [];
+      let chain: ChatMessage[] = [];
       let stats: ChatStats | undefined;
       let reasoning = "";
       let hadError = false;
@@ -130,9 +130,9 @@ export default function ChatPage() {
               s ? { ...s, reasoning: s.reasoning + delta } : s,
             );
           },
-          onTurnMessages: (turnMessages) => {
-            turn = turnMessages;
-            setStreaming((s) => (s ? { ...s, turn: turnMessages } : s));
+          onChain: (fullChain) => {
+            chain = fullChain;
+            setStreaming((s) => (s ? { ...s, chain: fullChain } : s));
           },
           onStats: (s) => {
             stats = s;
@@ -146,14 +146,15 @@ export default function ChatPage() {
             abortRef.current = null;
             setIsLoading(false);
             setStreaming(null);
-            const lastMsg = turn[turn.length - 1];
+            const lastMsg = chain[chain.length - 1];
             if (!hadError && lastMsg?.role === "assistant") {
-              const committed = turn.map((m, i) =>
-                i === turn.length - 1
-                  ? { ...m, reasoning: reasoning || undefined, stats }
-                  : m,
+              setMessages(
+                chain.map((m, i) =>
+                  i === chain.length - 1
+                    ? { ...m, reasoning: reasoning || undefined, stats }
+                    : m,
+                ),
               );
-              setMessages((prev) => [...prev, ...committed]);
             }
           },
           defaultErrorMessage: t("error"),
@@ -228,15 +229,17 @@ export default function ChatPage() {
 
   const hasStarted = messages.length > 0 || streaming != null;
 
-  // The conversation plus any in-flight turn, rendered as one flat list.
-  const renderList = streaming ? [...messages, ...streaming.turn] : messages;
+  // While streaming, the backend's in-flight chain is the source of truth;
+  // otherwise the committed conversation is.
+  const renderList =
+    streaming && streaming.chain.length ? streaming.chain : messages;
   const responses = toolResponsesById(renderList);
-  const streamingTail = streaming?.turn[streaming.turn.length - 1];
+  const renderTail = renderList[renderList.length - 1];
   const finalShown =
-    streamingTail?.role === "assistant" && hasText(streamingTail.content);
+    renderTail?.role === "assistant" && hasText(renderTail.content);
 
   const renderMessage = (msg: ChatMessage, i: number) => {
-    if (msg.role === "tool") return null;
+    if (msg.role === "system" || msg.role === "tool") return null;
 
     if (msg.role === "user") {
       if (!hasText(msg.content)) return null;
diff --git a/web/src/types/chat.ts b/web/src/types/chat.ts
index 08e47bd3ac..3b497cb565 100644
--- a/web/src/types/chat.ts
+++ b/web/src/types/chat.ts
@@ -10,7 +10,7 @@ export type WireToolCall = {
 };
 
 export type ChatMessage = {
-  role: "user" | "assistant" | "tool";
+  role: "system" | "user" | "assistant" | "tool";
   content: unknown;
   tool_call_id?: string;
   name?: string;
diff --git a/web/src/utils/chatUtil.ts b/web/src/utils/chatUtil.ts
index 5260fe804e..b7aeb8088d 100644
--- a/web/src/utils/chatUtil.ts
+++ b/web/src/utils/chatUtil.ts
@@ -5,9 +5,10 @@ export type StreamChatCallbacks = {
   onContentDelta: (delta: string) => void;
   /** Streamed delta of the assistant's reasoning trace. */
   onReasoningDelta: (delta: string) => void;
-  /** The exact wire messages appended for this turn so far (tool-call turns,
-   * tool results, and — on the final emission — the final assistant message). */
-  onTurnMessages: (messages: ChatMessage[]) => void;
+  /** The full conversation chain so far (system message, history, this turn's
+   * tool-call turns, tool results, and — on the final emission — the final
+   * assistant message). */
+  onChain: (chain: ChatMessage[]) => void;
   /** Token/timing stats for the turn. */
   onStats: (stats: ChatStats) => void;
   /** Called when the stream sends an error or fetch fails. */
@@ -52,7 +53,7 @@ export async function streamChatCompletion(
   const {
     onContentDelta,
     onReasoningDelta,
-    onTurnMessages,
+    onChain,
     onStats,
     onError,
     onDone,
@@ -99,7 +100,7 @@ export async function streamChatCompletion(
         return "break";
       }
       if (data.type === "messages") {
-        onTurnMessages(data.messages ?? []);
+        onChain(data.messages ?? []);
         return "continue";
       }
       if (data.type === "content" && data.delta !== undefined) {