mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-06-23 04:41:54 +03:00
Support token streaming stats
This commit is contained in:
parent
78fc472026
commit
475f87d40b
@ -10,6 +10,7 @@ from openai import AzureOpenAI
|
|||||||
|
|
||||||
from frigate.config import GenAIProviderEnum
|
from frigate.config import GenAIProviderEnum
|
||||||
from frigate.genai import GenAIClient, register_genai_provider
|
from frigate.genai import GenAIClient, register_genai_provider
|
||||||
|
from frigate.genai.openai import _stats_from_openai_usage
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -210,6 +211,7 @@ class OpenAIClient(GenAIClient):
|
|||||||
"messages": messages,
|
"messages": messages,
|
||||||
"timeout": self.timeout,
|
"timeout": self.timeout,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
|
"stream_options": {"include_usage": True},
|
||||||
}
|
}
|
||||||
|
|
||||||
if tools:
|
if tools:
|
||||||
@ -221,10 +223,15 @@ class OpenAIClient(GenAIClient):
|
|||||||
content_parts: list[str] = []
|
content_parts: list[str] = []
|
||||||
tool_calls_by_index: dict[int, dict[str, Any]] = {}
|
tool_calls_by_index: dict[int, dict[str, Any]] = {}
|
||||||
finish_reason = "stop"
|
finish_reason = "stop"
|
||||||
|
usage_stats: Optional[dict[str, Any]] = None
|
||||||
|
|
||||||
stream = self.provider.chat.completions.create(**request_params) # type: ignore[call-overload]
|
stream = self.provider.chat.completions.create(**request_params) # type: ignore[call-overload]
|
||||||
|
|
||||||
for chunk in stream:
|
for chunk in stream:
|
||||||
|
chunk_usage = getattr(chunk, "usage", None)
|
||||||
|
if chunk_usage is not None:
|
||||||
|
usage_stats = _stats_from_openai_usage(chunk_usage)
|
||||||
|
|
||||||
if not chunk or not chunk.choices:
|
if not chunk or not chunk.choices:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -284,6 +291,9 @@ class OpenAIClient(GenAIClient):
|
|||||||
)
|
)
|
||||||
finish_reason = "tool_calls"
|
finish_reason = "tool_calls"
|
||||||
|
|
||||||
|
if usage_stats is not None:
|
||||||
|
yield ("stats", usage_stats)
|
||||||
|
|
||||||
yield (
|
yield (
|
||||||
"message",
|
"message",
|
||||||
{
|
{
|
||||||
|
|||||||
@ -14,6 +14,20 @@ from frigate.genai import GenAIClient, register_genai_provider
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _stats_from_gemini_usage(usage: Any) -> Optional[dict[str, Any]]:
|
||||||
|
"""Build a stats dict from a Gemini usage_metadata object."""
|
||||||
|
prompt_tokens = getattr(usage, "prompt_token_count", None)
|
||||||
|
completion_tokens = getattr(usage, "candidates_token_count", None)
|
||||||
|
if prompt_tokens is None and completion_tokens is None:
|
||||||
|
return None
|
||||||
|
stats: dict[str, Any] = {}
|
||||||
|
if isinstance(prompt_tokens, int):
|
||||||
|
stats["prompt_tokens"] = prompt_tokens
|
||||||
|
if isinstance(completion_tokens, int):
|
||||||
|
stats["completion_tokens"] = completion_tokens
|
||||||
|
return stats or None
|
||||||
|
|
||||||
|
|
||||||
@register_genai_provider(GenAIProviderEnum.gemini)
|
@register_genai_provider(GenAIProviderEnum.gemini)
|
||||||
class GeminiClient(GenAIClient):
|
class GeminiClient(GenAIClient):
|
||||||
"""Generative AI client for Frigate using Gemini."""
|
"""Generative AI client for Frigate using Gemini."""
|
||||||
@ -471,6 +485,7 @@ class GeminiClient(GenAIClient):
|
|||||||
content_parts: list[str] = []
|
content_parts: list[str] = []
|
||||||
tool_calls_by_index: dict[int, dict[str, Any]] = {}
|
tool_calls_by_index: dict[int, dict[str, Any]] = {}
|
||||||
finish_reason = "stop"
|
finish_reason = "stop"
|
||||||
|
usage_stats: Optional[dict[str, Any]] = None
|
||||||
|
|
||||||
stream = await self.provider.aio.models.generate_content_stream(
|
stream = await self.provider.aio.models.generate_content_stream(
|
||||||
model=self.genai_config.model,
|
model=self.genai_config.model,
|
||||||
@ -479,6 +494,12 @@ class GeminiClient(GenAIClient):
|
|||||||
)
|
)
|
||||||
|
|
||||||
async for chunk in stream:
|
async for chunk in stream:
|
||||||
|
chunk_usage = getattr(chunk, "usage_metadata", None)
|
||||||
|
if chunk_usage is not None:
|
||||||
|
maybe_stats = _stats_from_gemini_usage(chunk_usage)
|
||||||
|
if maybe_stats is not None:
|
||||||
|
usage_stats = maybe_stats
|
||||||
|
|
||||||
if not chunk or not chunk.candidates:
|
if not chunk or not chunk.candidates:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -565,6 +586,9 @@ class GeminiClient(GenAIClient):
|
|||||||
)
|
)
|
||||||
finish_reason = "tool_calls"
|
finish_reason = "tool_calls"
|
||||||
|
|
||||||
|
if usage_stats is not None:
|
||||||
|
yield ("stats", usage_stats)
|
||||||
|
|
||||||
yield (
|
yield (
|
||||||
"message",
|
"message",
|
||||||
{
|
{
|
||||||
|
|||||||
@ -18,6 +18,52 @@ from frigate.genai.utils import parse_tool_calls_from_message
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _stats_from_llama_cpp_chunk(data: dict[str, Any]) -> Optional[dict[str, Any]]:
|
||||||
|
"""Build a stats dict from a llama.cpp streaming chunk.
|
||||||
|
|
||||||
|
Final-chunk `usage` carries authoritative token counts. Per-chunk
|
||||||
|
`timings` (enabled via timings_per_token) carries the running token
|
||||||
|
counts (prompt_n, predicted_n) and generation rate, so live updates
|
||||||
|
work mid-stream.
|
||||||
|
"""
|
||||||
|
usage = data.get("usage") or {}
|
||||||
|
timings = data.get("timings") or {}
|
||||||
|
prompt_tokens = usage.get("prompt_tokens")
|
||||||
|
completion_tokens = usage.get("completion_tokens")
|
||||||
|
predicted_ms = timings.get("predicted_ms")
|
||||||
|
tps = timings.get("predicted_per_second")
|
||||||
|
stats: dict[str, Any] = {}
|
||||||
|
|
||||||
|
if not isinstance(prompt_tokens, int):
|
||||||
|
prompt_n = timings.get("prompt_n")
|
||||||
|
|
||||||
|
if isinstance(prompt_n, int):
|
||||||
|
prompt_tokens = prompt_n
|
||||||
|
|
||||||
|
if not isinstance(completion_tokens, int):
|
||||||
|
predicted_n = timings.get("predicted_n")
|
||||||
|
|
||||||
|
if isinstance(predicted_n, int):
|
||||||
|
completion_tokens = predicted_n
|
||||||
|
|
||||||
|
if not isinstance(prompt_tokens, int) and not isinstance(completion_tokens, int):
|
||||||
|
return None
|
||||||
|
|
||||||
|
if isinstance(prompt_tokens, int):
|
||||||
|
stats["prompt_tokens"] = prompt_tokens
|
||||||
|
|
||||||
|
if isinstance(completion_tokens, int):
|
||||||
|
stats["completion_tokens"] = completion_tokens
|
||||||
|
|
||||||
|
if isinstance(predicted_ms, (int, float)) and predicted_ms > 0:
|
||||||
|
stats["completion_duration_ms"] = float(predicted_ms)
|
||||||
|
|
||||||
|
if isinstance(tps, (int, float)) and tps > 0:
|
||||||
|
stats["tokens_per_second"] = float(tps)
|
||||||
|
|
||||||
|
return stats or None
|
||||||
|
|
||||||
|
|
||||||
def _parse_launch_arg(args: list[str], flag: str) -> str | None:
|
def _parse_launch_arg(args: list[str], flag: str) -> str | None:
|
||||||
"""Return the value following `flag` in a positional argv list, or None."""
|
"""Return the value following `flag` in a positional argv list, or None."""
|
||||||
try:
|
try:
|
||||||
@ -462,6 +508,8 @@ class LlamaCppClient(GenAIClient):
|
|||||||
}
|
}
|
||||||
if stream:
|
if stream:
|
||||||
payload["stream"] = True
|
payload["stream"] = True
|
||||||
|
payload["stream_options"] = {"include_usage": True}
|
||||||
|
payload["timings_per_token"] = True
|
||||||
if tools:
|
if tools:
|
||||||
payload["tools"] = tools
|
payload["tools"] = tools
|
||||||
if openai_tool_choice is not None:
|
if openai_tool_choice is not None:
|
||||||
@ -724,6 +772,9 @@ class LlamaCppClient(GenAIClient):
|
|||||||
data = json.loads(data_str)
|
data = json.loads(data_str)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
continue
|
continue
|
||||||
|
maybe_stats = _stats_from_llama_cpp_chunk(data)
|
||||||
|
if maybe_stats is not None:
|
||||||
|
yield ("stats", maybe_stats)
|
||||||
choices = data.get("choices") or []
|
choices = data.get("choices") or []
|
||||||
if not choices:
|
if not choices:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@ -18,6 +18,37 @@ from frigate.genai.utils import parse_tool_calls_from_message
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_ollama_stats(response: Any) -> Optional[dict[str, Any]]:
|
||||||
|
"""Build a stats dict from Ollama's response metadata.
|
||||||
|
|
||||||
|
Ollama reports eval_count/eval_duration (generation) and
|
||||||
|
prompt_eval_count (context size). Durations are nanoseconds.
|
||||||
|
"""
|
||||||
|
if not response:
|
||||||
|
return None
|
||||||
|
if hasattr(response, "get"):
|
||||||
|
getter = response.get
|
||||||
|
else:
|
||||||
|
getter = lambda key: getattr(response, key, None) # noqa: E731
|
||||||
|
|
||||||
|
eval_count = getter("eval_count")
|
||||||
|
eval_duration_ns = getter("eval_duration")
|
||||||
|
prompt_eval_count = getter("prompt_eval_count")
|
||||||
|
if eval_count is None and prompt_eval_count is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
stats: dict[str, Any] = {}
|
||||||
|
if isinstance(prompt_eval_count, int):
|
||||||
|
stats["prompt_tokens"] = prompt_eval_count
|
||||||
|
if isinstance(eval_count, int):
|
||||||
|
stats["completion_tokens"] = eval_count
|
||||||
|
if isinstance(eval_duration_ns, int) and eval_duration_ns > 0:
|
||||||
|
stats["completion_duration_ms"] = eval_duration_ns / 1_000_000
|
||||||
|
if isinstance(eval_count, int) and eval_count > 0:
|
||||||
|
stats["tokens_per_second"] = eval_count / (eval_duration_ns / 1_000_000_000)
|
||||||
|
return stats or None
|
||||||
|
|
||||||
|
|
||||||
def _normalize_multimodal_content(
|
def _normalize_multimodal_content(
|
||||||
content: Any,
|
content: Any,
|
||||||
) -> tuple[Optional[str], Optional[list[bytes]]]:
|
) -> tuple[Optional[str], Optional[list[bytes]]]:
|
||||||
@ -403,6 +434,9 @@ class OllamaClient(GenAIClient):
|
|||||||
content = result.get("content")
|
content = result.get("content")
|
||||||
if content:
|
if content:
|
||||||
yield ("content_delta", content)
|
yield ("content_delta", content)
|
||||||
|
stats = _extract_ollama_stats(response)
|
||||||
|
if stats is not None:
|
||||||
|
yield ("stats", stats)
|
||||||
yield ("message", result)
|
yield ("message", result)
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -416,6 +450,7 @@ class OllamaClient(GenAIClient):
|
|||||||
)
|
)
|
||||||
content_parts: list[str] = []
|
content_parts: list[str] = []
|
||||||
final_message: dict[str, Any] | None = None
|
final_message: dict[str, Any] | None = None
|
||||||
|
final_chunk: Any = None
|
||||||
stream = await async_client.chat(**request_params)
|
stream = await async_client.chat(**request_params)
|
||||||
async for chunk in stream:
|
async for chunk in stream:
|
||||||
if not chunk or "message" not in chunk:
|
if not chunk or "message" not in chunk:
|
||||||
@ -426,6 +461,7 @@ class OllamaClient(GenAIClient):
|
|||||||
content_parts.append(delta)
|
content_parts.append(delta)
|
||||||
yield ("content_delta", delta)
|
yield ("content_delta", delta)
|
||||||
if chunk.get("done"):
|
if chunk.get("done"):
|
||||||
|
final_chunk = chunk
|
||||||
full_content = "".join(content_parts).strip() or None
|
full_content = "".join(content_parts).strip() or None
|
||||||
final_message = {
|
final_message = {
|
||||||
"content": full_content,
|
"content": full_content,
|
||||||
@ -434,6 +470,10 @@ class OllamaClient(GenAIClient):
|
|||||||
}
|
}
|
||||||
break
|
break
|
||||||
|
|
||||||
|
stats = _extract_ollama_stats(final_chunk)
|
||||||
|
if stats is not None:
|
||||||
|
yield ("stats", stats)
|
||||||
|
|
||||||
if final_message is not None:
|
if final_message is not None:
|
||||||
yield ("message", final_message)
|
yield ("message", final_message)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -14,6 +14,22 @@ from frigate.genai import GenAIClient, register_genai_provider
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _stats_from_openai_usage(usage: Any) -> Optional[dict[str, Any]]:
|
||||||
|
"""Build a stats dict from an OpenAI-compatible usage object."""
|
||||||
|
if usage is None:
|
||||||
|
return None
|
||||||
|
prompt_tokens = getattr(usage, "prompt_tokens", None)
|
||||||
|
completion_tokens = getattr(usage, "completion_tokens", None)
|
||||||
|
if prompt_tokens is None and completion_tokens is None:
|
||||||
|
return None
|
||||||
|
stats: dict[str, Any] = {}
|
||||||
|
if isinstance(prompt_tokens, int):
|
||||||
|
stats["prompt_tokens"] = prompt_tokens
|
||||||
|
if isinstance(completion_tokens, int):
|
||||||
|
stats["completion_tokens"] = completion_tokens
|
||||||
|
return stats or None
|
||||||
|
|
||||||
|
|
||||||
@register_genai_provider(GenAIProviderEnum.openai)
|
@register_genai_provider(GenAIProviderEnum.openai)
|
||||||
class OpenAIClient(GenAIClient):
|
class OpenAIClient(GenAIClient):
|
||||||
"""Generative AI client for Frigate using OpenAI."""
|
"""Generative AI client for Frigate using OpenAI."""
|
||||||
@ -298,6 +314,7 @@ class OpenAIClient(GenAIClient):
|
|||||||
"messages": messages,
|
"messages": messages,
|
||||||
"timeout": self.timeout,
|
"timeout": self.timeout,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
|
"stream_options": {"include_usage": True},
|
||||||
}
|
}
|
||||||
|
|
||||||
if tools:
|
if tools:
|
||||||
@ -318,10 +335,15 @@ class OpenAIClient(GenAIClient):
|
|||||||
content_parts: list[str] = []
|
content_parts: list[str] = []
|
||||||
tool_calls_by_index: dict[int, dict[str, Any]] = {}
|
tool_calls_by_index: dict[int, dict[str, Any]] = {}
|
||||||
finish_reason = "stop"
|
finish_reason = "stop"
|
||||||
|
usage_stats: Optional[dict[str, Any]] = None
|
||||||
|
|
||||||
stream = self.provider.chat.completions.create(**request_params) # type: ignore[call-overload]
|
stream = self.provider.chat.completions.create(**request_params) # type: ignore[call-overload]
|
||||||
|
|
||||||
for chunk in stream:
|
for chunk in stream:
|
||||||
|
chunk_usage = getattr(chunk, "usage", None)
|
||||||
|
if chunk_usage is not None:
|
||||||
|
usage_stats = _stats_from_openai_usage(chunk_usage)
|
||||||
|
|
||||||
if not chunk or not chunk.choices:
|
if not chunk or not chunk.choices:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -381,6 +403,9 @@ class OpenAIClient(GenAIClient):
|
|||||||
)
|
)
|
||||||
finish_reason = "tool_calls"
|
finish_reason = "tool_calls"
|
||||||
|
|
||||||
|
if usage_stats is not None:
|
||||||
|
yield ("stats", usage_stats)
|
||||||
|
|
||||||
yield (
|
yield (
|
||||||
"message",
|
"message",
|
||||||
{
|
{
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user