From c5d4e301d12884b7970603e0b173367235b0402b Mon Sep 17 00:00:00 2001 From: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:43:53 -0500 Subject: [PATCH] manually download and cache feature extractor config --- frigate/embeddings/embeddings.py | 8 ++--- frigate/embeddings/functions/onnx.py | 48 ++++++++++++---------------- web/src/pages/Explore.tsx | 2 +- 3 files changed, 26 insertions(+), 32 deletions(-) diff --git a/frigate/embeddings/embeddings.py b/frigate/embeddings/embeddings.py index 9334f9a04..99a2d8ab0 100644 --- a/frigate/embeddings/embeddings.py +++ b/frigate/embeddings/embeddings.py @@ -91,7 +91,7 @@ class Embeddings: "jinaai/jina-clip-v1-text_model_fp16.onnx", "jinaai/jina-clip-v1-tokenizer", "jinaai/jina-clip-v1-vision_model_fp16.onnx", - "jinaai/jina-clip-v1-feature_extractor", + "jinaai/jina-clip-v1-preprocessor_config.json", ] for model in models: @@ -114,7 +114,7 @@ class Embeddings: model_file="text_model_fp16.onnx", tokenizer_file="tokenizer", download_urls={ - "text_model_fp16.onnx": "https://huggingface.co/jinaai/jina-clip-v1/resolve/main/onnx/text_model_fp16.onnx" + "text_model_fp16.onnx": "https://huggingface.co/jinaai/jina-clip-v1/resolve/main/onnx/text_model_fp16.onnx", }, embedding_function=jina_text_embedding_function, model_type="text", @@ -124,9 +124,9 @@ class Embeddings: self.vision_embedding = GenericONNXEmbedding( model_name="jinaai/jina-clip-v1", model_file="vision_model_fp16.onnx", - tokenizer_file="feature_extractor", download_urls={ - "vision_model_fp16.onnx": "https://huggingface.co/jinaai/jina-clip-v1/resolve/main/onnx/vision_model_fp16.onnx" + "vision_model_fp16.onnx": "https://huggingface.co/jinaai/jina-clip-v1/resolve/main/onnx/vision_model_fp16.onnx", + "preprocessor_config.json": "https://huggingface.co/jinaai/jina-clip-v1/resolve/main/preprocessor_config.json", }, embedding_function=jina_vision_embedding_function, model_type="vision", diff --git a/frigate/embeddings/functions/onnx.py b/frigate/embeddings/functions/onnx.py index 454fe3faf..7b437c5d2 100644 --- a/frigate/embeddings/functions/onnx.py +++ b/frigate/embeddings/functions/onnx.py @@ -2,7 +2,7 @@ import logging import os import warnings from io import BytesIO -from typing import Callable, Dict, List, Union +from typing import Callable, Dict, List, Optional, Union import numpy as np import onnxruntime as ort @@ -37,11 +37,11 @@ class GenericONNXEmbedding: self, model_name: str, model_file: str, - tokenizer_file: str, download_urls: Dict[str, str], embedding_function: Callable[[List[np.ndarray]], np.ndarray], model_type: str, preferred_providers: List[str] = ["CPUExecutionProvider"], + tokenizer_file: Optional[str] = None, ): self.model_name = model_name self.model_file = model_file @@ -59,7 +59,8 @@ class GenericONNXEmbedding: self.downloader = ModelDownloader( model_name=self.model_name, download_path=self.download_path, - file_names=[self.model_file, self.tokenizer_file], + file_names=list(self.download_urls.keys()) + + ([self.tokenizer_file] if self.tokenizer_file else []), download_func=self._download_model, ) self.downloader.ensure_model_files() @@ -69,26 +70,22 @@ class GenericONNXEmbedding: file_name = os.path.basename(path) if file_name in self.download_urls: ModelDownloader.download_from_url(self.download_urls[file_name], path) - elif file_name == self.tokenizer_file: - if self.model_type == "text": - if not os.path.exists(path + "/" + self.model_name): - logger.info(f"Downloading {self.model_name} tokenizer") - tokenizer = AutoTokenizer.from_pretrained( - self.model_name, - trust_remote_code=True, - cache_dir=f"{MODEL_CACHE_DIR}/{self.model_name}/tokenizer", - clean_up_tokenization_spaces=True, - ) - tokenizer.save_pretrained(path) - else: - if not os.path.exists(path + "/" + self.model_name): - logger.info(f"Downloading {self.model_name} feature extractor") - feature_extractor = AutoFeatureExtractor.from_pretrained( - self.model_name, - trust_remote_code=True, - cache_dir=f"{MODEL_CACHE_DIR}/{self.model_name}/feature_extractor", - ) - feature_extractor.save_pretrained(path) + elif file_name == self.tokenizer_file and self.model_type == "text": + if not os.path.exists(path + "/" + self.model_name): + logger.info(f"Downloading {self.model_name} tokenizer") + tokenizer = AutoTokenizer.from_pretrained( + self.model_name, + trust_remote_code=True, + cache_dir=f"{MODEL_CACHE_DIR}/{self.model_name}/tokenizer", + clean_up_tokenization_spaces=True, + ) + tokenizer.save_pretrained(path) + else: + if not os.path.exists(path + "/" + self.model_name): + logger.info(f"Downloading {self.model_name} feature extractor") + self.feature_extractor = AutoFeatureExtractor.from_pretrained( + f"{MODEL_CACHE_DIR}/{self.model_name}", + ) self.downloader.requestor.send_data( UPDATE_MODEL_STATE, @@ -128,11 +125,8 @@ class GenericONNXEmbedding: ) def _load_feature_extractor(self): - feature_extractor_path = os.path.join( - f"{MODEL_CACHE_DIR}/{self.model_name}/feature_extractor" - ) return AutoFeatureExtractor.from_pretrained( - self.model_name, trust_remote_code=True, cache_dir=feature_extractor_path + f"{MODEL_CACHE_DIR}/{self.model_name}", ) def _load_model(self, path: str, providers: List[str]): diff --git a/web/src/pages/Explore.tsx b/web/src/pages/Explore.tsx index 601260e0f..5a1ed6145 100644 --- a/web/src/pages/Explore.tsx +++ b/web/src/pages/Explore.tsx @@ -194,7 +194,7 @@ export default function Explore() { "jinaai/jina-clip-v1-vision_model_fp16.onnx", ); const { payload: visionFeatureExtractorState } = useModelState( - "jinaai/jina-clip-v1-feature_extractor", + "jinaai/jina-clip-v1-preprocessor_config.json", ); const allModelsLoaded = useMemo(() => {