Use thread lock for openvino to avoid concurrent requests with JinaV2

This commit is contained in:
Nicolas Mowen 2025-11-07 08:46:43 -07:00
parent 2376bcaf97
commit ab3ded38e6

View File

@ -3,6 +3,7 @@
import logging import logging
import os import os
import platform import platform
import threading
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any from typing import Any
@ -290,6 +291,10 @@ class OpenVINOModelRunner(BaseModelRunner):
self.infer_request = self.compiled_model.create_infer_request() self.infer_request = self.compiled_model.create_infer_request()
self.input_tensor: ov.Tensor | None = None self.input_tensor: ov.Tensor | None = None
# Thread lock to prevent concurrent inference (needed for JinaV2 which shares
# one runner between text and vision embeddings called from different threads)
self._inference_lock = threading.Lock()
if not self.complex_model: if not self.complex_model:
try: try:
input_shape = self.compiled_model.inputs[0].get_shape() input_shape = self.compiled_model.inputs[0].get_shape()
@ -333,6 +338,9 @@ class OpenVINOModelRunner(BaseModelRunner):
Returns: Returns:
List of output tensors List of output tensors
""" """
# Lock prevents concurrent access to infer_request
# Needed for JinaV2: genai thread (text) + embeddings thread (vision)
with self._inference_lock:
# Handle single input case for backward compatibility # Handle single input case for backward compatibility
if ( if (
len(inputs) == 1 len(inputs) == 1