add scale_factor and bias to description zscore normalization

This commit is contained in:
Josh Hawkins 2024-10-07 15:20:45 -05:00
parent 5cda95f5bf
commit e3a81db0bb
4 changed files with 17 additions and 29 deletions

View File

@ -73,7 +73,7 @@ class EmbeddingsContext:
def __init__(self, db: SqliteVecQueueDatabase):
self.embeddings = Embeddings(db)
self.thumb_stats = ZScoreNormalization()
self.desc_stats = ZScoreNormalization()
self.desc_stats = ZScoreNormalization(scale_factor=2.5, bias=0.5)
# load stats from disk
try:

View File

@ -46,7 +46,7 @@ class MiniLMEmbedding:
elif os.path.basename(path) == self.TOKENIZER_FILE:
logger.info("Downloading MiniLM tokenizer")
tokenizer = AutoTokenizer.from_pretrained(
self.MODEL_NAME, clean_up_tokenization_spaces=False
self.MODEL_NAME, clean_up_tokenization_spaces=True
)
tokenizer.save_pretrained(path)
@ -78,7 +78,7 @@ class MiniLMEmbedding:
def _load_tokenizer(self):
tokenizer_path = os.path.join(self.DOWNLOAD_PATH, self.TOKENIZER_FILE)
return AutoTokenizer.from_pretrained(
tokenizer_path, clean_up_tokenization_spaces=False
tokenizer_path, clean_up_tokenization_spaces=True
)
def _load_model(self, path: str, providers: List[str]):

View File

@ -4,12 +4,15 @@ import math
class ZScoreNormalization:
"""Running Z-score normalization for search distance."""
def __init__(self):
def __init__(self, scale_factor: float = 1.0, bias: float = 0.0):
"""Initialize with optional scaling and bias adjustments."""
"""scale_factor adjusts the magnitude of each score"""
"""bias will artificially shift the entire distribution upwards"""
self.n = 0
self.mean = 0
self.m2 = 0
self.scale_factor = scale_factor
self.bias = bias
@property
def variance(self):
@ -23,7 +26,10 @@ class ZScoreNormalization:
self._update(distances)
if self.stddev == 0:
return distances
return [(x - self.mean) / self.stddev for x in distances]
return [
(x - self.mean) / self.stddev * self.scale_factor + self.bias
for x in distances
]
def _update(self, distances: list[float]):
for x in distances:

View File

@ -189,19 +189,9 @@ export default function SearchView({
// confidence score - probably needs tweaking
const zScoreToConfidence = (score: number, source: string) => {
let midpoint, scale;
if (source === "thumbnail") {
midpoint = 2;
scale = 0.5;
} else {
midpoint = 0.5;
scale = 1.5;
}
const zScoreToConfidence = (score: number) => {
// Sigmoid function: 1 / (1 + e^x)
const confidence = 1 / (1 + Math.exp((score - midpoint) * scale));
const confidence = 1 / (1 + Math.exp(score));
return Math.round(confidence * 100);
};
@ -412,21 +402,13 @@ export default function SearchView({
) : (
<LuText className="mr-1 size-3" />
)}
{zScoreToConfidence(
value.search_distance,
value.search_source,
)}
%
{zScoreToConfidence(value.search_distance)}%
</Chip>
</TooltipTrigger>
<TooltipPortal>
<TooltipContent>
Matched {value.search_source} at{" "}
{zScoreToConfidence(
value.search_distance,
value.search_source,
)}
%
{zScoreToConfidence(value.search_distance)}%
</TooltipContent>
</TooltipPortal>
</Tooltip>