4a82352391
Model downloads automatically from HuggingFace Hub on first use. No manual download required. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
26 lines
1008 B
Python
26 lines
1008 B
Python
from sentence_transformers import SentenceTransformer
|
|
from PIL import Image
|
|
|
|
_model = None
|
|
|
|
def _get_model():
|
|
# Lazy load: the CLIP model is ~600 MB and takes several seconds to initialise.
|
|
# Loading on first call avoids the cost at import time and during indexing warmup.
|
|
# Downloaded automatically from Hugging Face Hub on first use:
|
|
# https://huggingface.co/sentence-transformers/clip-ViT-B-32
|
|
# Cached in ~/.cache/huggingface/hub/
|
|
global _model
|
|
if _model is None:
|
|
_model = SentenceTransformer("clip-ViT-B-32")
|
|
return _model
|
|
|
|
def embed_image(path: str) -> list[float]:
|
|
# CLIP requires RGB — some JPEGs are stored as CMYK or grayscale.
|
|
img = Image.open(path).convert("RGB")
|
|
return _get_model().encode(img).tolist()
|
|
|
|
def embed_text(text: str) -> list[float]:
|
|
# Text and images share the same 512-dimensional vector space in CLIP,
|
|
# so the returned vector is directly comparable to image embeddings.
|
|
return _get_model().encode(text).tolist()
|