from sentence_transformers import SentenceTransformer from PIL import Image _model = None def _get_model(): # Lazy load: the CLIP model is ~600 MB and takes several seconds to initialise. # Loading on first call avoids the cost at import time and during indexing warmup. # Downloaded automatically from Hugging Face Hub on first use: # https://huggingface.co/sentence-transformers/clip-ViT-B-32 # Cached in ~/.cache/huggingface/hub/ global _model if _model is None: _model = SentenceTransformer("clip-ViT-B-32") return _model def embed_image(path: str) -> list[float]: # CLIP requires RGB — some JPEGs are stored as CMYK or grayscale. img = Image.open(path).convert("RGB") return _get_model().encode(img).tolist() def embed_text(text: str) -> list[float]: # Text and images share the same 512-dimensional vector space in CLIP, # so the returned vector is directly comparable to image embeddings. return _get_model().encode(text).tolist()