vector-search-demo/oravector-demo/backend/embedder.py

from sentence_transformers import SentenceTransformer
from PIL import Image

_model = None

def _get_model():
    # Lazy load: the CLIP model is ~600 MB and takes several seconds to initialise.
    # Loading on first call avoids the cost at import time and during indexing warmup.
    # Downloaded automatically from Hugging Face Hub on first use:
    # https://huggingface.co/sentence-transformers/clip-ViT-B-32
    # Cached in ~/.cache/huggingface/hub/
    global _model
    if _model is None:
        _model = SentenceTransformer("clip-ViT-B-32")
    return _model

def embed_image(path: str) -> list[float]:
    # CLIP requires RGB — some JPEGs are stored as CMYK or grayscale.
    img = Image.open(path).convert("RGB")
    return _get_model().encode(img).tolist()

def embed_text(text: str) -> list[float]:
    # Text and images share the same 512-dimensional vector space in CLIP,
    # so the returned vector is directly comparable to image embeddings.
    return _get_model().encode(text).tolist()