Files
vector-search-demo/pgvector-demo/backend/index_images.py
T
dierk 3ef43019be Add in-DB indexing script, benchmark results, schema names in presentation
- index_images_indb.py: new script indexing via VECTOR_EMBEDDING(CLIP_IMG)
  using a two-step INSERT+UPDATE to work around ORA-24816
- index_images_oracle.py / index_images.py: add timing output
- Presentation: schema names VECTORS_USER/VECTOR in diagram and comparison,
  ONNX expansion, HNSW index note on slide 11,
  indexing times updated from 3-run benchmark (avg: PG 12.1s, Ora 12.1s, InDB 13.6s)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 10:42:13 +02:00

59 lines
1.5 KiB
Python

import os
import time
from dotenv import load_dotenv
from db import get_connection
from embedder import embed_image
load_dotenv()
PHOTOS_DIR = os.getenv("PHOTOS_DIR")
CREATE_TABLE = """
CREATE TABLE IF NOT EXISTS images (
id SERIAL PRIMARY KEY,
filename TEXT NOT NULL UNIQUE,
filepath TEXT NOT NULL,
embedding vector(512)
);
"""
CREATE_INDEX = """
CREATE INDEX IF NOT EXISTS images_embedding_idx
ON images USING hnsw (embedding vector_cosine_ops);
"""
INSERT = """
INSERT INTO images (filename, filepath, embedding)
VALUES (%s, %s, %s)
ON CONFLICT (filename) DO NOTHING;
"""
def main():
conn = get_connection()
cur = conn.cursor()
cur.execute(CREATE_TABLE)
cur.execute(CREATE_INDEX)
conn.commit()
files = [f for f in os.listdir(PHOTOS_DIR) if f.lower().endswith((".jpg", ".jpeg"))]
print(f"Found {len(files)} photos in {PHOTOS_DIR}")
start = time.time()
for i, filename in enumerate(files, 1):
filepath = os.path.join(PHOTOS_DIR, filename)
cur.execute("SELECT 1 FROM images WHERE filename = %s", (filename,))
if cur.fetchone():
print(f"[{i}/{len(files)}] Skipping {filename} (already indexed)")
continue
embedding = embed_image(filepath)
cur.execute(INSERT, (filename, filepath, embedding))
conn.commit()
print(f"[{i}/{len(files)}] Indexed {filename}")
cur.close()
conn.close()
print(f"Done in {time.time() - start:.1f} seconds.")
if __name__ == "__main__":
main()