Files
vector-search-demo/oravector-demo/backend/index_images_oracle.py
T
dierk 1c5e00d8e4 Add targeted comments explaining non-obvious behaviour
- embedder.py: lazy model load rationale, RGB conversion, shared vector space
- main.py: why vec appears twice, ::vector cast, 1-distance score formula
- main_oracle.py: why array.array("f") is required instead of plain list
- main_oracle_indb.py: no embedder import — embedding done inside Oracle SQL
- index_images_oracle.py: same array.array requirement on indexing path

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 14:39:40 +02:00

68 lines
2.0 KiB
Python

import os
import array
from dotenv import load_dotenv
from db_oracle import get_connection
from embedder import embed_image
load_dotenv()
PHOTOS_DIR = os.getenv("PHOTOS_DIR")
CREATE_TABLE = """
CREATE TABLE images (
id NUMBER GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
filename VARCHAR2(255) NOT NULL UNIQUE,
filepath VARCHAR2(1000) NOT NULL,
embedding VECTOR(512, FLOAT32)
)
"""
CREATE_INDEX = """
CREATE VECTOR INDEX images_embedding_idx
ON images(embedding)
ORGANIZATION INMEMORY NEIGHBOR GRAPH
WITH DISTANCE COSINE
WITH TARGET ACCURACY 95
PARAMETERS (type HNSW, neighbors 32, efconstruction 200)
"""
INSERT = "INSERT INTO images (filename, filepath, embedding) VALUES (:1, :2, :3)"
def table_exists(cur):
cur.execute("SELECT COUNT(*) FROM user_tables WHERE table_name = 'IMAGES'")
return cur.fetchone()[0] > 0
def main():
conn = get_connection()
cur = conn.cursor()
if not table_exists(cur):
cur.execute(CREATE_TABLE)
cur.execute(CREATE_INDEX)
conn.commit()
print("Table and index created.")
else:
print("Table already exists, skipping creation.")
files = [f for f in os.listdir(PHOTOS_DIR) if f.lower().endswith((".jpg", ".jpeg"))]
print(f"Found {len(files)} photos in {PHOTOS_DIR}")
for i, filename in enumerate(files, 1):
filepath = os.path.join(PHOTOS_DIR, filename)
cur.execute("SELECT 1 FROM images WHERE filename = :1", (filename,))
if cur.fetchone():
print(f"[{i}/{len(files)}] Skipping {filename} (already indexed)")
continue
# oracledb requires array.array("f") for VECTOR(512, FLOAT32) — plain list is rejected.
embedding = array.array("f", embed_image(filepath))
cur.execute(INSERT, (filename, filepath, embedding))
conn.commit()
print(f"[{i}/{len(files)}] Indexed {filename}")
cur.close()
conn.close()
print("Done.")
if __name__ == "__main__":
main()