3ef43019be
- index_images_indb.py: new script indexing via VECTOR_EMBEDDING(CLIP_IMG) using a two-step INSERT+UPDATE to work around ORA-24816 - index_images_oracle.py / index_images.py: add timing output - Presentation: schema names VECTORS_USER/VECTOR in diagram and comparison, ONNX expansion, HNSW index note on slide 11, indexing times updated from 3-run benchmark (avg: PG 12.1s, Ora 12.1s, InDB 13.6s) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
70 lines
2.0 KiB
Python
70 lines
2.0 KiB
Python
import os
|
|
import array
|
|
import time
|
|
from dotenv import load_dotenv
|
|
from db_oracle import get_connection
|
|
from embedder import embed_image
|
|
|
|
load_dotenv()
|
|
|
|
PHOTOS_DIR = os.getenv("PHOTOS_DIR")
|
|
|
|
CREATE_TABLE = """
|
|
CREATE TABLE images (
|
|
id NUMBER GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
|
|
filename VARCHAR2(255) NOT NULL UNIQUE,
|
|
filepath VARCHAR2(1000) NOT NULL,
|
|
embedding VECTOR(512, FLOAT32)
|
|
)
|
|
"""
|
|
|
|
CREATE_INDEX = """
|
|
CREATE VECTOR INDEX images_embedding_idx
|
|
ON images(embedding)
|
|
ORGANIZATION INMEMORY NEIGHBOR GRAPH
|
|
WITH DISTANCE COSINE
|
|
WITH TARGET ACCURACY 95
|
|
PARAMETERS (type HNSW, neighbors 32, efconstruction 200)
|
|
"""
|
|
|
|
INSERT = "INSERT INTO images (filename, filepath, embedding) VALUES (:1, :2, :3)"
|
|
|
|
def table_exists(cur):
|
|
cur.execute("SELECT COUNT(*) FROM user_tables WHERE table_name = 'IMAGES'")
|
|
return cur.fetchone()[0] > 0
|
|
|
|
def main():
|
|
conn = get_connection()
|
|
cur = conn.cursor()
|
|
|
|
if not table_exists(cur):
|
|
cur.execute(CREATE_TABLE)
|
|
cur.execute(CREATE_INDEX)
|
|
conn.commit()
|
|
print("Table and index created.")
|
|
else:
|
|
print("Table already exists, skipping creation.")
|
|
|
|
files = [f for f in os.listdir(PHOTOS_DIR) if f.lower().endswith((".jpg", ".jpeg"))]
|
|
print(f"Found {len(files)} photos in {PHOTOS_DIR}")
|
|
|
|
start = time.time()
|
|
for i, filename in enumerate(files, 1):
|
|
filepath = os.path.join(PHOTOS_DIR, filename)
|
|
cur.execute("SELECT 1 FROM images WHERE filename = :1", (filename,))
|
|
if cur.fetchone():
|
|
print(f"[{i}/{len(files)}] Skipping {filename} (already indexed)")
|
|
continue
|
|
# oracledb requires array.array("f") for VECTOR(512, FLOAT32) — plain list is rejected.
|
|
embedding = array.array("f", embed_image(filepath))
|
|
cur.execute(INSERT, (filename, filepath, embedding))
|
|
conn.commit()
|
|
print(f"[{i}/{len(files)}] Indexed {filename}")
|
|
|
|
cur.close()
|
|
conn.close()
|
|
print(f"Done in {time.time() - start:.1f} seconds.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|