Add in-DB indexing script, benchmark results, schema names in presentation
- index_images_indb.py: new script indexing via VECTOR_EMBEDDING(CLIP_IMG) using a two-step INSERT+UPDATE to work around ORA-24816 - index_images_oracle.py / index_images.py: add timing output - Presentation: schema names VECTORS_USER/VECTOR in diagram and comparison, ONNX expansion, HNSW index note on slide 11, indexing times updated from 3-run benchmark (avg: PG 12.1s, Ora 12.1s, InDB 13.6s) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,49 @@
|
||||
import os
|
||||
import time
|
||||
from dotenv import load_dotenv
|
||||
from db_oracle import get_connection_indb
|
||||
|
||||
load_dotenv()
|
||||
|
||||
PHOTOS_DIR = os.getenv("PHOTOS_DIR")
|
||||
|
||||
def main():
|
||||
conn = get_connection_indb()
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM VECTOR.FOTO_VEKTOR")
|
||||
print(f"Rows before: {cur.fetchone()[0]}")
|
||||
|
||||
files = [f for f in os.listdir(PHOTOS_DIR) if f.lower().endswith((".jpg", ".jpeg"))]
|
||||
print(f"Found {len(files)} photos in {PHOTOS_DIR}")
|
||||
|
||||
start = time.time()
|
||||
for i, filename in enumerate(files, 1):
|
||||
filepath = os.path.join(PHOTOS_DIR, filename)
|
||||
cur.execute("SELECT 1 FROM VECTOR.FOTO_VEKTOR WHERE filename = :1", (filename,))
|
||||
if cur.fetchone():
|
||||
print(f"[{i}/{len(files)}] Skipping {filename} (already indexed)")
|
||||
continue
|
||||
with open(filepath, "rb") as f:
|
||||
blob_data = f.read()
|
||||
# ORA-24816: Oracle cannot bind the same BLOB as both column value and
|
||||
# VECTOR_EMBEDDING() input in one statement. Insert the BLOB first, then
|
||||
# let Oracle compute the embedding from the stored data in a second step.
|
||||
cur.execute(
|
||||
"INSERT INTO VECTOR.FOTO_VEKTOR (filename, foto) VALUES (:1, :2)",
|
||||
(filename, blob_data),
|
||||
)
|
||||
cur.execute(
|
||||
"""UPDATE VECTOR.FOTO_VEKTOR
|
||||
SET foto_vek = VECTOR_EMBEDDING(CLIP_IMG USING foto AS data)
|
||||
WHERE filename = :1""",
|
||||
(filename,),
|
||||
)
|
||||
conn.commit()
|
||||
print(f"[{i}/{len(files)}] Indexed {filename}")
|
||||
|
||||
elapsed = time.time() - start
|
||||
print(f"Done in {elapsed:.1f} seconds.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user