Initial implementation of pgvector and Oracle 26ai vector search demo
Three FastAPI backends comparing PostgreSQL/pgvector and Oracle 26ai for semantic image search using CLIP embeddings: Python-side embedding for both databases, plus Oracle in-database embedding via VECTOR_EMBEDDING(CLIP_TXT). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,19 @@
|
||||
import os
|
||||
import oracledb
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
def get_connection():
|
||||
return oracledb.connect(
|
||||
user=os.getenv("ORA_USER"),
|
||||
password=os.getenv("ORA_PASSWORD"),
|
||||
dsn=f"{os.getenv('ORA_HOST')}:{os.getenv('ORA_PORT')}/{os.getenv('ORA_SERVICE')}",
|
||||
)
|
||||
|
||||
def get_connection_indb():
|
||||
return oracledb.connect(
|
||||
user=os.getenv("ORA_USER_INDB"),
|
||||
password=os.getenv("ORA_PASSWORD_INDB"),
|
||||
dsn=f"{os.getenv('ORA_HOST')}:{os.getenv('ORA_PORT')}/{os.getenv('ORA_SERVICE')}",
|
||||
)
|
||||
@@ -0,0 +1,17 @@
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from PIL import Image
|
||||
|
||||
_model = None
|
||||
|
||||
def _get_model():
|
||||
global _model
|
||||
if _model is None:
|
||||
_model = SentenceTransformer("clip-ViT-B-32")
|
||||
return _model
|
||||
|
||||
def embed_image(path: str) -> list[float]:
|
||||
img = Image.open(path).convert("RGB")
|
||||
return _get_model().encode(img).tolist()
|
||||
|
||||
def embed_text(text: str) -> list[float]:
|
||||
return _get_model().encode(text).tolist()
|
||||
@@ -0,0 +1,66 @@
|
||||
import os
|
||||
import array
|
||||
from dotenv import load_dotenv
|
||||
from db_oracle import get_connection
|
||||
from embedder import embed_image
|
||||
|
||||
load_dotenv()
|
||||
|
||||
PHOTOS_DIR = os.getenv("PHOTOS_DIR")
|
||||
|
||||
CREATE_TABLE = """
|
||||
CREATE TABLE images (
|
||||
id NUMBER GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
|
||||
filename VARCHAR2(255) NOT NULL UNIQUE,
|
||||
filepath VARCHAR2(1000) NOT NULL,
|
||||
embedding VECTOR(512, FLOAT32)
|
||||
)
|
||||
"""
|
||||
|
||||
CREATE_INDEX = """
|
||||
CREATE VECTOR INDEX images_embedding_idx
|
||||
ON images(embedding)
|
||||
ORGANIZATION INMEMORY NEIGHBOR GRAPH
|
||||
WITH DISTANCE COSINE
|
||||
WITH TARGET ACCURACY 95
|
||||
PARAMETERS (type HNSW, neighbors 32, efconstruction 200)
|
||||
"""
|
||||
|
||||
INSERT = "INSERT INTO images (filename, filepath, embedding) VALUES (:1, :2, :3)"
|
||||
|
||||
def table_exists(cur):
|
||||
cur.execute("SELECT COUNT(*) FROM user_tables WHERE table_name = 'IMAGES'")
|
||||
return cur.fetchone()[0] > 0
|
||||
|
||||
def main():
|
||||
conn = get_connection()
|
||||
cur = conn.cursor()
|
||||
|
||||
if not table_exists(cur):
|
||||
cur.execute(CREATE_TABLE)
|
||||
cur.execute(CREATE_INDEX)
|
||||
conn.commit()
|
||||
print("Table and index created.")
|
||||
else:
|
||||
print("Table already exists, skipping creation.")
|
||||
|
||||
files = [f for f in os.listdir(PHOTOS_DIR) if f.lower().endswith((".jpg", ".jpeg"))]
|
||||
print(f"Found {len(files)} photos in {PHOTOS_DIR}")
|
||||
|
||||
for i, filename in enumerate(files, 1):
|
||||
filepath = os.path.join(PHOTOS_DIR, filename)
|
||||
cur.execute("SELECT 1 FROM images WHERE filename = :1", (filename,))
|
||||
if cur.fetchone():
|
||||
print(f"[{i}/{len(files)}] Skipping {filename} (already indexed)")
|
||||
continue
|
||||
embedding = array.array("f", embed_image(filepath))
|
||||
cur.execute(INSERT, (filename, filepath, embedding))
|
||||
conn.commit()
|
||||
print(f"[{i}/{len(files)}] Indexed {filename}")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("Done.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,49 @@
|
||||
import os
|
||||
import array
|
||||
from fastapi import FastAPI, Query
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import FileResponse
|
||||
from dotenv import load_dotenv
|
||||
from db_oracle import get_connection
|
||||
from embedder import embed_text
|
||||
|
||||
load_dotenv()
|
||||
|
||||
PHOTOS_DIR = os.getenv("PHOTOS_DIR")
|
||||
|
||||
app = FastAPI()
|
||||
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
|
||||
|
||||
@app.get("/search")
|
||||
def search(q: str = Query(...), limit: int = Query(12)):
|
||||
vec = array.array("f", embed_text(q))
|
||||
conn = get_connection()
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT filename, 1 - VECTOR_DISTANCE(embedding, :vec, COSINE) AS score
|
||||
FROM images
|
||||
ORDER BY VECTOR_DISTANCE(embedding, :vec, COSINE)
|
||||
FETCH FIRST :lim ROWS ONLY
|
||||
""",
|
||||
{"vec": vec, "lim": limit},
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return [{"filename": r[0], "score": round(r[1], 4)} for r in rows]
|
||||
|
||||
@app.get("/stats")
|
||||
def stats():
|
||||
conn = get_connection()
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT COUNT(*) FROM images")
|
||||
count = cur.fetchone()[0]
|
||||
cur.close()
|
||||
conn.close()
|
||||
return {"count": count}
|
||||
|
||||
@app.get("/photos/{filename}")
|
||||
def get_photo(filename: str):
|
||||
path = os.path.join(PHOTOS_DIR, filename)
|
||||
return FileResponse(path, media_type="image/jpeg")
|
||||
@@ -0,0 +1,55 @@
|
||||
import os
|
||||
from fastapi import FastAPI, Query
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import FileResponse
|
||||
from dotenv import load_dotenv
|
||||
from db_oracle import get_connection_indb
|
||||
|
||||
load_dotenv()
|
||||
|
||||
PHOTOS_DIR = os.getenv("PHOTOS_DIR")
|
||||
|
||||
app = FastAPI()
|
||||
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
|
||||
|
||||
@app.get("/search")
|
||||
def search(q: str = Query(...), limit: int = Query(12)):
|
||||
conn = get_connection_indb()
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT filename,
|
||||
1 - VECTOR_DISTANCE(
|
||||
foto_vek,
|
||||
VECTOR_EMBEDDING(CLIP_TXT USING :q AS data),
|
||||
COSINE
|
||||
) AS score
|
||||
FROM VECTOR.FOTO_VEKTOR
|
||||
ORDER BY VECTOR_DISTANCE(
|
||||
foto_vek,
|
||||
VECTOR_EMBEDDING(CLIP_TXT USING :q AS data),
|
||||
COSINE
|
||||
)
|
||||
FETCH FIRST :lim ROWS ONLY
|
||||
""",
|
||||
{"q": q, "lim": limit},
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return [{"filename": r[0], "score": round(r[1], 4)} for r in rows]
|
||||
|
||||
@app.get("/stats")
|
||||
def stats():
|
||||
conn = get_connection_indb()
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT COUNT(*) FROM VECTOR.FOTO_VEKTOR")
|
||||
count = cur.fetchone()[0]
|
||||
cur.close()
|
||||
conn.close()
|
||||
return {"count": count}
|
||||
|
||||
@app.get("/photos/{filename}")
|
||||
def get_photo(filename: str):
|
||||
path = os.path.join(PHOTOS_DIR, filename)
|
||||
return FileResponse(path, media_type="image/jpeg")
|
||||
Reference in New Issue
Block a user