Initial implementation of generic Excel-to-DB import tool

Supports .xls and .xlsx, Oracle and PostgreSQL via SQLAlchemy.
Includes CLI (run/inspect/generate-config), YAML config, auto schema
detection, and append/replace/upsert modes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-13 11:31:47 +02:00
commit 8f7399de58
26 changed files with 663 additions and 0 deletions
+4
View File
@@ -0,0 +1,4 @@
from .reader import ExcelReader
from .importer import Importer
__all__ = ["ExcelReader", "Importer"]
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
+87
View File
@@ -0,0 +1,87 @@
from __future__ import annotations
import logging
import sys
from pathlib import Path
import click
from .config import ImportConfig, SheetConfig
from .importer import Importer
from .reader import ExcelReader
def _setup_logging(verbose: bool):
level = logging.DEBUG if verbose else logging.INFO
logging.basicConfig(format="%(levelname)s %(message)s", level=level)
@click.group()
def main():
"""Generic Excel-to-database import tool (Oracle & PostgreSQL)."""
@main.command()
@click.argument("excel_file", type=click.Path(exists=True))
@click.argument("config_file", type=click.Path(exists=True))
@click.option("-v", "--verbose", is_flag=True)
def run(excel_file: str, config_file: str, verbose: bool):
"""Import EXCEL_FILE using CONFIG_FILE (YAML)."""
_setup_logging(verbose)
cfg = ImportConfig.from_yaml(config_file)
importer = Importer(cfg)
try:
results = importer.run(excel_file)
except Exception as exc:
click.echo(f"ERROR: {exc}", err=True)
sys.exit(1)
for table, rows in results.items():
click.echo(f" {table}: {rows} rows imported")
@main.command()
@click.argument("excel_file", type=click.Path(exists=True))
def inspect(excel_file: str):
"""Show sheet names and column preview of EXCEL_FILE."""
reader = ExcelReader(excel_file)
names = reader.sheet_names()
click.echo(f"Sheets in {Path(excel_file).name}:")
for i, name in enumerate(names):
click.echo(f" [{i}] {name}")
# read first few rows for preview
from .config import SheetConfig as SC
df = reader.read(SC(sheet=i))
click.echo(f" Columns ({len(df.columns)}): {', '.join(str(c) for c in df.columns[:8])}")
if len(df.columns) > 8:
click.echo(f" ... and {len(df.columns) - 8} more")
click.echo(f" Rows: {len(df)}")
@main.command("generate-config")
@click.argument("excel_file", type=click.Path(exists=True))
@click.option("--dsn", default="postgresql+psycopg2://user:pass@localhost/dbname", show_default=True)
@click.option("--output", "-o", default="import_config.yaml", show_default=True)
def generate_config(excel_file: str, dsn: str, output: str):
"""Generate a starter YAML config from EXCEL_FILE's structure."""
import yaml
reader = ExcelReader(excel_file)
names = reader.sheet_names()
sheets = []
for i, name in enumerate(names):
from .config import SheetConfig as SC
df = reader.read(SC(sheet=i))
table_name = name.lower().replace(" ", "_")
columns = [{"source": str(c), "target": str(c).lower().replace(" ", "_")} for c in df.columns]
sheets.append({
"sheet": name,
"header_row": 0,
"target_table": table_name,
"mode": "append",
"columns": columns,
})
config = {"dsn": dsn, "sheets": sheets}
with open(output, "w") as f:
yaml.dump(config, f, allow_unicode=True, sort_keys=False)
click.echo(f"Config written to {output}")
+48
View File
@@ -0,0 +1,48 @@
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Literal
import yaml
@dataclass
class ColumnMapping:
source: str
target: str
dtype: str | None = None # override detected type, e.g. "VARCHAR(100)", "NUMBER"
skip: bool = False
@dataclass
class SheetConfig:
sheet: str | int = 0 # sheet name or index
header_row: int = 0 # 0-based row index of the header
skip_rows: int = 0 # rows to skip before header
target_table: str = ""
columns: list[ColumnMapping] = field(default_factory=list)
mode: Literal["append", "replace", "upsert"] = "append"
upsert_keys: list[str] = field(default_factory=list) # column names for upsert PK
@dataclass
class ImportConfig:
dsn: str # SQLAlchemy DSN
sheets: list[SheetConfig] = field(default_factory=list)
default_varchar_length: int = 255
@classmethod
def from_yaml(cls, path: str | Path) -> "ImportConfig":
with open(path) as f:
raw = yaml.safe_load(f)
sheets = []
for s in raw.get("sheets", []):
columns = [ColumnMapping(**c) for c in s.pop("columns", [])]
upsert_keys = s.pop("upsert_keys", [])
sheets.append(SheetConfig(**s, columns=columns, upsert_keys=upsert_keys))
return cls(
dsn=raw["dsn"],
default_varchar_length=raw.get("default_varchar_length", 255),
sheets=sheets,
)
+127
View File
@@ -0,0 +1,127 @@
from __future__ import annotations
import logging
from pathlib import Path
import pandas as pd
from sqlalchemy import create_engine, text, MetaData, Table, inspect
from sqlalchemy.dialects.postgresql import insert as pg_insert
from .config import ImportConfig, SheetConfig
from .reader import ExcelReader
from .schema import build_columns
logger = logging.getLogger(__name__)
class Importer:
def __init__(self, config: ImportConfig):
self.config = config
self.engine = create_engine(config.dsn)
def run(self, excel_path: str | Path) -> dict[str, int]:
"""Import all configured sheets. Returns {table_name: rows_imported}."""
reader = ExcelReader(excel_path)
results = {}
for sheet_cfg in self.config.sheets:
rows = self._import_sheet(reader, sheet_cfg)
results[sheet_cfg.target_table] = rows
return results
def _import_sheet(self, reader: ExcelReader, cfg: SheetConfig) -> int:
df = reader.read(cfg)
if df.empty:
logger.warning("Sheet %r is empty, skipping.", cfg.sheet)
return 0
logger.info("Read %d rows from sheet %r -> table %r", len(df), cfg.sheet, cfg.target_table)
with self.engine.begin() as conn:
self._ensure_table(conn, df, cfg)
if cfg.mode == "replace":
dialect = self.engine.dialect.name
truncate_sql = (
f"DELETE FROM {cfg.target_table}"
if dialect == "sqlite"
else f"TRUNCATE TABLE {cfg.target_table}"
)
conn.execute(text(truncate_sql))
rows = self._bulk_insert(conn, df, cfg.target_table)
elif cfg.mode == "upsert":
rows = self._upsert(conn, df, cfg)
else: # append
rows = self._bulk_insert(conn, df, cfg.target_table)
logger.info("Imported %d rows into %r (mode=%s)", rows, cfg.target_table, cfg.mode)
return rows
def _ensure_table(self, conn, df: pd.DataFrame, cfg: SheetConfig):
insp = inspect(conn)
if not insp.has_table(cfg.target_table):
meta = MetaData()
cols = build_columns(df, cfg.columns, self.config.default_varchar_length)
table = Table(cfg.target_table, meta, *cols)
meta.create_all(conn)
logger.info("Created table %r", cfg.target_table)
def _bulk_insert(self, conn, df: pd.DataFrame, table_name: str) -> int:
records = _df_to_records(df)
if not records:
return 0
meta = MetaData()
meta.reflect(bind=conn, only=[table_name])
table = meta.tables[table_name]
conn.execute(table.insert(), records)
return len(records)
def _upsert(self, conn, df: pd.DataFrame, cfg: SheetConfig) -> int:
dialect = self.engine.dialect.name
records = _df_to_records(df)
if not records:
return 0
meta = MetaData()
meta.reflect(bind=conn, only=[cfg.target_table])
table = meta.tables[cfg.target_table]
if dialect == "postgresql":
stmt = pg_insert(table).values(records)
update_cols = {c.key: stmt.excluded[c.key] for c in table.columns if c.key not in cfg.upsert_keys}
stmt = stmt.on_conflict_do_update(index_elements=cfg.upsert_keys, set_=update_cols)
conn.execute(stmt)
elif dialect == "oracle":
# Oracle MERGE via raw SQL
for record in records:
_oracle_merge(conn, table, record, cfg.upsert_keys)
else:
raise NotImplementedError(f"Upsert not implemented for dialect: {dialect}")
return len(records)
def _df_to_records(df: pd.DataFrame) -> list[dict]:
# Replace pandas NA/NaT with None so SQLAlchemy handles nulls correctly
return [
{k: (None if pd.isna(v) else v) for k, v in row.items()}
for row in df.to_dict(orient="records")
]
def _oracle_merge(conn, table: Table, record: dict, keys: list[str]):
key_clauses = " AND ".join(f"t.{k} = s.{k}" for k in keys)
all_cols = list(record.keys())
non_keys = [c for c in all_cols if c not in keys]
select_parts = ", ".join(f":{c} AS {c}" for c in all_cols)
update_parts = ", ".join(f"t.{c} = s.{c}" for c in non_keys)
insert_cols = ", ".join(all_cols)
insert_vals = ", ".join(f"s.{c}" for c in all_cols)
sql = f"""
MERGE INTO {table.name} t
USING (SELECT {select_parts} FROM dual) s
ON ({key_clauses})
WHEN MATCHED THEN UPDATE SET {update_parts}
WHEN NOT MATCHED THEN INSERT ({insert_cols}) VALUES ({insert_vals})
"""
conn.execute(text(sql), record)
+49
View File
@@ -0,0 +1,49 @@
from __future__ import annotations
from pathlib import Path
import pandas as pd
from .config import SheetConfig
def _engine_for(path: Path) -> str:
return "xlrd" if path.suffix.lower() == ".xls" else "openpyxl"
class ExcelReader:
def __init__(self, path: str | Path):
self.path = Path(path)
if not self.path.exists():
raise FileNotFoundError(f"Excel file not found: {self.path}")
if self.path.suffix.lower() not in {".xls", ".xlsx", ".xlsm", ".xlsb"}:
raise ValueError(f"Unsupported file type: {self.path.suffix}")
def sheet_names(self) -> list[str]:
engine = _engine_for(self.path)
xf = pd.ExcelFile(self.path, engine=engine)
return xf.sheet_names
def read(self, cfg: SheetConfig) -> pd.DataFrame:
engine = _engine_for(self.path)
df = pd.read_excel(
self.path,
sheet_name=cfg.sheet,
header=cfg.header_row,
skiprows=range(cfg.skip_rows) if cfg.skip_rows else None,
engine=engine,
)
# drop completely empty rows
df.dropna(how="all", inplace=True)
# apply column mapping: rename and drop skipped columns
if cfg.columns:
skip_sources = {c.source for c in cfg.columns if c.skip}
df.drop(columns=[c for c in skip_sources if c in df.columns], inplace=True)
rename_map = {
c.source: c.target
for c in cfg.columns
if not c.skip and c.source != c.target
}
df.rename(columns=rename_map, inplace=True)
return df
+72
View File
@@ -0,0 +1,72 @@
from __future__ import annotations
import pandas as pd
from sqlalchemy import (
Column, Integer, Float, String, DateTime, Date, Boolean, Numeric, Text
)
from .config import ColumnMapping
def _pandas_dtype_to_sqla(series: pd.Series, varchar_length: int):
dtype = series.dtype
if pd.api.types.is_bool_dtype(dtype):
return Boolean()
if pd.api.types.is_integer_dtype(dtype):
return Integer()
if pd.api.types.is_float_dtype(dtype):
return Float()
if pd.api.types.is_datetime64_any_dtype(dtype):
return DateTime()
# object columns: check if they look like dates
if dtype == object:
sample = series.dropna().head(100)
if len(sample) > 0:
try:
pd.to_datetime(sample)
return DateTime()
except Exception:
pass
max_len = int(series.dropna().astype(str).str.len().max()) if len(series.dropna()) > 0 else 1
return String(max(max_len + 10, varchar_length))
return Text()
def _override_to_sqla(dtype_str: str):
"""Convert a user-supplied type string like 'VARCHAR(100)' to a SQLAlchemy type."""
s = dtype_str.upper().strip()
if s.startswith("VARCHAR"):
length = int(s.split("(")[1].rstrip(")")) if "(" in s else 255
return String(length)
if s in ("TEXT", "CLOB"):
return Text()
if s in ("INTEGER", "INT", "NUMBER"):
return Integer()
if s.startswith("NUMBER") or s.startswith("NUMERIC") or s.startswith("DECIMAL"):
if "(" in s:
parts = s.split("(")[1].rstrip(")").split(",")
p, sc = int(parts[0]), int(parts[1]) if len(parts) > 1 else 0
return Numeric(precision=p, scale=sc)
return Numeric()
if s in ("FLOAT", "REAL", "DOUBLE"):
return Float()
if s in ("DATETIME", "TIMESTAMP"):
return DateTime()
if s == "DATE":
return Date()
if s in ("BOOLEAN", "BOOL"):
return Boolean()
raise ValueError(f"Unknown dtype override: {dtype_str!r}")
def build_columns(df: pd.DataFrame, column_configs: list[ColumnMapping], varchar_length: int) -> list[Column]:
override_map = {c.target or c.source: c.dtype for c in column_configs if c.dtype and not c.skip}
columns = []
for col in df.columns:
col_name = str(col)
if col_name in override_map and override_map[col_name]:
sqla_type = _override_to_sqla(override_map[col_name])
else:
sqla_type = _pandas_dtype_to_sqla(df[col], varchar_length)
columns.append(Column(col_name, sqla_type))
return columns