Initial implementation of generic Excel-to-DB import tool

Supports .xls and .xlsx, Oracle and PostgreSQL via SQLAlchemy.
Includes CLI (run/inspect/generate-config), YAML config, auto schema
detection, and append/replace/upsert modes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-13 11:31:47 +02:00
commit 8f7399de58
26 changed files with 663 additions and 0 deletions
+49
View File
@@ -0,0 +1,49 @@
from __future__ import annotations
from pathlib import Path
import pandas as pd
from .config import SheetConfig
def _engine_for(path: Path) -> str:
return "xlrd" if path.suffix.lower() == ".xls" else "openpyxl"
class ExcelReader:
def __init__(self, path: str | Path):
self.path = Path(path)
if not self.path.exists():
raise FileNotFoundError(f"Excel file not found: {self.path}")
if self.path.suffix.lower() not in {".xls", ".xlsx", ".xlsm", ".xlsb"}:
raise ValueError(f"Unsupported file type: {self.path.suffix}")
def sheet_names(self) -> list[str]:
engine = _engine_for(self.path)
xf = pd.ExcelFile(self.path, engine=engine)
return xf.sheet_names
def read(self, cfg: SheetConfig) -> pd.DataFrame:
engine = _engine_for(self.path)
df = pd.read_excel(
self.path,
sheet_name=cfg.sheet,
header=cfg.header_row,
skiprows=range(cfg.skip_rows) if cfg.skip_rows else None,
engine=engine,
)
# drop completely empty rows
df.dropna(how="all", inplace=True)
# apply column mapping: rename and drop skipped columns
if cfg.columns:
skip_sources = {c.source for c in cfg.columns if c.skip}
df.drop(columns=[c for c in skip_sources if c in df.columns], inplace=True)
rename_map = {
c.source: c.target
for c in cfg.columns
if not c.skip and c.source != c.target
}
df.rename(columns=rename_map, inplace=True)
return df