Initial implementation of generic Excel-to-DB import tool

Supports .xls and .xlsx, Oracle and PostgreSQL via SQLAlchemy. Includes CLI (run/inspect/generate-config), YAML config, auto schema detection, and append/replace/upsert modes. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-13 11:31:47 +02:00
commit 8f7399de58
26 changed files with 663 additions and 0 deletions
@@ -0,0 +1,49 @@
+from __future__ import annotations
+from pathlib import Path
+import pandas as pd
+
+from .config import SheetConfig
+
+
+def _engine_for(path: Path) -> str:
+    return "xlrd" if path.suffix.lower() == ".xls" else "openpyxl"
+
+
+class ExcelReader:
+    def __init__(self, path: str | Path):
+        self.path = Path(path)
+        if not self.path.exists():
+            raise FileNotFoundError(f"Excel file not found: {self.path}")
+        if self.path.suffix.lower() not in {".xls", ".xlsx", ".xlsm", ".xlsb"}:
+            raise ValueError(f"Unsupported file type: {self.path.suffix}")
+
+    def sheet_names(self) -> list[str]:
+        engine = _engine_for(self.path)
+        xf = pd.ExcelFile(self.path, engine=engine)
+        return xf.sheet_names
+
+    def read(self, cfg: SheetConfig) -> pd.DataFrame:
+        engine = _engine_for(self.path)
+        df = pd.read_excel(
+            self.path,
+            sheet_name=cfg.sheet,
+            header=cfg.header_row,
+            skiprows=range(cfg.skip_rows) if cfg.skip_rows else None,
+            engine=engine,
+        )
+        # drop completely empty rows
+        df.dropna(how="all", inplace=True)
+
+        # apply column mapping: rename and drop skipped columns
+        if cfg.columns:
+            skip_sources = {c.source for c in cfg.columns if c.skip}
+            df.drop(columns=[c for c in skip_sources if c in df.columns], inplace=True)
+
+            rename_map = {
+                c.source: c.target
+                for c in cfg.columns
+                if not c.skip and c.source != c.target
+            }
+            df.rename(columns=rename_map, inplace=True)
+
+        return df