Initial implementation of generic Excel-to-DB import tool

Supports .xls and .xlsx, Oracle and PostgreSQL via SQLAlchemy. Includes CLI (run/inspect/generate-config), YAML config, auto schema detection, and append/replace/upsert modes. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-13 11:31:47 +02:00
commit 8f7399de58
26 changed files with 663 additions and 0 deletions
@@ -0,0 +1 @@
+python3
@@ -0,0 +1 @@
+/usr/bin/python3
@@ -0,0 +1 @@
+python3
@@ -0,0 +1 @@
+lib
@@ -0,0 +1,5 @@
+home = /usr/bin
+include-system-site-packages = false
+version = 3.12.3
+executable = /usr/bin/python3.12
+command = /usr/bin/python3 -m venv /home/dierk/Programmierung/claude/excel-import/.venv
@@ -0,0 +1,40 @@
+# SQLAlchemy DSN — Beispiele:
+#   PostgreSQL:  postgresql+psycopg2://user:pass@localhost/mydb
+#   Oracle:      oracle+oracledb://user:pass@localhost:1521/?service_name=MYDB
+dsn: "postgresql+psycopg2://user:pass@localhost/mydb"
+
+default_varchar_length: 255
+
+sheets:
+  - sheet: "Artikel"           # Sheet-Name oder Index (0, 1, ...)
+    header_row: 0              # 0-basierter Zeilenindex der Kopfzeile
+    skip_rows: 0               # Zeilen vor der Kopfzeile überspringen
+    target_table: "artikel"
+    mode: "replace"            # append | replace | upsert
+    upsert_keys: []
+    columns:
+      - source: "Artikelnummer"
+        target: "artikelnummer"
+        dtype: "VARCHAR(50)"
+      - source: "Bezeichnung"
+        target: "bezeichnung"
+      - source: "Preis"
+        target: "preis"
+        dtype: "NUMERIC(12,2)"
+      - source: "Interne Notiz"
+        target: "interne_notiz"
+        skip: true             # Spalte nicht importieren
+
+  - sheet: "Kunden"
+    header_row: 0
+    target_table: "kunden"
+    mode: "upsert"
+    upsert_keys: ["kundennummer"]
+    columns:
+      - source: "Kundennummer"
+        target: "kundennummer"
+        dtype: "VARCHAR(20)"
+      - source: "Name"
+        target: "name"
+      - source: "E-Mail"
+        target: "email"
@@ -0,0 +1,4 @@
+from .reader import ExcelReader
+from .importer import Importer
+
+__all__ = ["ExcelReader", "Importer"]
@@ -0,0 +1,87 @@
+from __future__ import annotations
+import logging
+import sys
+from pathlib import Path
+
+import click
+
+from .config import ImportConfig, SheetConfig
+from .importer import Importer
+from .reader import ExcelReader
+
+
+def _setup_logging(verbose: bool):
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(format="%(levelname)s  %(message)s", level=level)
+
+
+@click.group()
+def main():
+    """Generic Excel-to-database import tool (Oracle & PostgreSQL)."""
+
+
+@main.command()
+@click.argument("excel_file", type=click.Path(exists=True))
+@click.argument("config_file", type=click.Path(exists=True))
+@click.option("-v", "--verbose", is_flag=True)
+def run(excel_file: str, config_file: str, verbose: bool):
+    """Import EXCEL_FILE using CONFIG_FILE (YAML)."""
+    _setup_logging(verbose)
+    cfg = ImportConfig.from_yaml(config_file)
+    importer = Importer(cfg)
+    try:
+        results = importer.run(excel_file)
+    except Exception as exc:
+        click.echo(f"ERROR: {exc}", err=True)
+        sys.exit(1)
+
+    for table, rows in results.items():
+        click.echo(f"  {table}: {rows} rows imported")
+
+
+@main.command()
+@click.argument("excel_file", type=click.Path(exists=True))
+def inspect(excel_file: str):
+    """Show sheet names and column preview of EXCEL_FILE."""
+    reader = ExcelReader(excel_file)
+    names = reader.sheet_names()
+    click.echo(f"Sheets in {Path(excel_file).name}:")
+    for i, name in enumerate(names):
+        click.echo(f"  [{i}] {name}")
+        # read first few rows for preview
+        from .config import SheetConfig as SC
+        df = reader.read(SC(sheet=i))
+        click.echo(f"      Columns ({len(df.columns)}): {', '.join(str(c) for c in df.columns[:8])}")
+        if len(df.columns) > 8:
+            click.echo(f"      ... and {len(df.columns) - 8} more")
+        click.echo(f"      Rows: {len(df)}")
+
+
+@main.command("generate-config")
+@click.argument("excel_file", type=click.Path(exists=True))
+@click.option("--dsn", default="postgresql+psycopg2://user:pass@localhost/dbname", show_default=True)
+@click.option("--output", "-o", default="import_config.yaml", show_default=True)
+def generate_config(excel_file: str, dsn: str, output: str):
+    """Generate a starter YAML config from EXCEL_FILE's structure."""
+    import yaml
+    reader = ExcelReader(excel_file)
+    names = reader.sheet_names()
+
+    sheets = []
+    for i, name in enumerate(names):
+        from .config import SheetConfig as SC
+        df = reader.read(SC(sheet=i))
+        table_name = name.lower().replace(" ", "_")
+        columns = [{"source": str(c), "target": str(c).lower().replace(" ", "_")} for c in df.columns]
+        sheets.append({
+            "sheet": name,
+            "header_row": 0,
+            "target_table": table_name,
+            "mode": "append",
+            "columns": columns,
+        })
+
+    config = {"dsn": dsn, "sheets": sheets}
+    with open(output, "w") as f:
+        yaml.dump(config, f, allow_unicode=True, sort_keys=False)
+    click.echo(f"Config written to {output}")
@@ -0,0 +1,48 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+import yaml
+
+
+@dataclass
+class ColumnMapping:
+    source: str
+    target: str
+    dtype: str | None = None  # override detected type, e.g. "VARCHAR(100)", "NUMBER"
+    skip: bool = False
+
+
+@dataclass
+class SheetConfig:
+    sheet: str | int = 0          # sheet name or index
+    header_row: int = 0           # 0-based row index of the header
+    skip_rows: int = 0            # rows to skip before header
+    target_table: str = ""
+    columns: list[ColumnMapping] = field(default_factory=list)
+    mode: Literal["append", "replace", "upsert"] = "append"
+    upsert_keys: list[str] = field(default_factory=list)  # column names for upsert PK
+
+
+@dataclass
+class ImportConfig:
+    dsn: str                        # SQLAlchemy DSN
+    sheets: list[SheetConfig] = field(default_factory=list)
+    default_varchar_length: int = 255
+
+    @classmethod
+    def from_yaml(cls, path: str | Path) -> "ImportConfig":
+        with open(path) as f:
+            raw = yaml.safe_load(f)
+
+        sheets = []
+        for s in raw.get("sheets", []):
+            columns = [ColumnMapping(**c) for c in s.pop("columns", [])]
+            upsert_keys = s.pop("upsert_keys", [])
+            sheets.append(SheetConfig(**s, columns=columns, upsert_keys=upsert_keys))
+
+        return cls(
+            dsn=raw["dsn"],
+            default_varchar_length=raw.get("default_varchar_length", 255),
+            sheets=sheets,
+        )
@@ -0,0 +1,127 @@
+from __future__ import annotations
+import logging
+from pathlib import Path
+
+import pandas as pd
+from sqlalchemy import create_engine, text, MetaData, Table, inspect
+from sqlalchemy.dialects.postgresql import insert as pg_insert
+
+from .config import ImportConfig, SheetConfig
+from .reader import ExcelReader
+from .schema import build_columns
+
+logger = logging.getLogger(__name__)
+
+
+class Importer:
+    def __init__(self, config: ImportConfig):
+        self.config = config
+        self.engine = create_engine(config.dsn)
+
+    def run(self, excel_path: str | Path) -> dict[str, int]:
+        """Import all configured sheets. Returns {table_name: rows_imported}."""
+        reader = ExcelReader(excel_path)
+        results = {}
+        for sheet_cfg in self.config.sheets:
+            rows = self._import_sheet(reader, sheet_cfg)
+            results[sheet_cfg.target_table] = rows
+        return results
+
+    def _import_sheet(self, reader: ExcelReader, cfg: SheetConfig) -> int:
+        df = reader.read(cfg)
+        if df.empty:
+            logger.warning("Sheet %r is empty, skipping.", cfg.sheet)
+            return 0
+
+        logger.info("Read %d rows from sheet %r -> table %r", len(df), cfg.sheet, cfg.target_table)
+
+        with self.engine.begin() as conn:
+            self._ensure_table(conn, df, cfg)
+
+            if cfg.mode == "replace":
+                dialect = self.engine.dialect.name
+                truncate_sql = (
+                    f"DELETE FROM {cfg.target_table}"
+                    if dialect == "sqlite"
+                    else f"TRUNCATE TABLE {cfg.target_table}"
+                )
+                conn.execute(text(truncate_sql))
+                rows = self._bulk_insert(conn, df, cfg.target_table)
+            elif cfg.mode == "upsert":
+                rows = self._upsert(conn, df, cfg)
+            else:  # append
+                rows = self._bulk_insert(conn, df, cfg.target_table)
+
+        logger.info("Imported %d rows into %r (mode=%s)", rows, cfg.target_table, cfg.mode)
+        return rows
+
+    def _ensure_table(self, conn, df: pd.DataFrame, cfg: SheetConfig):
+        insp = inspect(conn)
+        if not insp.has_table(cfg.target_table):
+            meta = MetaData()
+            cols = build_columns(df, cfg.columns, self.config.default_varchar_length)
+            table = Table(cfg.target_table, meta, *cols)
+            meta.create_all(conn)
+            logger.info("Created table %r", cfg.target_table)
+
+    def _bulk_insert(self, conn, df: pd.DataFrame, table_name: str) -> int:
+        records = _df_to_records(df)
+        if not records:
+            return 0
+        meta = MetaData()
+        meta.reflect(bind=conn, only=[table_name])
+        table = meta.tables[table_name]
+        conn.execute(table.insert(), records)
+        return len(records)
+
+    def _upsert(self, conn, df: pd.DataFrame, cfg: SheetConfig) -> int:
+        dialect = self.engine.dialect.name
+        records = _df_to_records(df)
+        if not records:
+            return 0
+
+        meta = MetaData()
+        meta.reflect(bind=conn, only=[cfg.target_table])
+        table = meta.tables[cfg.target_table]
+
+        if dialect == "postgresql":
+            stmt = pg_insert(table).values(records)
+            update_cols = {c.key: stmt.excluded[c.key] for c in table.columns if c.key not in cfg.upsert_keys}
+            stmt = stmt.on_conflict_do_update(index_elements=cfg.upsert_keys, set_=update_cols)
+            conn.execute(stmt)
+        elif dialect == "oracle":
+            # Oracle MERGE via raw SQL
+            for record in records:
+                _oracle_merge(conn, table, record, cfg.upsert_keys)
+        else:
+            raise NotImplementedError(f"Upsert not implemented for dialect: {dialect}")
+
+        return len(records)
+
+
+def _df_to_records(df: pd.DataFrame) -> list[dict]:
+    # Replace pandas NA/NaT with None so SQLAlchemy handles nulls correctly
+    return [
+        {k: (None if pd.isna(v) else v) for k, v in row.items()}
+        for row in df.to_dict(orient="records")
+    ]
+
+
+def _oracle_merge(conn, table: Table, record: dict, keys: list[str]):
+    key_clauses = " AND ".join(f"t.{k} = s.{k}" for k in keys)
+    all_cols = list(record.keys())
+    non_keys = [c for c in all_cols if c not in keys]
+
+    select_parts = ", ".join(f":{c} AS {c}" for c in all_cols)
+    update_parts = ", ".join(f"t.{c} = s.{c}" for c in non_keys)
+    insert_cols = ", ".join(all_cols)
+    insert_vals = ", ".join(f"s.{c}" for c in all_cols)
+
+    sql = f"""
+        MERGE INTO {table.name} t
+        USING (SELECT {select_parts} FROM dual) s
+        ON ({key_clauses})
+        WHEN MATCHED THEN UPDATE SET {update_parts}
+        WHEN NOT MATCHED THEN INSERT ({insert_cols}) VALUES ({insert_vals})
+    """
+    conn.execute(text(sql), record)
@@ -0,0 +1,49 @@
+from __future__ import annotations
+from pathlib import Path
+import pandas as pd
+
+from .config import SheetConfig
+
+
+def _engine_for(path: Path) -> str:
+    return "xlrd" if path.suffix.lower() == ".xls" else "openpyxl"
+
+
+class ExcelReader:
+    def __init__(self, path: str | Path):
+        self.path = Path(path)
+        if not self.path.exists():
+            raise FileNotFoundError(f"Excel file not found: {self.path}")
+        if self.path.suffix.lower() not in {".xls", ".xlsx", ".xlsm", ".xlsb"}:
+            raise ValueError(f"Unsupported file type: {self.path.suffix}")
+
+    def sheet_names(self) -> list[str]:
+        engine = _engine_for(self.path)
+        xf = pd.ExcelFile(self.path, engine=engine)
+        return xf.sheet_names
+
+    def read(self, cfg: SheetConfig) -> pd.DataFrame:
+        engine = _engine_for(self.path)
+        df = pd.read_excel(
+            self.path,
+            sheet_name=cfg.sheet,
+            header=cfg.header_row,
+            skiprows=range(cfg.skip_rows) if cfg.skip_rows else None,
+            engine=engine,
+        )
+        # drop completely empty rows
+        df.dropna(how="all", inplace=True)
+
+        # apply column mapping: rename and drop skipped columns
+        if cfg.columns:
+            skip_sources = {c.source for c in cfg.columns if c.skip}
+            df.drop(columns=[c for c in skip_sources if c in df.columns], inplace=True)
+
+            rename_map = {
+                c.source: c.target
+                for c in cfg.columns
+                if not c.skip and c.source != c.target
+            }
+            df.rename(columns=rename_map, inplace=True)
+
+        return df
@@ -0,0 +1,72 @@
+from __future__ import annotations
+import pandas as pd
+from sqlalchemy import (
+    Column, Integer, Float, String, DateTime, Date, Boolean, Numeric, Text
+)
+
+from .config import ColumnMapping
+
+
+def _pandas_dtype_to_sqla(series: pd.Series, varchar_length: int):
+    dtype = series.dtype
+    if pd.api.types.is_bool_dtype(dtype):
+        return Boolean()
+    if pd.api.types.is_integer_dtype(dtype):
+        return Integer()
+    if pd.api.types.is_float_dtype(dtype):
+        return Float()
+    if pd.api.types.is_datetime64_any_dtype(dtype):
+        return DateTime()
+    # object columns: check if they look like dates
+    if dtype == object:
+        sample = series.dropna().head(100)
+        if len(sample) > 0:
+            try:
+                pd.to_datetime(sample)
+                return DateTime()
+            except Exception:
+                pass
+        max_len = int(series.dropna().astype(str).str.len().max()) if len(series.dropna()) > 0 else 1
+        return String(max(max_len + 10, varchar_length))
+    return Text()
+
+
+def _override_to_sqla(dtype_str: str):
+    """Convert a user-supplied type string like 'VARCHAR(100)' to a SQLAlchemy type."""
+    s = dtype_str.upper().strip()
+    if s.startswith("VARCHAR"):
+        length = int(s.split("(")[1].rstrip(")")) if "(" in s else 255
+        return String(length)
+    if s in ("TEXT", "CLOB"):
+        return Text()
+    if s in ("INTEGER", "INT", "NUMBER"):
+        return Integer()
+    if s.startswith("NUMBER") or s.startswith("NUMERIC") or s.startswith("DECIMAL"):
+        if "(" in s:
+            parts = s.split("(")[1].rstrip(")").split(",")
+            p, sc = int(parts[0]), int(parts[1]) if len(parts) > 1 else 0
+            return Numeric(precision=p, scale=sc)
+        return Numeric()
+    if s in ("FLOAT", "REAL", "DOUBLE"):
+        return Float()
+    if s in ("DATETIME", "TIMESTAMP"):
+        return DateTime()
+    if s == "DATE":
+        return Date()
+    if s in ("BOOLEAN", "BOOL"):
+        return Boolean()
+    raise ValueError(f"Unknown dtype override: {dtype_str!r}")
+
+
+def build_columns(df: pd.DataFrame, column_configs: list[ColumnMapping], varchar_length: int) -> list[Column]:
+    override_map = {c.target or c.source: c.dtype for c in column_configs if c.dtype and not c.skip}
+
+    columns = []
+    for col in df.columns:
+        col_name = str(col)
+        if col_name in override_map and override_map[col_name]:
+            sqla_type = _override_to_sqla(override_map[col_name])
+        else:
+            sqla_type = _pandas_dtype_to_sqla(df[col], varchar_length)
+        columns.append(Column(col_name, sqla_type))
+    return columns
@@ -0,0 +1,24 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.backends.legacy:build"
+
+[project]
+name = "excel-import"
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = [
+    "pandas>=2.0",
+    "openpyxl>=3.1",
+    "xlrd>=2.0",
+    "sqlalchemy>=2.0",
+    "psycopg2-binary>=2.9",
+    "oracledb>=2.0",
+    "pyyaml>=6.0",
+    "click>=8.1",
+]
+
+[project.scripts]
+excel-import = "excel_import.cli:main"
+
+[project.optional-dependencies]
+dev = ["pytest>=8.0", "pytest-mock>=3.0"]
@@ -0,0 +1,39 @@
+from pathlib import Path
+import pytest
+import yaml
+
+from excel_import.config import ImportConfig
+
+
+@pytest.fixture
+def config_file(tmp_path: Path) -> Path:
+    cfg = {
+        "dsn": "postgresql+psycopg2://u:p@localhost/db",
+        "sheets": [
+            {
+                "sheet": "Artikel",
+                "header_row": 0,
+                "target_table": "artikel",
+                "mode": "replace",
+                "columns": [
+                    {"source": "Artikelnummer", "target": "art_nr", "dtype": "VARCHAR(50)"},
+                    {"source": "Preis", "target": "preis"},
+                ],
+            }
+        ],
+    }
+    path = tmp_path / "config.yaml"
+    path.write_text(yaml.dump(cfg))
+    return path
+
+
+def test_load_from_yaml(config_file: Path):
+    cfg = ImportConfig.from_yaml(config_file)
+    assert cfg.dsn == "postgresql+psycopg2://u:p@localhost/db"
+    assert len(cfg.sheets) == 1
+    sheet = cfg.sheets[0]
+    assert sheet.sheet == "Artikel"
+    assert sheet.target_table == "artikel"
+    assert sheet.mode == "replace"
+    assert len(sheet.columns) == 2
+    assert sheet.columns[0].dtype == "VARCHAR(50)"
@@ -0,0 +1,80 @@
+from pathlib import Path
+import pandas as pd
+import pytest
+from sqlalchemy import create_engine, text
+
+from excel_import.config import ImportConfig, SheetConfig, ColumnMapping
+from excel_import.importer import Importer
+
+
+@pytest.fixture
+def xlsx_file(tmp_path: Path) -> Path:
+    path = tmp_path / "data.xlsx"
+    df = pd.DataFrame({
+        "id": [1, 2, 3],
+        "name": ["Alice", "Bob", "Carol"],
+        "amount": [100.0, 200.5, 300.0],
+    })
+    df.to_excel(path, index=False)
+    return path
+
+
+@pytest.fixture
+def sqlite_config(xlsx_file):
+    return ImportConfig(
+        dsn="sqlite:///:memory:",
+        sheets=[
+            SheetConfig(
+                sheet=0,
+                target_table="persons",
+                mode="append",
+            )
+        ],
+    )
+
+
+def test_import_append(xlsx_file, sqlite_config):
+    importer = Importer(sqlite_config)
+    results = importer.run(xlsx_file)
+    assert results["persons"] == 3
+
+    with importer.engine.connect() as conn:
+        rows = conn.execute(text("SELECT COUNT(*) FROM persons")).scalar()
+    assert rows == 3
+
+
+def test_import_replace(xlsx_file, tmp_path):
+    cfg = ImportConfig(
+        dsn="sqlite:///:memory:",
+        sheets=[SheetConfig(sheet=0, target_table="persons", mode="replace")],
+    )
+    importer = Importer(cfg)
+    importer.run(xlsx_file)
+    results = importer.run(xlsx_file)  # second run should truncate+insert
+    assert results["persons"] == 3
+
+    with importer.engine.connect() as conn:
+        rows = conn.execute(text("SELECT COUNT(*) FROM persons")).scalar()
+    assert rows == 3
+
+
+def test_import_creates_table(xlsx_file, sqlite_config):
+    importer = Importer(sqlite_config)
+    importer.run(xlsx_file)
+
+    from sqlalchemy import inspect
+    insp = inspect(importer.engine)
+    assert "persons" in insp.get_table_names()
+
+
+def test_import_empty_sheet(tmp_path):
+    path = tmp_path / "empty.xlsx"
+    pd.DataFrame({"a": [], "b": []}).to_excel(path, index=False)
+
+    cfg = ImportConfig(
+        dsn="sqlite:///:memory:",
+        sheets=[SheetConfig(sheet=0, target_table="empty_table", mode="append")],
+    )
+    importer = Importer(cfg)
+    results = importer.run(path)
+    assert results["empty_table"] == 0
@@ -0,0 +1,84 @@
+import io
+from pathlib import Path
+import pandas as pd
+import pytest
+
+from excel_import.reader import ExcelReader
+from excel_import.config import SheetConfig
+
+
+@pytest.fixture
+def xlsx_file(tmp_path: Path) -> Path:
+    path = tmp_path / "test.xlsx"
+    df = pd.DataFrame({
+        "Artikelnummer": ["A001", "A002", "A003"],
+        "Bezeichnung": ["Widget", "Gadget", None],
+        "Preis": [9.99, 14.50, 0.99],
+    })
+    df.to_excel(path, index=False)
+    return path
+
+
+def test_sheet_names(xlsx_file: Path):
+    reader = ExcelReader(xlsx_file)
+    assert reader.sheet_names() == ["Sheet1"]
+
+
+def test_read_basic(xlsx_file: Path):
+    reader = ExcelReader(xlsx_file)
+    df = reader.read(SheetConfig(sheet=0, target_table="t"))
+    assert len(df) == 3
+    assert list(df.columns) == ["Artikelnummer", "Bezeichnung", "Preis"]
+
+
+def test_read_drops_empty_rows(tmp_path: Path):
+    path = tmp_path / "empty_rows.xlsx"
+    df = pd.DataFrame({"A": ["x", None, "y"], "B": [1, None, 3]})
+    df.to_excel(path, index=False)
+
+    reader = ExcelReader(path)
+    result = reader.read(SheetConfig(sheet=0, target_table="t"))
+    assert len(result) == 2
+
+
+def test_read_column_rename(xlsx_file: Path):
+    from excel_import.config import ColumnMapping
+    cfg = SheetConfig(
+        sheet=0,
+        target_table="t",
+        columns=[
+            ColumnMapping(source="Artikelnummer", target="art_nr"),
+            ColumnMapping(source="Bezeichnung", target="bez"),
+            ColumnMapping(source="Preis", target="preis"),
+        ],
+    )
+    reader = ExcelReader(xlsx_file)
+    df = reader.read(cfg)
+    assert "art_nr" in df.columns
+    assert "Artikelnummer" not in df.columns
+
+
+def test_read_column_skip(xlsx_file: Path):
+    from excel_import.config import ColumnMapping
+    cfg = SheetConfig(
+        sheet=0,
+        target_table="t",
+        columns=[
+            ColumnMapping(source="Preis", target="Preis", skip=True),
+        ],
+    )
+    reader = ExcelReader(xlsx_file)
+    df = reader.read(cfg)
+    assert "Preis" not in df.columns
+
+
+def test_file_not_found():
+    with pytest.raises(FileNotFoundError):
+        ExcelReader("/nonexistent/path/file.xlsx")
+
+
+def test_unsupported_extension(tmp_path: Path):
+    f = tmp_path / "data.csv"
+    f.write_text("a,b\n1,2")
+    with pytest.raises(ValueError, match="Unsupported"):
+        ExcelReader(f)