commit 8f7399de58d3b197b5fd0c77149459dc6da4cb93 Author: Dierk Date: Wed May 13 11:31:47 2026 +0200 Initial implementation of generic Excel-to-DB import tool Supports .xls and .xlsx, Oracle and PostgreSQL via SQLAlchemy. Includes CLI (run/inspect/generate-config), YAML config, auto schema detection, and append/replace/upsert modes. Co-Authored-By: Claude Sonnet 4.6 diff --git a/.venv/bin/python b/.venv/bin/python new file mode 120000 index 0000000..b8a0adb --- /dev/null +++ b/.venv/bin/python @@ -0,0 +1 @@ +python3 \ No newline at end of file diff --git a/.venv/bin/python3 b/.venv/bin/python3 new file mode 120000 index 0000000..ae65fda --- /dev/null +++ b/.venv/bin/python3 @@ -0,0 +1 @@ +/usr/bin/python3 \ No newline at end of file diff --git a/.venv/bin/python3.12 b/.venv/bin/python3.12 new file mode 120000 index 0000000..b8a0adb --- /dev/null +++ b/.venv/bin/python3.12 @@ -0,0 +1 @@ +python3 \ No newline at end of file diff --git a/.venv/lib64 b/.venv/lib64 new file mode 120000 index 0000000..7951405 --- /dev/null +++ b/.venv/lib64 @@ -0,0 +1 @@ +lib \ No newline at end of file diff --git a/.venv/pyvenv.cfg b/.venv/pyvenv.cfg new file mode 100644 index 0000000..c57c6f3 --- /dev/null +++ b/.venv/pyvenv.cfg @@ -0,0 +1,5 @@ +home = /usr/bin +include-system-site-packages = false +version = 3.12.3 +executable = /usr/bin/python3.12 +command = /usr/bin/python3 -m venv /home/dierk/Programmierung/claude/excel-import/.venv diff --git a/examples/import_config.yaml b/examples/import_config.yaml new file mode 100644 index 0000000..7c61e6b --- /dev/null +++ b/examples/import_config.yaml @@ -0,0 +1,40 @@ +# SQLAlchemy DSN — Beispiele: +# PostgreSQL: postgresql+psycopg2://user:pass@localhost/mydb +# Oracle: oracle+oracledb://user:pass@localhost:1521/?service_name=MYDB +dsn: "postgresql+psycopg2://user:pass@localhost/mydb" + +default_varchar_length: 255 + +sheets: + - sheet: "Artikel" # Sheet-Name oder Index (0, 1, ...) + header_row: 0 # 0-basierter Zeilenindex der Kopfzeile + skip_rows: 0 # Zeilen vor der Kopfzeile überspringen + target_table: "artikel" + mode: "replace" # append | replace | upsert + upsert_keys: [] + columns: + - source: "Artikelnummer" + target: "artikelnummer" + dtype: "VARCHAR(50)" + - source: "Bezeichnung" + target: "bezeichnung" + - source: "Preis" + target: "preis" + dtype: "NUMERIC(12,2)" + - source: "Interne Notiz" + target: "interne_notiz" + skip: true # Spalte nicht importieren + + - sheet: "Kunden" + header_row: 0 + target_table: "kunden" + mode: "upsert" + upsert_keys: ["kundennummer"] + columns: + - source: "Kundennummer" + target: "kundennummer" + dtype: "VARCHAR(20)" + - source: "Name" + target: "name" + - source: "E-Mail" + target: "email" diff --git a/excel_import/__init__.py b/excel_import/__init__.py new file mode 100644 index 0000000..1e35936 --- /dev/null +++ b/excel_import/__init__.py @@ -0,0 +1,4 @@ +from .reader import ExcelReader +from .importer import Importer + +__all__ = ["ExcelReader", "Importer"] diff --git a/excel_import/__pycache__/__init__.cpython-312.pyc b/excel_import/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..6be3ca2 Binary files /dev/null and b/excel_import/__pycache__/__init__.cpython-312.pyc differ diff --git a/excel_import/__pycache__/config.cpython-312.pyc b/excel_import/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000..340f9e0 Binary files /dev/null and b/excel_import/__pycache__/config.cpython-312.pyc differ diff --git a/excel_import/__pycache__/importer.cpython-312.pyc b/excel_import/__pycache__/importer.cpython-312.pyc new file mode 100644 index 0000000..dc164e5 Binary files /dev/null and b/excel_import/__pycache__/importer.cpython-312.pyc differ diff --git a/excel_import/__pycache__/reader.cpython-312.pyc b/excel_import/__pycache__/reader.cpython-312.pyc new file mode 100644 index 0000000..3bdd9ab Binary files /dev/null and b/excel_import/__pycache__/reader.cpython-312.pyc differ diff --git a/excel_import/__pycache__/schema.cpython-312.pyc b/excel_import/__pycache__/schema.cpython-312.pyc new file mode 100644 index 0000000..f14dc23 Binary files /dev/null and b/excel_import/__pycache__/schema.cpython-312.pyc differ diff --git a/excel_import/cli.py b/excel_import/cli.py new file mode 100644 index 0000000..67afa75 --- /dev/null +++ b/excel_import/cli.py @@ -0,0 +1,87 @@ +from __future__ import annotations +import logging +import sys +from pathlib import Path + +import click + +from .config import ImportConfig, SheetConfig +from .importer import Importer +from .reader import ExcelReader + + +def _setup_logging(verbose: bool): + level = logging.DEBUG if verbose else logging.INFO + logging.basicConfig(format="%(levelname)s %(message)s", level=level) + + +@click.group() +def main(): + """Generic Excel-to-database import tool (Oracle & PostgreSQL).""" + + +@main.command() +@click.argument("excel_file", type=click.Path(exists=True)) +@click.argument("config_file", type=click.Path(exists=True)) +@click.option("-v", "--verbose", is_flag=True) +def run(excel_file: str, config_file: str, verbose: bool): + """Import EXCEL_FILE using CONFIG_FILE (YAML).""" + _setup_logging(verbose) + cfg = ImportConfig.from_yaml(config_file) + importer = Importer(cfg) + try: + results = importer.run(excel_file) + except Exception as exc: + click.echo(f"ERROR: {exc}", err=True) + sys.exit(1) + + for table, rows in results.items(): + click.echo(f" {table}: {rows} rows imported") + + +@main.command() +@click.argument("excel_file", type=click.Path(exists=True)) +def inspect(excel_file: str): + """Show sheet names and column preview of EXCEL_FILE.""" + reader = ExcelReader(excel_file) + names = reader.sheet_names() + click.echo(f"Sheets in {Path(excel_file).name}:") + for i, name in enumerate(names): + click.echo(f" [{i}] {name}") + # read first few rows for preview + from .config import SheetConfig as SC + df = reader.read(SC(sheet=i)) + click.echo(f" Columns ({len(df.columns)}): {', '.join(str(c) for c in df.columns[:8])}") + if len(df.columns) > 8: + click.echo(f" ... and {len(df.columns) - 8} more") + click.echo(f" Rows: {len(df)}") + + +@main.command("generate-config") +@click.argument("excel_file", type=click.Path(exists=True)) +@click.option("--dsn", default="postgresql+psycopg2://user:pass@localhost/dbname", show_default=True) +@click.option("--output", "-o", default="import_config.yaml", show_default=True) +def generate_config(excel_file: str, dsn: str, output: str): + """Generate a starter YAML config from EXCEL_FILE's structure.""" + import yaml + reader = ExcelReader(excel_file) + names = reader.sheet_names() + + sheets = [] + for i, name in enumerate(names): + from .config import SheetConfig as SC + df = reader.read(SC(sheet=i)) + table_name = name.lower().replace(" ", "_") + columns = [{"source": str(c), "target": str(c).lower().replace(" ", "_")} for c in df.columns] + sheets.append({ + "sheet": name, + "header_row": 0, + "target_table": table_name, + "mode": "append", + "columns": columns, + }) + + config = {"dsn": dsn, "sheets": sheets} + with open(output, "w") as f: + yaml.dump(config, f, allow_unicode=True, sort_keys=False) + click.echo(f"Config written to {output}") diff --git a/excel_import/config.py b/excel_import/config.py new file mode 100644 index 0000000..3433cc5 --- /dev/null +++ b/excel_import/config.py @@ -0,0 +1,48 @@ +from __future__ import annotations +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal +import yaml + + +@dataclass +class ColumnMapping: + source: str + target: str + dtype: str | None = None # override detected type, e.g. "VARCHAR(100)", "NUMBER" + skip: bool = False + + +@dataclass +class SheetConfig: + sheet: str | int = 0 # sheet name or index + header_row: int = 0 # 0-based row index of the header + skip_rows: int = 0 # rows to skip before header + target_table: str = "" + columns: list[ColumnMapping] = field(default_factory=list) + mode: Literal["append", "replace", "upsert"] = "append" + upsert_keys: list[str] = field(default_factory=list) # column names for upsert PK + + +@dataclass +class ImportConfig: + dsn: str # SQLAlchemy DSN + sheets: list[SheetConfig] = field(default_factory=list) + default_varchar_length: int = 255 + + @classmethod + def from_yaml(cls, path: str | Path) -> "ImportConfig": + with open(path) as f: + raw = yaml.safe_load(f) + + sheets = [] + for s in raw.get("sheets", []): + columns = [ColumnMapping(**c) for c in s.pop("columns", [])] + upsert_keys = s.pop("upsert_keys", []) + sheets.append(SheetConfig(**s, columns=columns, upsert_keys=upsert_keys)) + + return cls( + dsn=raw["dsn"], + default_varchar_length=raw.get("default_varchar_length", 255), + sheets=sheets, + ) diff --git a/excel_import/importer.py b/excel_import/importer.py new file mode 100644 index 0000000..478e9c8 --- /dev/null +++ b/excel_import/importer.py @@ -0,0 +1,127 @@ +from __future__ import annotations +import logging +from pathlib import Path + +import pandas as pd +from sqlalchemy import create_engine, text, MetaData, Table, inspect +from sqlalchemy.dialects.postgresql import insert as pg_insert + +from .config import ImportConfig, SheetConfig +from .reader import ExcelReader +from .schema import build_columns + +logger = logging.getLogger(__name__) + + +class Importer: + def __init__(self, config: ImportConfig): + self.config = config + self.engine = create_engine(config.dsn) + + def run(self, excel_path: str | Path) -> dict[str, int]: + """Import all configured sheets. Returns {table_name: rows_imported}.""" + reader = ExcelReader(excel_path) + results = {} + for sheet_cfg in self.config.sheets: + rows = self._import_sheet(reader, sheet_cfg) + results[sheet_cfg.target_table] = rows + return results + + def _import_sheet(self, reader: ExcelReader, cfg: SheetConfig) -> int: + df = reader.read(cfg) + if df.empty: + logger.warning("Sheet %r is empty, skipping.", cfg.sheet) + return 0 + + logger.info("Read %d rows from sheet %r -> table %r", len(df), cfg.sheet, cfg.target_table) + + with self.engine.begin() as conn: + self._ensure_table(conn, df, cfg) + + if cfg.mode == "replace": + dialect = self.engine.dialect.name + truncate_sql = ( + f"DELETE FROM {cfg.target_table}" + if dialect == "sqlite" + else f"TRUNCATE TABLE {cfg.target_table}" + ) + conn.execute(text(truncate_sql)) + rows = self._bulk_insert(conn, df, cfg.target_table) + elif cfg.mode == "upsert": + rows = self._upsert(conn, df, cfg) + else: # append + rows = self._bulk_insert(conn, df, cfg.target_table) + + logger.info("Imported %d rows into %r (mode=%s)", rows, cfg.target_table, cfg.mode) + return rows + + def _ensure_table(self, conn, df: pd.DataFrame, cfg: SheetConfig): + insp = inspect(conn) + if not insp.has_table(cfg.target_table): + meta = MetaData() + cols = build_columns(df, cfg.columns, self.config.default_varchar_length) + table = Table(cfg.target_table, meta, *cols) + meta.create_all(conn) + logger.info("Created table %r", cfg.target_table) + + def _bulk_insert(self, conn, df: pd.DataFrame, table_name: str) -> int: + records = _df_to_records(df) + if not records: + return 0 + meta = MetaData() + meta.reflect(bind=conn, only=[table_name]) + table = meta.tables[table_name] + conn.execute(table.insert(), records) + return len(records) + + def _upsert(self, conn, df: pd.DataFrame, cfg: SheetConfig) -> int: + dialect = self.engine.dialect.name + records = _df_to_records(df) + if not records: + return 0 + + meta = MetaData() + meta.reflect(bind=conn, only=[cfg.target_table]) + table = meta.tables[cfg.target_table] + + if dialect == "postgresql": + stmt = pg_insert(table).values(records) + update_cols = {c.key: stmt.excluded[c.key] for c in table.columns if c.key not in cfg.upsert_keys} + stmt = stmt.on_conflict_do_update(index_elements=cfg.upsert_keys, set_=update_cols) + conn.execute(stmt) + elif dialect == "oracle": + # Oracle MERGE via raw SQL + for record in records: + _oracle_merge(conn, table, record, cfg.upsert_keys) + else: + raise NotImplementedError(f"Upsert not implemented for dialect: {dialect}") + + return len(records) + + +def _df_to_records(df: pd.DataFrame) -> list[dict]: + # Replace pandas NA/NaT with None so SQLAlchemy handles nulls correctly + return [ + {k: (None if pd.isna(v) else v) for k, v in row.items()} + for row in df.to_dict(orient="records") + ] + + +def _oracle_merge(conn, table: Table, record: dict, keys: list[str]): + key_clauses = " AND ".join(f"t.{k} = s.{k}" for k in keys) + all_cols = list(record.keys()) + non_keys = [c for c in all_cols if c not in keys] + + select_parts = ", ".join(f":{c} AS {c}" for c in all_cols) + update_parts = ", ".join(f"t.{c} = s.{c}" for c in non_keys) + insert_cols = ", ".join(all_cols) + insert_vals = ", ".join(f"s.{c}" for c in all_cols) + + sql = f""" + MERGE INTO {table.name} t + USING (SELECT {select_parts} FROM dual) s + ON ({key_clauses}) + WHEN MATCHED THEN UPDATE SET {update_parts} + WHEN NOT MATCHED THEN INSERT ({insert_cols}) VALUES ({insert_vals}) + """ + conn.execute(text(sql), record) diff --git a/excel_import/reader.py b/excel_import/reader.py new file mode 100644 index 0000000..05d7619 --- /dev/null +++ b/excel_import/reader.py @@ -0,0 +1,49 @@ +from __future__ import annotations +from pathlib import Path +import pandas as pd + +from .config import SheetConfig + + +def _engine_for(path: Path) -> str: + return "xlrd" if path.suffix.lower() == ".xls" else "openpyxl" + + +class ExcelReader: + def __init__(self, path: str | Path): + self.path = Path(path) + if not self.path.exists(): + raise FileNotFoundError(f"Excel file not found: {self.path}") + if self.path.suffix.lower() not in {".xls", ".xlsx", ".xlsm", ".xlsb"}: + raise ValueError(f"Unsupported file type: {self.path.suffix}") + + def sheet_names(self) -> list[str]: + engine = _engine_for(self.path) + xf = pd.ExcelFile(self.path, engine=engine) + return xf.sheet_names + + def read(self, cfg: SheetConfig) -> pd.DataFrame: + engine = _engine_for(self.path) + df = pd.read_excel( + self.path, + sheet_name=cfg.sheet, + header=cfg.header_row, + skiprows=range(cfg.skip_rows) if cfg.skip_rows else None, + engine=engine, + ) + # drop completely empty rows + df.dropna(how="all", inplace=True) + + # apply column mapping: rename and drop skipped columns + if cfg.columns: + skip_sources = {c.source for c in cfg.columns if c.skip} + df.drop(columns=[c for c in skip_sources if c in df.columns], inplace=True) + + rename_map = { + c.source: c.target + for c in cfg.columns + if not c.skip and c.source != c.target + } + df.rename(columns=rename_map, inplace=True) + + return df diff --git a/excel_import/schema.py b/excel_import/schema.py new file mode 100644 index 0000000..017de04 --- /dev/null +++ b/excel_import/schema.py @@ -0,0 +1,72 @@ +from __future__ import annotations +import pandas as pd +from sqlalchemy import ( + Column, Integer, Float, String, DateTime, Date, Boolean, Numeric, Text +) + +from .config import ColumnMapping + + +def _pandas_dtype_to_sqla(series: pd.Series, varchar_length: int): + dtype = series.dtype + if pd.api.types.is_bool_dtype(dtype): + return Boolean() + if pd.api.types.is_integer_dtype(dtype): + return Integer() + if pd.api.types.is_float_dtype(dtype): + return Float() + if pd.api.types.is_datetime64_any_dtype(dtype): + return DateTime() + # object columns: check if they look like dates + if dtype == object: + sample = series.dropna().head(100) + if len(sample) > 0: + try: + pd.to_datetime(sample) + return DateTime() + except Exception: + pass + max_len = int(series.dropna().astype(str).str.len().max()) if len(series.dropna()) > 0 else 1 + return String(max(max_len + 10, varchar_length)) + return Text() + + +def _override_to_sqla(dtype_str: str): + """Convert a user-supplied type string like 'VARCHAR(100)' to a SQLAlchemy type.""" + s = dtype_str.upper().strip() + if s.startswith("VARCHAR"): + length = int(s.split("(")[1].rstrip(")")) if "(" in s else 255 + return String(length) + if s in ("TEXT", "CLOB"): + return Text() + if s in ("INTEGER", "INT", "NUMBER"): + return Integer() + if s.startswith("NUMBER") or s.startswith("NUMERIC") or s.startswith("DECIMAL"): + if "(" in s: + parts = s.split("(")[1].rstrip(")").split(",") + p, sc = int(parts[0]), int(parts[1]) if len(parts) > 1 else 0 + return Numeric(precision=p, scale=sc) + return Numeric() + if s in ("FLOAT", "REAL", "DOUBLE"): + return Float() + if s in ("DATETIME", "TIMESTAMP"): + return DateTime() + if s == "DATE": + return Date() + if s in ("BOOLEAN", "BOOL"): + return Boolean() + raise ValueError(f"Unknown dtype override: {dtype_str!r}") + + +def build_columns(df: pd.DataFrame, column_configs: list[ColumnMapping], varchar_length: int) -> list[Column]: + override_map = {c.target or c.source: c.dtype for c in column_configs if c.dtype and not c.skip} + + columns = [] + for col in df.columns: + col_name = str(col) + if col_name in override_map and override_map[col_name]: + sqla_type = _override_to_sqla(override_map[col_name]) + else: + sqla_type = _pandas_dtype_to_sqla(df[col], varchar_length) + columns.append(Column(col_name, sqla_type)) + return columns diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..28cae8b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.backends.legacy:build" + +[project] +name = "excel-import" +version = "0.1.0" +requires-python = ">=3.10" +dependencies = [ + "pandas>=2.0", + "openpyxl>=3.1", + "xlrd>=2.0", + "sqlalchemy>=2.0", + "psycopg2-binary>=2.9", + "oracledb>=2.0", + "pyyaml>=6.0", + "click>=8.1", +] + +[project.scripts] +excel-import = "excel_import.cli:main" + +[project.optional-dependencies] +dev = ["pytest>=8.0", "pytest-mock>=3.0"] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__pycache__/__init__.cpython-312.pyc b/tests/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..b5ddee5 Binary files /dev/null and b/tests/__pycache__/__init__.cpython-312.pyc differ diff --git a/tests/__pycache__/test_config.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/test_config.cpython-312-pytest-9.0.3.pyc new file mode 100644 index 0000000..bfb4583 Binary files /dev/null and b/tests/__pycache__/test_config.cpython-312-pytest-9.0.3.pyc differ diff --git a/tests/__pycache__/test_importer.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/test_importer.cpython-312-pytest-9.0.3.pyc new file mode 100644 index 0000000..0c973a2 Binary files /dev/null and b/tests/__pycache__/test_importer.cpython-312-pytest-9.0.3.pyc differ diff --git a/tests/__pycache__/test_reader.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/test_reader.cpython-312-pytest-9.0.3.pyc new file mode 100644 index 0000000..58c483d Binary files /dev/null and b/tests/__pycache__/test_reader.cpython-312-pytest-9.0.3.pyc differ diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..7556060 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,39 @@ +from pathlib import Path +import pytest +import yaml + +from excel_import.config import ImportConfig + + +@pytest.fixture +def config_file(tmp_path: Path) -> Path: + cfg = { + "dsn": "postgresql+psycopg2://u:p@localhost/db", + "sheets": [ + { + "sheet": "Artikel", + "header_row": 0, + "target_table": "artikel", + "mode": "replace", + "columns": [ + {"source": "Artikelnummer", "target": "art_nr", "dtype": "VARCHAR(50)"}, + {"source": "Preis", "target": "preis"}, + ], + } + ], + } + path = tmp_path / "config.yaml" + path.write_text(yaml.dump(cfg)) + return path + + +def test_load_from_yaml(config_file: Path): + cfg = ImportConfig.from_yaml(config_file) + assert cfg.dsn == "postgresql+psycopg2://u:p@localhost/db" + assert len(cfg.sheets) == 1 + sheet = cfg.sheets[0] + assert sheet.sheet == "Artikel" + assert sheet.target_table == "artikel" + assert sheet.mode == "replace" + assert len(sheet.columns) == 2 + assert sheet.columns[0].dtype == "VARCHAR(50)" diff --git a/tests/test_importer.py b/tests/test_importer.py new file mode 100644 index 0000000..1762b0a --- /dev/null +++ b/tests/test_importer.py @@ -0,0 +1,80 @@ +from pathlib import Path +import pandas as pd +import pytest +from sqlalchemy import create_engine, text + +from excel_import.config import ImportConfig, SheetConfig, ColumnMapping +from excel_import.importer import Importer + + +@pytest.fixture +def xlsx_file(tmp_path: Path) -> Path: + path = tmp_path / "data.xlsx" + df = pd.DataFrame({ + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Carol"], + "amount": [100.0, 200.5, 300.0], + }) + df.to_excel(path, index=False) + return path + + +@pytest.fixture +def sqlite_config(xlsx_file): + return ImportConfig( + dsn="sqlite:///:memory:", + sheets=[ + SheetConfig( + sheet=0, + target_table="persons", + mode="append", + ) + ], + ) + + +def test_import_append(xlsx_file, sqlite_config): + importer = Importer(sqlite_config) + results = importer.run(xlsx_file) + assert results["persons"] == 3 + + with importer.engine.connect() as conn: + rows = conn.execute(text("SELECT COUNT(*) FROM persons")).scalar() + assert rows == 3 + + +def test_import_replace(xlsx_file, tmp_path): + cfg = ImportConfig( + dsn="sqlite:///:memory:", + sheets=[SheetConfig(sheet=0, target_table="persons", mode="replace")], + ) + importer = Importer(cfg) + importer.run(xlsx_file) + results = importer.run(xlsx_file) # second run should truncate+insert + assert results["persons"] == 3 + + with importer.engine.connect() as conn: + rows = conn.execute(text("SELECT COUNT(*) FROM persons")).scalar() + assert rows == 3 + + +def test_import_creates_table(xlsx_file, sqlite_config): + importer = Importer(sqlite_config) + importer.run(xlsx_file) + + from sqlalchemy import inspect + insp = inspect(importer.engine) + assert "persons" in insp.get_table_names() + + +def test_import_empty_sheet(tmp_path): + path = tmp_path / "empty.xlsx" + pd.DataFrame({"a": [], "b": []}).to_excel(path, index=False) + + cfg = ImportConfig( + dsn="sqlite:///:memory:", + sheets=[SheetConfig(sheet=0, target_table="empty_table", mode="append")], + ) + importer = Importer(cfg) + results = importer.run(path) + assert results["empty_table"] == 0 diff --git a/tests/test_reader.py b/tests/test_reader.py new file mode 100644 index 0000000..416698a --- /dev/null +++ b/tests/test_reader.py @@ -0,0 +1,84 @@ +import io +from pathlib import Path +import pandas as pd +import pytest + +from excel_import.reader import ExcelReader +from excel_import.config import SheetConfig + + +@pytest.fixture +def xlsx_file(tmp_path: Path) -> Path: + path = tmp_path / "test.xlsx" + df = pd.DataFrame({ + "Artikelnummer": ["A001", "A002", "A003"], + "Bezeichnung": ["Widget", "Gadget", None], + "Preis": [9.99, 14.50, 0.99], + }) + df.to_excel(path, index=False) + return path + + +def test_sheet_names(xlsx_file: Path): + reader = ExcelReader(xlsx_file) + assert reader.sheet_names() == ["Sheet1"] + + +def test_read_basic(xlsx_file: Path): + reader = ExcelReader(xlsx_file) + df = reader.read(SheetConfig(sheet=0, target_table="t")) + assert len(df) == 3 + assert list(df.columns) == ["Artikelnummer", "Bezeichnung", "Preis"] + + +def test_read_drops_empty_rows(tmp_path: Path): + path = tmp_path / "empty_rows.xlsx" + df = pd.DataFrame({"A": ["x", None, "y"], "B": [1, None, 3]}) + df.to_excel(path, index=False) + + reader = ExcelReader(path) + result = reader.read(SheetConfig(sheet=0, target_table="t")) + assert len(result) == 2 + + +def test_read_column_rename(xlsx_file: Path): + from excel_import.config import ColumnMapping + cfg = SheetConfig( + sheet=0, + target_table="t", + columns=[ + ColumnMapping(source="Artikelnummer", target="art_nr"), + ColumnMapping(source="Bezeichnung", target="bez"), + ColumnMapping(source="Preis", target="preis"), + ], + ) + reader = ExcelReader(xlsx_file) + df = reader.read(cfg) + assert "art_nr" in df.columns + assert "Artikelnummer" not in df.columns + + +def test_read_column_skip(xlsx_file: Path): + from excel_import.config import ColumnMapping + cfg = SheetConfig( + sheet=0, + target_table="t", + columns=[ + ColumnMapping(source="Preis", target="Preis", skip=True), + ], + ) + reader = ExcelReader(xlsx_file) + df = reader.read(cfg) + assert "Preis" not in df.columns + + +def test_file_not_found(): + with pytest.raises(FileNotFoundError): + ExcelReader("/nonexistent/path/file.xlsx") + + +def test_unsupported_extension(tmp_path: Path): + f = tmp_path / "data.csv" + f.write_text("a,b\n1,2") + with pytest.raises(ValueError, match="Unsupported"): + ExcelReader(f)