excel-import/excel_import/reader.py

from __future__ import annotations
from pathlib import Path
import pandas as pd

from .config import SheetConfig


_ENGINES = {
    ".xls": "xlrd",
    ".xlsx": "openpyxl",
    ".xlsm": "openpyxl",
    ".xlsb": "openpyxl",
    ".ods": "odf",
}


def _engine_for(path: Path) -> str:
    return _ENGINES[path.suffix.lower()]


class ExcelReader:
    def __init__(self, path: str | Path):
        self.path = Path(path)
        if not self.path.exists():
            raise FileNotFoundError(f"Excel file not found: {self.path}")
        if self.path.suffix.lower() not in _ENGINES:
            raise ValueError(f"Unsupported file type: {self.path.suffix}")

    def sheet_names(self) -> list[str]:
        engine = _engine_for(self.path)
        xf = pd.ExcelFile(self.path, engine=engine)
        return xf.sheet_names

    def read(self, cfg: SheetConfig) -> pd.DataFrame:
        engine = _engine_for(self.path)
        df = pd.read_excel(
            self.path,
            sheet_name=cfg.sheet,
            header=cfg.header_row,
            skiprows=range(cfg.skip_rows) if cfg.skip_rows else None,
            engine=engine,
        )
        # drop completely empty rows
        df.dropna(how="all", inplace=True)

        # apply column mapping: rename and drop skipped columns
        if cfg.columns:
            skip_sources = {c.source for c in cfg.columns if c.skip}
            df.drop(columns=[c for c in skip_sources if c in df.columns], inplace=True)

            rename_map = {
                c.source: c.target
                for c in cfg.columns
                if not c.skip and c.source != c.target
            }
            df.rename(columns=rename_map, inplace=True)

        return df