from __future__ import annotations from pathlib import Path import pandas as pd from .config import SheetConfig _ENGINES = { ".xls": "xlrd", ".xlsx": "openpyxl", ".xlsm": "openpyxl", ".xlsb": "openpyxl", ".ods": "odf", } def _engine_for(path: Path) -> str: return _ENGINES[path.suffix.lower()] class ExcelReader: def __init__(self, path: str | Path): self.path = Path(path) if not self.path.exists(): raise FileNotFoundError(f"Excel file not found: {self.path}") if self.path.suffix.lower() not in _ENGINES: raise ValueError(f"Unsupported file type: {self.path.suffix}") def sheet_names(self) -> list[str]: engine = _engine_for(self.path) xf = pd.ExcelFile(self.path, engine=engine) return xf.sheet_names def read(self, cfg: SheetConfig) -> pd.DataFrame: engine = _engine_for(self.path) df = pd.read_excel( self.path, sheet_name=cfg.sheet, header=cfg.header_row, skiprows=range(cfg.skip_rows) if cfg.skip_rows else None, engine=engine, ) # drop completely empty rows df.dropna(how="all", inplace=True) # apply column mapping: rename and drop skipped columns if cfg.columns: skip_sources = {c.source for c in cfg.columns if c.skip} df.drop(columns=[c for c in skip_sources if c in df.columns], inplace=True) rename_map = { c.source: c.target for c in cfg.columns if not c.skip and c.source != c.target } df.rename(columns=rename_map, inplace=True) return df