Files
2026-05-13 11:48:03 +02:00

59 lines
1.7 KiB
Python

from __future__ import annotations
from pathlib import Path
import pandas as pd
from .config import SheetConfig
_ENGINES = {
".xls": "xlrd",
".xlsx": "openpyxl",
".xlsm": "openpyxl",
".xlsb": "openpyxl",
".ods": "odf",
}
def _engine_for(path: Path) -> str:
return _ENGINES[path.suffix.lower()]
class ExcelReader:
def __init__(self, path: str | Path):
self.path = Path(path)
if not self.path.exists():
raise FileNotFoundError(f"Excel file not found: {self.path}")
if self.path.suffix.lower() not in _ENGINES:
raise ValueError(f"Unsupported file type: {self.path.suffix}")
def sheet_names(self) -> list[str]:
engine = _engine_for(self.path)
xf = pd.ExcelFile(self.path, engine=engine)
return xf.sheet_names
def read(self, cfg: SheetConfig) -> pd.DataFrame:
engine = _engine_for(self.path)
df = pd.read_excel(
self.path,
sheet_name=cfg.sheet,
header=cfg.header_row,
skiprows=range(cfg.skip_rows) if cfg.skip_rows else None,
engine=engine,
)
# drop completely empty rows
df.dropna(how="all", inplace=True)
# apply column mapping: rename and drop skipped columns
if cfg.columns:
skip_sources = {c.source for c in cfg.columns if c.skip}
df.drop(columns=[c for c in skip_sources if c in df.columns], inplace=True)
rename_map = {
c.source: c.target
for c in cfg.columns
if not c.skip and c.source != c.target
}
df.rename(columns=rename_map, inplace=True)
return df