2d9bce014f
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
59 lines
1.7 KiB
Python
59 lines
1.7 KiB
Python
from __future__ import annotations
|
|
from pathlib import Path
|
|
import pandas as pd
|
|
|
|
from .config import SheetConfig
|
|
|
|
|
|
_ENGINES = {
|
|
".xls": "xlrd",
|
|
".xlsx": "openpyxl",
|
|
".xlsm": "openpyxl",
|
|
".xlsb": "openpyxl",
|
|
".ods": "odf",
|
|
}
|
|
|
|
|
|
def _engine_for(path: Path) -> str:
|
|
return _ENGINES[path.suffix.lower()]
|
|
|
|
|
|
class ExcelReader:
|
|
def __init__(self, path: str | Path):
|
|
self.path = Path(path)
|
|
if not self.path.exists():
|
|
raise FileNotFoundError(f"Excel file not found: {self.path}")
|
|
if self.path.suffix.lower() not in _ENGINES:
|
|
raise ValueError(f"Unsupported file type: {self.path.suffix}")
|
|
|
|
def sheet_names(self) -> list[str]:
|
|
engine = _engine_for(self.path)
|
|
xf = pd.ExcelFile(self.path, engine=engine)
|
|
return xf.sheet_names
|
|
|
|
def read(self, cfg: SheetConfig) -> pd.DataFrame:
|
|
engine = _engine_for(self.path)
|
|
df = pd.read_excel(
|
|
self.path,
|
|
sheet_name=cfg.sheet,
|
|
header=cfg.header_row,
|
|
skiprows=range(cfg.skip_rows) if cfg.skip_rows else None,
|
|
engine=engine,
|
|
)
|
|
# drop completely empty rows
|
|
df.dropna(how="all", inplace=True)
|
|
|
|
# apply column mapping: rename and drop skipped columns
|
|
if cfg.columns:
|
|
skip_sources = {c.source for c in cfg.columns if c.skip}
|
|
df.drop(columns=[c for c in skip_sources if c in df.columns], inplace=True)
|
|
|
|
rename_map = {
|
|
c.source: c.target
|
|
for c in cfg.columns
|
|
if not c.skip and c.source != c.target
|
|
}
|
|
df.rename(columns=rename_map, inplace=True)
|
|
|
|
return df
|