Initial implementation of generic Excel-to-DB import tool
Supports .xls and .xlsx, Oracle and PostgreSQL via SQLAlchemy. Includes CLI (run/inspect/generate-config), YAML config, auto schema detection, and append/replace/upsert modes. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,49 @@
|
||||
from __future__ import annotations
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
from .config import SheetConfig
|
||||
|
||||
|
||||
def _engine_for(path: Path) -> str:
|
||||
return "xlrd" if path.suffix.lower() == ".xls" else "openpyxl"
|
||||
|
||||
|
||||
class ExcelReader:
|
||||
def __init__(self, path: str | Path):
|
||||
self.path = Path(path)
|
||||
if not self.path.exists():
|
||||
raise FileNotFoundError(f"Excel file not found: {self.path}")
|
||||
if self.path.suffix.lower() not in {".xls", ".xlsx", ".xlsm", ".xlsb"}:
|
||||
raise ValueError(f"Unsupported file type: {self.path.suffix}")
|
||||
|
||||
def sheet_names(self) -> list[str]:
|
||||
engine = _engine_for(self.path)
|
||||
xf = pd.ExcelFile(self.path, engine=engine)
|
||||
return xf.sheet_names
|
||||
|
||||
def read(self, cfg: SheetConfig) -> pd.DataFrame:
|
||||
engine = _engine_for(self.path)
|
||||
df = pd.read_excel(
|
||||
self.path,
|
||||
sheet_name=cfg.sheet,
|
||||
header=cfg.header_row,
|
||||
skiprows=range(cfg.skip_rows) if cfg.skip_rows else None,
|
||||
engine=engine,
|
||||
)
|
||||
# drop completely empty rows
|
||||
df.dropna(how="all", inplace=True)
|
||||
|
||||
# apply column mapping: rename and drop skipped columns
|
||||
if cfg.columns:
|
||||
skip_sources = {c.source for c in cfg.columns if c.skip}
|
||||
df.drop(columns=[c for c in skip_sources if c in df.columns], inplace=True)
|
||||
|
||||
rename_map = {
|
||||
c.source: c.target
|
||||
for c in cfg.columns
|
||||
if not c.skip and c.source != c.target
|
||||
}
|
||||
df.rename(columns=rename_map, inplace=True)
|
||||
|
||||
return df
|
||||
Reference in New Issue
Block a user