"""Data loading module for Unsprawl.
This module handles CSV ingestion and schema normalization for HDB resale data.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
import pandas as pd
[docs]
@dataclass(frozen=True)
class Schema:
"""Canonical column names expected by the pipeline.
This class centralizes schema expectations while allowing flexible mapping from
real-world datasets where names may vary slightly in case or spacing.
"""
town: str = "town"
flat_type: str = "flat_type"
resale_price: str = "resale_price"
floor_area: str = "floor_area_sqm"
remaining_lease_raw: str = "remaining_lease"
# Engineered fields
remaining_lease_years: str = "remaining_lease_years"
price_efficiency: str = "price_efficiency"
z_price_efficiency: str = "z_price_efficiency"
valuation_score: str = "valuation_score"
[docs]
class HDBLoader:
"""Load and normalize HDB resale CSV data.
The loader focuses on robust file I/O and schema normalization. It lowercases and
strips column names to mitigate schema drift and attempts to coerce core numeric
columns into numeric dtype with proper NA handling.
"""
def __init__(self, schema: Schema | None = None) -> None:
self.schema = schema or Schema()
self.logger = logging.getLogger(self.__class__.__name__)
[docs]
def load(self, path: str) -> pd.DataFrame:
"""Load CSV into a pandas DataFrame with normalized column names.
Parameters
----------
path : str
Path to the CSV file.
Returns
-------
pd.DataFrame
DataFrame with normalized columns and raw types preserved where possible.
Raises
------
FileNotFoundError
If the file does not exist.
ValueError
If the CSV cannot be parsed.
"""
self.logger.info("Loading CSV file: %s", path)
try:
df = pd.read_csv(path)
except FileNotFoundError:
self.logger.error("File not found: %s", path)
raise
except Exception as exc: # noqa: BLE001
self.logger.exception("Failed to read CSV: %s", path)
raise ValueError(f"Failed to read CSV: {path}") from exc
# Normalize column names: lowercase, strip, replace spaces with underscore
df.columns = pd.Index(
[str(c).strip().lower().replace(" ", "_") for c in df.columns]
)
self.logger.debug("Normalized columns: %s", df.columns.tolist())
# Coerce common numeric fields when present
for col in [self.schema.resale_price, self.schema.floor_area]:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
return df