Source code for unsprawl.loader

"""Data loading module for Unsprawl.

This module handles CSV ingestion and schema normalization for HDB resale data.
"""

from __future__ import annotations

import logging
from dataclasses import dataclass

import pandas as pd



[docs]
@dataclass(frozen=True)
class Schema:
    """Canonical column names expected by the pipeline.

    This class centralizes schema expectations while allowing flexible mapping from
    real-world datasets where names may vary slightly in case or spacing.
    """

    town: str = "town"
    flat_type: str = "flat_type"
    resale_price: str = "resale_price"
    floor_area: str = "floor_area_sqm"
    remaining_lease_raw: str = "remaining_lease"

    # Engineered fields
    remaining_lease_years: str = "remaining_lease_years"
    price_efficiency: str = "price_efficiency"
    z_price_efficiency: str = "z_price_efficiency"
    valuation_score: str = "valuation_score"




[docs]
class HDBLoader:
    """Load and normalize HDB resale CSV data.

    The loader focuses on robust file I/O and schema normalization. It lowercases and
    strips column names to mitigate schema drift and attempts to coerce core numeric
    columns into numeric dtype with proper NA handling.
    """

    def __init__(self, schema: Schema | None = None) -> None:
        self.schema = schema or Schema()
        self.logger = logging.getLogger(self.__class__.__name__)


[docs]
    def load(self, path: str) -> pd.DataFrame:
        """Load CSV into a pandas DataFrame with normalized column names.

        Parameters
        ----------
        path : str
            Path to the CSV file.

        Returns
        -------
        pd.DataFrame
            DataFrame with normalized columns and raw types preserved where possible.

        Raises
        ------
        FileNotFoundError
            If the file does not exist.
        ValueError
            If the CSV cannot be parsed.
        """
        self.logger.info("Loading CSV file: %s", path)
        try:
            df = pd.read_csv(path)
        except FileNotFoundError:
            self.logger.error("File not found: %s", path)
            raise
        except Exception as exc:  # noqa: BLE001
            self.logger.exception("Failed to read CSV: %s", path)
            raise ValueError(f"Failed to read CSV: {path}") from exc

        # Normalize column names: lowercase, strip, replace spaces with underscore
        df.columns = pd.Index(
            [str(c).strip().lower().replace(" ", "_") for c in df.columns]
        )
        self.logger.debug("Normalized columns: %s", df.columns.tolist())

        # Coerce common numeric fields when present
        for col in [self.schema.resale_price, self.schema.floor_area]:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

        return df