Source code for unsprawl.adapters.sg

"""unsprawl.adapters.sg.

Singapore adapter.

This module is allowed to carry Singapore-specific semantics.
Its job is to map GovSG datasets into universal ``Asset`` objects.
"""

from __future__ import annotations

import hashlib
from typing import Any

import pandas as pd

from unsprawl.core.schemas import Asset
from unsprawl.providers.data.sg.govsg import GovSGProvider


[docs] def _parse_remaining_lease_years(value: Any) -> float: """Parse the SG 'remaining_lease' field into fractional years. Expected formats include: - '94 years' - '94 years 05 months' Returns 0.0 if the input cannot be parsed. """ if value is None: return 0.0 s = str(value).strip().lower() if not s: return 0.0 years = 0.0 months = 0.0 # very small, robust parser parts = s.replace("years", "year").replace("months", "month").split() for i, token in enumerate(parts): if token == "year" and i > 0: try: years = float(parts[i - 1]) except ValueError: years = 0.0 if token == "month" and i > 0: try: months = float(parts[i - 1]) except ValueError: months = 0.0 return float(years) + float(months) / 12.0
[docs] class SGAdapter: """Normalize Singapore datasets into core ``Asset`` objects.""" def __init__(self, provider: GovSGProvider | None = None) -> None: self.provider = provider or GovSGProvider()
[docs] def fetch(self, region_code: str) -> list[Asset]: """Fetch assets from Singapore datasets.""" if region_code != "SG": raise ValueError( f"SGAdapter only supports region_code='SG' (got {region_code!r})" ) df = self.provider.fetch_resale_prices() return self._assets_from_resale_prices(df)
[docs] def _assets_from_resale_prices(self, df: pd.DataFrame) -> list[Asset]: """Convert resale price DataFrame to Asset objects.""" assets: list[Asset] = [] # Normalize columns like the legacy loader does df = df.copy() df.columns = pd.Index( [str(c).strip().lower().replace(" ", "_") for c in df.columns] ) for idx, row in df.iterrows(): # Deterministic ID from a small set of stable fields. # We avoid using idx alone because CSV ordering is not stable across releases. stable = "|".join( [ str(row.get("month", "")), str(row.get("town", "")), str(row.get("block", "")), str(row.get("street_name", "")), str(row.get("floor_area_sqm", "")), str(row.get("resale_price", "")), ] ) asset_id = hashlib.sha256(stable.encode("utf-8")).hexdigest()[:24] # We do not have per-asset geocoding yet in this dataset. # Use a sentinel location of (0,0) until adapters enrich with coordinates. # Future-proofing: the schema enforces (lat, lon), so adapters must fill it correctly. location = (0.0, 0.0) floor_area = float(row.get("floor_area_sqm") or 0.0) lease_remaining_years = _parse_remaining_lease_years( row.get("remaining_lease") ) price = float(row.get("resale_price") or 0.0) assets.append( Asset( id=asset_id, location=location, asset_type="residential", floor_area_sqm=floor_area, lease_remaining_years=lease_remaining_years, valuation_currency="SGD", predicted_valuation=price, local_metadata={ # Keep SG-specific fields quarantined. "town": row.get("town"), "flat_type": row.get("flat_type"), "flat_model": row.get("flat_model"), "storey_range": row.get("storey_range"), "block": row.get("block"), "street_name": row.get("street_name"), "month": row.get("month"), "lease_commence_date": row.get("lease_commence_date"), "source_row_index": ( int(idx) if isinstance(idx, int) else str(idx) ), }, ) ) return assets