"""unsprawl.adapters.sg.
Singapore adapter.
This module is allowed to carry Singapore-specific semantics.
Its job is to map GovSG datasets into universal ``Asset`` objects.
"""
from __future__ import annotations
import hashlib
from typing import Any
import pandas as pd
from unsprawl.core.schemas import Asset
from unsprawl.providers.data.sg.govsg import GovSGProvider
[docs]
def _parse_remaining_lease_years(value: Any) -> float:
"""Parse the SG 'remaining_lease' field into fractional years.
Expected formats include:
- '94 years'
- '94 years 05 months'
Returns 0.0 if the input cannot be parsed.
"""
if value is None:
return 0.0
s = str(value).strip().lower()
if not s:
return 0.0
years = 0.0
months = 0.0
# very small, robust parser
parts = s.replace("years", "year").replace("months", "month").split()
for i, token in enumerate(parts):
if token == "year" and i > 0:
try:
years = float(parts[i - 1])
except ValueError:
years = 0.0
if token == "month" and i > 0:
try:
months = float(parts[i - 1])
except ValueError:
months = 0.0
return float(years) + float(months) / 12.0
[docs]
class SGAdapter:
"""Normalize Singapore datasets into core ``Asset`` objects."""
def __init__(self, provider: GovSGProvider | None = None) -> None:
self.provider = provider or GovSGProvider()
[docs]
def fetch(self, region_code: str) -> list[Asset]:
"""Fetch assets from Singapore datasets."""
if region_code != "SG":
raise ValueError(
f"SGAdapter only supports region_code='SG' (got {region_code!r})"
)
df = self.provider.fetch_resale_prices()
return self._assets_from_resale_prices(df)
[docs]
def _assets_from_resale_prices(self, df: pd.DataFrame) -> list[Asset]:
"""Convert resale price DataFrame to Asset objects."""
assets: list[Asset] = []
# Normalize columns like the legacy loader does
df = df.copy()
df.columns = pd.Index(
[str(c).strip().lower().replace(" ", "_") for c in df.columns]
)
for idx, row in df.iterrows():
# Deterministic ID from a small set of stable fields.
# We avoid using idx alone because CSV ordering is not stable across releases.
stable = "|".join(
[
str(row.get("month", "")),
str(row.get("town", "")),
str(row.get("block", "")),
str(row.get("street_name", "")),
str(row.get("floor_area_sqm", "")),
str(row.get("resale_price", "")),
]
)
asset_id = hashlib.sha256(stable.encode("utf-8")).hexdigest()[:24]
# We do not have per-asset geocoding yet in this dataset.
# Use a sentinel location of (0,0) until adapters enrich with coordinates.
# Future-proofing: the schema enforces (lat, lon), so adapters must fill it correctly.
location = (0.0, 0.0)
floor_area = float(row.get("floor_area_sqm") or 0.0)
lease_remaining_years = _parse_remaining_lease_years(
row.get("remaining_lease")
)
price = float(row.get("resale_price") or 0.0)
assets.append(
Asset(
id=asset_id,
location=location,
asset_type="residential",
floor_area_sqm=floor_area,
lease_remaining_years=lease_remaining_years,
valuation_currency="SGD",
predicted_valuation=price,
local_metadata={
# Keep SG-specific fields quarantined.
"town": row.get("town"),
"flat_type": row.get("flat_type"),
"flat_model": row.get("flat_model"),
"storey_range": row.get("storey_range"),
"block": row.get("block"),
"street_name": row.get("street_name"),
"month": row.get("month"),
"lease_commence_date": row.get("lease_commence_date"),
"source_row_index": (
int(idx) if isinstance(idx, int) else str(idx)
),
},
)
)
return assets