Source code for unsprawl.geocoding

"""Lightweight geocoding helpers.

The core resale dataset published by data.gov.sg does not include coordinates.
For MRT accessibility scoring we need a (lat, lon) per record.

This module provides a conservative fallback: infer coordinates using a
predefined centroid per town.

Notes
-----
- This is an approximation intended to enable accessibility scoring out-of-the-box.
- If your dataset has true coordinates (lat/lon), those always take precedence.
"""

from __future__ import annotations

from typing import Final

import numpy as np
import pandas as pd

# Approximate town centroids (lat, lon) in WGS84.
# These are used only when a dataset lacks per-transaction coordinates.
# Values are intentionally coarse; for analysis requiring precise distances,
# provide true lat/lon for each record.
TOWN_CENTROIDS: Final[dict[str, tuple[float, float]]] = {
    "ANG MO KIO": (1.3692, 103.8460),
    "BEDOK": (1.3236, 103.9273),
    "BISHAN": (1.3508, 103.8485),
    "BUKIT BATOK": (1.3590, 103.7637),
    "BUKIT MERAH": (1.2804, 103.8220),
    "BUKIT PANJANG": (1.3806, 103.7626),
    "BUKIT TIMAH": (1.3294, 103.8021),
    "CENTRAL AREA": (1.2850, 103.8510),
    "CHOA CHU KANG": (1.3854, 103.7444),
    "CLEMENTI": (1.3151, 103.7650),
    "GEYLANG": (1.3180, 103.8840),
    "HOUGANG": (1.3612, 103.8863),
    "JURONG EAST": (1.3331, 103.7420),
    "JURONG WEST": (1.3405, 103.7057),
    "KALLANG/WHAMPOA": (1.3103, 103.8620),
    "MARINE PARADE": (1.3026, 103.9076),
    "PASIR RIS": (1.3721, 103.9493),
    "PUNGGOL": (1.3984, 103.9072),
    "QUEENSTOWN": (1.2942, 103.7862),
    "SEMBAWANG": (1.4491, 103.8200),
    "SENGKANG": (1.3868, 103.8914),
    "SERANGOON": (1.3499, 103.8737),
    "TAMPINES": (1.3547, 103.9447),
    "TOA PAYOH": (1.3343, 103.8563),
    "WOODLANDS": (1.4360, 103.7860),
    "YISHUN": (1.4304, 103.8354),
}


[docs] def ensure_lat_lon_from_town_centroids( df: pd.DataFrame, *, town_col: str = "town", lat_col: str = "lat", lon_col: str = "lon", ) -> pd.DataFrame: """Ensure DataFrame has numeric `lat` and `lon` columns. If `lat` and `lon` already exist, they are coerced to numeric and left as-is. If one or both are missing, they are inferred from `town` using :data:`TOWN_CENTROIDS`. Unknown towns will remain NaN. """ out = df.copy() has_lat = lat_col in out.columns has_lon = lon_col in out.columns if has_lat: out[lat_col] = pd.to_numeric(out[lat_col], errors="coerce") if has_lon: out[lon_col] = pd.to_numeric(out[lon_col], errors="coerce") if has_lat and has_lon: return out # Need town to infer if town_col not in out.columns: if not has_lat: out[lat_col] = np.nan if not has_lon: out[lon_col] = np.nan return out towns = out[town_col].astype(str).str.strip().str.upper() lat = towns.map(lambda t: TOWN_CENTROIDS.get(t, (np.nan, np.nan))[0]) lon = towns.map(lambda t: TOWN_CENTROIDS.get(t, (np.nan, np.nan))[1]) if not has_lat: out[lat_col] = lat.astype(float) if not has_lon: out[lon_col] = lon.astype(float) return out