"""Lightweight geocoding helpers.
The core resale dataset published by data.gov.sg does not include coordinates.
For MRT accessibility scoring we need a (lat, lon) per record.
This module provides a conservative fallback: infer coordinates using a
predefined centroid per town.
Notes
-----
- This is an approximation intended to enable accessibility scoring out-of-the-box.
- If your dataset has true coordinates (lat/lon), those always take precedence.
"""
from __future__ import annotations
from typing import Final
import numpy as np
import pandas as pd
# Approximate town centroids (lat, lon) in WGS84.
# These are used only when a dataset lacks per-transaction coordinates.
# Values are intentionally coarse; for analysis requiring precise distances,
# provide true lat/lon for each record.
TOWN_CENTROIDS: Final[dict[str, tuple[float, float]]] = {
"ANG MO KIO": (1.3692, 103.8460),
"BEDOK": (1.3236, 103.9273),
"BISHAN": (1.3508, 103.8485),
"BUKIT BATOK": (1.3590, 103.7637),
"BUKIT MERAH": (1.2804, 103.8220),
"BUKIT PANJANG": (1.3806, 103.7626),
"BUKIT TIMAH": (1.3294, 103.8021),
"CENTRAL AREA": (1.2850, 103.8510),
"CHOA CHU KANG": (1.3854, 103.7444),
"CLEMENTI": (1.3151, 103.7650),
"GEYLANG": (1.3180, 103.8840),
"HOUGANG": (1.3612, 103.8863),
"JURONG EAST": (1.3331, 103.7420),
"JURONG WEST": (1.3405, 103.7057),
"KALLANG/WHAMPOA": (1.3103, 103.8620),
"MARINE PARADE": (1.3026, 103.9076),
"PASIR RIS": (1.3721, 103.9493),
"PUNGGOL": (1.3984, 103.9072),
"QUEENSTOWN": (1.2942, 103.7862),
"SEMBAWANG": (1.4491, 103.8200),
"SENGKANG": (1.3868, 103.8914),
"SERANGOON": (1.3499, 103.8737),
"TAMPINES": (1.3547, 103.9447),
"TOA PAYOH": (1.3343, 103.8563),
"WOODLANDS": (1.4360, 103.7860),
"YISHUN": (1.4304, 103.8354),
}
[docs]
def ensure_lat_lon_from_town_centroids(
df: pd.DataFrame,
*,
town_col: str = "town",
lat_col: str = "lat",
lon_col: str = "lon",
) -> pd.DataFrame:
"""Ensure DataFrame has numeric `lat` and `lon` columns.
If `lat` and `lon` already exist, they are coerced to numeric and left as-is.
If one or both are missing, they are inferred from `town` using
:data:`TOWN_CENTROIDS`.
Unknown towns will remain NaN.
"""
out = df.copy()
has_lat = lat_col in out.columns
has_lon = lon_col in out.columns
if has_lat:
out[lat_col] = pd.to_numeric(out[lat_col], errors="coerce")
if has_lon:
out[lon_col] = pd.to_numeric(out[lon_col], errors="coerce")
if has_lat and has_lon:
return out
# Need town to infer
if town_col not in out.columns:
if not has_lat:
out[lat_col] = np.nan
if not has_lon:
out[lon_col] = np.nan
return out
towns = out[town_col].astype(str).str.strip().str.upper()
lat = towns.map(lambda t: TOWN_CENTROIDS.get(t, (np.nan, np.nan))[0])
lon = towns.map(lambda t: TOWN_CENTROIDS.get(t, (np.nan, np.nan))[1])
if not has_lat:
out[lat_col] = lat.astype(float)
if not has_lon:
out[lon_col] = lon.astype(float)
return out