From 66ab177faa1ef240d7928d6a8c6a7a16496efae4 Mon Sep 17 00:00:00 2001 From: bachir Date: Fri, 27 Mar 2026 09:57:35 +0100 Subject: [PATCH 1/7] feat(uc3-uhi): tessera integration --- .../heat_guatemala_full_fusion_avg_best.yaml | 34 +++++++++++++++++ .../heat_guatemala_full_fusion_cnn_best.yaml | 34 +++++++++++++++++ .../heat_guatemala_fusion_best.yaml | 27 +++++++++++++ .../heat_guatemala_geoclip_best.yaml | 27 +++++++++++++ .../heat_guatemala_tabular_best.yaml | 27 +++++++++++++ .../heat_guatemala_tessera_avg_best.yaml | 34 +++++++++++++++++ .../heat_guatemala_tessera_cnn_best.yaml | 34 +++++++++++++++++ configs/model/heat_full_fusion_avg_best.yaml | 34 +++++++++++++++++ configs/model/heat_full_fusion_cnn_best.yaml | 38 +++++++++++++++++++ configs/model/heat_fusion_best.yaml | 31 +++++++++++++++ configs/model/heat_geoclip_best.yaml | 23 +++++++++++ configs/model/heat_tabular_best.yaml | 25 ++++++++++++ configs/model/heat_tessera_avg_best.yaml | 24 ++++++++++++ configs/model/heat_tessera_cnn_best.yaml | 28 ++++++++++++++ src/data/base_dataset.py | 2 +- src/data/heat_guatemala_dataset.py | 15 +++++++- .../components/geo_encoders/cnn_encoder.py | 4 +- 17 files changed, 436 insertions(+), 5 deletions(-) create mode 100644 configs/experiment/heat_guatemala_full_fusion_avg_best.yaml create mode 100644 configs/experiment/heat_guatemala_full_fusion_cnn_best.yaml create mode 100644 configs/experiment/heat_guatemala_fusion_best.yaml create mode 100644 configs/experiment/heat_guatemala_geoclip_best.yaml create mode 100644 configs/experiment/heat_guatemala_tabular_best.yaml create mode 100644 configs/experiment/heat_guatemala_tessera_avg_best.yaml create mode 100644 configs/experiment/heat_guatemala_tessera_cnn_best.yaml create mode 100644 configs/model/heat_full_fusion_avg_best.yaml create mode 100644 configs/model/heat_full_fusion_cnn_best.yaml create mode 100644 configs/model/heat_fusion_best.yaml create mode 100644 configs/model/heat_geoclip_best.yaml create mode 100644 configs/model/heat_tabular_best.yaml create mode 100644 configs/model/heat_tessera_avg_best.yaml create mode 100644 configs/model/heat_tessera_cnn_best.yaml diff --git a/configs/experiment/heat_guatemala_full_fusion_avg_best.yaml b/configs/experiment/heat_guatemala_full_fusion_avg_best.yaml new file mode 100644 index 0000000..72c2896 --- /dev/null +++ b/configs/experiment/heat_guatemala_full_fusion_avg_best.yaml @@ -0,0 +1,34 @@ +# @package _global_ +# Best config: R2=0.672, RMSE=1.104, MAE=0.896 +defaults: + - override /model: heat_full_fusion_avg_best + - override /data: heat_guatemala + - override /metrics: guatemala_regression +tags: ["heat_island", "guatemala", "full_fusion_avg", "best", "regression"] +seed: 12345 +trainer: + min_epochs: 1 + max_epochs: 100 +data: + batch_size: 64 + dataset: + modalities: + coords: {} + tessera: + year: 2024 + size: 10 + format: npy +callbacks: + model_checkpoint: + monitor: val_r2 + mode: max + early_stopping: + monitor: val_r2 + mode: max + patience: 20 +logger: + wandb: + tags: ${tags} + group: "heat_island" + aim: + experiment: "heat_island" diff --git a/configs/experiment/heat_guatemala_full_fusion_cnn_best.yaml b/configs/experiment/heat_guatemala_full_fusion_cnn_best.yaml new file mode 100644 index 0000000..d87c38a --- /dev/null +++ b/configs/experiment/heat_guatemala_full_fusion_cnn_best.yaml @@ -0,0 +1,34 @@ +# @package _global_ +# Best config: R2=0.647, RMSE=1.144, MAE=0.931 +defaults: + - override /model: heat_full_fusion_cnn_best + - override /data: heat_guatemala + - override /metrics: guatemala_regression +tags: ["heat_island", "guatemala", "full_fusion_cnn", "best", "regression"] +seed: 12345 +trainer: + min_epochs: 1 + max_epochs: 100 +data: + batch_size: 64 + dataset: + modalities: + coords: {} + tessera: + year: 2024 + size: 10 + format: npy +callbacks: + model_checkpoint: + monitor: val_r2 + mode: max + early_stopping: + monitor: val_r2 + mode: max + patience: 20 +logger: + wandb: + tags: ${tags} + group: "heat_island" + aim: + experiment: "heat_island" diff --git a/configs/experiment/heat_guatemala_fusion_best.yaml b/configs/experiment/heat_guatemala_fusion_best.yaml new file mode 100644 index 0000000..dbab3a1 --- /dev/null +++ b/configs/experiment/heat_guatemala_fusion_best.yaml @@ -0,0 +1,27 @@ +# @package _global_ +# Best config: R2=0.555, RMSE=1.285, MAE=1.039 +defaults: + - override /model: heat_fusion_best + - override /data: heat_guatemala + - override /metrics: guatemala_regression +tags: ["heat_island", "guatemala", "fusion", "best", "regression"] +seed: 12345 +trainer: + min_epochs: 1 + max_epochs: 100 +data: + batch_size: 64 +callbacks: + model_checkpoint: + monitor: val_r2 + mode: max + early_stopping: + monitor: val_r2 + mode: max + patience: 20 +logger: + wandb: + tags: ${tags} + group: "heat_island" + aim: + experiment: "heat_island" diff --git a/configs/experiment/heat_guatemala_geoclip_best.yaml b/configs/experiment/heat_guatemala_geoclip_best.yaml new file mode 100644 index 0000000..5c555af --- /dev/null +++ b/configs/experiment/heat_guatemala_geoclip_best.yaml @@ -0,0 +1,27 @@ +# @package _global_ +# Best config: R2=0.323, RMSE=1.607, MAE=1.344 +defaults: + - override /model: heat_geoclip_best + - override /data: heat_guatemala + - override /metrics: guatemala_regression +tags: ["heat_island", "guatemala", "coords", "best", "regression"] +seed: 12345 +trainer: + min_epochs: 1 + max_epochs: 100 +data: + batch_size: 64 +callbacks: + model_checkpoint: + monitor: val_r2 + mode: max + early_stopping: + monitor: val_r2 + mode: max + patience: 20 +logger: + wandb: + tags: ${tags} + group: "heat_island" + aim: + experiment: "heat_island" diff --git a/configs/experiment/heat_guatemala_tabular_best.yaml b/configs/experiment/heat_guatemala_tabular_best.yaml new file mode 100644 index 0000000..952b33f --- /dev/null +++ b/configs/experiment/heat_guatemala_tabular_best.yaml @@ -0,0 +1,27 @@ +# @package _global_ +# Best config: R2=0.562, RMSE=1.282, MAE=1.040 +defaults: + - override /model: heat_tabular_best + - override /data: heat_guatemala + - override /metrics: guatemala_regression +tags: ["heat_island", "guatemala", "tabular", "best", "regression"] +seed: 12345 +trainer: + min_epochs: 1 + max_epochs: 100 +data: + batch_size: 64 +callbacks: + model_checkpoint: + monitor: val_r2 + mode: max + early_stopping: + monitor: val_r2 + mode: max + patience: 20 +logger: + wandb: + tags: ${tags} + group: "heat_island" + aim: + experiment: "heat_island" diff --git a/configs/experiment/heat_guatemala_tessera_avg_best.yaml b/configs/experiment/heat_guatemala_tessera_avg_best.yaml new file mode 100644 index 0000000..21bc135 --- /dev/null +++ b/configs/experiment/heat_guatemala_tessera_avg_best.yaml @@ -0,0 +1,34 @@ +# @package _global_ +# Best config: R2=0.733, RMSE=1.011, MAE=0.814 +defaults: + - override /model: heat_tessera_avg_best + - override /data: heat_guatemala + - override /metrics: guatemala_regression +tags: ["heat_island", "guatemala", "tessera_avg", "best", "regression"] +seed: 12345 +trainer: + min_epochs: 1 + max_epochs: 100 +data: + batch_size: 64 + dataset: + modalities: + coords: {} + tessera: + year: 2024 + size: 10 + format: npy +callbacks: + model_checkpoint: + monitor: val_r2 + mode: max + early_stopping: + monitor: val_r2 + mode: max + patience: 20 +logger: + wandb: + tags: ${tags} + group: "heat_island" + aim: + experiment: "heat_island" diff --git a/configs/experiment/heat_guatemala_tessera_cnn_best.yaml b/configs/experiment/heat_guatemala_tessera_cnn_best.yaml new file mode 100644 index 0000000..c7cc1d1 --- /dev/null +++ b/configs/experiment/heat_guatemala_tessera_cnn_best.yaml @@ -0,0 +1,34 @@ +# @package _global_ +# Best config: R2=0.694, RMSE=1.088, MAE=0.877 +defaults: + - override /model: heat_tessera_cnn_best + - override /data: heat_guatemala + - override /metrics: guatemala_regression +tags: ["heat_island", "guatemala", "tessera_cnn", "best", "regression"] +seed: 12345 +trainer: + min_epochs: 1 + max_epochs: 100 +data: + batch_size: 64 + dataset: + modalities: + coords: {} + tessera: + year: 2024 + size: 10 + format: npy +callbacks: + model_checkpoint: + monitor: val_r2 + mode: max + early_stopping: + monitor: val_r2 + mode: max + patience: 20 +logger: + wandb: + tags: ${tags} + group: "heat_island" + aim: + experiment: "heat_island" diff --git a/configs/model/heat_full_fusion_avg_best.yaml b/configs/model/heat_full_fusion_avg_best.yaml new file mode 100644 index 0000000..df5799d --- /dev/null +++ b/configs/model/heat_full_fusion_avg_best.yaml @@ -0,0 +1,34 @@ +_target_: src.models.predictive_model.PredictiveModel +geo_encoder: + _target_: src.models.components.geo_encoders.encoder_wrapper.EncoderWrapper + encoder_branches: + - encoder: + _target_: src.models.components.geo_encoders.geoclip.GeoClipCoordinateEncoder + - encoder: + _target_: src.models.components.geo_encoders.tabular_encoder.TabularEncoder + output_dim: 64 + geo_data_name: tabular + - encoder: + _target_: src.models.components.geo_encoders.average_encoder.AverageEncoder + geo_data_name: tessera + fusion_strategy: concat +prediction_head: + _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead + nn_layers: 3 + hidden_dim: 512 +trainable_modules: [geo_encoder.encoder_branches.1, prediction_head] +normalize_features: false +metrics: ${metrics} +optimizer: + _target_: torch.optim.Adam + _partial_: true + lr: 0.0001 + weight_decay: 0.0 +scheduler: + _target_: torch.optim.lr_scheduler.ReduceLROnPlateau + _partial_: true + mode: min + factor: 0.1 + patience: 10 +loss_fn: + _target_: torch.nn.MSELoss diff --git a/configs/model/heat_full_fusion_cnn_best.yaml b/configs/model/heat_full_fusion_cnn_best.yaml new file mode 100644 index 0000000..ea07641 --- /dev/null +++ b/configs/model/heat_full_fusion_cnn_best.yaml @@ -0,0 +1,38 @@ +_target_: src.models.predictive_model.PredictiveModel +geo_encoder: + _target_: src.models.components.geo_encoders.encoder_wrapper.EncoderWrapper + encoder_branches: + - encoder: + _target_: src.models.components.geo_encoders.geoclip.GeoClipCoordinateEncoder + - encoder: + _target_: src.models.components.geo_encoders.tabular_encoder.TabularEncoder + output_dim: 64 + geo_data_name: tabular + - encoder: + _target_: src.models.components.geo_encoders.cnn_encoder.CNNEncoder + geo_data_name: tessera + resnet_version: 18 + pretrained_cnn: imagenet + freezing_strategy: all + output_dim: 256 + fusion_strategy: concat +prediction_head: + _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead + nn_layers: 3 + hidden_dim: 256 +trainable_modules: [geo_encoder.encoder_branches.1, geo_encoder.encoder_branches.2, prediction_head] +normalize_features: true +metrics: ${metrics} +optimizer: + _target_: torch.optim.Adam + _partial_: true + lr: 0.001 + weight_decay: 0.0 +scheduler: + _target_: torch.optim.lr_scheduler.ReduceLROnPlateau + _partial_: true + mode: min + factor: 0.1 + patience: 10 +loss_fn: + _target_: torch.nn.MSELoss diff --git a/configs/model/heat_fusion_best.yaml b/configs/model/heat_fusion_best.yaml new file mode 100644 index 0000000..40205fc --- /dev/null +++ b/configs/model/heat_fusion_best.yaml @@ -0,0 +1,31 @@ +_target_: src.models.predictive_model.PredictiveModel +geo_encoder: + _target_: src.models.components.geo_encoders.encoder_wrapper.EncoderWrapper + encoder_branches: + - encoder: + _target_: src.models.components.geo_encoders.geoclip.GeoClipCoordinateEncoder + - encoder: + _target_: src.models.components.geo_encoders.tabular_encoder.TabularEncoder + output_dim: 64 + geo_data_name: tabular + fusion_strategy: concat +prediction_head: + _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead + nn_layers: 2 + hidden_dim: 512 +trainable_modules: [geo_encoder.encoder_branches.1, prediction_head] +normalize_features: false +metrics: ${metrics} +optimizer: + _target_: torch.optim.Adam + _partial_: true + lr: 0.0001 + weight_decay: 0.0 +scheduler: + _target_: torch.optim.lr_scheduler.ReduceLROnPlateau + _partial_: true + mode: min + factor: 0.1 + patience: 10 +loss_fn: + _target_: torch.nn.MSELoss diff --git a/configs/model/heat_geoclip_best.yaml b/configs/model/heat_geoclip_best.yaml new file mode 100644 index 0000000..9363260 --- /dev/null +++ b/configs/model/heat_geoclip_best.yaml @@ -0,0 +1,23 @@ +_target_: src.models.predictive_model.PredictiveModel +geo_encoder: + _target_: src.models.components.geo_encoders.geoclip.GeoClipCoordinateEncoder +prediction_head: + _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead + nn_layers: 2 + hidden_dim: 128 +trainable_modules: [prediction_head] +normalize_features: true +metrics: ${metrics} +optimizer: + _target_: torch.optim.Adam + _partial_: true + lr: 0.001 + weight_decay: 0.0 +scheduler: + _target_: torch.optim.lr_scheduler.ReduceLROnPlateau + _partial_: true + mode: min + factor: 0.1 + patience: 10 +loss_fn: + _target_: torch.nn.MSELoss diff --git a/configs/model/heat_tabular_best.yaml b/configs/model/heat_tabular_best.yaml new file mode 100644 index 0000000..a5b9f4d --- /dev/null +++ b/configs/model/heat_tabular_best.yaml @@ -0,0 +1,25 @@ +_target_: src.models.predictive_model.PredictiveModel +geo_encoder: + _target_: src.models.components.geo_encoders.tabular_encoder.TabularEncoder + output_dim: 64 + geo_data_name: tabular +prediction_head: + _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead + nn_layers: 3 + hidden_dim: 512 +trainable_modules: [geo_encoder, prediction_head] +normalize_features: false +metrics: ${metrics} +optimizer: + _target_: torch.optim.Adam + _partial_: true + lr: 0.0001 + weight_decay: 0.0001 +scheduler: + _target_: torch.optim.lr_scheduler.ReduceLROnPlateau + _partial_: true + mode: min + factor: 0.1 + patience: 10 +loss_fn: + _target_: torch.nn.MSELoss diff --git a/configs/model/heat_tessera_avg_best.yaml b/configs/model/heat_tessera_avg_best.yaml new file mode 100644 index 0000000..b68ac43 --- /dev/null +++ b/configs/model/heat_tessera_avg_best.yaml @@ -0,0 +1,24 @@ +_target_: src.models.predictive_model.PredictiveModel +geo_encoder: + _target_: src.models.components.geo_encoders.average_encoder.AverageEncoder + geo_data_name: tessera +prediction_head: + _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead + nn_layers: 2 + hidden_dim: 512 +trainable_modules: [prediction_head] +normalize_features: true +metrics: ${metrics} +optimizer: + _target_: torch.optim.Adam + _partial_: true + lr: 0.001 + weight_decay: 0.0 +scheduler: + _target_: torch.optim.lr_scheduler.ReduceLROnPlateau + _partial_: true + mode: min + factor: 0.1 + patience: 10 +loss_fn: + _target_: torch.nn.MSELoss diff --git a/configs/model/heat_tessera_cnn_best.yaml b/configs/model/heat_tessera_cnn_best.yaml new file mode 100644 index 0000000..aa4af72 --- /dev/null +++ b/configs/model/heat_tessera_cnn_best.yaml @@ -0,0 +1,28 @@ +_target_: src.models.predictive_model.PredictiveModel +geo_encoder: + _target_: src.models.components.geo_encoders.cnn_encoder.CNNEncoder + geo_data_name: tessera + resnet_version: 34 + pretrained_cnn: imagenet + freezing_strategy: all + output_dim: 256 +prediction_head: + _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead + nn_layers: 2 + hidden_dim: 1024 +trainable_modules: [geo_encoder, prediction_head] +normalize_features: false +metrics: ${metrics} +optimizer: + _target_: torch.optim.Adam + _partial_: true + lr: 0.001 + weight_decay: 0.0 +scheduler: + _target_: torch.optim.lr_scheduler.ReduceLROnPlateau + _partial_: true + mode: min + factor: 0.1 + patience: 10 +loss_fn: + _target_: torch.nn.MSELoss diff --git a/src/data/base_dataset.py b/src/data/base_dataset.py index bd727d7..da276f1 100644 --- a/src/data/base_dataset.py +++ b/src/data/base_dataset.py @@ -237,7 +237,7 @@ def setup_tessera(self) -> None: if fname not in avail_files: print(f"Retrieving missing Tessera data: {fname}") gt = gt or GeoTessera(cache_dir=self.cache_dir) - get_tessera_embeds(rec.lon, rec.lat, rec.name_loc, year, dst_dir, size) + get_tessera_embeds(rec["lon"], rec["lat"], rec["name_loc"], year, dst_dir, size,gt) @final def setup_aef(self) -> None: diff --git a/src/data/heat_guatemala_dataset.py b/src/data/heat_guatemala_dataset.py index f4fabac..4607ba9 100644 --- a/src/data/heat_guatemala_dataset.py +++ b/src/data/heat_guatemala_dataset.py @@ -15,9 +15,9 @@ from typing import Any, Dict, override import torch - +import os from src.data.base_dataset import BaseDataset - +import numpy as np class HeatGuatemalaDataset(BaseDataset): """Dataset for the urban heat island use case (Guatemala City, LST regression). @@ -88,6 +88,17 @@ def __getitem__(self, idx: int) -> Dict[str, Any]: sample["eo"]["coords"] = torch.tensor( [row["lat"], row["lon"]], dtype=torch.float32 ) + elif modality == "tessera": + path = row["tessera_path"] + if path is not None and os.path.exists(path): + arr = np.load(path).transpose(2, 0, 1) + else: + arr = np.zeros((128, 10, 10), dtype=np.float32) + tess = torch.tensor(arr, dtype=torch.float32) + sample["eo"]["tessera"] = tess + sample["tessera"] = tess + + # --- Tabular features (always included if present in CSV) --- if self.use_features and self.feat_names: diff --git a/src/models/components/geo_encoders/cnn_encoder.py b/src/models/components/geo_encoders/cnn_encoder.py index 34f8f48..909074f 100644 --- a/src/models/components/geo_encoders/cnn_encoder.py +++ b/src/models/components/geo_encoders/cnn_encoder.py @@ -148,12 +148,12 @@ def forward( :param batch: input batch :return: extracted features """ - eo_data = batch.get("eo", {}) + eo_data = batch[self.geo_data_name] dtype = self.dtype if eo_data.dtype != dtype: eo_data = eo_data.to(dtype) - feats = self.geo_encoder(eo_data[self.geo_data_name]) + feats = self.geo_encoder(eo_data)#self.geo_encoder(eo_data[self.geo_data_name]) # n_nans = torch.sum(torch.isnan(feats)).item() # assert ( # n_nans == 0 From f1d8050f6a9c84b37715d216129b32c567f2fee2 Mon Sep 17 00:00:00 2001 From: bachir Date: Mon, 30 Mar 2026 12:19:08 +0200 Subject: [PATCH 2/7] fix size hardcoding --- src/data/heat_guatemala_dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/data/heat_guatemala_dataset.py b/src/data/heat_guatemala_dataset.py index 4607ba9..e9a640d 100644 --- a/src/data/heat_guatemala_dataset.py +++ b/src/data/heat_guatemala_dataset.py @@ -93,7 +93,10 @@ def __getitem__(self, idx: int) -> Dict[str, Any]: if path is not None and os.path.exists(path): arr = np.load(path).transpose(2, 0, 1) else: - arr = np.zeros((128, 10, 10), dtype=np.float32) + size = self.modalities["tessera"].get("size", 10) + arr = np.zeros((128, size, size), dtype=np.float32) + n_bands = self.modalities["tessera"].get("n_bands", 128) + arr = np.zeros((n_bands, size, size), dtype=np.float32) tess = torch.tensor(arr, dtype=torch.float32) sample["eo"]["tessera"] = tess sample["tessera"] = tess From c7fd3023562c6fee20e9252598005183a647a7a5 Mon Sep 17 00:00:00 2001 From: bachir Date: Mon, 30 Mar 2026 12:39:17 +0200 Subject: [PATCH 3/7] fix size hardcoding --- src/data/heat_guatemala_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/data/heat_guatemala_dataset.py b/src/data/heat_guatemala_dataset.py index e9a640d..6edc4db 100644 --- a/src/data/heat_guatemala_dataset.py +++ b/src/data/heat_guatemala_dataset.py @@ -94,7 +94,6 @@ def __getitem__(self, idx: int) -> Dict[str, Any]: arr = np.load(path).transpose(2, 0, 1) else: size = self.modalities["tessera"].get("size", 10) - arr = np.zeros((128, size, size), dtype=np.float32) n_bands = self.modalities["tessera"].get("n_bands", 128) arr = np.zeros((n_bands, size, size), dtype=np.float32) tess = torch.tensor(arr, dtype=torch.float32) From a219590c558128fb5afb8df604753e35aea00be7 Mon Sep 17 00:00:00 2001 From: bachir Date: Mon, 15 Jun 2026 17:40:41 +0200 Subject: [PATCH 4/7] fix(data): handle Hydra DictConfig in use_aux_data; safe setup_tessera dropping --- src/data/base_dataset.py | 124 ++++++++++++++++++++++++++++++--------- 1 file changed, 96 insertions(+), 28 deletions(-) diff --git a/src/data/base_dataset.py b/src/data/base_dataset.py index d39e66a..e8c606e 100644 --- a/src/data/base_dataset.py +++ b/src/data/base_dataset.py @@ -11,6 +11,7 @@ from src.utils.data_utils import center_crop_npy from src.utils.errors import MissingDataError +from omegaconf import DictConfig, OmegaConf TORCH_DTYPES = { "float32": torch.float32, @@ -121,23 +122,45 @@ def __init__( self.use_target_data = use_target_data self.use_features = use_features - if use_aux_data is None or use_aux_data == "all": + if isinstance(use_aux_data, DictConfig): + use_aux_data = OmegaConf.to_container(use_aux_data, resolve=True) + + if use_aux_data is None: + self.use_aux_data = None + + elif use_aux_data == "all": self.use_aux_data = { "aux": { "pattern": "^aux_(?!.*top).*", - # 'columns' : [] }, "top": { "pattern": "^aux_.*top.*", - # 'columns' : [] }, } - elif type(use_aux_data) is dict: + elif isinstance(use_aux_data, dict): self.use_aux_data = use_aux_data + else: self.use_aux_data = None + # if use_aux_data is None or use_aux_data == "all": + # self.use_aux_data = { + # "aux": { + # "pattern": "^aux_(?!.*top).*", + # # 'columns' : [] + # }, + # "top": { + # "pattern": "^aux_.*top.*", + # # 'columns' : [] + # }, + # } + + # elif type(use_aux_data) is dict: + # self.use_aux_data = use_aux_data + # else: + # self.use_aux_data = None + # More precise dataset name (with modalities) if isinstance(dataset_name, list): dataset_name = "+".join(dataset_name) @@ -280,35 +303,80 @@ def setup_tessera(self) -> None: else: from geotessera import GeoTessera - print("Downloading missing Tessera tiles...") - print("[Warning]: it may download tessera tiles filled with 0a") - - avail_files = os.listdir(dst_dir) + print("Checking missing Tessera tiles...") + avail_files = set(os.listdir(dst_dir)) gt = None - for i, rec in enumerate(self.records): + kept_records = [] + missing_files = [] + + for rec in self.records: fname = os.path.basename(rec["tessera_path"]) - if fname not in avail_files: - if download_missing_tiles: - print(f"Retrieving missing Tessera data: {fname}") - gt = gt or GeoTessera(cache_dir=self.cache_dir) - row = self.df[self.df["name_loc"] == rec["name_loc"]] - lon, lat = row.lon.item(), row.lat.item() - try: - get_tessera_embeds( - lon, - lat, - rec["name_loc"], - year=year, - save_dir=dst_dir, - tile_size=size, - tessera_con=gt, - ) + + if fname in avail_files: + kept_records.append(rec) + continue + + if download_missing_tiles: + print(f"Retrieving missing Tessera data: {fname}") + gt = gt or GeoTessera(cache_dir=self.cache_dir) + row = self.df[self.df["name_loc"] == rec["name_loc"]] + lon, lat = row.lon.item(), row.lat.item() + + try: + get_tessera_embeds( + lon, + lat, + rec["name_loc"], + year=year, + save_dir=dst_dir, + tile_size=size, + tessera_con=gt, + ) + if os.path.exists(rec["tessera_path"]): + kept_records.append(rec) + avail_files.add(fname) continue - except Exception as e: - print(f"Tile for {fname} could not be retrieved. Error: {e}") - self.records.pop(i) + except Exception as e: + print(f"Tile for {fname} could not be retrieved. Error: {e}") + + missing_files.append(fname) print(f"No tile found for {fname} thus it will not be used.") + self.records = kept_records + + if missing_files: + print(f"Dropped {len(missing_files)} records with missing Tessera tiles.") + # from geotessera import GeoTessera + + # print("Downloading missing Tessera tiles...") + # print("[Warning]: it may download tessera tiles filled with 0a") + + # avail_files = os.listdir(dst_dir) + # gt = None + # for i, rec in enumerate(self.records): + # fname = os.path.basename(rec["tessera_path"]) + # if fname not in avail_files: + # if download_missing_tiles: + # print(f"Retrieving missing Tessera data: {fname}") + # gt = gt or GeoTessera(cache_dir=self.cache_dir) + # row = self.df[self.df["name_loc"] == rec["name_loc"]] + # lon, lat = row.lon.item(), row.lat.item() + # try: + # get_tessera_embeds( + # lon, + # lat, + # rec["name_loc"], + # year=year, + # save_dir=dst_dir, + # tile_size=size, + # tessera_con=gt, + # ) + # continue + # except Exception as e: + # print(f"Tile for {fname} could not be retrieved. Error: {e}") + # self.records.pop(i) + # print(f"No tile found for {fname} thus it will not be used.") + @final def setup_aef(self) -> None: """Download full dataset or the missing AEF tiles. From d889382b6241cff4208e01cb5ffb78d13c70a12a Mon Sep 17 00:00:00 2001 From: bachir Date: Mon, 15 Jun 2026 17:43:48 +0200 Subject: [PATCH 5/7] feat(heat-guatemala): Tessera+text caption-alignment use case --- configs/data/heat_guatemala_tessera_text.yaml | 65 +++++++++++++++ configs/experiment/heat_alignment.yaml | 25 ++++++ configs/model/heat_tessera_alignment.yaml | 37 +++++++++ data/heat_guatemala/concept_captions/v1.json | 82 +++++++++++++++++++ .../location_caption_templates/v1.json | 30 +++++++ src/data/heat_guatemala_caption_builder.py | 64 +++++++++++++++ .../build_aux_from_original.py | 72 ++++++++++++++++ 7 files changed, 375 insertions(+) create mode 100644 configs/data/heat_guatemala_tessera_text.yaml create mode 100644 configs/experiment/heat_alignment.yaml create mode 100644 configs/model/heat_tessera_alignment.yaml create mode 100644 data/heat_guatemala/concept_captions/v1.json create mode 100644 data/heat_guatemala/location_caption_templates/v1.json create mode 100644 src/data/heat_guatemala_caption_builder.py create mode 100644 src/data_preprocessing/build_aux_from_original.py diff --git a/configs/data/heat_guatemala_tessera_text.yaml b/configs/data/heat_guatemala_tessera_text.yaml new file mode 100644 index 0000000..4ee12cd --- /dev/null +++ b/configs/data/heat_guatemala_tessera_text.yaml @@ -0,0 +1,65 @@ +# Alignment data config for the Guatemala LST use case (Tessera + expert-legend captions). +# +# Requirements before running: +# 1. The model-ready CSV the dataset loads from ${paths.data_dir} must contain the +# aux_* columns produced by scripts/build_aux_from_original.py (numeric + *_label). +# 2. Tessera tiles at ${paths.data_dir}/heat_guatemala/eo/tessera/tessera_.npy +# (10x10, 128-band) — see the tessera preprocessing step. +_target_: src.data.base_datamodule.BaseDataModule + +dataset: + _target_: src.data.heat_guatemala_dataset.HeatGuatemalaDataset + data_dir: ${paths.data_dir} + modalities: + tessera: + year: 2024 + size: 10 + format: npy + use_target_data: false # alignment is contrastive geo<->text; LST enters only as a concept + use_features: false # Tessera-only EO branch (your results: fusion hurts), avoids circularity + use_aux_data: + aux: # numeric raw values -> concept theta_k / retrieval ground truth + columns: + - aux_ndvi_mean + - aux_ndwi_mean + - aux_forest_cover_perc + - aux_tree_cover_perc + - aux_builtup_age_years + - aux_slope_perc + - aux_socioeconomic + - aux_lst + top: # expert-legend label strings -> caption text + columns: + - aux_ndvi_label + - aux_ndwi_label + - aux_forest_label + - aux_age_label + - aux_slope_label + - aux_socio_label + - aux_height_label + - aux_density_label + - aux_landuse + - aux_blocktype + - aux_interzone + seed: ${seed} + cache_dir: ${paths.cache_dir} + +caption_builder: + _target_: src.data.heat_guatemala_caption_builder.HeatGuatemalaCaptionBuilder + templates_fname: v1.json + concepts_fname: v1.json + data_dir: ${paths.data_dir}/heat_guatemala + seed: ${seed} + +batch_size: 64 +num_workers: 8 +pin_memory: true + +#split_mode: "spatial_clusters" # honest val/test: hold out whole areas (city blocks autocorrelate) +#spatial_split_distance_m: 500 +#split_mode: "random" +split_mode: "from_file" +saved_split_file_name: "split_indices_heat_guatemala_2026-02-20-1148.pth" +train_val_test_split: [0.7, 0.15, 0.15] +save_split: false +seed: ${seed} diff --git a/configs/experiment/heat_alignment.yaml b/configs/experiment/heat_alignment.yaml new file mode 100644 index 0000000..8a8e109 --- /dev/null +++ b/configs/experiment/heat_alignment.yaml @@ -0,0 +1,25 @@ +# @package _global_ +# Run: python src/train.py experiment=heat_alignment +# Smoke test (weak, expected): python src/train.py experiment=heat_alignment \ +# model=geoclip_alignment data.dataset.modalities='{coords: {}}' +defaults: + - override /model: heat_tessera_alignment + - override /data: heat_guatemala_tessera_text + - override /metrics: contrastive_similarities + +tags: ["alignment", "heat_island", "guatemala", "tessera"] +seed: 12345 + +trainer: + min_epochs: 10 + max_epochs: 100 + +data: + batch_size: 64 + +logger: + wandb: + tags: ${tags} + group: "heat_alignment" + aim: + experiment: "heat_alignment" diff --git a/configs/model/heat_tessera_alignment.yaml b/configs/model/heat_tessera_alignment.yaml new file mode 100644 index 0000000..622b015 --- /dev/null +++ b/configs/model/heat_tessera_alignment.yaml @@ -0,0 +1,37 @@ +# Tessera-only alignment model for the Guatemala LST use case. +# EO branch = average-pooled Tessera (your best regressor: Tessera avg, R2=0.733). +# No fusion (fusion hurt in your experiments). CLIP text tower + GeoCLIP mlp stay frozen; +# only the text projector, the auto-added geo projector, and the temperature train. +_target_: src.models.text_alignment_model.TextAlignmentModel + +geo_encoder: + _target_: src.models.components.geo_encoders.average_encoder.AverageEncoder + geo_data_name: tessera # averages the 10x10x128 tile -> 128-d vector + +text_encoder: + _target_: src.models.components.text_encoders.clip_text_encoder.ClipTextEncoder + hf_cache_dir: ${paths.huggingface_cache} + +# geo (128) != text (512); match_to_geo=false projects the GEO side up to 512. +# _setup() auto-adds & trains geo_encoder.extra_projector. +match_to_geo: false +trainable_modules: [text_encoder.projector, loss_fn.log_temp] + +metrics: ${metrics} + +optimizer: + _target_: torch.optim.Adam + _partial_: true + lr: 0.001 + weight_decay: 0.0 + +scheduler: + _target_: torch.optim.lr_scheduler.ReduceLROnPlateau + _partial_: true + mode: min + factor: 0.1 + patience: 10 + +loss_fn: + _target_: src.models.components.loss_fns.clip_loss.ClipLoss + temperature: 0.07 diff --git a/data/heat_guatemala/concept_captions/v1.json b/data/heat_guatemala/concept_captions/v1.json new file mode 100644 index 0000000..c4a4977 --- /dev/null +++ b/data/heat_guatemala/concept_captions/v1.json @@ -0,0 +1,82 @@ +[ + { + "concept_caption": "Densely vegetated green area with abundant healthy vegetation", + "is_max": true, + "theta_k": 0.334, + "col": "aux_ndvi_mean" + }, + { + "concept_caption": "Sparsely vegetated area with little greenery", + "is_max": false, + "theta_k": 0.112, + "col": "aux_ndvi_mean" + }, + { + "concept_caption": "Moist surface with no vegetation water stress", + "is_max": true, + "theta_k": 0.097, + "col": "aux_ndwi_mean" + }, + { + "concept_caption": "Dry surface with vegetation under drought stress", + "is_max": false, + "theta_k": -0.053, + "col": "aux_ndwi_mean" + }, + { + "concept_caption": "Forested area with substantial tree canopy cover", + "is_max": true, + "theta_k": 34.244, + "col": "aux_forest_cover_perc" + }, + { + "concept_caption": "Area with dense tropical tree cover", + "is_max": true, + "theta_k": 43.0, + "col": "aux_tree_cover_perc" + }, + { + "concept_caption": "Established, older built-up urban fabric", + "is_max": true, + "theta_k": 30.0, + "col": "aux_builtup_age_years" + }, + { + "concept_caption": "Flat, low-lying terrain", + "is_max": false, + "theta_k": 3.092, + "col": "aux_slope_perc" + }, + { + "concept_caption": "Hilly terrain with higher relief", + "is_max": true, + "theta_k": 19.775, + "col": "aux_slope_perc" + }, + { + "concept_caption": "High socioeconomic quality neighbourhood", + "is_max": true, + "theta_k": 4.0, + "col": "aux_socioeconomic" + }, + { + "concept_caption": "Low socioeconomic quality neighbourhood", + "is_max": false, + "theta_k": 2.0, + "col": "aux_socioeconomic" + }, + { + "concept_caption": "Hot surface above the urban heat-stress limit", + "is_max": true, + "theta_k": 26.592, + "col": "aux_lst", + "note": "month-confounded: compute/evaluate on March-only blocks for a clean signal" + }, + { + "concept_caption": "Cool surface well under the heat-stress limit", + "is_max": false, + "theta_k": 23.69, + "col": "aux_lst", + "note": "month-confounded: compute/evaluate on March-only blocks for a clean signal" + } +] \ No newline at end of file diff --git a/data/heat_guatemala/location_caption_templates/v1.json b/data/heat_guatemala/location_caption_templates/v1.json new file mode 100644 index 0000000..de16c76 --- /dev/null +++ b/data/heat_guatemala/location_caption_templates/v1.json @@ -0,0 +1,30 @@ +[ + "Urban location dominated by , with and .", + "Area of showing , under .", + "Built-up block of type , characterised by and .", + "Location of with , showing .", + "Neighbourhood of with , on .", + "Site characterised by , influenced by and .", + "Urban area with , showing and .", + "Location with and , on .", + "Area of on , with .", + "Built environment of type with , under .", + "Site with and , characterised by .", + "Location in interior zone , dominated by , with .", + "Built-up area with and , of .", + "Green location with and , under .", + "Area of with , showing .", + "Location on with , in interior zone .", + "Urban block of type with , under .", + "Site dominated by , with and .", + "Location characterised by , and .", + "Area of with , influenced by .", + "Residential area of , showing and .", + "Location dominated by with , on .", + "Built-up site of type , under with .", + "Area showing and , in interior zone .", + "Location of with , characterised by .", + "Urban location of with fabric, showing .", + "Site on with and .", + "Area dominated by , with , and ." +] \ No newline at end of file diff --git a/src/data/heat_guatemala_caption_builder.py b/src/data/heat_guatemala_caption_builder.py new file mode 100644 index 0000000..76d63e0 --- /dev/null +++ b/src/data/heat_guatemala_caption_builder.py @@ -0,0 +1,64 @@ +"""Caption builder for the Guatemala City urban-heat (LST) use case. + +Two aux categories are used (configured in the data yaml under `use_aux_data`): + + * ``aux`` – numeric raw columns (NDVI, NDWI, slope, built-up age, LST, ...). + These feed the concept retrieval evaluation: each concept's + ``theta_k`` is compared directly against these raw values. + * ``top`` – expert-legend *label* columns (e.g. ``aux_ndvi_label`` = + "high vegetation greenness", ``aux_density_label`` = + "very dense urban", ``aux_landuse`` = "discontinuous urban"). + These fill the ``<...>`` tokens in the location-caption templates, + so the training text uses the authoritative expert wording from + ``Heat_Guatemala.csv``. + +This mirrors the continuous (butterfly) paradigm for concepts, while taking the +caption *words* straight from the legend rather than re-deriving them — the two +are produced from the same block by build_aux_from_original.py, so they stay +consistent. The LST label is deliberately NOT used in any template (the target +must not leak into the training captions); LST appears only as a concept. +""" + +from typing import Dict, List, override + +import torch + +from src.data.base_caption_builder import BaseCaptionBuilder +from src.data.base_dataset import BaseDataset + + +class HeatGuatemalaCaptionBuilder(BaseCaptionBuilder): + @override + def sync_with_dataset(self, dataset: BaseDataset) -> None: + """Index numeric aux columns (for concepts) and label columns (for text).""" + # numeric aux -> id (position in the 'aux' tensor); used by sync_concepts() + self.column_to_metadata_map = {"aux": {}} + for i, col in enumerate(dataset.use_aux_data.get("aux", [])): + self.column_to_metadata_map["aux"][col] = {"id": i} + + # label aux -> position in the per-row 'top' list; used to fill template tokens + self.top_index: Dict[str, int] = { + col: i for i, col in enumerate(dataset.use_aux_data.get("top", [])) + } + + # wires concept["id"] from the numeric aux map (raises if a concept col is missing) + self.sync_concepts() + + @override + def _build_from_template( + self, template_idx: int, aux: torch.Tensor, top: List[str] | None = None + ) -> str: + template = self.templates[template_idx] + fillers: Dict[str, str] = {} + for token in self.tokens_in_template[template_idx]: + if token in self.top_index and top is not None: + fillers[token] = str(top[self.top_index[token]]) + elif token in self.column_to_metadata_map["aux"]: + # numeric fallback (default templates don't use numeric tokens) + fillers[token] = f"{aux[self.column_to_metadata_map['aux'][token]['id']].item():.2f}" + else: + raise KeyError( + f"Token '{token}' is neither a label ('top') nor a numeric ('aux') " + "column in the dataset. Check the template and use_aux_data config." + ) + return self._fill(template, fillers) diff --git a/src/data_preprocessing/build_aux_from_original.py b/src/data_preprocessing/build_aux_from_original.py new file mode 100644 index 0000000..e1dd875 --- /dev/null +++ b/src/data_preprocessing/build_aux_from_original.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +""" +Build aux_* columns for the Guatemala LST captioning, by joining: + - RAW numbers (model_ready_heat_guatemala.csv, NON-standardized) -> concept theta_k + - EXPERT words (Heat_Guatemala.csv, the original legend) -> caption text + +Join key: BLOCK_ID = int(name_loc[5:]) (verified 1:1, lat diff 0, LST diff <0.005). + +Output: a copy of the raw model-ready CSV with aux_* columns appended, ready for +the alignment datamodule (which selects columns by the regex ^aux_). +""" +import argparse +import re + +import pandas as pd + + +def cls(s): + """Take the human class from a legend string ('<0.5 NDVI greenness : high' -> 'high').""" + if pd.isna(s): + return None + s = str(s).strip() + return s.split(":")[-1].strip() if ":" in s else s + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--raw", required=True, help="model_ready_heat_guatemala.csv (RAW, not _in_)") + ap.add_argument("--legend", required=True, help="Heat_Guatemala.csv (original legend)") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + raw = pd.read_csv(args.raw, low_memory=False) + leg = pd.read_csv(args.legend, encoding="cp1252", low_memory=False) + raw["BLOCK_ID"] = raw["name_loc"].str.replace("heat_", "", regex=False).astype(int) + df = raw.merge(leg, on="BLOCK_ID", how="left") + + new = {} + # numeric aux (raw scale) -> concept ground truth / theta_k + num = { + "aux_ndvi_mean": "feat_ndvi_mean2022", "aux_ndwi_mean": "feat_ndwi_mean2022", + "aux_forest_cover_perc": "feat_forcov_meanperc", "aux_tree_cover_perc": "feat_troptreecovperc", + "aux_builtup_age_years": "feat_bua_gaia_age_mean", "aux_slope_perc": "feat_dem5mslopeperc_mean", + "aux_socioeconomic": "feat_estrato_s", "aux_lst": "target_lst", + } + for a, f in num.items(): + new[a] = df[f] + if "feat_measurement_month" in df.columns: # present only if you kept raw month + new["aux_month"] = df["feat_measurement_month"] + + # word aux (authoritative expert legend) -> caption text + new["aux_ndvi_label"] = df["NDVI_mean2022"].map(cls).map(lambda x: f"{x} vegetation greenness" if x else x) + new["aux_ndwi_label"] = df["NDWI_mean2022"].map(cls).map(lambda x: x if x and "stress" in x else (f"{x} drought stress" if x else x)) + new["aux_socio_label"] = df["SocioEconomicQuality"].map(cls).map(lambda x: f"{x} socioeconomic quality" if x else x) + new["aux_slope_label"] = df["DEM5m_Slope%"].map(cls).map(lambda x: x.lower() if x else x) + new["aux_forest_label"] = df["Hansen_ForestCover_meanPerc"].map(cls) + new["aux_age_label"] = df["BUA_GAIA_Age_Mean"].map(cls) + new["aux_height_label"] = df["CopenicusMSZ_BuildingHeightM"].map(cls) # words OK; raw number is unit-broken + new["aux_density_label"] = df["PopulationDensityPerKm2"].map(cls) # urban-form label + new["aux_lst_label"] = df["LST_mean_predictor_Classified"].map(cls) + new["aux_landuse"] = df["BlockMAGADominantLanduse"].astype(str).str.strip().str.lower() + new["aux_blocktype"] = df["BlockType"].astype(str).str.strip().str.lower() + new["aux_interzone"] = df["IntrZon"].astype(str).str.strip() + + out = pd.concat([raw.drop(columns=["BLOCK_ID"]), pd.DataFrame(new, index=raw.index)], axis=1) + out.to_csv(args.out, index=False) + print(f"wrote {out.shape[0]} rows, {sum(k.startswith('aux_') for k in out.columns)} aux columns -> {args.out}") + print("nulls in aux columns:", int(out[[c for c in out.columns if c.startswith('aux_')]].isna().sum().sum())) + + +if __name__ == "__main__": + main() From f25349ff096baea8bf259b0f2d8dab8ca2ae82c1 Mon Sep 17 00:00:00 2001 From: bachir Date: Wed, 17 Jun 2026 15:49:38 +0200 Subject: [PATCH 6/7] clean base_dataset.py --- src/data/base_dataset.py | 50 +++------------------------------------- 1 file changed, 3 insertions(+), 47 deletions(-) diff --git a/src/data/base_dataset.py b/src/data/base_dataset.py index e8c606e..aef9390 100644 --- a/src/data/base_dataset.py +++ b/src/data/base_dataset.py @@ -144,23 +144,6 @@ def __init__( else: self.use_aux_data = None - # if use_aux_data is None or use_aux_data == "all": - # self.use_aux_data = { - # "aux": { - # "pattern": "^aux_(?!.*top).*", - # # 'columns' : [] - # }, - # "top": { - # "pattern": "^aux_.*top.*", - # # 'columns' : [] - # }, - # } - - # elif type(use_aux_data) is dict: - # self.use_aux_data = use_aux_data - # else: - # self.use_aux_data = None - # More precise dataset name (with modalities) if isinstance(dataset_name, list): dataset_name = "+".join(dataset_name) @@ -346,36 +329,9 @@ def setup_tessera(self) -> None: if missing_files: print(f"Dropped {len(missing_files)} records with missing Tessera tiles.") - # from geotessera import GeoTessera - - # print("Downloading missing Tessera tiles...") - # print("[Warning]: it may download tessera tiles filled with 0a") - - # avail_files = os.listdir(dst_dir) - # gt = None - # for i, rec in enumerate(self.records): - # fname = os.path.basename(rec["tessera_path"]) - # if fname not in avail_files: - # if download_missing_tiles: - # print(f"Retrieving missing Tessera data: {fname}") - # gt = gt or GeoTessera(cache_dir=self.cache_dir) - # row = self.df[self.df["name_loc"] == rec["name_loc"]] - # lon, lat = row.lon.item(), row.lat.item() - # try: - # get_tessera_embeds( - # lon, - # lat, - # rec["name_loc"], - # year=year, - # save_dir=dst_dir, - # tile_size=size, - # tessera_con=gt, - # ) - # continue - # except Exception as e: - # print(f"Tile for {fname} could not be retrieved. Error: {e}") - # self.records.pop(i) - # print(f"No tile found for {fname} thus it will not be used.") + + print("Downloading missing Tessera tiles...") + print("[Warning]: it may download tessera tiles filled with 0a") @final def setup_aef(self) -> None: From 69819b9d3e78be1fc7d394b08892f064f64dfbc4 Mon Sep 17 00:00:00 2001 From: bachir Date: Wed, 17 Jun 2026 15:51:43 +0200 Subject: [PATCH 7/7] rename preprocessing script --- .../build_heat_guatemala_aux.py | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 src/data_preprocessing/build_heat_guatemala_aux.py diff --git a/src/data_preprocessing/build_heat_guatemala_aux.py b/src/data_preprocessing/build_heat_guatemala_aux.py new file mode 100644 index 0000000..220f746 --- /dev/null +++ b/src/data_preprocessing/build_heat_guatemala_aux.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +""" +Build aux_* columns for the Guatemala LST captioning, by joining: + - RAW numbers (model_ready_heat_guatemala.csv, NON-standardized) -> concept theta_k + - EXPERT words (Heat_Guatemala.csv, the original legend) -> caption text + +Join key: BLOCK_ID = int(name_loc[5:]) (verified 1:1, lat diff 0, LST diff <0.005). + +Output: a copy of the raw model-ready CSV with aux_* columns appended, ready for +the alignment datamodule (which selects columns by the regex ^aux_). +""" +import argparse +import re + +import pandas as pd + + +def cls(s): + """Take the human class from a legend string ('<0.5 NDVI greenness : high' -> 'high').""" + if pd.isna(s): + return None + s = str(s).strip() + return s.split(":")[-1].strip() if ":" in s else s + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--raw", required=True, help="model_ready_heat_guatemala.csv (RAW, not _in_)") + ap.add_argument("--legend", required=True, help="Heat_Guatemala.csv (original legend)") + ap.add_argument("--out", required=True) + ap.add_argument("--unstd", required=True, help="Heat_Guatemala_raw.csv (true unstandardized source)") + args = ap.parse_args() + + raw = pd.read_csv(args.raw, low_memory=False) + + raw = raw.drop(columns=[c for c in raw.columns if c.startswith("aux_")], errors="ignore") # ADD THIS + leg = pd.read_csv(args.legend, encoding="cp1252", low_memory=False) + unstd = pd.read_csv(args.unstd, encoding="cp1252", low_memory=False) + + raw["BLOCK_ID"] = raw["name_loc"].str.replace("heat_", "", regex=False).astype(int) + unstd = unstd.rename(columns={"OBJECTID": "BLOCK_ID"}) + df = raw.merge(leg, on="BLOCK_ID", how="left") + df = df.merge( + unstd[["BLOCK_ID", "ForCov_meanPerc", "TropTreecovPerc", "BUA_GAIA_Age_Mean"]], + on="BLOCK_ID", how="left", suffixes=("", "_unstd"), + ) + new = {} + # numeric aux (raw scale) -> concept ground truth / theta_k + num = { + "aux_ndvi_mean": "feat_ndvi_mean2022", "aux_ndwi_mean": "feat_ndwi_mean2022", + "aux_forest_cover_perc": "ForCov_meanPerc", "aux_tree_cover_perc": "TropTreecovPerc", + "aux_builtup_age_years": "BUA_GAIA_Age_Mean_unstd", "aux_slope_perc": "feat_dem5mslopeperc_mean", + "aux_socioeconomic": "feat_estrato_s", "aux_lst": "target_lst", + } + for a, f in num.items(): + new[a] = df[f] + if "feat_measurement_month" in df.columns: # present only if you kept raw month + new["aux_month"] = df["feat_measurement_month"] + + # word aux (authoritative expert legend) -> caption text + new["aux_ndvi_label"] = df["NDVI_mean2022"].map(cls).map(lambda x: f"{x} vegetation greenness" if x else x) + new["aux_ndwi_label"] = df["NDWI_mean2022"].map(cls).map(lambda x: x if x and "stress" in x else (f"{x} drought stress" if x else x)) + new["aux_socio_label"] = df["SocioEconomicQuality"].map(cls).map(lambda x: f"{x} socioeconomic quality" if x else x) + new["aux_slope_label"] = df["DEM5m_Slope%"].map(cls).map(lambda x: x.lower() if x else x) + new["aux_forest_label"] = df["Hansen_ForestCover_meanPerc"].map(cls) + new["aux_age_label"] = df["BUA_GAIA_Age_Mean"].map(cls) + new["aux_height_label"] = df["CopenicusMSZ_BuildingHeightM"].map(cls) # words OK; raw number is unit-broken + new["aux_density_label"] = df["PopulationDensityPerKm2"].map(cls) # urban-form label + new["aux_lst_label"] = df["LST_mean_predictor_Classified"].map(cls) + new["aux_landuse"] = df["BlockMAGADominantLanduse"].astype(str).str.strip().str.lower() + new["aux_blocktype"] = df["BlockType"].astype(str).str.strip().str.lower() + new["aux_interzone"] = df["IntrZon"].astype(str).str.strip() + + out = pd.concat([raw.drop(columns=["BLOCK_ID"]), pd.DataFrame(new, index=raw.index)], axis=1) + out.to_csv(args.out, index=False) + print(f"wrote {out.shape[0]} rows, {sum(k.startswith('aux_') for k in out.columns)} aux columns -> {args.out}") + print("nulls in aux columns:", int(out[[c for c in out.columns if c.startswith('aux_')]].isna().sum().sum())) + + +if __name__ == "__main__": + main()