From 66ab177faa1ef240d7928d6a8c6a7a16496efae4 Mon Sep 17 00:00:00 2001
From: bachir <bbel@nilu.no>
Date: Fri, 27 Mar 2026 09:57:35 +0100
Subject: [PATCH 1/7] feat(uc3-uhi): tessera integration

---
 .../heat_guatemala_full_fusion_avg_best.yaml  | 34 +++++++++++++++++
 .../heat_guatemala_full_fusion_cnn_best.yaml  | 34 +++++++++++++++++
 .../heat_guatemala_fusion_best.yaml           | 27 +++++++++++++
 .../heat_guatemala_geoclip_best.yaml          | 27 +++++++++++++
 .../heat_guatemala_tabular_best.yaml          | 27 +++++++++++++
 .../heat_guatemala_tessera_avg_best.yaml      | 34 +++++++++++++++++
 .../heat_guatemala_tessera_cnn_best.yaml      | 34 +++++++++++++++++
 configs/model/heat_full_fusion_avg_best.yaml  | 34 +++++++++++++++++
 configs/model/heat_full_fusion_cnn_best.yaml  | 38 +++++++++++++++++++
 configs/model/heat_fusion_best.yaml           | 31 +++++++++++++++
 configs/model/heat_geoclip_best.yaml          | 23 +++++++++++
 configs/model/heat_tabular_best.yaml          | 25 ++++++++++++
 configs/model/heat_tessera_avg_best.yaml      | 24 ++++++++++++
 configs/model/heat_tessera_cnn_best.yaml      | 28 ++++++++++++++
 src/data/base_dataset.py                      |  2 +-
 src/data/heat_guatemala_dataset.py            | 15 +++++++-
 .../components/geo_encoders/cnn_encoder.py    |  4 +-
 17 files changed, 436 insertions(+), 5 deletions(-)
 create mode 100644 configs/experiment/heat_guatemala_full_fusion_avg_best.yaml
 create mode 100644 configs/experiment/heat_guatemala_full_fusion_cnn_best.yaml
 create mode 100644 configs/experiment/heat_guatemala_fusion_best.yaml
 create mode 100644 configs/experiment/heat_guatemala_geoclip_best.yaml
 create mode 100644 configs/experiment/heat_guatemala_tabular_best.yaml
 create mode 100644 configs/experiment/heat_guatemala_tessera_avg_best.yaml
 create mode 100644 configs/experiment/heat_guatemala_tessera_cnn_best.yaml
 create mode 100644 configs/model/heat_full_fusion_avg_best.yaml
 create mode 100644 configs/model/heat_full_fusion_cnn_best.yaml
 create mode 100644 configs/model/heat_fusion_best.yaml
 create mode 100644 configs/model/heat_geoclip_best.yaml
 create mode 100644 configs/model/heat_tabular_best.yaml
 create mode 100644 configs/model/heat_tessera_avg_best.yaml
 create mode 100644 configs/model/heat_tessera_cnn_best.yaml

diff --git a/configs/experiment/heat_guatemala_full_fusion_avg_best.yaml b/configs/experiment/heat_guatemala_full_fusion_avg_best.yaml
new file mode 100644
index 0000000..72c2896
--- /dev/null
+++ b/configs/experiment/heat_guatemala_full_fusion_avg_best.yaml
@@ -0,0 +1,34 @@
+# @package _global_
+# Best config: R2=0.672, RMSE=1.104, MAE=0.896
+defaults:
+  - override /model: heat_full_fusion_avg_best
+  - override /data: heat_guatemala
+  - override /metrics: guatemala_regression
+tags: ["heat_island", "guatemala", "full_fusion_avg", "best", "regression"]
+seed: 12345
+trainer:
+  min_epochs: 1
+  max_epochs: 100
+data:
+  batch_size: 64
+  dataset:
+    modalities:
+      coords: {}
+      tessera:
+        year: 2024
+        size: 10
+        format: npy
+callbacks:
+  model_checkpoint:
+    monitor: val_r2
+    mode: max
+  early_stopping:
+    monitor: val_r2
+    mode: max
+    patience: 20
+logger:
+  wandb:
+    tags: ${tags}
+    group: "heat_island"
+  aim:
+    experiment: "heat_island"
diff --git a/configs/experiment/heat_guatemala_full_fusion_cnn_best.yaml b/configs/experiment/heat_guatemala_full_fusion_cnn_best.yaml
new file mode 100644
index 0000000..d87c38a
--- /dev/null
+++ b/configs/experiment/heat_guatemala_full_fusion_cnn_best.yaml
@@ -0,0 +1,34 @@
+# @package _global_
+# Best config: R2=0.647, RMSE=1.144, MAE=0.931
+defaults:
+  - override /model: heat_full_fusion_cnn_best
+  - override /data: heat_guatemala
+  - override /metrics: guatemala_regression
+tags: ["heat_island", "guatemala", "full_fusion_cnn", "best", "regression"]
+seed: 12345
+trainer:
+  min_epochs: 1
+  max_epochs: 100
+data:
+  batch_size: 64
+  dataset:
+    modalities:
+      coords: {}
+      tessera:
+        year: 2024
+        size: 10
+        format: npy
+callbacks:
+  model_checkpoint:
+    monitor: val_r2
+    mode: max
+  early_stopping:
+    monitor: val_r2
+    mode: max
+    patience: 20
+logger:
+  wandb:
+    tags: ${tags}
+    group: "heat_island"
+  aim:
+    experiment: "heat_island"
diff --git a/configs/experiment/heat_guatemala_fusion_best.yaml b/configs/experiment/heat_guatemala_fusion_best.yaml
new file mode 100644
index 0000000..dbab3a1
--- /dev/null
+++ b/configs/experiment/heat_guatemala_fusion_best.yaml
@@ -0,0 +1,27 @@
+# @package _global_
+# Best config: R2=0.555, RMSE=1.285, MAE=1.039
+defaults:
+  - override /model: heat_fusion_best
+  - override /data: heat_guatemala
+  - override /metrics: guatemala_regression
+tags: ["heat_island", "guatemala", "fusion", "best", "regression"]
+seed: 12345
+trainer:
+  min_epochs: 1
+  max_epochs: 100
+data:
+  batch_size: 64
+callbacks:
+  model_checkpoint:
+    monitor: val_r2
+    mode: max
+  early_stopping:
+    monitor: val_r2
+    mode: max
+    patience: 20
+logger:
+  wandb:
+    tags: ${tags}
+    group: "heat_island"
+  aim:
+    experiment: "heat_island"
diff --git a/configs/experiment/heat_guatemala_geoclip_best.yaml b/configs/experiment/heat_guatemala_geoclip_best.yaml
new file mode 100644
index 0000000..5c555af
--- /dev/null
+++ b/configs/experiment/heat_guatemala_geoclip_best.yaml
@@ -0,0 +1,27 @@
+# @package _global_
+# Best config: R2=0.323, RMSE=1.607, MAE=1.344
+defaults:
+  - override /model: heat_geoclip_best
+  - override /data: heat_guatemala
+  - override /metrics: guatemala_regression
+tags: ["heat_island", "guatemala", "coords", "best", "regression"]
+seed: 12345
+trainer:
+  min_epochs: 1
+  max_epochs: 100
+data:
+  batch_size: 64
+callbacks:
+  model_checkpoint:
+    monitor: val_r2
+    mode: max
+  early_stopping:
+    monitor: val_r2
+    mode: max
+    patience: 20
+logger:
+  wandb:
+    tags: ${tags}
+    group: "heat_island"
+  aim:
+    experiment: "heat_island"
diff --git a/configs/experiment/heat_guatemala_tabular_best.yaml b/configs/experiment/heat_guatemala_tabular_best.yaml
new file mode 100644
index 0000000..952b33f
--- /dev/null
+++ b/configs/experiment/heat_guatemala_tabular_best.yaml
@@ -0,0 +1,27 @@
+# @package _global_
+# Best config: R2=0.562, RMSE=1.282, MAE=1.040
+defaults:
+  - override /model: heat_tabular_best
+  - override /data: heat_guatemala
+  - override /metrics: guatemala_regression
+tags: ["heat_island", "guatemala", "tabular", "best", "regression"]
+seed: 12345
+trainer:
+  min_epochs: 1
+  max_epochs: 100
+data:
+  batch_size: 64
+callbacks:
+  model_checkpoint:
+    monitor: val_r2
+    mode: max
+  early_stopping:
+    monitor: val_r2
+    mode: max
+    patience: 20
+logger:
+  wandb:
+    tags: ${tags}
+    group: "heat_island"
+  aim:
+    experiment: "heat_island"
diff --git a/configs/experiment/heat_guatemala_tessera_avg_best.yaml b/configs/experiment/heat_guatemala_tessera_avg_best.yaml
new file mode 100644
index 0000000..21bc135
--- /dev/null
+++ b/configs/experiment/heat_guatemala_tessera_avg_best.yaml
@@ -0,0 +1,34 @@
+# @package _global_
+# Best config: R2=0.733, RMSE=1.011, MAE=0.814
+defaults:
+  - override /model: heat_tessera_avg_best
+  - override /data: heat_guatemala
+  - override /metrics: guatemala_regression
+tags: ["heat_island", "guatemala", "tessera_avg", "best", "regression"]
+seed: 12345
+trainer:
+  min_epochs: 1
+  max_epochs: 100
+data:
+  batch_size: 64
+  dataset:
+    modalities:
+      coords: {}
+      tessera:
+        year: 2024
+        size: 10
+        format: npy
+callbacks:
+  model_checkpoint:
+    monitor: val_r2
+    mode: max
+  early_stopping:
+    monitor: val_r2
+    mode: max
+    patience: 20
+logger:
+  wandb:
+    tags: ${tags}
+    group: "heat_island"
+  aim:
+    experiment: "heat_island"
diff --git a/configs/experiment/heat_guatemala_tessera_cnn_best.yaml b/configs/experiment/heat_guatemala_tessera_cnn_best.yaml
new file mode 100644
index 0000000..c7cc1d1
--- /dev/null
+++ b/configs/experiment/heat_guatemala_tessera_cnn_best.yaml
@@ -0,0 +1,34 @@
+# @package _global_
+# Best config: R2=0.694, RMSE=1.088, MAE=0.877
+defaults:
+  - override /model: heat_tessera_cnn_best
+  - override /data: heat_guatemala
+  - override /metrics: guatemala_regression
+tags: ["heat_island", "guatemala", "tessera_cnn", "best", "regression"]
+seed: 12345
+trainer:
+  min_epochs: 1
+  max_epochs: 100
+data:
+  batch_size: 64
+  dataset:
+    modalities:
+      coords: {}
+      tessera:
+        year: 2024
+        size: 10
+        format: npy
+callbacks:
+  model_checkpoint:
+    monitor: val_r2
+    mode: max
+  early_stopping:
+    monitor: val_r2
+    mode: max
+    patience: 20
+logger:
+  wandb:
+    tags: ${tags}
+    group: "heat_island"
+  aim:
+    experiment: "heat_island"
diff --git a/configs/model/heat_full_fusion_avg_best.yaml b/configs/model/heat_full_fusion_avg_best.yaml
new file mode 100644
index 0000000..df5799d
--- /dev/null
+++ b/configs/model/heat_full_fusion_avg_best.yaml
@@ -0,0 +1,34 @@
+_target_: src.models.predictive_model.PredictiveModel
+geo_encoder:
+  _target_: src.models.components.geo_encoders.encoder_wrapper.EncoderWrapper
+  encoder_branches:
+    - encoder:
+        _target_: src.models.components.geo_encoders.geoclip.GeoClipCoordinateEncoder
+    - encoder:
+        _target_: src.models.components.geo_encoders.tabular_encoder.TabularEncoder
+        output_dim: 64
+        geo_data_name: tabular
+    - encoder:
+        _target_: src.models.components.geo_encoders.average_encoder.AverageEncoder
+        geo_data_name: tessera
+  fusion_strategy: concat
+prediction_head:
+  _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead
+  nn_layers: 3
+  hidden_dim: 512
+trainable_modules: [geo_encoder.encoder_branches.1, prediction_head]
+normalize_features: false
+metrics: ${metrics}
+optimizer:
+  _target_: torch.optim.Adam
+  _partial_: true
+  lr: 0.0001
+  weight_decay: 0.0
+scheduler:
+  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
+  _partial_: true
+  mode: min
+  factor: 0.1
+  patience: 10
+loss_fn:
+  _target_: torch.nn.MSELoss
diff --git a/configs/model/heat_full_fusion_cnn_best.yaml b/configs/model/heat_full_fusion_cnn_best.yaml
new file mode 100644
index 0000000..ea07641
--- /dev/null
+++ b/configs/model/heat_full_fusion_cnn_best.yaml
@@ -0,0 +1,38 @@
+_target_: src.models.predictive_model.PredictiveModel
+geo_encoder:
+  _target_: src.models.components.geo_encoders.encoder_wrapper.EncoderWrapper
+  encoder_branches:
+    - encoder:
+        _target_: src.models.components.geo_encoders.geoclip.GeoClipCoordinateEncoder
+    - encoder:
+        _target_: src.models.components.geo_encoders.tabular_encoder.TabularEncoder
+        output_dim: 64
+        geo_data_name: tabular
+    - encoder:
+        _target_: src.models.components.geo_encoders.cnn_encoder.CNNEncoder
+        geo_data_name: tessera
+        resnet_version: 18
+        pretrained_cnn: imagenet
+        freezing_strategy: all
+        output_dim: 256
+  fusion_strategy: concat
+prediction_head:
+  _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead
+  nn_layers: 3
+  hidden_dim: 256
+trainable_modules: [geo_encoder.encoder_branches.1, geo_encoder.encoder_branches.2, prediction_head]
+normalize_features: true
+metrics: ${metrics}
+optimizer:
+  _target_: torch.optim.Adam
+  _partial_: true
+  lr: 0.001
+  weight_decay: 0.0
+scheduler:
+  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
+  _partial_: true
+  mode: min
+  factor: 0.1
+  patience: 10
+loss_fn:
+  _target_: torch.nn.MSELoss
diff --git a/configs/model/heat_fusion_best.yaml b/configs/model/heat_fusion_best.yaml
new file mode 100644
index 0000000..40205fc
--- /dev/null
+++ b/configs/model/heat_fusion_best.yaml
@@ -0,0 +1,31 @@
+_target_: src.models.predictive_model.PredictiveModel
+geo_encoder:
+  _target_: src.models.components.geo_encoders.encoder_wrapper.EncoderWrapper
+  encoder_branches:
+    - encoder:
+        _target_: src.models.components.geo_encoders.geoclip.GeoClipCoordinateEncoder
+    - encoder:
+        _target_: src.models.components.geo_encoders.tabular_encoder.TabularEncoder
+        output_dim: 64
+        geo_data_name: tabular
+  fusion_strategy: concat
+prediction_head:
+  _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead
+  nn_layers: 2
+  hidden_dim: 512
+trainable_modules: [geo_encoder.encoder_branches.1, prediction_head]
+normalize_features: false
+metrics: ${metrics}
+optimizer:
+  _target_: torch.optim.Adam
+  _partial_: true
+  lr: 0.0001
+  weight_decay: 0.0
+scheduler:
+  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
+  _partial_: true
+  mode: min
+  factor: 0.1
+  patience: 10
+loss_fn:
+  _target_: torch.nn.MSELoss
diff --git a/configs/model/heat_geoclip_best.yaml b/configs/model/heat_geoclip_best.yaml
new file mode 100644
index 0000000..9363260
--- /dev/null
+++ b/configs/model/heat_geoclip_best.yaml
@@ -0,0 +1,23 @@
+_target_: src.models.predictive_model.PredictiveModel
+geo_encoder:
+  _target_: src.models.components.geo_encoders.geoclip.GeoClipCoordinateEncoder
+prediction_head:
+  _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead
+  nn_layers: 2
+  hidden_dim: 128
+trainable_modules: [prediction_head]
+normalize_features: true
+metrics: ${metrics}
+optimizer:
+  _target_: torch.optim.Adam
+  _partial_: true
+  lr: 0.001
+  weight_decay: 0.0
+scheduler:
+  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
+  _partial_: true
+  mode: min
+  factor: 0.1
+  patience: 10
+loss_fn:
+  _target_: torch.nn.MSELoss
diff --git a/configs/model/heat_tabular_best.yaml b/configs/model/heat_tabular_best.yaml
new file mode 100644
index 0000000..a5b9f4d
--- /dev/null
+++ b/configs/model/heat_tabular_best.yaml
@@ -0,0 +1,25 @@
+_target_: src.models.predictive_model.PredictiveModel
+geo_encoder:
+  _target_: src.models.components.geo_encoders.tabular_encoder.TabularEncoder
+  output_dim: 64
+  geo_data_name: tabular
+prediction_head:
+  _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead
+  nn_layers: 3
+  hidden_dim: 512
+trainable_modules: [geo_encoder, prediction_head]
+normalize_features: false
+metrics: ${metrics}
+optimizer:
+  _target_: torch.optim.Adam
+  _partial_: true
+  lr: 0.0001
+  weight_decay: 0.0001
+scheduler:
+  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
+  _partial_: true
+  mode: min
+  factor: 0.1
+  patience: 10
+loss_fn:
+  _target_: torch.nn.MSELoss
diff --git a/configs/model/heat_tessera_avg_best.yaml b/configs/model/heat_tessera_avg_best.yaml
new file mode 100644
index 0000000..b68ac43
--- /dev/null
+++ b/configs/model/heat_tessera_avg_best.yaml
@@ -0,0 +1,24 @@
+_target_: src.models.predictive_model.PredictiveModel
+geo_encoder:
+  _target_: src.models.components.geo_encoders.average_encoder.AverageEncoder
+  geo_data_name: tessera
+prediction_head:
+  _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead
+  nn_layers: 2
+  hidden_dim: 512
+trainable_modules: [prediction_head]
+normalize_features: true
+metrics: ${metrics}
+optimizer:
+  _target_: torch.optim.Adam
+  _partial_: true
+  lr: 0.001
+  weight_decay: 0.0
+scheduler:
+  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
+  _partial_: true
+  mode: min
+  factor: 0.1
+  patience: 10
+loss_fn:
+  _target_: torch.nn.MSELoss
diff --git a/configs/model/heat_tessera_cnn_best.yaml b/configs/model/heat_tessera_cnn_best.yaml
new file mode 100644
index 0000000..aa4af72
--- /dev/null
+++ b/configs/model/heat_tessera_cnn_best.yaml
@@ -0,0 +1,28 @@
+_target_: src.models.predictive_model.PredictiveModel
+geo_encoder:
+  _target_: src.models.components.geo_encoders.cnn_encoder.CNNEncoder
+  geo_data_name: tessera
+  resnet_version: 34
+  pretrained_cnn: imagenet
+  freezing_strategy: all
+  output_dim: 256
+prediction_head:
+  _target_: src.models.components.pred_heads.mlp_regression_head.MLPRegressionPredictionHead
+  nn_layers: 2
+  hidden_dim: 1024
+trainable_modules: [geo_encoder, prediction_head]
+normalize_features: false
+metrics: ${metrics}
+optimizer:
+  _target_: torch.optim.Adam
+  _partial_: true
+  lr: 0.001
+  weight_decay: 0.0
+scheduler:
+  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
+  _partial_: true
+  mode: min
+  factor: 0.1
+  patience: 10
+loss_fn:
+  _target_: torch.nn.MSELoss
diff --git a/src/data/base_dataset.py b/src/data/base_dataset.py
index bd727d7..da276f1 100644
--- a/src/data/base_dataset.py
+++ b/src/data/base_dataset.py
@@ -237,7 +237,7 @@ def setup_tessera(self) -> None:
                 if fname not in avail_files:
                     print(f"Retrieving missing Tessera data: {fname}")
                     gt = gt or GeoTessera(cache_dir=self.cache_dir)
-                    get_tessera_embeds(rec.lon, rec.lat, rec.name_loc, year, dst_dir, size)
+                    get_tessera_embeds(rec["lon"], rec["lat"], rec["name_loc"], year, dst_dir, size,gt)
 
     @final
     def setup_aef(self) -> None:
diff --git a/src/data/heat_guatemala_dataset.py b/src/data/heat_guatemala_dataset.py
index f4fabac..4607ba9 100644
--- a/src/data/heat_guatemala_dataset.py
+++ b/src/data/heat_guatemala_dataset.py
@@ -15,9 +15,9 @@
 from typing import Any, Dict, override
 
 import torch
-
+import os
 from src.data.base_dataset import BaseDataset
-
+import numpy as np
 
 class HeatGuatemalaDataset(BaseDataset):
     """Dataset for the urban heat island use case (Guatemala City, LST regression).
@@ -88,6 +88,17 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
                 sample["eo"]["coords"] = torch.tensor(
                     [row["lat"], row["lon"]], dtype=torch.float32
                 )
+            elif modality == "tessera":
+                path = row["tessera_path"]
+                if path is not None and os.path.exists(path):
+                    arr = np.load(path).transpose(2, 0, 1)
+                else:
+                    arr = np.zeros((128, 10, 10), dtype=np.float32)
+                tess = torch.tensor(arr, dtype=torch.float32)
+                sample["eo"]["tessera"] = tess
+                sample["tessera"] = tess
+
+
 
         # --- Tabular features (always included if present in CSV) ---
         if self.use_features and self.feat_names:
diff --git a/src/models/components/geo_encoders/cnn_encoder.py b/src/models/components/geo_encoders/cnn_encoder.py
index 34f8f48..909074f 100644
--- a/src/models/components/geo_encoders/cnn_encoder.py
+++ b/src/models/components/geo_encoders/cnn_encoder.py
@@ -148,12 +148,12 @@ def forward(
         :param batch: input batch
         :return: extracted features
         """
-        eo_data = batch.get("eo", {})
+        eo_data = batch[self.geo_data_name]
 
         dtype = self.dtype
         if eo_data.dtype != dtype:
             eo_data = eo_data.to(dtype)
-        feats = self.geo_encoder(eo_data[self.geo_data_name])
+        feats = self.geo_encoder(eo_data)#self.geo_encoder(eo_data[self.geo_data_name])
         # n_nans = torch.sum(torch.isnan(feats)).item()
         # assert (
         #     n_nans == 0

From f1d8050f6a9c84b37715d216129b32c567f2fee2 Mon Sep 17 00:00:00 2001
From: bachir <bbel@nilu.no>
Date: Mon, 30 Mar 2026 12:19:08 +0200
Subject: [PATCH 2/7] fix size hardcoding

---
 src/data/heat_guatemala_dataset.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/data/heat_guatemala_dataset.py b/src/data/heat_guatemala_dataset.py
index 4607ba9..e9a640d 100644
--- a/src/data/heat_guatemala_dataset.py
+++ b/src/data/heat_guatemala_dataset.py
@@ -93,7 +93,10 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
                 if path is not None and os.path.exists(path):
                     arr = np.load(path).transpose(2, 0, 1)
                 else:
-                    arr = np.zeros((128, 10, 10), dtype=np.float32)
+                    size = self.modalities["tessera"].get("size", 10)
+                    arr = np.zeros((128, size, size), dtype=np.float32)
+                    n_bands = self.modalities["tessera"].get("n_bands", 128)
+                    arr = np.zeros((n_bands, size, size), dtype=np.float32)
                 tess = torch.tensor(arr, dtype=torch.float32)
                 sample["eo"]["tessera"] = tess
                 sample["tessera"] = tess

From c7fd3023562c6fee20e9252598005183a647a7a5 Mon Sep 17 00:00:00 2001
From: bachir <bbel@nilu.no>
Date: Mon, 30 Mar 2026 12:39:17 +0200
Subject: [PATCH 3/7] fix size hardcoding

---
 src/data/heat_guatemala_dataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/data/heat_guatemala_dataset.py b/src/data/heat_guatemala_dataset.py
index e9a640d..6edc4db 100644
--- a/src/data/heat_guatemala_dataset.py
+++ b/src/data/heat_guatemala_dataset.py
@@ -94,7 +94,6 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
                     arr = np.load(path).transpose(2, 0, 1)
                 else:
                     size = self.modalities["tessera"].get("size", 10)
-                    arr = np.zeros((128, size, size), dtype=np.float32)
                     n_bands = self.modalities["tessera"].get("n_bands", 128)
                     arr = np.zeros((n_bands, size, size), dtype=np.float32)
                 tess = torch.tensor(arr, dtype=torch.float32)

From a219590c558128fb5afb8df604753e35aea00be7 Mon Sep 17 00:00:00 2001
From: bachir <bbel@nilu.no>
Date: Mon, 15 Jun 2026 17:40:41 +0200
Subject: [PATCH 4/7] fix(data): handle Hydra DictConfig in use_aux_data; safe
 setup_tessera dropping

---
 src/data/base_dataset.py | 124 ++++++++++++++++++++++++++++++---------
 1 file changed, 96 insertions(+), 28 deletions(-)

diff --git a/src/data/base_dataset.py b/src/data/base_dataset.py
index d39e66a..e8c606e 100644
--- a/src/data/base_dataset.py
+++ b/src/data/base_dataset.py
@@ -11,6 +11,7 @@
 
 from src.utils.data_utils import center_crop_npy
 from src.utils.errors import MissingDataError
+from omegaconf import DictConfig, OmegaConf
 
 TORCH_DTYPES = {
     "float32": torch.float32,
@@ -121,23 +122,45 @@ def __init__(
         self.use_target_data = use_target_data
         self.use_features = use_features
 
-        if use_aux_data is None or use_aux_data == "all":
+        if isinstance(use_aux_data, DictConfig):
+            use_aux_data = OmegaConf.to_container(use_aux_data, resolve=True)
+
+        if use_aux_data is None:
+            self.use_aux_data = None
+
+        elif use_aux_data == "all":
             self.use_aux_data = {
                 "aux": {
                     "pattern": "^aux_(?!.*top).*",
-                    #     'columns' : []
                 },
                 "top": {
                     "pattern": "^aux_.*top.*",
-                    #     'columns' : []
                 },
             }
 
-        elif type(use_aux_data) is dict:
+        elif isinstance(use_aux_data, dict):
             self.use_aux_data = use_aux_data
+
         else:
             self.use_aux_data = None
 
+        # if use_aux_data is None or use_aux_data == "all":
+        #     self.use_aux_data = {
+        #         "aux": {
+        #             "pattern": "^aux_(?!.*top).*",
+        #             #     'columns' : []
+        #         },
+        #         "top": {
+        #             "pattern": "^aux_.*top.*",
+        #             #     'columns' : []
+        #         },
+        #     }
+
+        # elif type(use_aux_data) is dict:
+        #     self.use_aux_data = use_aux_data
+        # else:
+        #     self.use_aux_data = None
+
         # More precise dataset name (with modalities)
         if isinstance(dataset_name, list):
             dataset_name = "+".join(dataset_name)
@@ -280,35 +303,80 @@ def setup_tessera(self) -> None:
         else:
             from geotessera import GeoTessera
 
-            print("Downloading missing Tessera tiles...")
-            print("[Warning]: it may download tessera tiles filled with 0a")
-
-            avail_files = os.listdir(dst_dir)
+            print("Checking missing Tessera tiles...")
+            avail_files = set(os.listdir(dst_dir))
             gt = None
-            for i, rec in enumerate(self.records):
+            kept_records = []
+            missing_files = []
+
+            for rec in self.records:
                 fname = os.path.basename(rec["tessera_path"])
-                if fname not in avail_files:
-                    if download_missing_tiles:
-                        print(f"Retrieving missing Tessera data: {fname}")
-                        gt = gt or GeoTessera(cache_dir=self.cache_dir)
-                        row = self.df[self.df["name_loc"] == rec["name_loc"]]
-                        lon, lat = row.lon.item(), row.lat.item()
-                        try:
-                            get_tessera_embeds(
-                                lon,
-                                lat,
-                                rec["name_loc"],
-                                year=year,
-                                save_dir=dst_dir,
-                                tile_size=size,
-                                tessera_con=gt,
-                            )
+
+                if fname in avail_files:
+                    kept_records.append(rec)
+                    continue
+
+                if download_missing_tiles:
+                    print(f"Retrieving missing Tessera data: {fname}")
+                    gt = gt or GeoTessera(cache_dir=self.cache_dir)
+                    row = self.df[self.df["name_loc"] == rec["name_loc"]]
+                    lon, lat = row.lon.item(), row.lat.item()
+
+                    try:
+                        get_tessera_embeds(
+                            lon,
+                            lat,
+                            rec["name_loc"],
+                            year=year,
+                            save_dir=dst_dir,
+                            tile_size=size,
+                            tessera_con=gt,
+                        )
+                        if os.path.exists(rec["tessera_path"]):
+                            kept_records.append(rec)
+                            avail_files.add(fname)
                             continue
-                        except Exception as e:
-                            print(f"Tile for {fname} could not be retrieved. Error: {e}")
-                self.records.pop(i)
+                    except Exception as e:
+                        print(f"Tile for {fname} could not be retrieved. Error: {e}")
+
+                missing_files.append(fname)
                 print(f"No tile found for {fname} thus it will not be used.")
 
+            self.records = kept_records
+
+            if missing_files:
+                print(f"Dropped {len(missing_files)} records with missing Tessera tiles.")
+            # from geotessera import GeoTessera
+
+            # print("Downloading missing Tessera tiles...")
+            # print("[Warning]: it may download tessera tiles filled with 0a")
+
+            # avail_files = os.listdir(dst_dir)
+            # gt = None
+            # for i, rec in enumerate(self.records):
+            #     fname = os.path.basename(rec["tessera_path"])
+            #     if fname not in avail_files:
+            #         if download_missing_tiles:
+            #             print(f"Retrieving missing Tessera data: {fname}")
+            #             gt = gt or GeoTessera(cache_dir=self.cache_dir)
+            #             row = self.df[self.df["name_loc"] == rec["name_loc"]]
+            #             lon, lat = row.lon.item(), row.lat.item()
+            #             try:
+            #                 get_tessera_embeds(
+            #                     lon,
+            #                     lat,
+            #                     rec["name_loc"],
+            #                     year=year,
+            #                     save_dir=dst_dir,
+            #                     tile_size=size,
+            #                     tessera_con=gt,
+            #                 )
+            #                 continue
+            #             except Exception as e:
+            #                 print(f"Tile for {fname} could not be retrieved. Error: {e}")
+            #     self.records.pop(i)
+            #     print(f"No tile found for {fname} thus it will not be used.")
+
     @final
     def setup_aef(self) -> None:
         """Download full dataset or the missing AEF tiles.

From d889382b6241cff4208e01cb5ffb78d13c70a12a Mon Sep 17 00:00:00 2001
From: bachir <bbel@nilu.no>
Date: Mon, 15 Jun 2026 17:43:48 +0200
Subject: [PATCH 5/7] feat(heat-guatemala): Tessera+text caption-alignment use
 case

---
 configs/data/heat_guatemala_tessera_text.yaml | 65 +++++++++++++++
 configs/experiment/heat_alignment.yaml        | 25 ++++++
 configs/model/heat_tessera_alignment.yaml     | 37 +++++++++
 data/heat_guatemala/concept_captions/v1.json  | 82 +++++++++++++++++++
 .../location_caption_templates/v1.json        | 30 +++++++
 src/data/heat_guatemala_caption_builder.py    | 64 +++++++++++++++
 .../build_aux_from_original.py                | 72 ++++++++++++++++
 7 files changed, 375 insertions(+)
 create mode 100644 configs/data/heat_guatemala_tessera_text.yaml
 create mode 100644 configs/experiment/heat_alignment.yaml
 create mode 100644 configs/model/heat_tessera_alignment.yaml
 create mode 100644 data/heat_guatemala/concept_captions/v1.json
 create mode 100644 data/heat_guatemala/location_caption_templates/v1.json
 create mode 100644 src/data/heat_guatemala_caption_builder.py
 create mode 100644 src/data_preprocessing/build_aux_from_original.py

diff --git a/configs/data/heat_guatemala_tessera_text.yaml b/configs/data/heat_guatemala_tessera_text.yaml
new file mode 100644
index 0000000..4ee12cd
--- /dev/null
+++ b/configs/data/heat_guatemala_tessera_text.yaml
@@ -0,0 +1,65 @@
+# Alignment data config for the Guatemala LST use case (Tessera + expert-legend captions).
+#
+# Requirements before running:
+#   1. The model-ready CSV the dataset loads from ${paths.data_dir} must contain the
+#      aux_* columns produced by scripts/build_aux_from_original.py (numeric + *_label).
+#   2. Tessera tiles at ${paths.data_dir}/heat_guatemala/eo/tessera/tessera_<name_loc>.npy
+#      (10x10, 128-band) — see the tessera preprocessing step.
+_target_: src.data.base_datamodule.BaseDataModule
+
+dataset:
+  _target_: src.data.heat_guatemala_dataset.HeatGuatemalaDataset
+  data_dir: ${paths.data_dir}
+  modalities:
+    tessera:
+      year: 2024
+      size: 10
+      format: npy
+  use_target_data: false        # alignment is contrastive geo<->text; LST enters only as a concept
+  use_features: false           # Tessera-only EO branch (your results: fusion hurts), avoids circularity
+  use_aux_data:
+    aux:                        # numeric raw values -> concept theta_k / retrieval ground truth
+      columns:
+        - aux_ndvi_mean
+        - aux_ndwi_mean
+        - aux_forest_cover_perc
+        - aux_tree_cover_perc
+        - aux_builtup_age_years
+        - aux_slope_perc
+        - aux_socioeconomic
+        - aux_lst
+    top:                        # expert-legend label strings -> caption text
+      columns:
+        - aux_ndvi_label
+        - aux_ndwi_label
+        - aux_forest_label
+        - aux_age_label
+        - aux_slope_label
+        - aux_socio_label
+        - aux_height_label
+        - aux_density_label
+        - aux_landuse
+        - aux_blocktype
+        - aux_interzone
+  seed: ${seed}
+  cache_dir: ${paths.cache_dir}
+
+caption_builder:
+  _target_: src.data.heat_guatemala_caption_builder.HeatGuatemalaCaptionBuilder
+  templates_fname: v1.json
+  concepts_fname: v1.json
+  data_dir: ${paths.data_dir}/heat_guatemala
+  seed: ${seed}
+
+batch_size: 64
+num_workers: 8
+pin_memory: true
+
+#split_mode: "spatial_clusters"      # honest val/test: hold out whole areas (city blocks autocorrelate)
+#spatial_split_distance_m: 500
+#split_mode: "random"
+split_mode: "from_file"
+saved_split_file_name: "split_indices_heat_guatemala_2026-02-20-1148.pth"
+train_val_test_split: [0.7, 0.15, 0.15]
+save_split: false
+seed: ${seed}
diff --git a/configs/experiment/heat_alignment.yaml b/configs/experiment/heat_alignment.yaml
new file mode 100644
index 0000000..8a8e109
--- /dev/null
+++ b/configs/experiment/heat_alignment.yaml
@@ -0,0 +1,25 @@
+# @package _global_
+# Run: python src/train.py experiment=heat_alignment
+# Smoke test (weak, expected): python src/train.py experiment=heat_alignment \
+#     model=geoclip_alignment data.dataset.modalities='{coords: {}}'
+defaults:
+  - override /model: heat_tessera_alignment
+  - override /data: heat_guatemala_tessera_text
+  - override /metrics: contrastive_similarities
+
+tags: ["alignment", "heat_island", "guatemala", "tessera"]
+seed: 12345
+
+trainer:
+  min_epochs: 10
+  max_epochs: 100
+
+data:
+  batch_size: 64
+
+logger:
+  wandb:
+    tags: ${tags}
+    group: "heat_alignment"
+  aim:
+    experiment: "heat_alignment"
diff --git a/configs/model/heat_tessera_alignment.yaml b/configs/model/heat_tessera_alignment.yaml
new file mode 100644
index 0000000..622b015
--- /dev/null
+++ b/configs/model/heat_tessera_alignment.yaml
@@ -0,0 +1,37 @@
+# Tessera-only alignment model for the Guatemala LST use case.
+# EO branch = average-pooled Tessera (your best regressor: Tessera avg, R2=0.733).
+# No fusion (fusion hurt in your experiments). CLIP text tower + GeoCLIP mlp stay frozen;
+# only the text projector, the auto-added geo projector, and the temperature train.
+_target_: src.models.text_alignment_model.TextAlignmentModel
+
+geo_encoder:
+  _target_: src.models.components.geo_encoders.average_encoder.AverageEncoder
+  geo_data_name: tessera        # averages the 10x10x128 tile -> 128-d vector
+
+text_encoder:
+  _target_: src.models.components.text_encoders.clip_text_encoder.ClipTextEncoder
+  hf_cache_dir: ${paths.huggingface_cache}
+
+# geo (128) != text (512); match_to_geo=false projects the GEO side up to 512.
+# _setup() auto-adds & trains geo_encoder.extra_projector.
+match_to_geo: false
+trainable_modules: [text_encoder.projector, loss_fn.log_temp]
+
+metrics: ${metrics}
+
+optimizer:
+  _target_: torch.optim.Adam
+  _partial_: true
+  lr: 0.001
+  weight_decay: 0.0
+
+scheduler:
+  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
+  _partial_: true
+  mode: min
+  factor: 0.1
+  patience: 10
+
+loss_fn:
+  _target_: src.models.components.loss_fns.clip_loss.ClipLoss
+  temperature: 0.07
diff --git a/data/heat_guatemala/concept_captions/v1.json b/data/heat_guatemala/concept_captions/v1.json
new file mode 100644
index 0000000..c4a4977
--- /dev/null
+++ b/data/heat_guatemala/concept_captions/v1.json
@@ -0,0 +1,82 @@
+[
+  {
+    "concept_caption": "Densely vegetated green area with abundant healthy vegetation",
+    "is_max": true,
+    "theta_k": 0.334,
+    "col": "aux_ndvi_mean"
+  },
+  {
+    "concept_caption": "Sparsely vegetated area with little greenery",
+    "is_max": false,
+    "theta_k": 0.112,
+    "col": "aux_ndvi_mean"
+  },
+  {
+    "concept_caption": "Moist surface with no vegetation water stress",
+    "is_max": true,
+    "theta_k": 0.097,
+    "col": "aux_ndwi_mean"
+  },
+  {
+    "concept_caption": "Dry surface with vegetation under drought stress",
+    "is_max": false,
+    "theta_k": -0.053,
+    "col": "aux_ndwi_mean"
+  },
+  {
+    "concept_caption": "Forested area with substantial tree canopy cover",
+    "is_max": true,
+    "theta_k": 34.244,
+    "col": "aux_forest_cover_perc"
+  },
+  {
+    "concept_caption": "Area with dense tropical tree cover",
+    "is_max": true,
+    "theta_k": 43.0,
+    "col": "aux_tree_cover_perc"
+  },
+  {
+    "concept_caption": "Established, older built-up urban fabric",
+    "is_max": true,
+    "theta_k": 30.0,
+    "col": "aux_builtup_age_years"
+  },
+  {
+    "concept_caption": "Flat, low-lying terrain",
+    "is_max": false,
+    "theta_k": 3.092,
+    "col": "aux_slope_perc"
+  },
+  {
+    "concept_caption": "Hilly terrain with higher relief",
+    "is_max": true,
+    "theta_k": 19.775,
+    "col": "aux_slope_perc"
+  },
+  {
+    "concept_caption": "High socioeconomic quality neighbourhood",
+    "is_max": true,
+    "theta_k": 4.0,
+    "col": "aux_socioeconomic"
+  },
+  {
+    "concept_caption": "Low socioeconomic quality neighbourhood",
+    "is_max": false,
+    "theta_k": 2.0,
+    "col": "aux_socioeconomic"
+  },
+  {
+    "concept_caption": "Hot surface above the urban heat-stress limit",
+    "is_max": true,
+    "theta_k": 26.592,
+    "col": "aux_lst",
+    "note": "month-confounded: compute/evaluate on March-only blocks for a clean signal"
+  },
+  {
+    "concept_caption": "Cool surface well under the heat-stress limit",
+    "is_max": false,
+    "theta_k": 23.69,
+    "col": "aux_lst",
+    "note": "month-confounded: compute/evaluate on March-only blocks for a clean signal"
+  }
+]
\ No newline at end of file
diff --git a/data/heat_guatemala/location_caption_templates/v1.json b/data/heat_guatemala/location_caption_templates/v1.json
new file mode 100644
index 0000000..de16c76
--- /dev/null
+++ b/data/heat_guatemala/location_caption_templates/v1.json
@@ -0,0 +1,30 @@
+[
+  "Urban location dominated by <aux_landuse>, with <aux_ndvi_label> and <aux_ndwi_label>.",
+  "Area of <aux_landuse> showing <aux_ndvi_label>, under <aux_ndwi_label>.",
+  "Built-up block of type <aux_blocktype>, characterised by <aux_ndvi_label> and <aux_age_label>.",
+  "Location of <aux_landuse> with <aux_density_label>, showing <aux_ndvi_label>.",
+  "Neighbourhood of <aux_density_label> with <aux_socio_label>, on <aux_slope_label>.",
+  "Site characterised by <aux_landuse>, influenced by <aux_ndwi_label> and <aux_slope_label>.",
+  "Urban area with <aux_age_label>, showing <aux_ndvi_label> and <aux_density_label>.",
+  "Location with <aux_ndvi_label> and <aux_forest_label>, on <aux_slope_label>.",
+  "Area of <aux_landuse> on <aux_slope_label>, with <aux_ndwi_label>.",
+  "Built environment of type <aux_blocktype> with <aux_density_label>, under <aux_ndwi_label>.",
+  "Site with <aux_forest_label> and <aux_ndvi_label>, characterised by <aux_height_label>.",
+  "Location in interior zone <aux_interzone>, dominated by <aux_landuse>, with <aux_ndvi_label>.",
+  "Built-up area with <aux_height_label> and <aux_age_label>, of <aux_socio_label>.",
+  "Green location with <aux_ndvi_label> and <aux_forest_label>, under <aux_ndwi_label>.",
+  "Area of <aux_landuse> with <aux_socio_label>, showing <aux_ndwi_label>.",
+  "Location on <aux_slope_label> with <aux_ndvi_label>, in interior zone <aux_interzone>.",
+  "Urban block of type <aux_blocktype> with <aux_density_label>, under <aux_ndwi_label>.",
+  "Site dominated by <aux_landuse>, with <aux_age_label> and <aux_slope_label>.",
+  "Location characterised by <aux_ndvi_label>, <aux_ndwi_label> and <aux_forest_label>.",
+  "Area of <aux_density_label> with <aux_ndvi_label>, influenced by <aux_age_label>.",
+  "Residential area of <aux_socio_label>, showing <aux_ndvi_label> and <aux_height_label>.",
+  "Location dominated by <aux_landuse> with <aux_forest_label>, on <aux_slope_label>.",
+  "Built-up site of type <aux_blocktype>, under <aux_ndwi_label> with <aux_ndvi_label>.",
+  "Area showing <aux_ndvi_label> and <aux_age_label>, in interior zone <aux_interzone>.",
+  "Location of <aux_landuse> with <aux_density_label>, characterised by <aux_ndwi_label>.",
+  "Urban location of <aux_socio_label> with <aux_blocktype> fabric, showing <aux_ndvi_label>.",
+  "Site on <aux_slope_label> with <aux_forest_label> and <aux_ndvi_label>.",
+  "Area dominated by <aux_landuse>, with <aux_ndvi_label>, <aux_ndwi_label> and <aux_age_label>."
+]
\ No newline at end of file
diff --git a/src/data/heat_guatemala_caption_builder.py b/src/data/heat_guatemala_caption_builder.py
new file mode 100644
index 0000000..76d63e0
--- /dev/null
+++ b/src/data/heat_guatemala_caption_builder.py
@@ -0,0 +1,64 @@
+"""Caption builder for the Guatemala City urban-heat (LST) use case.
+
+Two aux categories are used (configured in the data yaml under `use_aux_data`):
+
+  * ``aux``  – numeric raw columns (NDVI, NDWI, slope, built-up age, LST, ...).
+               These feed the concept retrieval evaluation: each concept's
+               ``theta_k`` is compared directly against these raw values.
+  * ``top``  – expert-legend *label* columns (e.g. ``aux_ndvi_label`` =
+               "high vegetation greenness", ``aux_density_label`` =
+               "very dense urban", ``aux_landuse`` = "discontinuous urban").
+               These fill the ``<...>`` tokens in the location-caption templates,
+               so the training text uses the authoritative expert wording from
+               ``Heat_Guatemala.csv``.
+
+This mirrors the continuous (butterfly) paradigm for concepts, while taking the
+caption *words* straight from the legend rather than re-deriving them — the two
+are produced from the same block by build_aux_from_original.py, so they stay
+consistent. The LST label is deliberately NOT used in any template (the target
+must not leak into the training captions); LST appears only as a concept.
+"""
+
+from typing import Dict, List, override
+
+import torch
+
+from src.data.base_caption_builder import BaseCaptionBuilder
+from src.data.base_dataset import BaseDataset
+
+
+class HeatGuatemalaCaptionBuilder(BaseCaptionBuilder):
+    @override
+    def sync_with_dataset(self, dataset: BaseDataset) -> None:
+        """Index numeric aux columns (for concepts) and label columns (for text)."""
+        # numeric aux -> id (position in the 'aux' tensor); used by sync_concepts()
+        self.column_to_metadata_map = {"aux": {}}
+        for i, col in enumerate(dataset.use_aux_data.get("aux", [])):
+            self.column_to_metadata_map["aux"][col] = {"id": i}
+
+        # label aux -> position in the per-row 'top' list; used to fill template tokens
+        self.top_index: Dict[str, int] = {
+            col: i for i, col in enumerate(dataset.use_aux_data.get("top", []))
+        }
+
+        # wires concept["id"] from the numeric aux map (raises if a concept col is missing)
+        self.sync_concepts()
+
+    @override
+    def _build_from_template(
+        self, template_idx: int, aux: torch.Tensor, top: List[str] | None = None
+    ) -> str:
+        template = self.templates[template_idx]
+        fillers: Dict[str, str] = {}
+        for token in self.tokens_in_template[template_idx]:
+            if token in self.top_index and top is not None:
+                fillers[token] = str(top[self.top_index[token]])
+            elif token in self.column_to_metadata_map["aux"]:
+                # numeric fallback (default templates don't use numeric tokens)
+                fillers[token] = f"{aux[self.column_to_metadata_map['aux'][token]['id']].item():.2f}"
+            else:
+                raise KeyError(
+                    f"Token '{token}' is neither a label ('top') nor a numeric ('aux') "
+                    "column in the dataset. Check the template and use_aux_data config."
+                )
+        return self._fill(template, fillers)
diff --git a/src/data_preprocessing/build_aux_from_original.py b/src/data_preprocessing/build_aux_from_original.py
new file mode 100644
index 0000000..e1dd875
--- /dev/null
+++ b/src/data_preprocessing/build_aux_from_original.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+"""
+Build aux_* columns for the Guatemala LST captioning, by joining:
+  - RAW numbers  (model_ready_heat_guatemala.csv, NON-standardized)  -> concept theta_k
+  - EXPERT words (Heat_Guatemala.csv, the original legend)           -> caption text
+
+Join key: BLOCK_ID = int(name_loc[5:])  (verified 1:1, lat diff 0, LST diff <0.005).
+
+Output: a copy of the raw model-ready CSV with aux_* columns appended, ready for
+the alignment datamodule (which selects columns by the regex ^aux_).
+"""
+import argparse
+import re
+
+import pandas as pd
+
+
+def cls(s):
+    """Take the human class from a legend string ('<0.5 NDVI greenness : high' -> 'high')."""
+    if pd.isna(s):
+        return None
+    s = str(s).strip()
+    return s.split(":")[-1].strip() if ":" in s else s
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--raw", required=True, help="model_ready_heat_guatemala.csv (RAW, not _in_)")
+    ap.add_argument("--legend", required=True, help="Heat_Guatemala.csv (original legend)")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    raw = pd.read_csv(args.raw, low_memory=False)
+    leg = pd.read_csv(args.legend, encoding="cp1252", low_memory=False)
+    raw["BLOCK_ID"] = raw["name_loc"].str.replace("heat_", "", regex=False).astype(int)
+    df = raw.merge(leg, on="BLOCK_ID", how="left")
+
+    new = {}
+    # numeric aux (raw scale) -> concept ground truth / theta_k
+    num = {
+        "aux_ndvi_mean": "feat_ndvi_mean2022", "aux_ndwi_mean": "feat_ndwi_mean2022",
+        "aux_forest_cover_perc": "feat_forcov_meanperc", "aux_tree_cover_perc": "feat_troptreecovperc",
+        "aux_builtup_age_years": "feat_bua_gaia_age_mean", "aux_slope_perc": "feat_dem5mslopeperc_mean",
+        "aux_socioeconomic": "feat_estrato_s", "aux_lst": "target_lst",
+    }
+    for a, f in num.items():
+        new[a] = df[f]
+    if "feat_measurement_month" in df.columns:          # present only if you kept raw month
+        new["aux_month"] = df["feat_measurement_month"]
+
+    # word aux (authoritative expert legend) -> caption text
+    new["aux_ndvi_label"]    = df["NDVI_mean2022"].map(cls).map(lambda x: f"{x} vegetation greenness" if x else x)
+    new["aux_ndwi_label"]    = df["NDWI_mean2022"].map(cls).map(lambda x: x if x and "stress" in x else (f"{x} drought stress" if x else x))
+    new["aux_socio_label"]   = df["SocioEconomicQuality"].map(cls).map(lambda x: f"{x} socioeconomic quality" if x else x)
+    new["aux_slope_label"]   = df["DEM5m_Slope%"].map(cls).map(lambda x: x.lower() if x else x)
+    new["aux_forest_label"]  = df["Hansen_ForestCover_meanPerc"].map(cls)
+    new["aux_age_label"]     = df["BUA_GAIA_Age_Mean"].map(cls)
+    new["aux_height_label"]  = df["CopenicusMSZ_BuildingHeightM"].map(cls)   # words OK; raw number is unit-broken
+    new["aux_density_label"] = df["PopulationDensityPerKm2"].map(cls)        # urban-form label
+    new["aux_lst_label"]     = df["LST_mean_predictor_Classified"].map(cls)
+    new["aux_landuse"]       = df["BlockMAGADominantLanduse"].astype(str).str.strip().str.lower()
+    new["aux_blocktype"]     = df["BlockType"].astype(str).str.strip().str.lower()
+    new["aux_interzone"]     = df["IntrZon"].astype(str).str.strip()
+
+    out = pd.concat([raw.drop(columns=["BLOCK_ID"]), pd.DataFrame(new, index=raw.index)], axis=1)
+    out.to_csv(args.out, index=False)
+    print(f"wrote {out.shape[0]} rows, {sum(k.startswith('aux_') for k in out.columns)} aux columns -> {args.out}")
+    print("nulls in aux columns:", int(out[[c for c in out.columns if c.startswith('aux_')]].isna().sum().sum()))
+
+
+if __name__ == "__main__":
+    main()

From f25349ff096baea8bf259b0f2d8dab8ca2ae82c1 Mon Sep 17 00:00:00 2001
From: bachir <bbel@nilu.no>
Date: Wed, 17 Jun 2026 15:49:38 +0200
Subject: [PATCH 6/7] clean base_dataset.py

---
 src/data/base_dataset.py | 50 +++-------------------------------------
 1 file changed, 3 insertions(+), 47 deletions(-)

diff --git a/src/data/base_dataset.py b/src/data/base_dataset.py
index e8c606e..aef9390 100644
--- a/src/data/base_dataset.py
+++ b/src/data/base_dataset.py
@@ -144,23 +144,6 @@ def __init__(
         else:
             self.use_aux_data = None
 
-        # if use_aux_data is None or use_aux_data == "all":
-        #     self.use_aux_data = {
-        #         "aux": {
-        #             "pattern": "^aux_(?!.*top).*",
-        #             #     'columns' : []
-        #         },
-        #         "top": {
-        #             "pattern": "^aux_.*top.*",
-        #             #     'columns' : []
-        #         },
-        #     }
-
-        # elif type(use_aux_data) is dict:
-        #     self.use_aux_data = use_aux_data
-        # else:
-        #     self.use_aux_data = None
-
         # More precise dataset name (with modalities)
         if isinstance(dataset_name, list):
             dataset_name = "+".join(dataset_name)
@@ -346,36 +329,9 @@ def setup_tessera(self) -> None:
 
             if missing_files:
                 print(f"Dropped {len(missing_files)} records with missing Tessera tiles.")
-            # from geotessera import GeoTessera
-
-            # print("Downloading missing Tessera tiles...")
-            # print("[Warning]: it may download tessera tiles filled with 0a")
-
-            # avail_files = os.listdir(dst_dir)
-            # gt = None
-            # for i, rec in enumerate(self.records):
-            #     fname = os.path.basename(rec["tessera_path"])
-            #     if fname not in avail_files:
-            #         if download_missing_tiles:
-            #             print(f"Retrieving missing Tessera data: {fname}")
-            #             gt = gt or GeoTessera(cache_dir=self.cache_dir)
-            #             row = self.df[self.df["name_loc"] == rec["name_loc"]]
-            #             lon, lat = row.lon.item(), row.lat.item()
-            #             try:
-            #                 get_tessera_embeds(
-            #                     lon,
-            #                     lat,
-            #                     rec["name_loc"],
-            #                     year=year,
-            #                     save_dir=dst_dir,
-            #                     tile_size=size,
-            #                     tessera_con=gt,
-            #                 )
-            #                 continue
-            #             except Exception as e:
-            #                 print(f"Tile for {fname} could not be retrieved. Error: {e}")
-            #     self.records.pop(i)
-            #     print(f"No tile found for {fname} thus it will not be used.")
+
+            print("Downloading missing Tessera tiles...")
+            print("[Warning]: it may download tessera tiles filled with 0a")
 
     @final
     def setup_aef(self) -> None:

From 69819b9d3e78be1fc7d394b08892f064f64dfbc4 Mon Sep 17 00:00:00 2001
From: bachir <bbel@nilu.no>
Date: Wed, 17 Jun 2026 15:51:43 +0200
Subject: [PATCH 7/7] rename preprocessing script

---
 .../build_heat_guatemala_aux.py               | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 src/data_preprocessing/build_heat_guatemala_aux.py

diff --git a/src/data_preprocessing/build_heat_guatemala_aux.py b/src/data_preprocessing/build_heat_guatemala_aux.py
new file mode 100644
index 0000000..220f746
--- /dev/null
+++ b/src/data_preprocessing/build_heat_guatemala_aux.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+"""
+Build aux_* columns for the Guatemala LST captioning, by joining:
+  - RAW numbers  (model_ready_heat_guatemala.csv, NON-standardized)  -> concept theta_k
+  - EXPERT words (Heat_Guatemala.csv, the original legend)           -> caption text
+
+Join key: BLOCK_ID = int(name_loc[5:])  (verified 1:1, lat diff 0, LST diff <0.005).
+
+Output: a copy of the raw model-ready CSV with aux_* columns appended, ready for
+the alignment datamodule (which selects columns by the regex ^aux_).
+"""
+import argparse
+import re
+
+import pandas as pd
+
+
+def cls(s):
+    """Take the human class from a legend string ('<0.5 NDVI greenness : high' -> 'high')."""
+    if pd.isna(s):
+        return None
+    s = str(s).strip()
+    return s.split(":")[-1].strip() if ":" in s else s
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--raw", required=True, help="model_ready_heat_guatemala.csv (RAW, not _in_)")
+    ap.add_argument("--legend", required=True, help="Heat_Guatemala.csv (original legend)")
+    ap.add_argument("--out", required=True)
+    ap.add_argument("--unstd", required=True, help="Heat_Guatemala_raw.csv (true unstandardized source)")
+    args = ap.parse_args()
+
+    raw = pd.read_csv(args.raw, low_memory=False)
+
+    raw = raw.drop(columns=[c for c in raw.columns if c.startswith("aux_")], errors="ignore")  # ADD THIS
+    leg = pd.read_csv(args.legend, encoding="cp1252", low_memory=False)
+    unstd = pd.read_csv(args.unstd, encoding="cp1252", low_memory=False)
+    
+    raw["BLOCK_ID"] = raw["name_loc"].str.replace("heat_", "", regex=False).astype(int)
+    unstd = unstd.rename(columns={"OBJECTID": "BLOCK_ID"})
+    df = raw.merge(leg, on="BLOCK_ID", how="left")
+    df = df.merge(
+        unstd[["BLOCK_ID", "ForCov_meanPerc", "TropTreecovPerc", "BUA_GAIA_Age_Mean"]],
+        on="BLOCK_ID", how="left", suffixes=("", "_unstd"),
+    )
+    new = {}
+    # numeric aux (raw scale) -> concept ground truth / theta_k
+    num = {
+        "aux_ndvi_mean": "feat_ndvi_mean2022", "aux_ndwi_mean": "feat_ndwi_mean2022",
+        "aux_forest_cover_perc": "ForCov_meanPerc", "aux_tree_cover_perc": "TropTreecovPerc",
+        "aux_builtup_age_years": "BUA_GAIA_Age_Mean_unstd", "aux_slope_perc": "feat_dem5mslopeperc_mean",
+        "aux_socioeconomic": "feat_estrato_s", "aux_lst": "target_lst",
+    }
+    for a, f in num.items():
+        new[a] = df[f]
+    if "feat_measurement_month" in df.columns:          # present only if you kept raw month
+        new["aux_month"] = df["feat_measurement_month"]
+
+    # word aux (authoritative expert legend) -> caption text
+    new["aux_ndvi_label"]    = df["NDVI_mean2022"].map(cls).map(lambda x: f"{x} vegetation greenness" if x else x)
+    new["aux_ndwi_label"]    = df["NDWI_mean2022"].map(cls).map(lambda x: x if x and "stress" in x else (f"{x} drought stress" if x else x))
+    new["aux_socio_label"]   = df["SocioEconomicQuality"].map(cls).map(lambda x: f"{x} socioeconomic quality" if x else x)
+    new["aux_slope_label"]   = df["DEM5m_Slope%"].map(cls).map(lambda x: x.lower() if x else x)
+    new["aux_forest_label"]  = df["Hansen_ForestCover_meanPerc"].map(cls)
+    new["aux_age_label"]     = df["BUA_GAIA_Age_Mean"].map(cls)
+    new["aux_height_label"]  = df["CopenicusMSZ_BuildingHeightM"].map(cls)   # words OK; raw number is unit-broken
+    new["aux_density_label"] = df["PopulationDensityPerKm2"].map(cls)        # urban-form label
+    new["aux_lst_label"]     = df["LST_mean_predictor_Classified"].map(cls)
+    new["aux_landuse"]       = df["BlockMAGADominantLanduse"].astype(str).str.strip().str.lower()
+    new["aux_blocktype"]     = df["BlockType"].astype(str).str.strip().str.lower()
+    new["aux_interzone"]     = df["IntrZon"].astype(str).str.strip()
+
+    out = pd.concat([raw.drop(columns=["BLOCK_ID"]), pd.DataFrame(new, index=raw.index)], axis=1)
+    out.to_csv(args.out, index=False)
+    print(f"wrote {out.shape[0]} rows, {sum(k.startswith('aux_') for k in out.columns)} aux columns -> {args.out}")
+    print("nulls in aux columns:", int(out[[c for c in out.columns if c.startswith('aux_')]].isna().sum().sum()))
+
+
+if __name__ == "__main__":
+    main()