HUBioDataLab · furkannecatiinan · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 28, 2026
diff --git a/shap/USAGE.md b/shap/USAGE.md
@@ -0,0 +1,84 @@
+# SHAP Vis — Usage
+
+End-to-end pipeline that turns SHAP `.npz` outputs into per-molecule attention
+PNGs, packed into one ZIP per model. `app8.py` is the interactive Streamlit
+version of the same flow.
+
+## Pipeline overview
+
+```
+prepare_csv.py  ->  molecules.csv      (fetch SMILES from ChEMBL / PubChem)
+generate_images.py  ->  mol_images/    (transparent 300x300 molecule PNGs)
+shap_cli.py     ->  output_zips/<model>.zip   (terminal version of app7/app8 "Download ZIP")
+```
+
+## Requirements
+
+```bash
+pip install numpy opencv-python matplotlib pillow rdkit pandas requests \
+            chembl_webresource_client streamlit
+```
+
+## 1. Build `molecules.csv`
+
+Scans `shap_numpy_data/` for `*_0.npz`, collects unique molecule IDs, and looks
+up each SMILES (ChEMBL first, PubChem fallback).
+
+```bash
+python prepare_csv.py          # writes ./molecules.csv
+```
+
+CSV columns: `molecule_id, smiles`.
+
+## 2. Generate molecule images
+
+Renders transparent-background PNGs from the SMILES list.
+
+```bash
+python generate_images.py --input molecules.csv --output_dir ./mol_images
+```
+
+Missing PNGs are also generated on demand by `shap_cli.py`, so this step is
+optional if `molecules.csv` exists.
+
+## 3. Render attention ZIPs
+
+Provide either a single model folder or a parent folder of model subfolders.
+
+```bash
+# single model folder -> CHEMBL301_cnn.zip
+python shap_cli.py -i shap_numpy_data/CHEMBL301_cnn -o ./output_zips
+
+# parent folder -> one ZIP per subfolder
+python shap_cli.py -i shap_numpy_data -o ./output_zips
+```
+
+Key options (defaults match the app7/app8 sliders):
+
+| Flag | Default | Meaning |
+|------|---------|---------|
+| `--input`, `-i` | — | NPZ folder or parent of model subfolders (required) |
+| `--output_dir`, `-o` | `./output_zips` | Where ZIPs are written |
+| `--mol_images_dir` | `./mol_images` | Transparent molecule PNGs |
+| `--molecules_csv` | `./molecules.csv` | SMILES source for missing PNGs |
+| `--hotspot_p` | `95.0` | Focus threshold (Top %) |
+| `--blur_sigma` | `5.7` | Smoothing |
+| `--gamma` | `1.50` | Intensity |
+| `--alpha` | `0.75` | Opacity |
+
+## Run everything at once
+
+```bash
+./run_pipeline.sh                              # defaults
+./run_pipeline.sh --input shap_numpy_data
+./run_pipeline.sh --skip-csv --skip-images     # only re-render the ZIPs
+```
+
+## Interactive version
+
+```bash
+streamlit run app8.py
+```
+
+Same parameters exposed as sliders, with a "Download ZIP" button equivalent to
+`shap_cli.py`.
diff --git a/shap/generate_images.py b/shap/generate_images.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Molekül Görüntüsü Üretici — Saydam Arka Plan
+=============================================
+Verilen SMILES listesinden 300x300 saydam arka planlı PNG üretir.
+Çıktı: ./mol_images/<molecule_id>.png
+
+Kullanım:
+    python generate_mol_images.py --input molecules.csv --output_dir ./mol_images
+
+CSV formatı (zorunlu sütunlar):
+    molecule_id, smiles
+    CHEMBL100675, CCOc1ccc(...)cc1
+"""
+
+import os
+import argparse
+import numpy as np
+import pandas as pd
+from io import BytesIO
+
+try:
+    from rdkit import Chem
+    from rdkit.Chem import Draw
+    from rdkit.Chem.Draw import rdMolDraw2D
+    HAS_RDKIT = True
+except ImportError:
+    HAS_RDKIT = False
+    print("⚠️  RDKit bulunamadı. 'pip install rdkit' ile kurun.")
+
+try:
+    from PIL import Image
+    HAS_PIL = True
+except ImportError:
+    HAS_PIL = False
+    print("⚠️  Pillow bulunamadı. 'pip install Pillow' ile kurun.")
+
+
+def smiles_to_transparent_png(smiles: str, size: int = 300) -> "Image":
+    """
+    SMILES'tan saydam arka planlı PIL Image üretir.
+    Renk bazlı ayırt etme: N (mavi), O (kırmızı), Br (kahve) gibi atom etiketleri
+    tam opak korunur; beyaz/gri arka plan kademeli olarak saydam yapılır.
+    """
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        raise ValueError(f"Geçersiz SMILES: {smiles}")
+
+    # RDKit SVG/PNG renderer — beyaz arka plan
+    drawer = rdMolDraw2D.MolDraw2DCairo(size, size)
+    drawer.drawOptions().clearBackground = True
+    drawer.DrawMolecule(mol)
+    drawer.FinishDrawing()
+
+    png_bytes = drawer.GetDrawingText()
+    img = Image.open(BytesIO(png_bytes)).convert("RGBA")
+
+    data = np.array(img, dtype=float)
+    r = data[:, :, 0]
+    g = data[:, :, 1]
+    b = data[:, :, 2]
+
+    # Parlaklık: pikselin ne kadar açık olduğu
+    brightness = (r + g + b) / 3.0
+
+    # Renklililik: R/G/B kanalları arasındaki maksimum fark
+    # Yüksekse renkli piksel (N, O, Br atom etiketleri), düşükse gri/beyaz (arka plan)
+    colorfulness = (np.max(data[:, :, :3], axis=2) -
+                    np.min(data[:, :, :3], axis=2))
+
+    # Arka plan maskesi: hem açık (brightness > 200) hem renksiz (colorfulness < 15)
+    is_background = (brightness > 200) & (colorfulness < 15)
+
+    # Alpha hesaplama:
+    # - Arka plan: brightness'a göre kademeli saydamlık (anti-aliasing kenarları için)
+    # - Renkli/koyu pikseller: tam opak (255)
+    background_alpha = np.clip((255.0 - brightness) * 3.0, 0, 255)
+    alpha_channel = np.where(is_background, background_alpha, 255.0)
+
+    data[:, :, 3] = alpha_channel
+    return Image.fromarray(data.astype(np.uint8))
+
+
+def generate_from_csv(input_csv: str, output_dir: str, size: int = 300):
+    """CSV dosyasından toplu görüntü üretir."""
+    os.makedirs(output_dir, exist_ok=True)
+    df = pd.read_csv(input_csv)
+
+    required = {'molecule_id', 'smiles'}
+    if not required.issubset(df.columns):
+        raise ValueError(f"CSV'de şu sütunlar olmalı: {required}. Bulunanlar: {list(df.columns)}")
+
+    success, fail = 0, 0
+    for _, row in df.iterrows():
+        mol_id = str(row['molecule_id'])
+        smiles = str(row['smiles'])
+        out_path = os.path.join(output_dir, f"{mol_id}.png")
+
+        try:
+            img = smiles_to_transparent_png(smiles, size=size)
+            img.save(out_path, format="PNG")
+            print(f"✓ {mol_id} → {out_path}")
+            success += 1
+        except Exception as e:
+            print(f"✗ {mol_id}: {e}")
+            fail += 1
+
+    print(f"\n✅ Tamamlandı: {success} başarılı, {fail} başarısız.")
+
+
+def generate_from_dict(molecules: dict, output_dir: str, size: int = 300):
+    """
+    Dict'ten toplu görüntü üretir.
+    molecules = {"CHEMBL100675": "CCO...", "DILI1": "c1ccc..."}
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    success, fail = 0, 0
+
+    for mol_id, smiles in molecules.items():
+        out_path = os.path.join(output_dir, f"{mol_id}.png")
+        try:
+            img = smiles_to_transparent_png(smiles, size=size)
+            img.save(out_path, format="PNG")
+            print(f"✓ {mol_id} → {out_path}")
+            success += 1
+        except Exception as e:
+            print(f"✗ {mol_id}: {e}")
+            fail += 1
+
+    print(f"\n✅ Tamamlandı: {success} başarılı, {fail} başarısız.")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Saydam arka planlı molekül PNG üretici")
+    parser.add_argument("--input", type=str, required=True,
+                        help="CSV dosyası (sütunlar: molecule_id, smiles)")
+    parser.add_argument("--output_dir", type=str, default="./mol_images",
+                        help="Çıktı klasörü (varsayılan: ./mol_images)")
+    parser.add_argument("--size", type=int, default=300,
+                        help="Görüntü boyutu piksel (varsayılan: 300)")
+    args = parser.parse_args()
+
+    if not HAS_RDKIT or not HAS_PIL:
+        print("Eksik bağımlılık. Çıkılıyor.")
+        return
+
+    generate_from_csv(args.input, args.output_dir, size=args.size)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/shap/prepare_csv.py b/shap/prepare_csv.py
@@ -0,0 +1,60 @@
+import os
+import glob
+import pandas as pd
+import requests
+from chembl_webresource_client.new_client import new_client
+
+def get_smiles_from_chembl(chembl_id):
+    """ChEMBL ID'den SMILES çeker."""
+    try:
+        molecule = new_client.molecule
+        res = molecule.filter(molecule_chembl_id=chembl_id).only(['molecule_structures'])
+        if res:
+            return res[0]['molecule_structures']['canonical_smiles']
+    except Exception:
+        return None
+    return None
+
+def get_smiles_from_pubchem(name):
+    """DILI veya diğer ID'leri PubChem üzerinden aramayı dener."""
+    try:
+        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/CanonicalSMILES/JSON"
+        response = requests.get(url, timeout=5)
+        if response.status_status == 200:
+            return response.json()['PropertyTable']['Properties'][0]['CanonicalSMILES']
+    except Exception:
+        return None
+    return None
+
+def update_csv_with_smiles(data_dir, output_csv="molecules.csv"):
+    print(f"🔍 {data_dir} taranıyor...")
+    search_pattern = os.path.join(data_dir, "**", "*_0.npz")
+    files = glob.glob(search_pattern, recursive=True)
+
+    mol_ids = sorted(list(set([os.path.basename(f).replace("_0.npz", "") for f in files])))
+    print(f"Bulunan benzersiz molekül sayısı: {len(mol_ids)}")
+
+    results = []
+    for m_id in mol_ids:
+        print(f"📡 Veri çekiliyor: {m_id}...", end=" ", flush=True)
+
+        # 1. Yol: ChEMBL API
+        smiles = get_smiles_from_chembl(m_id)
+
+        # 2. Yol: PubChem (ChEMBL bulamazsa veya DILI id'si ise)
+        if not smiles:
+            smiles = get_smiles_from_pubchem(m_id)
+
+        if smiles:
+            print("✅ Bulundu")
+        else:
+            print("❌ Bulunamadı")
+
+        results.append({"molecule_id": m_id, "smiles": smiles})
+
+    df = pd.DataFrame(results)
+    df.to_csv(output_csv, index=False)
+    print(f"\n📁 İşlem bitti! '{output_csv}' kontrol edebilirsin.")
+
+if __name__ == "__main__":
+    update_csv_with_smiles("./shap_numpy_data")