diff --git a/docs/guides/add-estimation-package.md b/docs/guides/add-estimation-package.md
index 3f75220..0babb67 100644
--- a/docs/guides/add-estimation-package.md
+++ b/docs/guides/add-estimation-package.md
@@ -45,6 +45,9 @@
   - `overlap_max_basic.sh`
   - `gpu_kernel_lightgbm_v10.sh`
   - `gpu_kernel_mlp_v15.sh`
+  - `gpu_kernel_mlp_v21.sh`
+  - `gpu_kernel_mlp_v40.sh`
+  - `gpu_kernel_mlp_v41.sh`
 
 ## 3. top-level package の責務
 
@@ -75,6 +78,18 @@ GPU kernel 単位の外部推定ツールは、通常は section package とし
 - `gpu_kernel_mlp_v15`
   - PerfTools `MLP_NN/v1.5`
   - 主な依存: numpy/pandas/torch
+- `gpu_kernel_mlp_v21`
+  - PerfTools `MLP_NN/v2.1`
+  - v1.5 NN と analytical anchor を組み合わせた hybrid/reference 系
+  - 主な依存: numpy/pandas/torch
+- `gpu_kernel_mlp_v40`
+  - PerfTools `MLP_NN/v4.0`
+  - no-ET pure NN 系
+  - 主な依存: numpy/pandas/torch
+- `gpu_kernel_mlp_v41`
+  - PerfTools `MLP_NN/v4.1`
+  - v4.0 に single-axis trend 対応を加えた NN 系
+  - 主な依存: numpy/pandas/torch
 - `gpu_kernel_lightgbm_v10`
   - PerfTools `LightGBM_model/1.0`
   - 主な依存: numpy/pandas/lightgbm/pyyaml と `libgomp`
diff --git a/programs/genesis/README.md b/programs/genesis/README.md
index 0312566..a66a188 100644
--- a/programs/genesis/README.md
+++ b/programs/genesis/README.md
@@ -124,13 +124,19 @@ Single-package selection:
 ```bash
 BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v15
 # or
+BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v21
+# or
+BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v40
+# or
+BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v41
+# or
 BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_lightgbm_v10
 ```
 
 Multiple-package comparison:
 
 ```bash
-BK_GENESIS_GPU_SECTION_PACKAGES=gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15
+BK_GENESIS_GPU_SECTION_PACKAGES=gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15,gpu_kernel_mlp_v21,gpu_kernel_mlp_v40,gpu_kernel_mlp_v41
 ```
 
 When multiple packages are selected, the app wrapper asks for
diff --git a/programs/genesis/estimate.sh b/programs/genesis/estimate.sh
index 2fffbd7..1bd4d7c 100644
--- a/programs/genesis/estimate.sh
+++ b/programs/genesis/estimate.sh
@@ -9,7 +9,7 @@ genesis_gpu_section_packages() {
   elif [[ -n "${BK_GENESIS_GPU_SECTION_PACKAGE:-}" ]]; then
     raw="$BK_GENESIS_GPU_SECTION_PACKAGE"
   else
-    raw="gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15"
+    raw="gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15,gpu_kernel_mlp_v21,gpu_kernel_mlp_v40,gpu_kernel_mlp_v41"
   fi
 
   printf '%s\n' "$raw" |
@@ -340,7 +340,6 @@ genesis_run_single_estimate() {
     "${BK_ESTIMATION_CURRENT_TARGET_NODES:-1}" \
     "${BK_ESTIMATION_CURRENT_PACKAGE:-weakscaling}"
   est_current_fom="${est_current_bench_fom:-$est_current_fom}"
-  est_current_fom_breakdown=""
 
   if [[ "$synthetic_breakdown" -eq 1 ]]; then
     genesis_mark_gpu_section_time_missing
diff --git a/result_server/templates/_estimated_breakdown_card.html b/result_server/templates/_estimated_breakdown_card.html
index c63b97d..ba39f2a 100644
--- a/result_server/templates/_estimated_breakdown_card.html
+++ b/result_server/templates/_estimated_breakdown_card.html
@@ -10,13 +10,92 @@
     {% endif %}
 {%- endmacro %}
 
+{% macro render_kernel_package_comparisons(item) -%}
+    {% set kernel_summaries = item.get('metrics', {}).get('kernel_summaries', []) %}
+    {% if kernel_summaries %}
+    <div class="kernel-comparisons">
+        <div class="kernel-comparisons-title">Kernel package comparison; metrics are shown as reported by each package.</div>
+        {% for kernel in kernel_summaries %}
+        <div class="kernel-comparison">
+            <div class="kernel-name">{{ kernel.get('name', 'N/A') }}</div>
+            <table class="kernel-package-table">
+                <thead>
+                    <tr>
+                        <th>Package</th>
+                        <th>Samples</th>
+                        <th>Source Mean (ns)</th>
+                        <th>Predicted Mean (ns)</th>
+                        <th>Ratio</th>
+                        <th>Source GPU</th>
+                        <th>Target GPU</th>
+                    </tr>
+                </thead>
+                <tbody>
+                {% for package in kernel.get('package_summaries', []) %}
+                    <tr>
+                        <td>{{ package.get('estimation_package', 'N/A') }}</td>
+                        <td>{{ package.get('sample_count', 'N/A') }}</td>
+                        <td>{{ package.get('source_time_ns_mean', 'N/A') }}</td>
+                        <td>{{ package.get('predicted_time_ns_mean', 'N/A') }}</td>
+                        <td>{{ package.get('mean_time_ratio_predicted_over_source', 'N/A') }}</td>
+                        <td>{{ package.get('source_gpus', []) | join(', ') }}</td>
+                        <td>{{ package.get('target_gpus', []) | join(', ') }}</td>
+                    </tr>
+                    {% if package.get('metric_comparisons') %}
+                    <tr>
+                        <td colspan="7">
+                            <div class="kernel-metrics">
+                                <div class="kernel-metrics-summary">{{ package.get('estimation_package', 'N/A') }} metrics</div>
+                                <table class="kernel-metrics-table">
+                                    <thead>
+                                        <tr>
+                                            <th>Metric</th>
+                                            <th>Samples</th>
+                                            <th>Source Mean</th>
+                                            <th>Predicted Mean</th>
+                                            <th>Ratio</th>
+                                        </tr>
+                                    </thead>
+                                    <tbody>
+                                    {% for metric in package.get('metric_comparisons', []) %}
+                                        <tr>
+                                            <td>{{ metric.get('name', 'N/A') }}</td>
+                                            <td>{{ metric.get('sample_count', 'N/A') }}</td>
+                                            <td>{{ metric.get('source_value_mean', 'N/A') }}</td>
+                                            <td>{{ metric.get('predicted_value_mean', 'N/A') }}</td>
+                                            <td>{{ metric.get('ratio_predicted_over_source_mean', 'N/A') }}</td>
+                                        </tr>
+                                    {% endfor %}
+                                    </tbody>
+                                </table>
+                            </div>
+                        </td>
+                    </tr>
+                    {% endif %}
+                {% endfor %}
+                </tbody>
+            </table>
+        </div>
+        {% endfor %}
+    </div>
+    {% endif %}
+{%- endmacro %}
+
 {% macro render_breakdown_table(heading, items, first_column_label, first_column_key, join_list_values=False) -%}
     {% if items %}
     <h3>{{ heading }}</h3>
     <div class="table-wrap">
     <table class="breakdown-table">
         <thead>
-            <tr><th>{{ first_column_label }}</th><th>Time</th><th>Package</th><th>Scaling</th><th>Fallback</th><th>Applicability</th></tr>
+            <tr>
+                <th>{{ first_column_label }}</th>
+                <th>Bench Time</th>
+                <th>Estimated Time</th>
+                <th>Package</th>
+                <th>Scaling</th>
+                <th>Fallback</th>
+                <th>Applicability</th>
+            </tr>
         </thead>
         <tbody>
         {% for item in items %}
@@ -26,6 +105,7 @@ <h3>{{ heading }}</h3>
             {% endif %}
             <tr>
                 <td>{{ first_value }}</td>
+                <td>{{ item.get('bench_time', item.get('time', 'N/A')) }}</td>
                 <td>{{ item.get('time', 'N/A') }}</td>
                 <td>{{ item.get('estimation_package', 'N/A') }}</td>
                 <td>{{ item.get('scaling_method', 'N/A') }}</td>
@@ -34,7 +114,7 @@ <h3>{{ heading }}</h3>
             </tr>
             {% if item.get('candidate_estimates') %}
             <tr>
-                <td colspan="6">
+                <td colspan="7">
                     <div class="candidate-estimates">
                         <div class="candidate-estimates-title">Candidate estimates; mean time is used for FOM composition.</div>
                         <table class="candidate-estimates-table">
@@ -57,6 +137,13 @@ <h3>{{ heading }}</h3>
                 </td>
             </tr>
             {% endif %}
+            {% if item.get('metrics', {}).get('kernel_summaries') %}
+            <tr>
+                <td colspan="7">
+                    {{ render_kernel_package_comparisons(item) }}
+                </td>
+            </tr>
+            {% endif %}
         {% endfor %}
         </tbody>
     </table>
diff --git a/result_server/templates/estimated_detail.html b/result_server/templates/estimated_detail.html
index 2e26b4c..68c0835 100644
--- a/result_server/templates/estimated_detail.html
+++ b/result_server/templates/estimated_detail.html
@@ -14,6 +14,28 @@
         .detail-table { margin-bottom: 20px; }
         .detail-table th { text-align: left; min-width: 200px; background-color: #eef6f8; }
         .detail-table td { min-width: 260px; white-space: normal; }
+        .system-comparison-table {
+            width: 100%;
+            table-layout: fixed;
+            margin-bottom: 20px;
+        }
+        .system-comparison-table th {
+            text-align: left;
+            background-color: #eef6f8;
+        }
+        .system-comparison-table th,
+        .system-comparison-table td {
+            padding: 8px 10px;
+            border: 1px solid #d8e3e8;
+            white-space: normal;
+            overflow-wrap: anywhere;
+        }
+        .system-comparison-table th:first-child,
+        .system-comparison-table td:first-child {
+            width: 26%;
+            font-weight: 700;
+            color: #102a43;
+        }
         .applicability-summary {
             margin-bottom: 20px;
             padding: 16px 18px;
@@ -93,6 +115,63 @@
             padding: 5px 7px;
             border: 1px solid #e2e8f0;
         }
+        .kernel-comparisons {
+            margin-top: 10px;
+            padding: 10px 12px;
+            border-radius: 10px;
+            background: #f8fafc;
+            border: 1px solid #e2e8f0;
+        }
+        .kernel-comparisons-title {
+            margin-bottom: 8px;
+            color: #475569;
+            font-size: 12px;
+            font-weight: 600;
+        }
+        .kernel-comparison {
+            margin-top: 10px;
+            padding-top: 10px;
+            border-top: 1px solid #e2e8f0;
+        }
+        .kernel-comparison:first-of-type {
+            margin-top: 0;
+            padding-top: 0;
+            border-top: 0;
+        }
+        .kernel-name {
+            margin-bottom: 6px;
+            font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+            font-size: 12px;
+            color: #334155;
+            overflow-wrap: anywhere;
+        }
+        .kernel-package-table,
+        .kernel-metrics-table {
+            width: 100%;
+            border-collapse: collapse;
+            font-size: 12px;
+        }
+        .kernel-package-table th,
+        .kernel-package-table td,
+        .kernel-metrics-table th,
+        .kernel-metrics-table td {
+            padding: 5px 7px;
+            border: 1px solid #e2e8f0;
+            vertical-align: top;
+        }
+        .kernel-package-table th,
+        .kernel-metrics-table th {
+            background: #eef6f8;
+        }
+        .kernel-metrics {
+            margin-top: 8px;
+        }
+        .kernel-metrics-summary {
+            margin-bottom: 5px;
+            color: #475569;
+            font-size: 12px;
+            font-weight: 600;
+        }
         .empty-note { color: #6b7280; font-size: 13px; }
     </style>
 
@@ -121,9 +200,29 @@ <h2>Applicability Summary</h2>
             {{ render_json_block("Confidence", confidence_json) }}
         </div>
 
-        {{ render_titled_key_value_table("Current System", current_rows, "detail-table", "detail-table-wrap", "detail-card") }}
-
-        {{ render_titled_key_value_table("Future System", future_rows, "detail-table", "detail-table-wrap", "detail-card") }}
+        <div class="detail-card">
+            <h2>System Comparison</h2>
+            <div class="detail-table-wrap">
+                <table class="system-comparison-table">
+                    <thead>
+                        <tr>
+                            <th>Item</th>
+                            <th>Current System</th>
+                            <th>Future System</th>
+                        </tr>
+                    </thead>
+                    <tbody>
+                    {% for row in system_comparison_rows %}
+                        <tr>
+                            <td>{{ row.label }}</td>
+                            <td{% if row.current_class %} class="{{ row.current_class }}"{% endif %}>{{ row.current }}</td>
+                            <td{% if row.future_class %} class="{{ row.future_class }}"{% endif %}>{{ row.future }}</td>
+                        </tr>
+                    {% endfor %}
+                    </tbody>
+                </table>
+            </div>
+        </div>
     </div>
 
     <div class="detail-card">
diff --git a/result_server/tests/test_estimated_detail_template.py b/result_server/tests/test_estimated_detail_template.py
index c57c0e1..ce1e461 100644
--- a/result_server/tests/test_estimated_detail_template.py
+++ b/result_server/tests/test_estimated_detail_template.py
@@ -73,7 +73,7 @@
         "fom": 0.944,
         "target_nodes": "1024",
         "scaling_method": "weakscaling",
-        "benchmark": {"system": "Fugaku", "fom": 0.386, "nodes": "1"},
+        "benchmark": {"system": "Fugaku", "fom": 0.386, "nodes": "1", "numproc_node": "4"},
         "model": {"name": "weakscaling-current", "type": "intra_system_scaling_model"},
         "fom_breakdown": {
             "sections": [
@@ -109,7 +109,7 @@
         "fom": 9.054,
         "target_nodes": "256",
         "scaling_method": "instrumented-app-sections-dummy",
-        "benchmark": {"system": "MiyabiG", "fom": 5.712, "nodes": "1"},
+        "benchmark": {"system": "MiyabiG", "fom": 5.712, "nodes": "1", "numproc_node": "8"},
         "model": {"name": "instrumented-app-sections-future-projection", "type": "cross_system_projection_model"},
         "fom_breakdown": {
             "sections": [
@@ -140,6 +140,51 @@
                             "metrics": {"time_ratio_predicted_over_source": 2.6666666667},
                         },
                     ],
+                    "metrics": {
+                        "kernel_summaries": [
+                            {
+                                "name": "kern_build_pairlist",
+                                "package_summaries": [
+                                    {
+                                        "estimation_package": "gpu_kernel_lightgbm_v10",
+                                        "sample_count": 1,
+                                        "source_time_ns_mean": 310816,
+                                        "predicted_time_ns_mean": 294442.45,
+                                        "mean_time_ratio_predicted_over_source": 0.9473,
+                                        "source_gpus": ["H100"],
+                                        "target_gpus": ["GB200"],
+                                        "metric_comparisons": [
+                                            {
+                                                "name": "O-Memory Throughput [%]",
+                                                "sample_count": 1,
+                                                "source_value_mean": 52.12,
+                                                "predicted_value_mean": 49.10,
+                                                "ratio_predicted_over_source_mean": 0.9421,
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "estimation_package": "gpu_kernel_mlp_v15",
+                                        "sample_count": 5,
+                                        "source_time_ns_mean": 159104,
+                                        "predicted_time_ns_mean": 70311.2,
+                                        "mean_time_ratio_predicted_over_source": 0.4423,
+                                        "source_gpus": ["H100"],
+                                        "target_gpus": ["GB200"],
+                                        "metric_comparisons": [
+                                            {
+                                                "name": "Memory Throughput [%]",
+                                                "sample_count": 5,
+                                                "source_value_mean": 52.55,
+                                                "predicted_value_mean": 41.34,
+                                                "ratio_predicted_over_source_mean": 0.7866,
+                                            }
+                                        ],
+                                    },
+                                ],
+                            }
+                        ]
+                    },
                 }
             ],
             "overlaps": [],
@@ -201,8 +246,10 @@ def test_estimated_detail_template_renders_sections(app):
     assert "Applicability Summary" in html
     assert "Package Resolution" in html
     assert "Re-Estimation Context" in html
+    assert "System Comparison" in html
     assert "Current System" in html
     assert "Future System" in html
+    assert "Benchmark Processes/Node" in html
     assert "Estimate succeeded, but part of the breakdown used fallback handling." in html
     assert "required action: collect-section-specific-package-inputs" in html
     assert "weakscaling" in html
@@ -230,6 +277,15 @@ def test_estimated_detail_template_renders_sections(app):
     assert "overlap_package_unsupported:half" in html
     assert "Candidate estimates" in html
     assert "Time Ratio" in html
+    assert "Bench Time" in html
+    assert "Estimated Time" in html
     assert "gpu_kernel_ensemble_average" in html
     assert "gpu_kernel_lightgbm_v10" in html
     assert "gpu_kernel_mlp_v15" in html
+    assert "Kernel package comparison" in html
+    assert "kern_build_pairlist" in html
+    assert "Source Mean (ns)" in html
+    assert "Predicted Mean (ns)" in html
+    assert "O-Memory Throughput [%]" in html
+    assert "Memory Throughput [%]" in html
+    assert "GB200" in html
diff --git a/result_server/utils/estimated_detail_view.py b/result_server/utils/estimated_detail_view.py
index 5755c2f..52e4084 100644
--- a/result_server/utils/estimated_detail_view.py
+++ b/result_server/utils/estimated_detail_view.py
@@ -20,6 +20,7 @@ def build_estimated_detail_context(result):
         "reestimation_rows": _build_reestimation_rows(reestimation),
         "current_rows": _build_system_rows(current),
         "future_rows": _build_system_rows(future),
+        "system_comparison_rows": _build_system_comparison_rows(current, future),
         "measurement_json": result.get("measurement", {}),
         "confidence_json": result.get("confidence", {}),
         "assumptions_json": result.get("assumptions", {}),
@@ -63,13 +64,19 @@ def _build_package_rows(estimate_meta, applicability):
     current_package = estimate_meta.get("current_package", {})
     future_package = estimate_meta.get("future_package", {})
     rows = build_labeled_value_rows([
-        ("Top-Level Requested", estimate_meta.get("requested_estimation_package", "N/A")),
-        ("Top-Level Applied", estimate_meta.get("estimation_package", "N/A")),
+        ("Top-Level Package", _format_package_resolution(
+            estimate_meta.get("requested_estimation_package", "N/A"),
+            estimate_meta.get("estimation_package", "N/A"),
+        )),
         ("Top-Level Fallback", applicability.get("fallback_used", "none")),
-        ("Current Requested", current_package.get("requested_estimation_package", "N/A")),
-        ("Current Applied", current_package.get("estimation_package", "N/A")),
-        ("Future Requested", future_package.get("requested_estimation_package", "N/A")),
-        ("Future Applied", future_package.get("estimation_package", "N/A")),
+        ("Current Package", _format_package_resolution(
+            current_package.get("requested_estimation_package", "N/A"),
+            current_package.get("estimation_package", "N/A"),
+        )),
+        ("Future Package", _format_package_resolution(
+            future_package.get("requested_estimation_package", "N/A"),
+            future_package.get("estimation_package", "N/A"),
+        )),
     ])
 
     _append_list_row(rows, "Missing Inputs", applicability.get("missing_inputs", []))
@@ -78,6 +85,12 @@ def _build_package_rows(estimate_meta, applicability):
     return rows
 
 
+def _format_package_resolution(requested, applied):
+    if requested == applied:
+        return applied
+    return f"{applied} (requested: {requested})"
+
+
 def _build_system_rows(system_data):
     benchmark = system_data.get("benchmark", {})
     breakdown = system_data.get("fom_breakdown", {})
@@ -97,6 +110,40 @@ def _build_system_rows(system_data):
     ])
 
 
+def _build_system_comparison_rows(current, future):
+    current_rows = _build_comparison_system_rows(current)
+    future_rows = _build_comparison_system_rows(future)
+    future_by_label = {row["label"]: row for row in future_rows}
+    rows = []
+    for current_row in current_rows:
+        label = current_row["label"]
+        future_row = future_by_label.get(label, {})
+        rows.append({
+            "label": label,
+            "current": current_row.get("value", "N/A"),
+            "future": future_row.get("value", "N/A"),
+            "current_class": current_row.get("value_class", ""),
+            "future_class": future_row.get("value_class", ""),
+        })
+    return rows
+
+
+def _build_comparison_system_rows(system_data):
+    benchmark = system_data.get("benchmark", {})
+    breakdown = system_data.get("fom_breakdown", {})
+    return build_labeled_value_rows([
+        ("System", system_data.get("system", "N/A")),
+        ("FOM", format_numeric_value(system_data.get("fom", "N/A"))),
+        ("Target Nodes", system_data.get("target_nodes", "N/A")),
+        ("Benchmark System", benchmark.get("system", "N/A")),
+        ("Benchmark FOM", format_numeric_value(benchmark.get("fom", "N/A"))),
+        ("Benchmark Nodes", benchmark.get("nodes", "N/A")),
+        ("Benchmark Processes/Node", benchmark.get("numproc_node", "N/A")),
+        ("Sections", len(breakdown.get("sections", []))),
+        ("Overlaps", len(breakdown.get("overlaps", []))),
+    ])
+
+
 def _build_reestimation_rows(reestimation):
     if not reestimation:
         return []
diff --git a/scripts/estimation/common.sh b/scripts/estimation/common.sh
index 157c23b..7ec9158 100644
--- a/scripts/estimation/common.sh
+++ b/scripts/estimation/common.sh
@@ -282,6 +282,8 @@ bk_estimation_run_recorded_current_with_weakscaling() {
   local current_package="${4:-${BK_ESTIMATION_CURRENT_PACKAGE:-weakscaling}}"
   local current_model_version=""
   local baseline_breakdown=""
+  local current_breakdown_total=""
+  local current_breakdown_factor=""
 
   bk_estimation_load_package "$current_package"
   current_model_version="${BK_ESTIMATION_PACKAGE_VERSION:-0.1}"
@@ -305,6 +307,16 @@ bk_estimation_run_recorded_current_with_weakscaling() {
     "1" \
     "identity" \
     "identity")
+  if [[ -n "$est_current_fom_breakdown" && "$est_current_fom_breakdown" != "null" && -n "${est_current_bench_fom:-}" ]]; then
+    current_breakdown_total=$(bk_top_level_breakdown_total_time "$est_current_fom_breakdown")
+    if [[ -n "$current_breakdown_total" && "$current_breakdown_total" != "0" && "$current_breakdown_total" != "null" ]]; then
+      current_breakdown_factor=$(awk -v target="$est_current_bench_fom" -v source="$current_breakdown_total" 'BEGIN {printf "%.12f", target / source}')
+      est_current_fom_breakdown=$(bk_top_level_scale_breakdown_times \
+        "$est_current_fom_breakdown" \
+        "$current_breakdown_factor" \
+        "$current_package")
+    fi
+  fi
   est_current_fom=$(bk_top_level_breakdown_total_time "$est_current_fom_breakdown")
   if declare -F bk_estimation_package_build_recorded_current_model_json >/dev/null 2>&1; then
     est_current_model_json=$(bk_estimation_package_build_recorded_current_model_json "$baseline_system" "$current_model_version")
diff --git a/scripts/estimation/packages/instrumented_app_sections_dummy.sh b/scripts/estimation/packages/instrumented_app_sections_dummy.sh
index ecbe9bd..a59a263 100644
--- a/scripts/estimation/packages/instrumented_app_sections_dummy.sh
+++ b/scripts/estimation/packages/instrumented_app_sections_dummy.sh
@@ -34,6 +34,9 @@ bk_estimation_package_metadata() {
     "gpu_kernel_ensemble_average",
     "gpu_kernel_lightgbm_v10",
     "gpu_kernel_mlp_v15",
+    "gpu_kernel_mlp_v21",
+    "gpu_kernel_mlp_v40",
+    "gpu_kernel_mlp_v41",
     "logp"
   ],
   "supported_overlap_packages": [
diff --git a/scripts/estimation/prepare_gpu_mlp_ncu_input.py b/scripts/estimation/prepare_gpu_mlp_ncu_input.py
index 6178022..8c73442 100644
--- a/scripts/estimation/prepare_gpu_mlp_ncu_input.py
+++ b/scripts/estimation/prepare_gpu_mlp_ncu_input.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Prepare a PerfTools MLP_NN/v1.5 input CSV from an Nsight Compute archive.
+"""Prepare a PerfTools MLP_NN input CSV from an Nsight Compute archive.
 
 This is a small compatibility bridge for BenchKit.  It converts the wide
 Nsight Compute raw CSV exported from ``profile.ncu-rep`` into the CSV layout
@@ -354,7 +354,7 @@ def finalize_prepared_input(
         "TPC.TriageCompute.sm__inst_executed_realtime.avg.per_cycle_active",
     ).reset_index(drop=True)
     if "Executed Ipc Active [inst/cycle]" in df.columns:
-        df["Executed Ipc Active [inst/cycle]"] = ipc.iloc[: len(df)].to_numpy()
+        df["Executed Ipc Active [inst/cycle]"] = ipc.reindex(df.index).to_numpy()
         mean_ipc = df["Executed Ipc Active [inst/cycle]"].mean()
         df["Executed Ipc Active [inst/cycle]"] = df[
             "Executed Ipc Active [inst/cycle]"
@@ -411,7 +411,8 @@ def main() -> None:
             allowed_nan=ALLOWED_NAN_COLUMNS | set(args.allow_nan),
             target_gpu=args.target_gpu,
         )
-        print(f"wrote {out_csv}: {kernel_count} kernels")
+        final_count = len(pd.read_csv(out_csv))
+        print(f"wrote {out_csv}: {final_count} kernels")
     finally:
         if work_dir_owned and not args.keep_work:
             shutil.rmtree(work_dir, ignore_errors=True)
diff --git a/scripts/estimation/section_packages/gpu_kernel_ensemble_average.sh b/scripts/estimation/section_packages/gpu_kernel_ensemble_average.sh
index 1be6f3d..d837c54 100644
--- a/scripts/estimation/section_packages/gpu_kernel_ensemble_average.sh
+++ b/scripts/estimation/section_packages/gpu_kernel_ensemble_average.sh
@@ -165,7 +165,10 @@ bk_section_package_transform_gpu_kernel_ensemble_average() {
                   target_gpu: ($kernel.target_gpu // null),
                   estimation_package: ($candidate.estimation_package // ""),
                   predicted_time_ns: ($kernel.predicted_time_ns // null),
-                  time_ratio_predicted_over_source: $time_ratio
+                  time_ratio_predicted_over_source: $time_ratio,
+                  source_metrics: ($kernel.source_metrics // {}),
+                  predicted_metrics: ($kernel.metrics // {}),
+                  metric_comparisons: ($kernel.metric_comparisons // [])
                 }
             )
         )
@@ -177,6 +180,35 @@ bk_section_package_transform_gpu_kernel_ensemble_average() {
     | (blocking_candidates | length) as $blocking_count
     | ($candidates | length) as $candidate_count
     | ($usable | map(candidate_time_ratio) | map(select(. != null and . > 0))) as $usable_ratios
+    | (.bench_time // .time // null) as $app_section_time
+    | (
+        $candidates
+        | map(
+            . as $candidate
+            | (candidate_time_ratio) as $ratio
+            | {
+                estimation_package: ($candidate.estimation_package // ""),
+                scaling_method: ($candidate.scaling_method // ""),
+                applicability_status: ($candidate.package_applicability.status // ""),
+                source_section_time: $app_section_time,
+                projected_section_time: ($candidate.time // null),
+                time_ratio_predicted_over_source: $ratio,
+                source_gpus: ($candidate.metrics.source_gpus // []),
+                target_gpus: ($candidate.metrics.target_gpus // []),
+                kernel_count: ($candidate.metrics.kernel_count // (($candidate.metrics.matched_kernels // $candidate.metrics.kernels // []) | length)),
+                unique_kernel_count: (($candidate.metrics.kernel_names // (($candidate.metrics.matched_kernels // $candidate.metrics.kernels // []) | map(.name // "") | unique)) | length),
+                kernel_names: ($candidate.metrics.kernel_names // (($candidate.metrics.matched_kernels // $candidate.metrics.kernels // []) | map(.name // "") | unique)),
+                ncu_sample: {
+                  kernel_count: ($candidate.metrics.kernel_count // (($candidate.metrics.matched_kernels // $candidate.metrics.kernels // []) | length)),
+                  source_time: ($candidate.metrics.total_source_time // (if ($candidate.metrics.total_source_time_ns // null) != null then ($candidate.metrics.total_source_time_ns / 1000000000) else null end)),
+                  source_time_ns: ($candidate.metrics.total_source_time_ns // null),
+                  predicted_time: ($candidate.metrics.sample_predicted_time // (if ($candidate.metrics.total_predicted_time_ns // null) != null then ($candidate.metrics.total_predicted_time_ns / 1000000000) else null end)),
+                  predicted_time_ns: ($candidate.metrics.total_predicted_time_ns // null)
+                },
+                artifacts: ($candidate.artifacts // [])
+              }
+          )
+      ) as $package_summaries
     | ($usable | candidate_kernel_records) as $kernel_records
     | ($kernel_records | map(.name) | unique | sort) as $kernel_names
     | ($kernel_names | length) as $unique_kernel_count
@@ -207,14 +239,69 @@ bk_section_package_transform_gpu_kernel_ensemble_average() {
               }
           )
       ) as $kernel_means
+    | (
+        $kernel_records
+        | sort_by(.name)
+        | group_by(.name)
+        | map(
+            . as $kernel_group
+            | {
+                name: $kernel_group[0].name,
+                package_summaries: (
+                  $kernel_group
+                  | sort_by(.estimation_package)
+                  | group_by(.estimation_package)
+                  | map(
+                      . as $package_group
+                      | ($package_group | map(.source_time_ns) | map(select(. != null))) as $source_times_ns
+                      | ($package_group | map(.predicted_time_ns) | map(select(. != null))) as $predicted_times_ns
+                      | ($package_group | map(.time_ratio_predicted_over_source) | map(select(. != null))) as $ratios
+                      | (
+                          $package_group
+                          | map(.metric_comparisons // [])
+                          | add // []
+                          | sort_by(.name)
+                          | group_by(.name)
+                          | map(
+                              . as $metric_group
+                              | ($metric_group | map(.source_value // null) | map(select(. != null))) as $source_values
+                              | ($metric_group | map(.predicted_value // null) | map(select(. != null))) as $predicted_values
+                              | ($metric_group | map(.ratio_predicted_over_source // null) | map(select(. != null))) as $metric_ratios
+                              | {
+                                  name: $metric_group[0].name,
+                                  sample_count: ($metric_group | length),
+                                  source_value_mean: (if ($source_values | length) > 0 then (($source_values | add) / ($source_values | length)) else null end),
+                                  predicted_value_mean: (if ($predicted_values | length) > 0 then (($predicted_values | add) / ($predicted_values | length)) else null end),
+                                  ratio_predicted_over_source_mean: (if ($metric_ratios | length) > 0 then (($metric_ratios | add) / ($metric_ratios | length)) else null end),
+                                  samples: $metric_group
+                                }
+                            )
+                        ) as $metric_comparisons
+                      | {
+                          estimation_package: $package_group[0].estimation_package,
+                          sample_count: ($package_group | length),
+                          source_gpus: ($package_group | map(.source_gpu // empty) | unique | sort),
+                          target_gpus: ($package_group | map(.target_gpu // empty) | unique | sort),
+                          source_time_ns_total: (if ($source_times_ns | length) > 0 then ($source_times_ns | add) else null end),
+                          source_time_ns_mean: (if ($source_times_ns | length) > 0 then (($source_times_ns | add) / ($source_times_ns | length)) else null end),
+                          predicted_time_ns_total: (if ($predicted_times_ns | length) > 0 then ($predicted_times_ns | add) else null end),
+                          predicted_time_ns_mean: (if ($predicted_times_ns | length) > 0 then (($predicted_times_ns | add) / ($predicted_times_ns | length)) else null end),
+                          mean_time_ratio_predicted_over_source: (if ($ratios | length) > 0 then (($ratios | add) / ($ratios | length)) else null end),
+                          metric_comparisons: $metric_comparisons
+                        }
+                    )
+                )
+              }
+          )
+      ) as $kernel_summaries
     | (if ($usable_ratios | length) > 0 then (($usable_ratios | add) / ($usable_ratios | length)) else null end) as $mean_ratio
-    | (.bench_time // .time // null) as $app_section_time
     | ($blocking_count == 0 and $usable_count > 0 and $unique_kernel_count == 1 and $mean_ratio != null and $app_section_time != null) as $can_project_section
     | (if $can_project_section then ($app_section_time * $mean_ratio) else $app_section_time end) as $output_time
     | .
     + {
         estimation_package: (if $can_project_section then "gpu_kernel_ensemble_average" else "identity" end),
         requested_estimation_package: (.requested_estimation_package // "gpu_kernel_ensemble_average"),
+        bench_time: $app_section_time,
         time: $output_time,
         scaling_method: (if $can_project_section then "gpu-kernel-ensemble-average" else "identity" end),
         package_applicability: {
@@ -262,9 +349,11 @@ bk_section_package_transform_gpu_kernel_ensemble_average() {
             time_ratio_predicted_over_source: candidate_time_ratio,
             applicability_status: (.package_applicability.status // "")
           })),
+          package_summaries: $package_summaries,
           kernel_count: ($kernel_records | length),
           unique_kernel_count: $unique_kernel_count,
           kernel_names: $kernel_names,
+          kernel_summaries: $kernel_summaries,
           kernel_candidate_ratios: $kernel_means,
           app_gpu_section_time: $app_section_time,
           mean_time: (if $can_project_section then $output_time else null end),
diff --git a/scripts/estimation/section_packages/gpu_kernel_lightgbm_v10.sh b/scripts/estimation/section_packages/gpu_kernel_lightgbm_v10.sh
index 8f95c65..7d9fedb 100644
--- a/scripts/estimation/section_packages/gpu_kernel_lightgbm_v10.sh
+++ b/scripts/estimation/section_packages/gpu_kernel_lightgbm_v10.sh
@@ -502,6 +502,58 @@ def source_time_by_row(path):
     return [as_number(row.get(time_column)) for row in rows], time_column
 
 
+def source_metric_candidates(metric_name):
+    candidates = [metric_name]
+    if metric_name.startswith("O-"):
+        candidates.append(metric_name[2:])
+    if metric_name.startswith("brk_"):
+        candidates.append("breakdown_" + metric_name[4:])
+    if metric_name.startswith("breakdown_"):
+        candidates.append("brk_" + metric_name[len("breakdown_"):])
+    return list(dict.fromkeys(candidates))
+
+
+def source_metrics_by_row(path):
+    if not path:
+        return []
+    candidate = Path(path)
+    if not candidate.is_file():
+        return []
+
+    rows, fieldnames = read_csv_rows(path)
+    if not fieldnames:
+        return []
+
+    source_rows = []
+    for row in rows:
+        source_metrics = {}
+        for metric_name in metric_columns:
+            for source_name in source_metric_candidates(metric_name):
+                if source_name in fieldnames:
+                    value = as_number(row.get(source_name))
+                    if value is not None:
+                        source_metrics[metric_name] = value
+                        break
+        source_rows.append(source_metrics)
+    return source_rows
+
+
+def metric_comparisons(source_metrics, predicted_metrics):
+    comparisons = []
+    for metric_name in sorted(set(source_metrics) | set(predicted_metrics)):
+        item = {"name": metric_name}
+        source_value = source_metrics.get(metric_name)
+        predicted_value = predicted_metrics.get(metric_name)
+        if source_value is not None:
+            item["source_value"] = source_value
+        if predicted_value is not None:
+            item["predicted_value"] = predicted_value
+        if source_value not in (None, 0) and predicted_value is not None:
+            item["ratio_predicted_over_source"] = predicted_value / source_value
+        comparisons.append(item)
+    return comparisons
+
+
 reader = csv.DictReader(cleaned_lines(prediction_csv))
 if not reader.fieldnames:
     raise SystemExit(f"prediction CSV has no header: {prediction_csv}")
@@ -518,6 +570,7 @@ source_gpus = []
 target_gpus = []
 total_seconds = 0.0
 source_times_ns, source_time_column = source_time_by_row(input_csv)
+source_metrics_rows = source_metrics_by_row(input_csv)
 total_source_seconds = 0.0
 source_time_count = 0
 
@@ -537,6 +590,7 @@ for idx, row in enumerate(reader, start=1):
     seconds = predicted_ns / 1e9
     total_seconds += seconds
     source_ns = source_times_ns[idx - 1] if idx - 1 < len(source_times_ns) else None
+    source_metrics = source_metrics_rows[idx - 1] if idx - 1 < len(source_metrics_rows) else {}
     source_seconds = source_ns / 1e9 if source_ns is not None else None
     if source_seconds is not None:
         total_source_seconds += source_seconds
@@ -565,6 +619,11 @@ for idx, row in enumerate(reader, start=1):
         kernel["target_gpu"] = target_gpu
     if metrics:
         kernel["metrics"] = metrics
+    if source_metrics:
+        kernel["source_metrics"] = source_metrics
+    comparisons = metric_comparisons(source_metrics, metrics)
+    if comparisons:
+        kernel["metric_comparisons"] = comparisons
     kernels.append(kernel)
 
 summary_metrics = {
@@ -666,6 +725,7 @@ _bk_gpu_lightgbm_run_predictor() {
   prediction_csv_abs=$(_bk_gpu_lightgbm_abs_path "$prediction_csv")
   prediction_log_abs=$(_bk_gpu_lightgbm_abs_path "$prediction_log")
 
+  echo "Running PerfTools LightGBM_model/1.0 for ${section_name}: ${source_gpu}->${target_gpu}" >&2
   if ! (
     cd "$model_dir"
     "$python_bin" AI_model/run_inference.py \
diff --git a/scripts/estimation/section_packages/gpu_kernel_mlp_v15.sh b/scripts/estimation/section_packages/gpu_kernel_mlp_v15.sh
index 9021f75..64ad3fb 100644
--- a/scripts/estimation/section_packages/gpu_kernel_mlp_v15.sh
+++ b/scripts/estimation/section_packages/gpu_kernel_mlp_v15.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# gpu_kernel_mlp_v15.sh - Section package for the PerfTools MLP_NN/v1.5 GPU estimator.
+# gpu_kernel_mlp_v15.sh - Section package and shared implementation for
+# PerfTools MLP_NN GPU estimators.
 
 bk_section_package_metadata_gpu_kernel_mlp_v15() {
   cat <<'EOF'
@@ -143,7 +144,7 @@ _bk_gpu_mlp_ensure_perftools_root() {
 
     mkdir -p "$(dirname "$root")"
     if [[ ! -d "$root/.git" ]]; then
-      echo "Fetching PerfTools for gpu_kernel_mlp_v15: ${repo} (${ref})" >&2
+      echo "Fetching PerfTools for ${BK_GPU_MLP_PACKAGE_NAME:-gpu_kernel_mlp_v15}: ${repo} (${ref})" >&2
       git clone --depth 1 "$repo" "$root" >&2 || {
         printf '%s\n' "$root"
         return 0
@@ -161,13 +162,15 @@ _bk_gpu_mlp_ensure_perftools_root() {
 
 _bk_gpu_mlp_predictor() {
   local root="$1"
+  local version_dir="${BK_GPU_MLP_VERSION_DIR:-v1.5}"
+  local predictor_script="${BK_GPU_MLP_PREDICT_SCRIPT:-predict_v15.py}"
 
   if [[ -z "$root" ]]; then
     printf '%s\n' ""
     return 0
   fi
 
-  printf '%s\n' "${root}/MLP_NN/v1.5/predict_v15.py"
+  printf '%s\n' "${root}/MLP_NN/${version_dir}/${predictor_script}"
 }
 
 _bk_gpu_mlp_python_exists() {
@@ -346,6 +349,7 @@ bk_section_package_check_applicability_gpu_kernel_mlp_v15() {
   local root
   local predictor
   local python_bin="${BK_GPU_MLP_PYTHON:-$(_bk_gpu_mlp_default_python)}"
+  local predictor_rel="MLP_NN/${BK_GPU_MLP_VERSION_DIR:-v1.5}/${BK_GPU_MLP_PREDICT_SCRIPT:-predict_v15.py}"
   local missing=()
 
   if [[ "$item_kind" != "section" ]]; then
@@ -387,7 +391,7 @@ EOF
       missing+=('"BK_GPU_MLP_PERFTOOLS_ROOT"')
     fi
     if [[ -z "$predictor" || ! -f "$predictor" ]]; then
-      missing+=('"PerfTools MLP_NN/v1.5/predict_v15.py"')
+      missing+=("\"PerfTools predictor:${predictor_rel}\"")
     fi
   fi
 
@@ -490,6 +494,58 @@ def source_time_by_row(path):
     return [as_number(row.get(time_column)) for row in rows], time_column
 
 
+def source_metric_candidates(metric_name):
+    candidates = [metric_name]
+    if metric_name.startswith("O-"):
+        candidates.append(metric_name[2:])
+    if metric_name.startswith("brk_"):
+        candidates.append("breakdown_" + metric_name[4:])
+    if metric_name.startswith("breakdown_"):
+        candidates.append("brk_" + metric_name[len("breakdown_"):])
+    return list(dict.fromkeys(candidates))
+
+
+def source_metrics_by_row(path):
+    if not path:
+        return []
+    candidate = Path(path)
+    if not candidate.is_file():
+        return []
+
+    rows, fieldnames = read_csv_rows(path)
+    if not fieldnames:
+        return []
+
+    source_rows = []
+    for row in rows:
+        source_metrics = {}
+        for metric_name in metric_columns:
+            for source_name in source_metric_candidates(metric_name):
+                if source_name in fieldnames:
+                    value = as_number(row.get(source_name))
+                    if value is not None:
+                        source_metrics[metric_name] = value
+                        break
+        source_rows.append(source_metrics)
+    return source_rows
+
+
+def metric_comparisons(source_metrics, predicted_metrics):
+    comparisons = []
+    for metric_name in sorted(set(source_metrics) | set(predicted_metrics)):
+        item = {"name": metric_name}
+        source_value = source_metrics.get(metric_name)
+        predicted_value = predicted_metrics.get(metric_name)
+        if source_value is not None:
+            item["source_value"] = source_value
+        if predicted_value is not None:
+            item["predicted_value"] = predicted_value
+        if source_value not in (None, 0) and predicted_value is not None:
+            item["ratio_predicted_over_source"] = predicted_value / source_value
+        comparisons.append(item)
+    return comparisons
+
+
 reader = csv.DictReader(cleaned_lines(prediction_csv))
 if not reader.fieldnames:
     raise SystemExit(f"prediction CSV has no header: {prediction_csv}")
@@ -506,6 +562,7 @@ source_gpus = []
 target_gpus = []
 total_seconds = 0.0
 source_times_ns, source_time_column = source_time_by_row(input_csv)
+source_metrics_rows = source_metrics_by_row(input_csv)
 total_source_seconds = 0.0
 source_time_count = 0
 nonpositive_prediction_count = 0
@@ -528,6 +585,7 @@ for idx, row in enumerate(reader, start=1):
     seconds = predicted_ns / 1e9
     total_seconds += seconds
     source_ns = source_times_ns[idx - 1] if idx - 1 < len(source_times_ns) else None
+    source_metrics = source_metrics_rows[idx - 1] if idx - 1 < len(source_metrics_rows) else {}
     source_seconds = source_ns / 1e9 if source_ns is not None else None
     if source_seconds is not None:
         total_source_seconds += source_seconds
@@ -556,6 +614,11 @@ for idx, row in enumerate(reader, start=1):
         kernel["target_gpu"] = target_gpu
     if metrics:
         kernel["metrics"] = metrics
+    if source_metrics:
+        kernel["source_metrics"] = source_metrics
+    comparisons = metric_comparisons(source_metrics, metrics)
+    if comparisons:
+        kernel["metric_comparisons"] = comparisons
     kernels.append(kernel)
 
 summary_metrics = {
@@ -572,7 +635,7 @@ if nonpositive_prediction_count:
         "severity": "warning",
         "reason": "nonpositive_predicted_execution_time",
         "message": (
-            "PerfTools MLP_NN/v1.5 returned non-positive predicted execution "
+            f"PerfTools MLP_NN/{model_version} returned non-positive predicted execution "
             "time for one or more kernel rows. Check target GPU selection and "
             "required NCU feature coverage."
         ),
@@ -601,7 +664,7 @@ print(json.dumps({
     },
     "model": {
         "type": "cross_gpu_kernel_prediction_model",
-        "name": "PerfTools MLP_NN/v1.5",
+        "name": "PerfTools MLP_NN/" + model_version,
         "version": model_version,
         "repository": "https://github.com/masaaki-kondo/PerfTools",
     },
@@ -645,7 +708,10 @@ _bk_gpu_mlp_run_predictor() {
   local root
   local input_csv
   local ncu_archive
-  local output_dir="${BK_GPU_MLP_OUTPUT_DIR:-results/estimation_artifacts/gpu_kernel_mlp_v15}"
+  local package_name="${BK_GPU_MLP_PACKAGE_NAME:-gpu_kernel_mlp_v15}"
+  local version_dir="${BK_GPU_MLP_VERSION_DIR:-v1.5}"
+  local predictor_script="${BK_GPU_MLP_PREDICT_SCRIPT:-predict_v15.py}"
+  local output_dir="${BK_GPU_MLP_OUTPUT_DIR:-results/estimation_artifacts/${package_name}}"
   local prediction_csv
   local prediction_log
   local input_csv_abs
@@ -672,18 +738,18 @@ _bk_gpu_mlp_run_predictor() {
 
   if ! (
     cd "$root"
-    "$python_bin" MLP_NN/v1.5/predict_v15.py \
+    "$python_bin" "MLP_NN/${version_dir}/${predictor_script}" \
       --csv "$input_csv_abs" \
       --row "${BK_GPU_MLP_ROW:-all}" \
       --out "$prediction_csv_abs" \
       --log "$prediction_log_abs"
   ) >/dev/null; then
-    echo "ERROR: PerfTools MLP_NN/v1.5 inference failed" >&2
+    echo "ERROR: PerfTools MLP_NN/${version_dir} inference failed" >&2
     return 1
   fi
 
   if [[ ! -s "$prediction_csv_abs" ]]; then
-    echo "ERROR: PerfTools MLP_NN/v1.5 did not create prediction CSV: ${prediction_csv_abs}" >&2
+    echo "ERROR: PerfTools MLP_NN/${version_dir} did not create prediction CSV: ${prediction_csv_abs}" >&2
     return 1
   fi
 
@@ -702,8 +768,9 @@ bk_section_package_transform_gpu_kernel_mlp_v15() {
   local prediction_log=""
   local run_outputs
   local parsed_json
-  local package_name="gpu_kernel_mlp_v15"
+  local package_name="${BK_GPU_MLP_PACKAGE_NAME:-gpu_kernel_mlp_v15}"
   local model_version="${BK_GPU_MLP_MODEL_VERSION:-v1.5}"
+  local scaling_method="${BK_GPU_MLP_SCALING_METHOD:-gpu-kernel-mlp-${model_version}}"
   local selector_kind=""
   local selector_value=""
   local selector
@@ -727,6 +794,7 @@ bk_section_package_transform_gpu_kernel_mlp_v15() {
     --arg prediction_log "$prediction_log" \
     --arg selector_kind "$selector_kind" \
     --arg selector_value "$selector_value" \
+    --arg scaling_method "$scaling_method" \
     --argjson parsed "$parsed_json" '
     def selector_matches($kind; $value):
       if $kind == "" or $value == "" then true
@@ -767,7 +835,7 @@ bk_section_package_transform_gpu_kernel_mlp_v15() {
           end
         ),
         bench_time: $source_section_time,
-        scaling_method: (if $can_identity_fallback then "identity" else "gpu-kernel-mlp-v1.5" end),
+        scaling_method: (if $can_identity_fallback then "identity" else $scaling_method end),
         estimation_package: (if $can_identity_fallback then "identity" else $parsed.estimation_package end),
         requested_estimation_package: (if $can_identity_fallback then $parsed.estimation_package else (.requested_estimation_package // $parsed.estimation_package) end),
         fallback_used: (if $can_identity_fallback then "identity" else null end),
diff --git a/scripts/estimation/section_packages/gpu_kernel_mlp_v21.sh b/scripts/estimation/section_packages/gpu_kernel_mlp_v21.sh
new file mode 100644
index 0000000..faeda5c
--- /dev/null
+++ b/scripts/estimation/section_packages/gpu_kernel_mlp_v21.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# gpu_kernel_mlp_v21.sh - Thin package wrapper for PerfTools MLP_NN/v2.1.
+
+bk_section_package_metadata_gpu_kernel_mlp_v21() {
+  cat <<'EOF'
+{
+  "name": "gpu_kernel_mlp_v21",
+  "fallback_target": "identity",
+  "source_system_scope": {
+    "kind": "benchmark_system",
+    "accepted_values": ["any"]
+  },
+  "target_system_scope": {
+    "accepted_values": ["any"]
+  },
+  "item_kind_scope": ["section"],
+  "required_result_fields": ["name", "app-side GPU section time as time or bench_time"],
+  "required_artifact_kinds": [
+    "PerfTools MLP_NN/v2.1 prepared input CSV",
+    "precomputed prediction CSV",
+    "or BenchKit padata archive with Nsight Compute raw CSV"
+  ],
+  "acquisition_mode": "external",
+  "output_fields": [
+    "time",
+    "bench_time",
+    "scaling_method",
+    "metrics",
+    "package_applicability"
+  ]
+}
+EOF
+}
+
+bk_section_package_check_applicability_gpu_kernel_mlp_v21() (
+  export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v21"
+  export BK_GPU_MLP_VERSION_DIR="v2.1"
+  export BK_GPU_MLP_PREDICT_SCRIPT="predict_v21.py"
+  export BK_GPU_MLP_MODEL_VERSION="v2.1"
+  export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v2.1"
+  bk_section_package_check_applicability_gpu_kernel_mlp_v15 "$@"
+)
+
+bk_section_package_transform_gpu_kernel_mlp_v21() (
+  export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v21"
+  export BK_GPU_MLP_VERSION_DIR="v2.1"
+  export BK_GPU_MLP_PREDICT_SCRIPT="predict_v21.py"
+  export BK_GPU_MLP_MODEL_VERSION="v2.1"
+  export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v2.1"
+  bk_section_package_transform_gpu_kernel_mlp_v15 "$@"
+)
diff --git a/scripts/estimation/section_packages/gpu_kernel_mlp_v40.sh b/scripts/estimation/section_packages/gpu_kernel_mlp_v40.sh
new file mode 100644
index 0000000..09cf87d
--- /dev/null
+++ b/scripts/estimation/section_packages/gpu_kernel_mlp_v40.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# gpu_kernel_mlp_v40.sh - Thin package wrapper for PerfTools MLP_NN/v4.0.
+
+bk_section_package_metadata_gpu_kernel_mlp_v40() {
+  cat <<'EOF'
+{
+  "name": "gpu_kernel_mlp_v40",
+  "fallback_target": "identity",
+  "source_system_scope": {
+    "kind": "benchmark_system",
+    "accepted_values": ["any"]
+  },
+  "target_system_scope": {
+    "accepted_values": ["any"]
+  },
+  "item_kind_scope": ["section"],
+  "required_result_fields": ["name", "app-side GPU section time as time or bench_time"],
+  "required_artifact_kinds": [
+    "PerfTools MLP_NN/v4.0 prepared input CSV",
+    "precomputed prediction CSV",
+    "or BenchKit padata archive with Nsight Compute raw CSV"
+  ],
+  "acquisition_mode": "external",
+  "output_fields": [
+    "time",
+    "bench_time",
+    "scaling_method",
+    "metrics",
+    "package_applicability"
+  ]
+}
+EOF
+}
+
+bk_section_package_check_applicability_gpu_kernel_mlp_v40() (
+  export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v40"
+  export BK_GPU_MLP_VERSION_DIR="v4.0"
+  export BK_GPU_MLP_PREDICT_SCRIPT="predict_v40.py"
+  export BK_GPU_MLP_MODEL_VERSION="v4.0"
+  export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v4.0"
+  bk_section_package_check_applicability_gpu_kernel_mlp_v15 "$@"
+)
+
+bk_section_package_transform_gpu_kernel_mlp_v40() (
+  export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v40"
+  export BK_GPU_MLP_VERSION_DIR="v4.0"
+  export BK_GPU_MLP_PREDICT_SCRIPT="predict_v40.py"
+  export BK_GPU_MLP_MODEL_VERSION="v4.0"
+  export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v4.0"
+  bk_section_package_transform_gpu_kernel_mlp_v15 "$@"
+)
diff --git a/scripts/estimation/section_packages/gpu_kernel_mlp_v41.sh b/scripts/estimation/section_packages/gpu_kernel_mlp_v41.sh
new file mode 100644
index 0000000..9628c86
--- /dev/null
+++ b/scripts/estimation/section_packages/gpu_kernel_mlp_v41.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# gpu_kernel_mlp_v41.sh - Thin package wrapper for PerfTools MLP_NN/v4.1.
+
+bk_section_package_metadata_gpu_kernel_mlp_v41() {
+  cat <<'EOF'
+{
+  "name": "gpu_kernel_mlp_v41",
+  "fallback_target": "identity",
+  "source_system_scope": {
+    "kind": "benchmark_system",
+    "accepted_values": ["any"]
+  },
+  "target_system_scope": {
+    "accepted_values": ["any"]
+  },
+  "item_kind_scope": ["section"],
+  "required_result_fields": ["name", "app-side GPU section time as time or bench_time"],
+  "required_artifact_kinds": [
+    "PerfTools MLP_NN/v4.1 prepared input CSV",
+    "precomputed prediction CSV",
+    "or BenchKit padata archive with Nsight Compute raw CSV"
+  ],
+  "acquisition_mode": "external",
+  "output_fields": [
+    "time",
+    "bench_time",
+    "scaling_method",
+    "metrics",
+    "package_applicability"
+  ]
+}
+EOF
+}
+
+bk_section_package_check_applicability_gpu_kernel_mlp_v41() (
+  export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v41"
+  export BK_GPU_MLP_VERSION_DIR="v4.1"
+  export BK_GPU_MLP_PREDICT_SCRIPT="predict_v41.py"
+  export BK_GPU_MLP_MODEL_VERSION="v4.1"
+  export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v4.1"
+  bk_section_package_check_applicability_gpu_kernel_mlp_v15 "$@"
+)
+
+bk_section_package_transform_gpu_kernel_mlp_v41() (
+  export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v41"
+  export BK_GPU_MLP_VERSION_DIR="v4.1"
+  export BK_GPU_MLP_PREDICT_SCRIPT="predict_v41.py"
+  export BK_GPU_MLP_MODEL_VERSION="v4.1"
+  export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v4.1"
+  bk_section_package_transform_gpu_kernel_mlp_v15 "$@"
+)
diff --git a/scripts/tests/test_estimation_gpu_kernel_ensemble_average.sh b/scripts/tests/test_estimation_gpu_kernel_ensemble_average.sh
index 5534e83..244aa6a 100644
--- a/scripts/tests/test_estimation_gpu_kernel_ensemble_average.sh
+++ b/scripts/tests/test_estimation_gpu_kernel_ensemble_average.sh
@@ -56,8 +56,8 @@ kern_a,H100,A100,0,30
 EOF
 
 cat > "${TMP_DIR}/source_input_single.csv" <<'EOF'
-Kernel Name,Duration [ns]
-kern_a,1000
+Kernel Name,Duration [ns],Memory Throughput [%],Achieved Occupancy
+kern_a,1000,25,10
 EOF
 
 cat > "${TMP_DIR}/lightgbm_pred_mixed.csv" <<'EOF'
@@ -72,9 +72,9 @@ kern_b,H100,A100,5000,20
 EOF
 
 cat > "${TMP_DIR}/source_input_mixed.csv" <<'EOF'
-Kernel Name,Duration [ns]
-kern_a,1000
-kern_b,2000
+Kernel Name,Duration [ns],Memory Throughput [%],Achieved Occupancy
+kern_a,1000,25,10
+kern_b,2000,40,20
 EOF
 
 cat > "${TMP_DIR}/breakdown.json" <<'EOF'
@@ -129,10 +129,39 @@ if ! echo "$transformed_single" | jq -e '
   .sections[0].estimation_package == "gpu_kernel_ensemble_average" and
   near(.sections[0].time; 20) and
   .sections[0].scaling_method == "gpu-kernel-ensemble-average" and
+  .sections[0].bench_time == 10 and
   .sections[0].metrics.aggregation == "single-kernel-package-ratio-mean" and
   .sections[0].metrics.candidate_count == 2 and
   .sections[0].metrics.applicable_candidate_count == 2 and
   .sections[0].metrics.candidate_packages == ["gpu_kernel_lightgbm_v10", "gpu_kernel_mlp_v15"] and
+  (.sections[0].metrics.package_summaries | length == 2) and
+  .sections[0].metrics.package_summaries[0].estimation_package == "gpu_kernel_lightgbm_v10" and
+  .sections[0].metrics.package_summaries[0].source_section_time == 10 and
+  near(.sections[0].metrics.package_summaries[0].projected_section_time; 10) and
+  near(.sections[0].metrics.package_summaries[0].time_ratio_predicted_over_source; 1) and
+  .sections[0].metrics.package_summaries[0].source_gpus == ["H100"] and
+  .sections[0].metrics.package_summaries[0].target_gpus == ["A100"] and
+  .sections[0].metrics.package_summaries[0].ncu_sample.kernel_count == 1 and
+  .sections[0].metrics.package_summaries[0].ncu_sample.source_time_ns == 1000 and
+  .sections[0].metrics.package_summaries[0].ncu_sample.predicted_time_ns == 1000 and
+  .sections[0].metrics.package_summaries[1].estimation_package == "gpu_kernel_mlp_v15" and
+  near(.sections[0].metrics.package_summaries[1].projected_section_time; 30) and
+  near(.sections[0].metrics.package_summaries[1].time_ratio_predicted_over_source; 3) and
+  (.sections[0].metrics.kernel_summaries | length == 1) and
+  .sections[0].metrics.kernel_summaries[0].name == "kern_a" and
+  (.sections[0].metrics.kernel_summaries[0].package_summaries | length == 2) and
+  .sections[0].metrics.kernel_summaries[0].package_summaries[0].estimation_package == "gpu_kernel_lightgbm_v10" and
+  .sections[0].metrics.kernel_summaries[0].package_summaries[0].sample_count == 1 and
+  .sections[0].metrics.kernel_summaries[0].package_summaries[0].source_gpus == ["H100"] and
+  .sections[0].metrics.kernel_summaries[0].package_summaries[0].target_gpus == ["A100"] and
+  .sections[0].metrics.kernel_summaries[0].package_summaries[0].source_time_ns_total == 1000 and
+  .sections[0].metrics.kernel_summaries[0].package_summaries[0].predicted_time_ns_total == 1000 and
+  near(.sections[0].metrics.kernel_summaries[0].package_summaries[0].mean_time_ratio_predicted_over_source; 1) and
+  (.sections[0].metrics.kernel_summaries[0].package_summaries[0].metric_comparisons | length >= 2) and
+  (.sections[0].metrics.kernel_summaries[0].package_summaries[0].metric_comparisons | map(select(.name == "O-Memory Throughput [%]" and .source_value_mean == 25 and .predicted_value_mean == 50 and .ratio_predicted_over_source_mean == 2)) | length == 1) and
+  .sections[0].metrics.kernel_summaries[0].package_summaries[1].estimation_package == "gpu_kernel_mlp_v15" and
+  near(.sections[0].metrics.kernel_summaries[0].package_summaries[1].mean_time_ratio_predicted_over_source; 3) and
+  (.sections[0].metrics.kernel_summaries[0].package_summaries[1].metric_comparisons | map(select(.name == "Memory Throughput [%]" and .source_value_mean == 25 and .predicted_value_mean == 30 and .ratio_predicted_over_source_mean == 1.2)) | length == 1) and
   near(.sections[0].metrics.mean_time_ratio_predicted_over_source; 2) and
   .sections[0].metrics.unique_kernel_count == 1 and
   .sections[0].metrics.kernel_names == ["kern_a"] and
diff --git a/scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh b/scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh
index 82da30b..781f176 100644
--- a/scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh
+++ b/scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh
@@ -133,6 +133,16 @@ if args.log:
         handle.write("fake predictor called\n")
 PY
 
+for version_script in \
+  "v2.1 predict_v21.py" \
+  "v4.0 predict_v40.py" \
+  "v4.1 predict_v41.py"; do
+  read -r version_dir script_name <<< "$version_script"
+  mkdir -p "${FAKE_PERFTOOLS}/MLP_NN/${version_dir}"
+  cp "${FAKE_PERFTOOLS}/MLP_NN/v1.5/predict_v15.py" \
+    "${FAKE_PERFTOOLS}/MLP_NN/${version_dir}/${script_name}"
+done
+
 cat > "${TMP_DIR}/input.csv" <<'EOF'
 kernel_name,src_gpu,tgt_gpu,Execution Time
 probe_kernel,A100,H100,2000000
@@ -191,4 +201,53 @@ echo "$transformed_from_input" | jq -e '
 test -f "${TMP_DIR}/mlp_outputs/unknown_gpu_kernel_region_local_pred.csv"
 test -f "${TMP_DIR}/mlp_outputs/unknown_gpu_kernel_region_local.log"
 
+unset BK_GPU_MLP_OUTPUT_DIR
+for package_version in \
+  "gpu_kernel_mlp_v21 v2.1" \
+  "gpu_kernel_mlp_v40 v4.0" \
+  "gpu_kernel_mlp_v41 v4.1"; do
+  read -r package_name version_label <<< "$package_version"
+  cat > "${TMP_DIR}/breakdown_${package_name}.json" <<EOF
+{
+  "sections": [
+    {
+      "name": "gpu_kernel_region",
+      "bench_time": 0.011,
+      "estimation_package": "${package_name}",
+      "artifacts": [
+        {"path": "${TMP_DIR}/input.csv"}
+      ]
+    }
+  ],
+  "overlaps": []
+}
+EOF
+
+  pushd "${REPO_DIR}" >/dev/null
+  export BK_GPU_MLP_ARTIFACT_MODE="input"
+  export BK_GPU_MLP_PERFTOOLS_ROOT="${FAKE_PERFTOOLS}"
+  export BK_GPU_MLP_OUTPUT_DIR="${TMP_DIR}/${package_name}_outputs"
+  transformed_family=$(bk_top_level_transform_breakdown "$(cat "${TMP_DIR}/breakdown_${package_name}.json")" "1" "1" "1" "identity" "identity")
+  popd >/dev/null
+
+  echo "$transformed_family" | jq -e \
+    --arg package_name "$package_name" \
+    --arg version_label "$version_label" '
+    (.sections | length == 1) and
+    .sections[0].name == "gpu_kernel_region" and
+    .sections[0].time == 0.022 and
+    .sections[0].bench_time == 0.011 and
+    .sections[0].scaling_method == ("gpu-kernel-mlp-" + $version_label) and
+    .sections[0].estimation_package == $package_name and
+    .sections[0].model.name == ("PerfTools MLP_NN/" + $version_label) and
+    .sections[0].model.version == $version_label and
+    .sections[0].metrics.kernel_count == 1 and
+    .sections[0].metrics.total_source_time_ns == 2000000 and
+    .sections[0].metrics.total_predicted_time_ns == 4000000
+  ' >/dev/null
+
+  test -f "${TMP_DIR}/${package_name}_outputs/unknown_gpu_kernel_region_local_pred.csv"
+  test -f "${TMP_DIR}/${package_name}_outputs/unknown_gpu_kernel_region_local.log"
+done
+
 echo "gpu_kernel_mlp_v15 section estimation test passed"
diff --git a/scripts/tests/test_genesis_gpu_mlp_estimation.sh b/scripts/tests/test_genesis_gpu_mlp_estimation.sh
index baf200c..e8b994a 100644
--- a/scripts/tests/test_genesis_gpu_mlp_estimation.sh
+++ b/scripts/tests/test_genesis_gpu_mlp_estimation.sh
@@ -20,6 +20,7 @@ source programs/genesis/estimate.sh
 test "${BK_ESTIMATION_BASELINE_EXP}" = "p8"
 test "${BK_ESTIMATION_BASELINE_SYSTEM}" = "Fugaku"
 test "${BK_ESTIMATION_FUTURE_SYSTEM}" = "FugakuNEXT"
+test "${BK_GPU_KERNEL_ENSEMBLE_PACKAGES}" = "gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15,gpu_kernel_mlp_v21,gpu_kernel_mlp_v40,gpu_kernel_mlp_v41"
 
 cat > results/no_breakdown_input.json <<'EOF'
 {