diff --git a/docs/guides/add-estimation-package.md b/docs/guides/add-estimation-package.md index 3f75220..0babb67 100644 --- a/docs/guides/add-estimation-package.md +++ b/docs/guides/add-estimation-package.md @@ -45,6 +45,9 @@ - `overlap_max_basic.sh` - `gpu_kernel_lightgbm_v10.sh` - `gpu_kernel_mlp_v15.sh` + - `gpu_kernel_mlp_v21.sh` + - `gpu_kernel_mlp_v40.sh` + - `gpu_kernel_mlp_v41.sh` ## 3. top-level package の責務 @@ -75,6 +78,18 @@ GPU kernel 単位の外部推定ツールは、通常は section package とし - `gpu_kernel_mlp_v15` - PerfTools `MLP_NN/v1.5` - 主な依存: numpy/pandas/torch +- `gpu_kernel_mlp_v21` + - PerfTools `MLP_NN/v2.1` + - v1.5 NN と analytical anchor を組み合わせた hybrid/reference 系 + - 主な依存: numpy/pandas/torch +- `gpu_kernel_mlp_v40` + - PerfTools `MLP_NN/v4.0` + - no-ET pure NN 系 + - 主な依存: numpy/pandas/torch +- `gpu_kernel_mlp_v41` + - PerfTools `MLP_NN/v4.1` + - v4.0 に single-axis trend 対応を加えた NN 系 + - 主な依存: numpy/pandas/torch - `gpu_kernel_lightgbm_v10` - PerfTools `LightGBM_model/1.0` - 主な依存: numpy/pandas/lightgbm/pyyaml と `libgomp` diff --git a/programs/genesis/README.md b/programs/genesis/README.md index 0312566..a66a188 100644 --- a/programs/genesis/README.md +++ b/programs/genesis/README.md @@ -124,13 +124,19 @@ Single-package selection: ```bash BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v15 # or +BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v21 +# or +BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v40 +# or +BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v41 +# or BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_lightgbm_v10 ``` Multiple-package comparison: ```bash -BK_GENESIS_GPU_SECTION_PACKAGES=gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15 +BK_GENESIS_GPU_SECTION_PACKAGES=gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15,gpu_kernel_mlp_v21,gpu_kernel_mlp_v40,gpu_kernel_mlp_v41 ``` When multiple packages are selected, the app wrapper asks for diff --git a/programs/genesis/estimate.sh b/programs/genesis/estimate.sh index 2fffbd7..1bd4d7c 100644 --- a/programs/genesis/estimate.sh +++ b/programs/genesis/estimate.sh @@ -9,7 +9,7 @@ genesis_gpu_section_packages() { elif [[ -n "${BK_GENESIS_GPU_SECTION_PACKAGE:-}" ]]; then raw="$BK_GENESIS_GPU_SECTION_PACKAGE" else - raw="gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15" + raw="gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15,gpu_kernel_mlp_v21,gpu_kernel_mlp_v40,gpu_kernel_mlp_v41" fi printf '%s\n' "$raw" | @@ -340,7 +340,6 @@ genesis_run_single_estimate() { "${BK_ESTIMATION_CURRENT_TARGET_NODES:-1}" \ "${BK_ESTIMATION_CURRENT_PACKAGE:-weakscaling}" est_current_fom="${est_current_bench_fom:-$est_current_fom}" - est_current_fom_breakdown="" if [[ "$synthetic_breakdown" -eq 1 ]]; then genesis_mark_gpu_section_time_missing diff --git a/result_server/templates/_estimated_breakdown_card.html b/result_server/templates/_estimated_breakdown_card.html index c63b97d..ba39f2a 100644 --- a/result_server/templates/_estimated_breakdown_card.html +++ b/result_server/templates/_estimated_breakdown_card.html @@ -10,13 +10,92 @@ {% endif %} {%- endmacro %} +{% macro render_kernel_package_comparisons(item) -%} + {% set kernel_summaries = item.get('metrics', {}).get('kernel_summaries', []) %} + {% if kernel_summaries %} +
| Package | +Samples | +Source Mean (ns) | +Predicted Mean (ns) | +Ratio | +Source GPU | +Target GPU | +||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| {{ package.get('estimation_package', 'N/A') }} | +{{ package.get('sample_count', 'N/A') }} | +{{ package.get('source_time_ns_mean', 'N/A') }} | +{{ package.get('predicted_time_ns_mean', 'N/A') }} | +{{ package.get('mean_time_ratio_predicted_over_source', 'N/A') }} | +{{ package.get('source_gpus', []) | join(', ') }} | +{{ package.get('target_gpus', []) | join(', ') }} | +||||||||||
|
+
+
+ {{ package.get('estimation_package', 'N/A') }} metrics
+
|
+ ||||||||||||||||
| {{ first_column_label }} | Time | Package | Scaling | Fallback | Applicability | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| {{ first_column_label }} | +Bench Time | +Estimated Time | +Package | +Scaling | +Fallback | +Applicability | +|||||||||||||||||||
| {{ first_value }} | +{{ item.get('bench_time', item.get('time', 'N/A')) }} | {{ item.get('time', 'N/A') }} | {{ item.get('estimation_package', 'N/A') }} | {{ item.get('scaling_method', 'N/A') }} | @@ -34,7 +114,7 @@|||||||||||||||||||||
| + |
Candidate estimates; mean time is used for FOM composition.
Applicability Summary{{ render_json_block("Confidence", confidence_json) }}
+
System Comparison+
+
+
diff --git a/result_server/tests/test_estimated_detail_template.py b/result_server/tests/test_estimated_detail_template.py
index c57c0e1..ce1e461 100644
--- a/result_server/tests/test_estimated_detail_template.py
+++ b/result_server/tests/test_estimated_detail_template.py
@@ -73,7 +73,7 @@
"fom": 0.944,
"target_nodes": "1024",
"scaling_method": "weakscaling",
- "benchmark": {"system": "Fugaku", "fom": 0.386, "nodes": "1"},
+ "benchmark": {"system": "Fugaku", "fom": 0.386, "nodes": "1", "numproc_node": "4"},
"model": {"name": "weakscaling-current", "type": "intra_system_scaling_model"},
"fom_breakdown": {
"sections": [
@@ -109,7 +109,7 @@
"fom": 9.054,
"target_nodes": "256",
"scaling_method": "instrumented-app-sections-dummy",
- "benchmark": {"system": "MiyabiG", "fom": 5.712, "nodes": "1"},
+ "benchmark": {"system": "MiyabiG", "fom": 5.712, "nodes": "1", "numproc_node": "8"},
"model": {"name": "instrumented-app-sections-future-projection", "type": "cross_system_projection_model"},
"fom_breakdown": {
"sections": [
@@ -140,6 +140,51 @@
"metrics": {"time_ratio_predicted_over_source": 2.6666666667},
},
],
+ "metrics": {
+ "kernel_summaries": [
+ {
+ "name": "kern_build_pairlist",
+ "package_summaries": [
+ {
+ "estimation_package": "gpu_kernel_lightgbm_v10",
+ "sample_count": 1,
+ "source_time_ns_mean": 310816,
+ "predicted_time_ns_mean": 294442.45,
+ "mean_time_ratio_predicted_over_source": 0.9473,
+ "source_gpus": ["H100"],
+ "target_gpus": ["GB200"],
+ "metric_comparisons": [
+ {
+ "name": "O-Memory Throughput [%]",
+ "sample_count": 1,
+ "source_value_mean": 52.12,
+ "predicted_value_mean": 49.10,
+ "ratio_predicted_over_source_mean": 0.9421,
+ }
+ ],
+ },
+ {
+ "estimation_package": "gpu_kernel_mlp_v15",
+ "sample_count": 5,
+ "source_time_ns_mean": 159104,
+ "predicted_time_ns_mean": 70311.2,
+ "mean_time_ratio_predicted_over_source": 0.4423,
+ "source_gpus": ["H100"],
+ "target_gpus": ["GB200"],
+ "metric_comparisons": [
+ {
+ "name": "Memory Throughput [%]",
+ "sample_count": 5,
+ "source_value_mean": 52.55,
+ "predicted_value_mean": 41.34,
+ "ratio_predicted_over_source_mean": 0.7866,
+ }
+ ],
+ },
+ ],
+ }
+ ]
+ },
}
],
"overlaps": [],
@@ -201,8 +246,10 @@ def test_estimated_detail_template_renders_sections(app):
assert "Applicability Summary" in html
assert "Package Resolution" in html
assert "Re-Estimation Context" in html
+ assert "System Comparison" in html
assert "Current System" in html
assert "Future System" in html
+ assert "Benchmark Processes/Node" in html
assert "Estimate succeeded, but part of the breakdown used fallback handling." in html
assert "required action: collect-section-specific-package-inputs" in html
assert "weakscaling" in html
@@ -230,6 +277,15 @@ def test_estimated_detail_template_renders_sections(app):
assert "overlap_package_unsupported:half" in html
assert "Candidate estimates" in html
assert "Time Ratio" in html
+ assert "Bench Time" in html
+ assert "Estimated Time" in html
assert "gpu_kernel_ensemble_average" in html
assert "gpu_kernel_lightgbm_v10" in html
assert "gpu_kernel_mlp_v15" in html
+ assert "Kernel package comparison" in html
+ assert "kern_build_pairlist" in html
+ assert "Source Mean (ns)" in html
+ assert "Predicted Mean (ns)" in html
+ assert "O-Memory Throughput [%]" in html
+ assert "Memory Throughput [%]" in html
+ assert "GB200" in html
diff --git a/result_server/utils/estimated_detail_view.py b/result_server/utils/estimated_detail_view.py
index 5755c2f..52e4084 100644
--- a/result_server/utils/estimated_detail_view.py
+++ b/result_server/utils/estimated_detail_view.py
@@ -20,6 +20,7 @@ def build_estimated_detail_context(result):
"reestimation_rows": _build_reestimation_rows(reestimation),
"current_rows": _build_system_rows(current),
"future_rows": _build_system_rows(future),
+ "system_comparison_rows": _build_system_comparison_rows(current, future),
"measurement_json": result.get("measurement", {}),
"confidence_json": result.get("confidence", {}),
"assumptions_json": result.get("assumptions", {}),
@@ -63,13 +64,19 @@ def _build_package_rows(estimate_meta, applicability):
current_package = estimate_meta.get("current_package", {})
future_package = estimate_meta.get("future_package", {})
rows = build_labeled_value_rows([
- ("Top-Level Requested", estimate_meta.get("requested_estimation_package", "N/A")),
- ("Top-Level Applied", estimate_meta.get("estimation_package", "N/A")),
+ ("Top-Level Package", _format_package_resolution(
+ estimate_meta.get("requested_estimation_package", "N/A"),
+ estimate_meta.get("estimation_package", "N/A"),
+ )),
("Top-Level Fallback", applicability.get("fallback_used", "none")),
- ("Current Requested", current_package.get("requested_estimation_package", "N/A")),
- ("Current Applied", current_package.get("estimation_package", "N/A")),
- ("Future Requested", future_package.get("requested_estimation_package", "N/A")),
- ("Future Applied", future_package.get("estimation_package", "N/A")),
+ ("Current Package", _format_package_resolution(
+ current_package.get("requested_estimation_package", "N/A"),
+ current_package.get("estimation_package", "N/A"),
+ )),
+ ("Future Package", _format_package_resolution(
+ future_package.get("requested_estimation_package", "N/A"),
+ future_package.get("estimation_package", "N/A"),
+ )),
])
_append_list_row(rows, "Missing Inputs", applicability.get("missing_inputs", []))
@@ -78,6 +85,12 @@ def _build_package_rows(estimate_meta, applicability):
return rows
+def _format_package_resolution(requested, applied):
+ if requested == applied:
+ return applied
+ return f"{applied} (requested: {requested})"
+
+
def _build_system_rows(system_data):
benchmark = system_data.get("benchmark", {})
breakdown = system_data.get("fom_breakdown", {})
@@ -97,6 +110,40 @@ def _build_system_rows(system_data):
])
+def _build_system_comparison_rows(current, future):
+ current_rows = _build_comparison_system_rows(current)
+ future_rows = _build_comparison_system_rows(future)
+ future_by_label = {row["label"]: row for row in future_rows}
+ rows = []
+ for current_row in current_rows:
+ label = current_row["label"]
+ future_row = future_by_label.get(label, {})
+ rows.append({
+ "label": label,
+ "current": current_row.get("value", "N/A"),
+ "future": future_row.get("value", "N/A"),
+ "current_class": current_row.get("value_class", ""),
+ "future_class": future_row.get("value_class", ""),
+ })
+ return rows
+
+
+def _build_comparison_system_rows(system_data):
+ benchmark = system_data.get("benchmark", {})
+ breakdown = system_data.get("fom_breakdown", {})
+ return build_labeled_value_rows([
+ ("System", system_data.get("system", "N/A")),
+ ("FOM", format_numeric_value(system_data.get("fom", "N/A"))),
+ ("Target Nodes", system_data.get("target_nodes", "N/A")),
+ ("Benchmark System", benchmark.get("system", "N/A")),
+ ("Benchmark FOM", format_numeric_value(benchmark.get("fom", "N/A"))),
+ ("Benchmark Nodes", benchmark.get("nodes", "N/A")),
+ ("Benchmark Processes/Node", benchmark.get("numproc_node", "N/A")),
+ ("Sections", len(breakdown.get("sections", []))),
+ ("Overlaps", len(breakdown.get("overlaps", []))),
+ ])
+
+
def _build_reestimation_rows(reestimation):
if not reestimation:
return []
diff --git a/scripts/estimation/common.sh b/scripts/estimation/common.sh
index 157c23b..7ec9158 100644
--- a/scripts/estimation/common.sh
+++ b/scripts/estimation/common.sh
@@ -282,6 +282,8 @@ bk_estimation_run_recorded_current_with_weakscaling() {
local current_package="${4:-${BK_ESTIMATION_CURRENT_PACKAGE:-weakscaling}}"
local current_model_version=""
local baseline_breakdown=""
+ local current_breakdown_total=""
+ local current_breakdown_factor=""
bk_estimation_load_package "$current_package"
current_model_version="${BK_ESTIMATION_PACKAGE_VERSION:-0.1}"
@@ -305,6 +307,16 @@ bk_estimation_run_recorded_current_with_weakscaling() {
"1" \
"identity" \
"identity")
+ if [[ -n "$est_current_fom_breakdown" && "$est_current_fom_breakdown" != "null" && -n "${est_current_bench_fom:-}" ]]; then
+ current_breakdown_total=$(bk_top_level_breakdown_total_time "$est_current_fom_breakdown")
+ if [[ -n "$current_breakdown_total" && "$current_breakdown_total" != "0" && "$current_breakdown_total" != "null" ]]; then
+ current_breakdown_factor=$(awk -v target="$est_current_bench_fom" -v source="$current_breakdown_total" 'BEGIN {printf "%.12f", target / source}')
+ est_current_fom_breakdown=$(bk_top_level_scale_breakdown_times \
+ "$est_current_fom_breakdown" \
+ "$current_breakdown_factor" \
+ "$current_package")
+ fi
+ fi
est_current_fom=$(bk_top_level_breakdown_total_time "$est_current_fom_breakdown")
if declare -F bk_estimation_package_build_recorded_current_model_json >/dev/null 2>&1; then
est_current_model_json=$(bk_estimation_package_build_recorded_current_model_json "$baseline_system" "$current_model_version")
diff --git a/scripts/estimation/packages/instrumented_app_sections_dummy.sh b/scripts/estimation/packages/instrumented_app_sections_dummy.sh
index ecbe9bd..a59a263 100644
--- a/scripts/estimation/packages/instrumented_app_sections_dummy.sh
+++ b/scripts/estimation/packages/instrumented_app_sections_dummy.sh
@@ -34,6 +34,9 @@ bk_estimation_package_metadata() {
"gpu_kernel_ensemble_average",
"gpu_kernel_lightgbm_v10",
"gpu_kernel_mlp_v15",
+ "gpu_kernel_mlp_v21",
+ "gpu_kernel_mlp_v40",
+ "gpu_kernel_mlp_v41",
"logp"
],
"supported_overlap_packages": [
diff --git a/scripts/estimation/prepare_gpu_mlp_ncu_input.py b/scripts/estimation/prepare_gpu_mlp_ncu_input.py
index 6178022..8c73442 100644
--- a/scripts/estimation/prepare_gpu_mlp_ncu_input.py
+++ b/scripts/estimation/prepare_gpu_mlp_ncu_input.py
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
-"""Prepare a PerfTools MLP_NN/v1.5 input CSV from an Nsight Compute archive.
+"""Prepare a PerfTools MLP_NN input CSV from an Nsight Compute archive.
This is a small compatibility bridge for BenchKit. It converts the wide
Nsight Compute raw CSV exported from ``profile.ncu-rep`` into the CSV layout
@@ -354,7 +354,7 @@ def finalize_prepared_input(
"TPC.TriageCompute.sm__inst_executed_realtime.avg.per_cycle_active",
).reset_index(drop=True)
if "Executed Ipc Active [inst/cycle]" in df.columns:
- df["Executed Ipc Active [inst/cycle]"] = ipc.iloc[: len(df)].to_numpy()
+ df["Executed Ipc Active [inst/cycle]"] = ipc.reindex(df.index).to_numpy()
mean_ipc = df["Executed Ipc Active [inst/cycle]"].mean()
df["Executed Ipc Active [inst/cycle]"] = df[
"Executed Ipc Active [inst/cycle]"
@@ -411,7 +411,8 @@ def main() -> None:
allowed_nan=ALLOWED_NAN_COLUMNS | set(args.allow_nan),
target_gpu=args.target_gpu,
)
- print(f"wrote {out_csv}: {kernel_count} kernels")
+ final_count = len(pd.read_csv(out_csv))
+ print(f"wrote {out_csv}: {final_count} kernels")
finally:
if work_dir_owned and not args.keep_work:
shutil.rmtree(work_dir, ignore_errors=True)
diff --git a/scripts/estimation/section_packages/gpu_kernel_ensemble_average.sh b/scripts/estimation/section_packages/gpu_kernel_ensemble_average.sh
index 1be6f3d..d837c54 100644
--- a/scripts/estimation/section_packages/gpu_kernel_ensemble_average.sh
+++ b/scripts/estimation/section_packages/gpu_kernel_ensemble_average.sh
@@ -165,7 +165,10 @@ bk_section_package_transform_gpu_kernel_ensemble_average() {
target_gpu: ($kernel.target_gpu // null),
estimation_package: ($candidate.estimation_package // ""),
predicted_time_ns: ($kernel.predicted_time_ns // null),
- time_ratio_predicted_over_source: $time_ratio
+ time_ratio_predicted_over_source: $time_ratio,
+ source_metrics: ($kernel.source_metrics // {}),
+ predicted_metrics: ($kernel.metrics // {}),
+ metric_comparisons: ($kernel.metric_comparisons // [])
}
)
)
@@ -177,6 +180,35 @@ bk_section_package_transform_gpu_kernel_ensemble_average() {
| (blocking_candidates | length) as $blocking_count
| ($candidates | length) as $candidate_count
| ($usable | map(candidate_time_ratio) | map(select(. != null and . > 0))) as $usable_ratios
+ | (.bench_time // .time // null) as $app_section_time
+ | (
+ $candidates
+ | map(
+ . as $candidate
+ | (candidate_time_ratio) as $ratio
+ | {
+ estimation_package: ($candidate.estimation_package // ""),
+ scaling_method: ($candidate.scaling_method // ""),
+ applicability_status: ($candidate.package_applicability.status // ""),
+ source_section_time: $app_section_time,
+ projected_section_time: ($candidate.time // null),
+ time_ratio_predicted_over_source: $ratio,
+ source_gpus: ($candidate.metrics.source_gpus // []),
+ target_gpus: ($candidate.metrics.target_gpus // []),
+ kernel_count: ($candidate.metrics.kernel_count // (($candidate.metrics.matched_kernels // $candidate.metrics.kernels // []) | length)),
+ unique_kernel_count: (($candidate.metrics.kernel_names // (($candidate.metrics.matched_kernels // $candidate.metrics.kernels // []) | map(.name // "") | unique)) | length),
+ kernel_names: ($candidate.metrics.kernel_names // (($candidate.metrics.matched_kernels // $candidate.metrics.kernels // []) | map(.name // "") | unique)),
+ ncu_sample: {
+ kernel_count: ($candidate.metrics.kernel_count // (($candidate.metrics.matched_kernels // $candidate.metrics.kernels // []) | length)),
+ source_time: ($candidate.metrics.total_source_time // (if ($candidate.metrics.total_source_time_ns // null) != null then ($candidate.metrics.total_source_time_ns / 1000000000) else null end)),
+ source_time_ns: ($candidate.metrics.total_source_time_ns // null),
+ predicted_time: ($candidate.metrics.sample_predicted_time // (if ($candidate.metrics.total_predicted_time_ns // null) != null then ($candidate.metrics.total_predicted_time_ns / 1000000000) else null end)),
+ predicted_time_ns: ($candidate.metrics.total_predicted_time_ns // null)
+ },
+ artifacts: ($candidate.artifacts // [])
+ }
+ )
+ ) as $package_summaries
| ($usable | candidate_kernel_records) as $kernel_records
| ($kernel_records | map(.name) | unique | sort) as $kernel_names
| ($kernel_names | length) as $unique_kernel_count
@@ -207,14 +239,69 @@ bk_section_package_transform_gpu_kernel_ensemble_average() {
}
)
) as $kernel_means
+ | (
+ $kernel_records
+ | sort_by(.name)
+ | group_by(.name)
+ | map(
+ . as $kernel_group
+ | {
+ name: $kernel_group[0].name,
+ package_summaries: (
+ $kernel_group
+ | sort_by(.estimation_package)
+ | group_by(.estimation_package)
+ | map(
+ . as $package_group
+ | ($package_group | map(.source_time_ns) | map(select(. != null))) as $source_times_ns
+ | ($package_group | map(.predicted_time_ns) | map(select(. != null))) as $predicted_times_ns
+ | ($package_group | map(.time_ratio_predicted_over_source) | map(select(. != null))) as $ratios
+ | (
+ $package_group
+ | map(.metric_comparisons // [])
+ | add // []
+ | sort_by(.name)
+ | group_by(.name)
+ | map(
+ . as $metric_group
+ | ($metric_group | map(.source_value // null) | map(select(. != null))) as $source_values
+ | ($metric_group | map(.predicted_value // null) | map(select(. != null))) as $predicted_values
+ | ($metric_group | map(.ratio_predicted_over_source // null) | map(select(. != null))) as $metric_ratios
+ | {
+ name: $metric_group[0].name,
+ sample_count: ($metric_group | length),
+ source_value_mean: (if ($source_values | length) > 0 then (($source_values | add) / ($source_values | length)) else null end),
+ predicted_value_mean: (if ($predicted_values | length) > 0 then (($predicted_values | add) / ($predicted_values | length)) else null end),
+ ratio_predicted_over_source_mean: (if ($metric_ratios | length) > 0 then (($metric_ratios | add) / ($metric_ratios | length)) else null end),
+ samples: $metric_group
+ }
+ )
+ ) as $metric_comparisons
+ | {
+ estimation_package: $package_group[0].estimation_package,
+ sample_count: ($package_group | length),
+ source_gpus: ($package_group | map(.source_gpu // empty) | unique | sort),
+ target_gpus: ($package_group | map(.target_gpu // empty) | unique | sort),
+ source_time_ns_total: (if ($source_times_ns | length) > 0 then ($source_times_ns | add) else null end),
+ source_time_ns_mean: (if ($source_times_ns | length) > 0 then (($source_times_ns | add) / ($source_times_ns | length)) else null end),
+ predicted_time_ns_total: (if ($predicted_times_ns | length) > 0 then ($predicted_times_ns | add) else null end),
+ predicted_time_ns_mean: (if ($predicted_times_ns | length) > 0 then (($predicted_times_ns | add) / ($predicted_times_ns | length)) else null end),
+ mean_time_ratio_predicted_over_source: (if ($ratios | length) > 0 then (($ratios | add) / ($ratios | length)) else null end),
+ metric_comparisons: $metric_comparisons
+ }
+ )
+ )
+ }
+ )
+ ) as $kernel_summaries
| (if ($usable_ratios | length) > 0 then (($usable_ratios | add) / ($usable_ratios | length)) else null end) as $mean_ratio
- | (.bench_time // .time // null) as $app_section_time
| ($blocking_count == 0 and $usable_count > 0 and $unique_kernel_count == 1 and $mean_ratio != null and $app_section_time != null) as $can_project_section
| (if $can_project_section then ($app_section_time * $mean_ratio) else $app_section_time end) as $output_time
| .
+ {
estimation_package: (if $can_project_section then "gpu_kernel_ensemble_average" else "identity" end),
requested_estimation_package: (.requested_estimation_package // "gpu_kernel_ensemble_average"),
+ bench_time: $app_section_time,
time: $output_time,
scaling_method: (if $can_project_section then "gpu-kernel-ensemble-average" else "identity" end),
package_applicability: {
@@ -262,9 +349,11 @@ bk_section_package_transform_gpu_kernel_ensemble_average() {
time_ratio_predicted_over_source: candidate_time_ratio,
applicability_status: (.package_applicability.status // "")
})),
+ package_summaries: $package_summaries,
kernel_count: ($kernel_records | length),
unique_kernel_count: $unique_kernel_count,
kernel_names: $kernel_names,
+ kernel_summaries: $kernel_summaries,
kernel_candidate_ratios: $kernel_means,
app_gpu_section_time: $app_section_time,
mean_time: (if $can_project_section then $output_time else null end),
diff --git a/scripts/estimation/section_packages/gpu_kernel_lightgbm_v10.sh b/scripts/estimation/section_packages/gpu_kernel_lightgbm_v10.sh
index 8f95c65..7d9fedb 100644
--- a/scripts/estimation/section_packages/gpu_kernel_lightgbm_v10.sh
+++ b/scripts/estimation/section_packages/gpu_kernel_lightgbm_v10.sh
@@ -502,6 +502,58 @@ def source_time_by_row(path):
return [as_number(row.get(time_column)) for row in rows], time_column
+def source_metric_candidates(metric_name):
+ candidates = [metric_name]
+ if metric_name.startswith("O-"):
+ candidates.append(metric_name[2:])
+ if metric_name.startswith("brk_"):
+ candidates.append("breakdown_" + metric_name[4:])
+ if metric_name.startswith("breakdown_"):
+ candidates.append("brk_" + metric_name[len("breakdown_"):])
+ return list(dict.fromkeys(candidates))
+
+
+def source_metrics_by_row(path):
+ if not path:
+ return []
+ candidate = Path(path)
+ if not candidate.is_file():
+ return []
+
+ rows, fieldnames = read_csv_rows(path)
+ if not fieldnames:
+ return []
+
+ source_rows = []
+ for row in rows:
+ source_metrics = {}
+ for metric_name in metric_columns:
+ for source_name in source_metric_candidates(metric_name):
+ if source_name in fieldnames:
+ value = as_number(row.get(source_name))
+ if value is not None:
+ source_metrics[metric_name] = value
+ break
+ source_rows.append(source_metrics)
+ return source_rows
+
+
+def metric_comparisons(source_metrics, predicted_metrics):
+ comparisons = []
+ for metric_name in sorted(set(source_metrics) | set(predicted_metrics)):
+ item = {"name": metric_name}
+ source_value = source_metrics.get(metric_name)
+ predicted_value = predicted_metrics.get(metric_name)
+ if source_value is not None:
+ item["source_value"] = source_value
+ if predicted_value is not None:
+ item["predicted_value"] = predicted_value
+ if source_value not in (None, 0) and predicted_value is not None:
+ item["ratio_predicted_over_source"] = predicted_value / source_value
+ comparisons.append(item)
+ return comparisons
+
+
reader = csv.DictReader(cleaned_lines(prediction_csv))
if not reader.fieldnames:
raise SystemExit(f"prediction CSV has no header: {prediction_csv}")
@@ -518,6 +570,7 @@ source_gpus = []
target_gpus = []
total_seconds = 0.0
source_times_ns, source_time_column = source_time_by_row(input_csv)
+source_metrics_rows = source_metrics_by_row(input_csv)
total_source_seconds = 0.0
source_time_count = 0
@@ -537,6 +590,7 @@ for idx, row in enumerate(reader, start=1):
seconds = predicted_ns / 1e9
total_seconds += seconds
source_ns = source_times_ns[idx - 1] if idx - 1 < len(source_times_ns) else None
+ source_metrics = source_metrics_rows[idx - 1] if idx - 1 < len(source_metrics_rows) else {}
source_seconds = source_ns / 1e9 if source_ns is not None else None
if source_seconds is not None:
total_source_seconds += source_seconds
@@ -565,6 +619,11 @@ for idx, row in enumerate(reader, start=1):
kernel["target_gpu"] = target_gpu
if metrics:
kernel["metrics"] = metrics
+ if source_metrics:
+ kernel["source_metrics"] = source_metrics
+ comparisons = metric_comparisons(source_metrics, metrics)
+ if comparisons:
+ kernel["metric_comparisons"] = comparisons
kernels.append(kernel)
summary_metrics = {
@@ -666,6 +725,7 @@ _bk_gpu_lightgbm_run_predictor() {
prediction_csv_abs=$(_bk_gpu_lightgbm_abs_path "$prediction_csv")
prediction_log_abs=$(_bk_gpu_lightgbm_abs_path "$prediction_log")
+ echo "Running PerfTools LightGBM_model/1.0 for ${section_name}: ${source_gpu}->${target_gpu}" >&2
if ! (
cd "$model_dir"
"$python_bin" AI_model/run_inference.py \
diff --git a/scripts/estimation/section_packages/gpu_kernel_mlp_v15.sh b/scripts/estimation/section_packages/gpu_kernel_mlp_v15.sh
index 9021f75..64ad3fb 100644
--- a/scripts/estimation/section_packages/gpu_kernel_mlp_v15.sh
+++ b/scripts/estimation/section_packages/gpu_kernel_mlp_v15.sh
@@ -1,5 +1,6 @@
#!/bin/bash
-# gpu_kernel_mlp_v15.sh - Section package for the PerfTools MLP_NN/v1.5 GPU estimator.
+# gpu_kernel_mlp_v15.sh - Section package and shared implementation for
+# PerfTools MLP_NN GPU estimators.
bk_section_package_metadata_gpu_kernel_mlp_v15() {
cat <<'EOF'
@@ -143,7 +144,7 @@ _bk_gpu_mlp_ensure_perftools_root() {
mkdir -p "$(dirname "$root")"
if [[ ! -d "$root/.git" ]]; then
- echo "Fetching PerfTools for gpu_kernel_mlp_v15: ${repo} (${ref})" >&2
+ echo "Fetching PerfTools for ${BK_GPU_MLP_PACKAGE_NAME:-gpu_kernel_mlp_v15}: ${repo} (${ref})" >&2
git clone --depth 1 "$repo" "$root" >&2 || {
printf '%s\n' "$root"
return 0
@@ -161,13 +162,15 @@ _bk_gpu_mlp_ensure_perftools_root() {
_bk_gpu_mlp_predictor() {
local root="$1"
+ local version_dir="${BK_GPU_MLP_VERSION_DIR:-v1.5}"
+ local predictor_script="${BK_GPU_MLP_PREDICT_SCRIPT:-predict_v15.py}"
if [[ -z "$root" ]]; then
printf '%s\n' ""
return 0
fi
- printf '%s\n' "${root}/MLP_NN/v1.5/predict_v15.py"
+ printf '%s\n' "${root}/MLP_NN/${version_dir}/${predictor_script}"
}
_bk_gpu_mlp_python_exists() {
@@ -346,6 +349,7 @@ bk_section_package_check_applicability_gpu_kernel_mlp_v15() {
local root
local predictor
local python_bin="${BK_GPU_MLP_PYTHON:-$(_bk_gpu_mlp_default_python)}"
+ local predictor_rel="MLP_NN/${BK_GPU_MLP_VERSION_DIR:-v1.5}/${BK_GPU_MLP_PREDICT_SCRIPT:-predict_v15.py}"
local missing=()
if [[ "$item_kind" != "section" ]]; then
@@ -387,7 +391,7 @@ EOF
missing+=('"BK_GPU_MLP_PERFTOOLS_ROOT"')
fi
if [[ -z "$predictor" || ! -f "$predictor" ]]; then
- missing+=('"PerfTools MLP_NN/v1.5/predict_v15.py"')
+ missing+=("\"PerfTools predictor:${predictor_rel}\"")
fi
fi
@@ -490,6 +494,58 @@ def source_time_by_row(path):
return [as_number(row.get(time_column)) for row in rows], time_column
+def source_metric_candidates(metric_name):
+ candidates = [metric_name]
+ if metric_name.startswith("O-"):
+ candidates.append(metric_name[2:])
+ if metric_name.startswith("brk_"):
+ candidates.append("breakdown_" + metric_name[4:])
+ if metric_name.startswith("breakdown_"):
+ candidates.append("brk_" + metric_name[len("breakdown_"):])
+ return list(dict.fromkeys(candidates))
+
+
+def source_metrics_by_row(path):
+ if not path:
+ return []
+ candidate = Path(path)
+ if not candidate.is_file():
+ return []
+
+ rows, fieldnames = read_csv_rows(path)
+ if not fieldnames:
+ return []
+
+ source_rows = []
+ for row in rows:
+ source_metrics = {}
+ for metric_name in metric_columns:
+ for source_name in source_metric_candidates(metric_name):
+ if source_name in fieldnames:
+ value = as_number(row.get(source_name))
+ if value is not None:
+ source_metrics[metric_name] = value
+ break
+ source_rows.append(source_metrics)
+ return source_rows
+
+
+def metric_comparisons(source_metrics, predicted_metrics):
+ comparisons = []
+ for metric_name in sorted(set(source_metrics) | set(predicted_metrics)):
+ item = {"name": metric_name}
+ source_value = source_metrics.get(metric_name)
+ predicted_value = predicted_metrics.get(metric_name)
+ if source_value is not None:
+ item["source_value"] = source_value
+ if predicted_value is not None:
+ item["predicted_value"] = predicted_value
+ if source_value not in (None, 0) and predicted_value is not None:
+ item["ratio_predicted_over_source"] = predicted_value / source_value
+ comparisons.append(item)
+ return comparisons
+
+
reader = csv.DictReader(cleaned_lines(prediction_csv))
if not reader.fieldnames:
raise SystemExit(f"prediction CSV has no header: {prediction_csv}")
@@ -506,6 +562,7 @@ source_gpus = []
target_gpus = []
total_seconds = 0.0
source_times_ns, source_time_column = source_time_by_row(input_csv)
+source_metrics_rows = source_metrics_by_row(input_csv)
total_source_seconds = 0.0
source_time_count = 0
nonpositive_prediction_count = 0
@@ -528,6 +585,7 @@ for idx, row in enumerate(reader, start=1):
seconds = predicted_ns / 1e9
total_seconds += seconds
source_ns = source_times_ns[idx - 1] if idx - 1 < len(source_times_ns) else None
+ source_metrics = source_metrics_rows[idx - 1] if idx - 1 < len(source_metrics_rows) else {}
source_seconds = source_ns / 1e9 if source_ns is not None else None
if source_seconds is not None:
total_source_seconds += source_seconds
@@ -556,6 +614,11 @@ for idx, row in enumerate(reader, start=1):
kernel["target_gpu"] = target_gpu
if metrics:
kernel["metrics"] = metrics
+ if source_metrics:
+ kernel["source_metrics"] = source_metrics
+ comparisons = metric_comparisons(source_metrics, metrics)
+ if comparisons:
+ kernel["metric_comparisons"] = comparisons
kernels.append(kernel)
summary_metrics = {
@@ -572,7 +635,7 @@ if nonpositive_prediction_count:
"severity": "warning",
"reason": "nonpositive_predicted_execution_time",
"message": (
- "PerfTools MLP_NN/v1.5 returned non-positive predicted execution "
+ f"PerfTools MLP_NN/{model_version} returned non-positive predicted execution "
"time for one or more kernel rows. Check target GPU selection and "
"required NCU feature coverage."
),
@@ -601,7 +664,7 @@ print(json.dumps({
},
"model": {
"type": "cross_gpu_kernel_prediction_model",
- "name": "PerfTools MLP_NN/v1.5",
+ "name": "PerfTools MLP_NN/" + model_version,
"version": model_version,
"repository": "https://github.com/masaaki-kondo/PerfTools",
},
@@ -645,7 +708,10 @@ _bk_gpu_mlp_run_predictor() {
local root
local input_csv
local ncu_archive
- local output_dir="${BK_GPU_MLP_OUTPUT_DIR:-results/estimation_artifacts/gpu_kernel_mlp_v15}"
+ local package_name="${BK_GPU_MLP_PACKAGE_NAME:-gpu_kernel_mlp_v15}"
+ local version_dir="${BK_GPU_MLP_VERSION_DIR:-v1.5}"
+ local predictor_script="${BK_GPU_MLP_PREDICT_SCRIPT:-predict_v15.py}"
+ local output_dir="${BK_GPU_MLP_OUTPUT_DIR:-results/estimation_artifacts/${package_name}}"
local prediction_csv
local prediction_log
local input_csv_abs
@@ -672,18 +738,18 @@ _bk_gpu_mlp_run_predictor() {
if ! (
cd "$root"
- "$python_bin" MLP_NN/v1.5/predict_v15.py \
+ "$python_bin" "MLP_NN/${version_dir}/${predictor_script}" \
--csv "$input_csv_abs" \
--row "${BK_GPU_MLP_ROW:-all}" \
--out "$prediction_csv_abs" \
--log "$prediction_log_abs"
) >/dev/null; then
- echo "ERROR: PerfTools MLP_NN/v1.5 inference failed" >&2
+ echo "ERROR: PerfTools MLP_NN/${version_dir} inference failed" >&2
return 1
fi
if [[ ! -s "$prediction_csv_abs" ]]; then
- echo "ERROR: PerfTools MLP_NN/v1.5 did not create prediction CSV: ${prediction_csv_abs}" >&2
+ echo "ERROR: PerfTools MLP_NN/${version_dir} did not create prediction CSV: ${prediction_csv_abs}" >&2
return 1
fi
@@ -702,8 +768,9 @@ bk_section_package_transform_gpu_kernel_mlp_v15() {
local prediction_log=""
local run_outputs
local parsed_json
- local package_name="gpu_kernel_mlp_v15"
+ local package_name="${BK_GPU_MLP_PACKAGE_NAME:-gpu_kernel_mlp_v15}"
local model_version="${BK_GPU_MLP_MODEL_VERSION:-v1.5}"
+ local scaling_method="${BK_GPU_MLP_SCALING_METHOD:-gpu-kernel-mlp-${model_version}}"
local selector_kind=""
local selector_value=""
local selector
@@ -727,6 +794,7 @@ bk_section_package_transform_gpu_kernel_mlp_v15() {
--arg prediction_log "$prediction_log" \
--arg selector_kind "$selector_kind" \
--arg selector_value "$selector_value" \
+ --arg scaling_method "$scaling_method" \
--argjson parsed "$parsed_json" '
def selector_matches($kind; $value):
if $kind == "" or $value == "" then true
@@ -767,7 +835,7 @@ bk_section_package_transform_gpu_kernel_mlp_v15() {
end
),
bench_time: $source_section_time,
- scaling_method: (if $can_identity_fallback then "identity" else "gpu-kernel-mlp-v1.5" end),
+ scaling_method: (if $can_identity_fallback then "identity" else $scaling_method end),
estimation_package: (if $can_identity_fallback then "identity" else $parsed.estimation_package end),
requested_estimation_package: (if $can_identity_fallback then $parsed.estimation_package else (.requested_estimation_package // $parsed.estimation_package) end),
fallback_used: (if $can_identity_fallback then "identity" else null end),
diff --git a/scripts/estimation/section_packages/gpu_kernel_mlp_v21.sh b/scripts/estimation/section_packages/gpu_kernel_mlp_v21.sh
new file mode 100644
index 0000000..faeda5c
--- /dev/null
+++ b/scripts/estimation/section_packages/gpu_kernel_mlp_v21.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# gpu_kernel_mlp_v21.sh - Thin package wrapper for PerfTools MLP_NN/v2.1.
+
+bk_section_package_metadata_gpu_kernel_mlp_v21() {
+ cat <<'EOF'
+{
+ "name": "gpu_kernel_mlp_v21",
+ "fallback_target": "identity",
+ "source_system_scope": {
+ "kind": "benchmark_system",
+ "accepted_values": ["any"]
+ },
+ "target_system_scope": {
+ "accepted_values": ["any"]
+ },
+ "item_kind_scope": ["section"],
+ "required_result_fields": ["name", "app-side GPU section time as time or bench_time"],
+ "required_artifact_kinds": [
+ "PerfTools MLP_NN/v2.1 prepared input CSV",
+ "precomputed prediction CSV",
+ "or BenchKit padata archive with Nsight Compute raw CSV"
+ ],
+ "acquisition_mode": "external",
+ "output_fields": [
+ "time",
+ "bench_time",
+ "scaling_method",
+ "metrics",
+ "package_applicability"
+ ]
+}
+EOF
+}
+
+bk_section_package_check_applicability_gpu_kernel_mlp_v21() (
+ export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v21"
+ export BK_GPU_MLP_VERSION_DIR="v2.1"
+ export BK_GPU_MLP_PREDICT_SCRIPT="predict_v21.py"
+ export BK_GPU_MLP_MODEL_VERSION="v2.1"
+ export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v2.1"
+ bk_section_package_check_applicability_gpu_kernel_mlp_v15 "$@"
+)
+
+bk_section_package_transform_gpu_kernel_mlp_v21() (
+ export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v21"
+ export BK_GPU_MLP_VERSION_DIR="v2.1"
+ export BK_GPU_MLP_PREDICT_SCRIPT="predict_v21.py"
+ export BK_GPU_MLP_MODEL_VERSION="v2.1"
+ export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v2.1"
+ bk_section_package_transform_gpu_kernel_mlp_v15 "$@"
+)
diff --git a/scripts/estimation/section_packages/gpu_kernel_mlp_v40.sh b/scripts/estimation/section_packages/gpu_kernel_mlp_v40.sh
new file mode 100644
index 0000000..09cf87d
--- /dev/null
+++ b/scripts/estimation/section_packages/gpu_kernel_mlp_v40.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# gpu_kernel_mlp_v40.sh - Thin package wrapper for PerfTools MLP_NN/v4.0.
+
+bk_section_package_metadata_gpu_kernel_mlp_v40() {
+ cat <<'EOF'
+{
+ "name": "gpu_kernel_mlp_v40",
+ "fallback_target": "identity",
+ "source_system_scope": {
+ "kind": "benchmark_system",
+ "accepted_values": ["any"]
+ },
+ "target_system_scope": {
+ "accepted_values": ["any"]
+ },
+ "item_kind_scope": ["section"],
+ "required_result_fields": ["name", "app-side GPU section time as time or bench_time"],
+ "required_artifact_kinds": [
+ "PerfTools MLP_NN/v4.0 prepared input CSV",
+ "precomputed prediction CSV",
+ "or BenchKit padata archive with Nsight Compute raw CSV"
+ ],
+ "acquisition_mode": "external",
+ "output_fields": [
+ "time",
+ "bench_time",
+ "scaling_method",
+ "metrics",
+ "package_applicability"
+ ]
+}
+EOF
+}
+
+bk_section_package_check_applicability_gpu_kernel_mlp_v40() (
+ export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v40"
+ export BK_GPU_MLP_VERSION_DIR="v4.0"
+ export BK_GPU_MLP_PREDICT_SCRIPT="predict_v40.py"
+ export BK_GPU_MLP_MODEL_VERSION="v4.0"
+ export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v4.0"
+ bk_section_package_check_applicability_gpu_kernel_mlp_v15 "$@"
+)
+
+bk_section_package_transform_gpu_kernel_mlp_v40() (
+ export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v40"
+ export BK_GPU_MLP_VERSION_DIR="v4.0"
+ export BK_GPU_MLP_PREDICT_SCRIPT="predict_v40.py"
+ export BK_GPU_MLP_MODEL_VERSION="v4.0"
+ export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v4.0"
+ bk_section_package_transform_gpu_kernel_mlp_v15 "$@"
+)
diff --git a/scripts/estimation/section_packages/gpu_kernel_mlp_v41.sh b/scripts/estimation/section_packages/gpu_kernel_mlp_v41.sh
new file mode 100644
index 0000000..9628c86
--- /dev/null
+++ b/scripts/estimation/section_packages/gpu_kernel_mlp_v41.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# gpu_kernel_mlp_v41.sh - Thin package wrapper for PerfTools MLP_NN/v4.1.
+
+bk_section_package_metadata_gpu_kernel_mlp_v41() {
+ cat <<'EOF'
+{
+ "name": "gpu_kernel_mlp_v41",
+ "fallback_target": "identity",
+ "source_system_scope": {
+ "kind": "benchmark_system",
+ "accepted_values": ["any"]
+ },
+ "target_system_scope": {
+ "accepted_values": ["any"]
+ },
+ "item_kind_scope": ["section"],
+ "required_result_fields": ["name", "app-side GPU section time as time or bench_time"],
+ "required_artifact_kinds": [
+ "PerfTools MLP_NN/v4.1 prepared input CSV",
+ "precomputed prediction CSV",
+ "or BenchKit padata archive with Nsight Compute raw CSV"
+ ],
+ "acquisition_mode": "external",
+ "output_fields": [
+ "time",
+ "bench_time",
+ "scaling_method",
+ "metrics",
+ "package_applicability"
+ ]
+}
+EOF
+}
+
+bk_section_package_check_applicability_gpu_kernel_mlp_v41() (
+ export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v41"
+ export BK_GPU_MLP_VERSION_DIR="v4.1"
+ export BK_GPU_MLP_PREDICT_SCRIPT="predict_v41.py"
+ export BK_GPU_MLP_MODEL_VERSION="v4.1"
+ export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v4.1"
+ bk_section_package_check_applicability_gpu_kernel_mlp_v15 "$@"
+)
+
+bk_section_package_transform_gpu_kernel_mlp_v41() (
+ export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v41"
+ export BK_GPU_MLP_VERSION_DIR="v4.1"
+ export BK_GPU_MLP_PREDICT_SCRIPT="predict_v41.py"
+ export BK_GPU_MLP_MODEL_VERSION="v4.1"
+ export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v4.1"
+ bk_section_package_transform_gpu_kernel_mlp_v15 "$@"
+)
diff --git a/scripts/tests/test_estimation_gpu_kernel_ensemble_average.sh b/scripts/tests/test_estimation_gpu_kernel_ensemble_average.sh
index 5534e83..244aa6a 100644
--- a/scripts/tests/test_estimation_gpu_kernel_ensemble_average.sh
+++ b/scripts/tests/test_estimation_gpu_kernel_ensemble_average.sh
@@ -56,8 +56,8 @@ kern_a,H100,A100,0,30
EOF
cat > "${TMP_DIR}/source_input_single.csv" <<'EOF'
-Kernel Name,Duration [ns]
-kern_a,1000
+Kernel Name,Duration [ns],Memory Throughput [%],Achieved Occupancy
+kern_a,1000,25,10
EOF
cat > "${TMP_DIR}/lightgbm_pred_mixed.csv" <<'EOF'
@@ -72,9 +72,9 @@ kern_b,H100,A100,5000,20
EOF
cat > "${TMP_DIR}/source_input_mixed.csv" <<'EOF'
-Kernel Name,Duration [ns]
-kern_a,1000
-kern_b,2000
+Kernel Name,Duration [ns],Memory Throughput [%],Achieved Occupancy
+kern_a,1000,25,10
+kern_b,2000,40,20
EOF
cat > "${TMP_DIR}/breakdown.json" <<'EOF'
@@ -129,10 +129,39 @@ if ! echo "$transformed_single" | jq -e '
.sections[0].estimation_package == "gpu_kernel_ensemble_average" and
near(.sections[0].time; 20) and
.sections[0].scaling_method == "gpu-kernel-ensemble-average" and
+ .sections[0].bench_time == 10 and
.sections[0].metrics.aggregation == "single-kernel-package-ratio-mean" and
.sections[0].metrics.candidate_count == 2 and
.sections[0].metrics.applicable_candidate_count == 2 and
.sections[0].metrics.candidate_packages == ["gpu_kernel_lightgbm_v10", "gpu_kernel_mlp_v15"] and
+ (.sections[0].metrics.package_summaries | length == 2) and
+ .sections[0].metrics.package_summaries[0].estimation_package == "gpu_kernel_lightgbm_v10" and
+ .sections[0].metrics.package_summaries[0].source_section_time == 10 and
+ near(.sections[0].metrics.package_summaries[0].projected_section_time; 10) and
+ near(.sections[0].metrics.package_summaries[0].time_ratio_predicted_over_source; 1) and
+ .sections[0].metrics.package_summaries[0].source_gpus == ["H100"] and
+ .sections[0].metrics.package_summaries[0].target_gpus == ["A100"] and
+ .sections[0].metrics.package_summaries[0].ncu_sample.kernel_count == 1 and
+ .sections[0].metrics.package_summaries[0].ncu_sample.source_time_ns == 1000 and
+ .sections[0].metrics.package_summaries[0].ncu_sample.predicted_time_ns == 1000 and
+ .sections[0].metrics.package_summaries[1].estimation_package == "gpu_kernel_mlp_v15" and
+ near(.sections[0].metrics.package_summaries[1].projected_section_time; 30) and
+ near(.sections[0].metrics.package_summaries[1].time_ratio_predicted_over_source; 3) and
+ (.sections[0].metrics.kernel_summaries | length == 1) and
+ .sections[0].metrics.kernel_summaries[0].name == "kern_a" and
+ (.sections[0].metrics.kernel_summaries[0].package_summaries | length == 2) and
+ .sections[0].metrics.kernel_summaries[0].package_summaries[0].estimation_package == "gpu_kernel_lightgbm_v10" and
+ .sections[0].metrics.kernel_summaries[0].package_summaries[0].sample_count == 1 and
+ .sections[0].metrics.kernel_summaries[0].package_summaries[0].source_gpus == ["H100"] and
+ .sections[0].metrics.kernel_summaries[0].package_summaries[0].target_gpus == ["A100"] and
+ .sections[0].metrics.kernel_summaries[0].package_summaries[0].source_time_ns_total == 1000 and
+ .sections[0].metrics.kernel_summaries[0].package_summaries[0].predicted_time_ns_total == 1000 and
+ near(.sections[0].metrics.kernel_summaries[0].package_summaries[0].mean_time_ratio_predicted_over_source; 1) and
+ (.sections[0].metrics.kernel_summaries[0].package_summaries[0].metric_comparisons | length >= 2) and
+ (.sections[0].metrics.kernel_summaries[0].package_summaries[0].metric_comparisons | map(select(.name == "O-Memory Throughput [%]" and .source_value_mean == 25 and .predicted_value_mean == 50 and .ratio_predicted_over_source_mean == 2)) | length == 1) and
+ .sections[0].metrics.kernel_summaries[0].package_summaries[1].estimation_package == "gpu_kernel_mlp_v15" and
+ near(.sections[0].metrics.kernel_summaries[0].package_summaries[1].mean_time_ratio_predicted_over_source; 3) and
+ (.sections[0].metrics.kernel_summaries[0].package_summaries[1].metric_comparisons | map(select(.name == "Memory Throughput [%]" and .source_value_mean == 25 and .predicted_value_mean == 30 and .ratio_predicted_over_source_mean == 1.2)) | length == 1) and
near(.sections[0].metrics.mean_time_ratio_predicted_over_source; 2) and
.sections[0].metrics.unique_kernel_count == 1 and
.sections[0].metrics.kernel_names == ["kern_a"] and
diff --git a/scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh b/scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh
index 82da30b..781f176 100644
--- a/scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh
+++ b/scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh
@@ -133,6 +133,16 @@ if args.log:
handle.write("fake predictor called\n")
PY
+for version_script in \
+ "v2.1 predict_v21.py" \
+ "v4.0 predict_v40.py" \
+ "v4.1 predict_v41.py"; do
+ read -r version_dir script_name <<< "$version_script"
+ mkdir -p "${FAKE_PERFTOOLS}/MLP_NN/${version_dir}"
+ cp "${FAKE_PERFTOOLS}/MLP_NN/v1.5/predict_v15.py" \
+ "${FAKE_PERFTOOLS}/MLP_NN/${version_dir}/${script_name}"
+done
+
cat > "${TMP_DIR}/input.csv" <<'EOF'
kernel_name,src_gpu,tgt_gpu,Execution Time
probe_kernel,A100,H100,2000000
@@ -191,4 +201,53 @@ echo "$transformed_from_input" | jq -e '
test -f "${TMP_DIR}/mlp_outputs/unknown_gpu_kernel_region_local_pred.csv"
test -f "${TMP_DIR}/mlp_outputs/unknown_gpu_kernel_region_local.log"
+unset BK_GPU_MLP_OUTPUT_DIR
+for package_version in \
+ "gpu_kernel_mlp_v21 v2.1" \
+ "gpu_kernel_mlp_v40 v4.0" \
+ "gpu_kernel_mlp_v41 v4.1"; do
+ read -r package_name version_label <<< "$package_version"
+ cat > "${TMP_DIR}/breakdown_${package_name}.json" < | ||||||||||||||||||||||||