diff --git a/docs/guides/add-estimation-package.md b/docs/guides/add-estimation-package.md index 3f75220..0babb67 100644 --- a/docs/guides/add-estimation-package.md +++ b/docs/guides/add-estimation-package.md @@ -45,6 +45,9 @@ - `overlap_max_basic.sh` - `gpu_kernel_lightgbm_v10.sh` - `gpu_kernel_mlp_v15.sh` + - `gpu_kernel_mlp_v21.sh` + - `gpu_kernel_mlp_v40.sh` + - `gpu_kernel_mlp_v41.sh` ## 3. top-level package の責務 @@ -75,6 +78,18 @@ GPU kernel 単位の外部推定ツールは、通常は section package とし - `gpu_kernel_mlp_v15` - PerfTools `MLP_NN/v1.5` - 主な依存: numpy/pandas/torch +- `gpu_kernel_mlp_v21` + - PerfTools `MLP_NN/v2.1` + - v1.5 NN と analytical anchor を組み合わせた hybrid/reference 系 + - 主な依存: numpy/pandas/torch +- `gpu_kernel_mlp_v40` + - PerfTools `MLP_NN/v4.0` + - no-ET pure NN 系 + - 主な依存: numpy/pandas/torch +- `gpu_kernel_mlp_v41` + - PerfTools `MLP_NN/v4.1` + - v4.0 に single-axis trend 対応を加えた NN 系 + - 主な依存: numpy/pandas/torch - `gpu_kernel_lightgbm_v10` - PerfTools `LightGBM_model/1.0` - 主な依存: numpy/pandas/lightgbm/pyyaml と `libgomp` diff --git a/programs/genesis/README.md b/programs/genesis/README.md index 0312566..a66a188 100644 --- a/programs/genesis/README.md +++ b/programs/genesis/README.md @@ -124,13 +124,19 @@ Single-package selection: ```bash BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v15 # or +BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v21 +# or +BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v40 +# or +BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_mlp_v41 +# or BK_GENESIS_GPU_SECTION_PACKAGE=gpu_kernel_lightgbm_v10 ``` Multiple-package comparison: ```bash -BK_GENESIS_GPU_SECTION_PACKAGES=gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15 +BK_GENESIS_GPU_SECTION_PACKAGES=gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15,gpu_kernel_mlp_v21,gpu_kernel_mlp_v40,gpu_kernel_mlp_v41 ``` When multiple packages are selected, the app wrapper asks for diff --git a/programs/genesis/estimate.sh b/programs/genesis/estimate.sh index 2fffbd7..1bd4d7c 100644 --- a/programs/genesis/estimate.sh +++ b/programs/genesis/estimate.sh @@ -9,7 +9,7 @@ genesis_gpu_section_packages() { elif [[ -n "${BK_GENESIS_GPU_SECTION_PACKAGE:-}" ]]; then raw="$BK_GENESIS_GPU_SECTION_PACKAGE" else - raw="gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15" + raw="gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15,gpu_kernel_mlp_v21,gpu_kernel_mlp_v40,gpu_kernel_mlp_v41" fi printf '%s\n' "$raw" | @@ -340,7 +340,6 @@ genesis_run_single_estimate() { "${BK_ESTIMATION_CURRENT_TARGET_NODES:-1}" \ "${BK_ESTIMATION_CURRENT_PACKAGE:-weakscaling}" est_current_fom="${est_current_bench_fom:-$est_current_fom}" - est_current_fom_breakdown="" if [[ "$synthetic_breakdown" -eq 1 ]]; then genesis_mark_gpu_section_time_missing diff --git a/result_server/templates/_estimated_breakdown_card.html b/result_server/templates/_estimated_breakdown_card.html index c63b97d..ba39f2a 100644 --- a/result_server/templates/_estimated_breakdown_card.html +++ b/result_server/templates/_estimated_breakdown_card.html @@ -10,13 +10,92 @@ {% endif %} {%- endmacro %} +{% macro render_kernel_package_comparisons(item) -%} + {% set kernel_summaries = item.get('metrics', {}).get('kernel_summaries', []) %} + {% if kernel_summaries %} +
+
Kernel package comparison; metrics are shown as reported by each package.
+ {% for kernel in kernel_summaries %} +
+
{{ kernel.get('name', 'N/A') }}
+ + + + + + + + + + + + + + {% for package in kernel.get('package_summaries', []) %} + + + + + + + + + + {% if package.get('metric_comparisons') %} + + + + {% endif %} + {% endfor %} + +
PackageSamplesSource Mean (ns)Predicted Mean (ns)RatioSource GPUTarget GPU
{{ package.get('estimation_package', 'N/A') }}{{ package.get('sample_count', 'N/A') }}{{ package.get('source_time_ns_mean', 'N/A') }}{{ package.get('predicted_time_ns_mean', 'N/A') }}{{ package.get('mean_time_ratio_predicted_over_source', 'N/A') }}{{ package.get('source_gpus', []) | join(', ') }}{{ package.get('target_gpus', []) | join(', ') }}
+
+
{{ package.get('estimation_package', 'N/A') }} metrics
+ + + + + + + + + + + + {% for metric in package.get('metric_comparisons', []) %} + + + + + + + + {% endfor %} + +
MetricSamplesSource MeanPredicted MeanRatio
{{ metric.get('name', 'N/A') }}{{ metric.get('sample_count', 'N/A') }}{{ metric.get('source_value_mean', 'N/A') }}{{ metric.get('predicted_value_mean', 'N/A') }}{{ metric.get('ratio_predicted_over_source_mean', 'N/A') }}
+
+
+
+ {% endfor %} +
+ {% endif %} +{%- endmacro %} + {% macro render_breakdown_table(heading, items, first_column_label, first_column_key, join_list_values=False) -%} {% if items %}

{{ heading }}

- + + + + + + + + + {% for item in items %} @@ -26,6 +105,7 @@

{{ heading }}

{% endif %} + @@ -34,7 +114,7 @@

{{ heading }}

{% if item.get('candidate_estimates') %} -
{{ first_column_label }}TimePackageScalingFallbackApplicability
{{ first_column_label }}Bench TimeEstimated TimePackageScalingFallbackApplicability
{{ first_value }}{{ item.get('bench_time', item.get('time', 'N/A')) }} {{ item.get('time', 'N/A') }} {{ item.get('estimation_package', 'N/A') }} {{ item.get('scaling_method', 'N/A') }}
+
Candidate estimates; mean time is used for FOM composition.
@@ -57,6 +137,13 @@

{{ heading }}

{% endif %} + {% if item.get('metrics', {}).get('kernel_summaries') %} + + + + {% endif %} {% endfor %}
+ {{ render_kernel_package_comparisons(item) }} +
diff --git a/result_server/templates/estimated_detail.html b/result_server/templates/estimated_detail.html index 2e26b4c..68c0835 100644 --- a/result_server/templates/estimated_detail.html +++ b/result_server/templates/estimated_detail.html @@ -14,6 +14,28 @@ .detail-table { margin-bottom: 20px; } .detail-table th { text-align: left; min-width: 200px; background-color: #eef6f8; } .detail-table td { min-width: 260px; white-space: normal; } + .system-comparison-table { + width: 100%; + table-layout: fixed; + margin-bottom: 20px; + } + .system-comparison-table th { + text-align: left; + background-color: #eef6f8; + } + .system-comparison-table th, + .system-comparison-table td { + padding: 8px 10px; + border: 1px solid #d8e3e8; + white-space: normal; + overflow-wrap: anywhere; + } + .system-comparison-table th:first-child, + .system-comparison-table td:first-child { + width: 26%; + font-weight: 700; + color: #102a43; + } .applicability-summary { margin-bottom: 20px; padding: 16px 18px; @@ -93,6 +115,63 @@ padding: 5px 7px; border: 1px solid #e2e8f0; } + .kernel-comparisons { + margin-top: 10px; + padding: 10px 12px; + border-radius: 10px; + background: #f8fafc; + border: 1px solid #e2e8f0; + } + .kernel-comparisons-title { + margin-bottom: 8px; + color: #475569; + font-size: 12px; + font-weight: 600; + } + .kernel-comparison { + margin-top: 10px; + padding-top: 10px; + border-top: 1px solid #e2e8f0; + } + .kernel-comparison:first-of-type { + margin-top: 0; + padding-top: 0; + border-top: 0; + } + .kernel-name { + margin-bottom: 6px; + font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; + font-size: 12px; + color: #334155; + overflow-wrap: anywhere; + } + .kernel-package-table, + .kernel-metrics-table { + width: 100%; + border-collapse: collapse; + font-size: 12px; + } + .kernel-package-table th, + .kernel-package-table td, + .kernel-metrics-table th, + .kernel-metrics-table td { + padding: 5px 7px; + border: 1px solid #e2e8f0; + vertical-align: top; + } + .kernel-package-table th, + .kernel-metrics-table th { + background: #eef6f8; + } + .kernel-metrics { + margin-top: 8px; + } + .kernel-metrics-summary { + margin-bottom: 5px; + color: #475569; + font-size: 12px; + font-weight: 600; + } .empty-note { color: #6b7280; font-size: 13px; } @@ -121,9 +200,29 @@

Applicability Summary

{{ render_json_block("Confidence", confidence_json) }}
- {{ render_titled_key_value_table("Current System", current_rows, "detail-table", "detail-table-wrap", "detail-card") }} - - {{ render_titled_key_value_table("Future System", future_rows, "detail-table", "detail-table-wrap", "detail-card") }} +
+

System Comparison

+
+ + + + + + + + + + {% for row in system_comparison_rows %} + + + {{ row.current }} + {{ row.future }} + + {% endfor %} + +
ItemCurrent SystemFuture System
{{ row.label }}
+
+
diff --git a/result_server/tests/test_estimated_detail_template.py b/result_server/tests/test_estimated_detail_template.py index c57c0e1..ce1e461 100644 --- a/result_server/tests/test_estimated_detail_template.py +++ b/result_server/tests/test_estimated_detail_template.py @@ -73,7 +73,7 @@ "fom": 0.944, "target_nodes": "1024", "scaling_method": "weakscaling", - "benchmark": {"system": "Fugaku", "fom": 0.386, "nodes": "1"}, + "benchmark": {"system": "Fugaku", "fom": 0.386, "nodes": "1", "numproc_node": "4"}, "model": {"name": "weakscaling-current", "type": "intra_system_scaling_model"}, "fom_breakdown": { "sections": [ @@ -109,7 +109,7 @@ "fom": 9.054, "target_nodes": "256", "scaling_method": "instrumented-app-sections-dummy", - "benchmark": {"system": "MiyabiG", "fom": 5.712, "nodes": "1"}, + "benchmark": {"system": "MiyabiG", "fom": 5.712, "nodes": "1", "numproc_node": "8"}, "model": {"name": "instrumented-app-sections-future-projection", "type": "cross_system_projection_model"}, "fom_breakdown": { "sections": [ @@ -140,6 +140,51 @@ "metrics": {"time_ratio_predicted_over_source": 2.6666666667}, }, ], + "metrics": { + "kernel_summaries": [ + { + "name": "kern_build_pairlist", + "package_summaries": [ + { + "estimation_package": "gpu_kernel_lightgbm_v10", + "sample_count": 1, + "source_time_ns_mean": 310816, + "predicted_time_ns_mean": 294442.45, + "mean_time_ratio_predicted_over_source": 0.9473, + "source_gpus": ["H100"], + "target_gpus": ["GB200"], + "metric_comparisons": [ + { + "name": "O-Memory Throughput [%]", + "sample_count": 1, + "source_value_mean": 52.12, + "predicted_value_mean": 49.10, + "ratio_predicted_over_source_mean": 0.9421, + } + ], + }, + { + "estimation_package": "gpu_kernel_mlp_v15", + "sample_count": 5, + "source_time_ns_mean": 159104, + "predicted_time_ns_mean": 70311.2, + "mean_time_ratio_predicted_over_source": 0.4423, + "source_gpus": ["H100"], + "target_gpus": ["GB200"], + "metric_comparisons": [ + { + "name": "Memory Throughput [%]", + "sample_count": 5, + "source_value_mean": 52.55, + "predicted_value_mean": 41.34, + "ratio_predicted_over_source_mean": 0.7866, + } + ], + }, + ], + } + ] + }, } ], "overlaps": [], @@ -201,8 +246,10 @@ def test_estimated_detail_template_renders_sections(app): assert "Applicability Summary" in html assert "Package Resolution" in html assert "Re-Estimation Context" in html + assert "System Comparison" in html assert "Current System" in html assert "Future System" in html + assert "Benchmark Processes/Node" in html assert "Estimate succeeded, but part of the breakdown used fallback handling." in html assert "required action: collect-section-specific-package-inputs" in html assert "weakscaling" in html @@ -230,6 +277,15 @@ def test_estimated_detail_template_renders_sections(app): assert "overlap_package_unsupported:half" in html assert "Candidate estimates" in html assert "Time Ratio" in html + assert "Bench Time" in html + assert "Estimated Time" in html assert "gpu_kernel_ensemble_average" in html assert "gpu_kernel_lightgbm_v10" in html assert "gpu_kernel_mlp_v15" in html + assert "Kernel package comparison" in html + assert "kern_build_pairlist" in html + assert "Source Mean (ns)" in html + assert "Predicted Mean (ns)" in html + assert "O-Memory Throughput [%]" in html + assert "Memory Throughput [%]" in html + assert "GB200" in html diff --git a/result_server/utils/estimated_detail_view.py b/result_server/utils/estimated_detail_view.py index 5755c2f..52e4084 100644 --- a/result_server/utils/estimated_detail_view.py +++ b/result_server/utils/estimated_detail_view.py @@ -20,6 +20,7 @@ def build_estimated_detail_context(result): "reestimation_rows": _build_reestimation_rows(reestimation), "current_rows": _build_system_rows(current), "future_rows": _build_system_rows(future), + "system_comparison_rows": _build_system_comparison_rows(current, future), "measurement_json": result.get("measurement", {}), "confidence_json": result.get("confidence", {}), "assumptions_json": result.get("assumptions", {}), @@ -63,13 +64,19 @@ def _build_package_rows(estimate_meta, applicability): current_package = estimate_meta.get("current_package", {}) future_package = estimate_meta.get("future_package", {}) rows = build_labeled_value_rows([ - ("Top-Level Requested", estimate_meta.get("requested_estimation_package", "N/A")), - ("Top-Level Applied", estimate_meta.get("estimation_package", "N/A")), + ("Top-Level Package", _format_package_resolution( + estimate_meta.get("requested_estimation_package", "N/A"), + estimate_meta.get("estimation_package", "N/A"), + )), ("Top-Level Fallback", applicability.get("fallback_used", "none")), - ("Current Requested", current_package.get("requested_estimation_package", "N/A")), - ("Current Applied", current_package.get("estimation_package", "N/A")), - ("Future Requested", future_package.get("requested_estimation_package", "N/A")), - ("Future Applied", future_package.get("estimation_package", "N/A")), + ("Current Package", _format_package_resolution( + current_package.get("requested_estimation_package", "N/A"), + current_package.get("estimation_package", "N/A"), + )), + ("Future Package", _format_package_resolution( + future_package.get("requested_estimation_package", "N/A"), + future_package.get("estimation_package", "N/A"), + )), ]) _append_list_row(rows, "Missing Inputs", applicability.get("missing_inputs", [])) @@ -78,6 +85,12 @@ def _build_package_rows(estimate_meta, applicability): return rows +def _format_package_resolution(requested, applied): + if requested == applied: + return applied + return f"{applied} (requested: {requested})" + + def _build_system_rows(system_data): benchmark = system_data.get("benchmark", {}) breakdown = system_data.get("fom_breakdown", {}) @@ -97,6 +110,40 @@ def _build_system_rows(system_data): ]) +def _build_system_comparison_rows(current, future): + current_rows = _build_comparison_system_rows(current) + future_rows = _build_comparison_system_rows(future) + future_by_label = {row["label"]: row for row in future_rows} + rows = [] + for current_row in current_rows: + label = current_row["label"] + future_row = future_by_label.get(label, {}) + rows.append({ + "label": label, + "current": current_row.get("value", "N/A"), + "future": future_row.get("value", "N/A"), + "current_class": current_row.get("value_class", ""), + "future_class": future_row.get("value_class", ""), + }) + return rows + + +def _build_comparison_system_rows(system_data): + benchmark = system_data.get("benchmark", {}) + breakdown = system_data.get("fom_breakdown", {}) + return build_labeled_value_rows([ + ("System", system_data.get("system", "N/A")), + ("FOM", format_numeric_value(system_data.get("fom", "N/A"))), + ("Target Nodes", system_data.get("target_nodes", "N/A")), + ("Benchmark System", benchmark.get("system", "N/A")), + ("Benchmark FOM", format_numeric_value(benchmark.get("fom", "N/A"))), + ("Benchmark Nodes", benchmark.get("nodes", "N/A")), + ("Benchmark Processes/Node", benchmark.get("numproc_node", "N/A")), + ("Sections", len(breakdown.get("sections", []))), + ("Overlaps", len(breakdown.get("overlaps", []))), + ]) + + def _build_reestimation_rows(reestimation): if not reestimation: return [] diff --git a/scripts/estimation/common.sh b/scripts/estimation/common.sh index 157c23b..7ec9158 100644 --- a/scripts/estimation/common.sh +++ b/scripts/estimation/common.sh @@ -282,6 +282,8 @@ bk_estimation_run_recorded_current_with_weakscaling() { local current_package="${4:-${BK_ESTIMATION_CURRENT_PACKAGE:-weakscaling}}" local current_model_version="" local baseline_breakdown="" + local current_breakdown_total="" + local current_breakdown_factor="" bk_estimation_load_package "$current_package" current_model_version="${BK_ESTIMATION_PACKAGE_VERSION:-0.1}" @@ -305,6 +307,16 @@ bk_estimation_run_recorded_current_with_weakscaling() { "1" \ "identity" \ "identity") + if [[ -n "$est_current_fom_breakdown" && "$est_current_fom_breakdown" != "null" && -n "${est_current_bench_fom:-}" ]]; then + current_breakdown_total=$(bk_top_level_breakdown_total_time "$est_current_fom_breakdown") + if [[ -n "$current_breakdown_total" && "$current_breakdown_total" != "0" && "$current_breakdown_total" != "null" ]]; then + current_breakdown_factor=$(awk -v target="$est_current_bench_fom" -v source="$current_breakdown_total" 'BEGIN {printf "%.12f", target / source}') + est_current_fom_breakdown=$(bk_top_level_scale_breakdown_times \ + "$est_current_fom_breakdown" \ + "$current_breakdown_factor" \ + "$current_package") + fi + fi est_current_fom=$(bk_top_level_breakdown_total_time "$est_current_fom_breakdown") if declare -F bk_estimation_package_build_recorded_current_model_json >/dev/null 2>&1; then est_current_model_json=$(bk_estimation_package_build_recorded_current_model_json "$baseline_system" "$current_model_version") diff --git a/scripts/estimation/packages/instrumented_app_sections_dummy.sh b/scripts/estimation/packages/instrumented_app_sections_dummy.sh index ecbe9bd..a59a263 100644 --- a/scripts/estimation/packages/instrumented_app_sections_dummy.sh +++ b/scripts/estimation/packages/instrumented_app_sections_dummy.sh @@ -34,6 +34,9 @@ bk_estimation_package_metadata() { "gpu_kernel_ensemble_average", "gpu_kernel_lightgbm_v10", "gpu_kernel_mlp_v15", + "gpu_kernel_mlp_v21", + "gpu_kernel_mlp_v40", + "gpu_kernel_mlp_v41", "logp" ], "supported_overlap_packages": [ diff --git a/scripts/estimation/prepare_gpu_mlp_ncu_input.py b/scripts/estimation/prepare_gpu_mlp_ncu_input.py index 6178022..8c73442 100644 --- a/scripts/estimation/prepare_gpu_mlp_ncu_input.py +++ b/scripts/estimation/prepare_gpu_mlp_ncu_input.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Prepare a PerfTools MLP_NN/v1.5 input CSV from an Nsight Compute archive. +"""Prepare a PerfTools MLP_NN input CSV from an Nsight Compute archive. This is a small compatibility bridge for BenchKit. It converts the wide Nsight Compute raw CSV exported from ``profile.ncu-rep`` into the CSV layout @@ -354,7 +354,7 @@ def finalize_prepared_input( "TPC.TriageCompute.sm__inst_executed_realtime.avg.per_cycle_active", ).reset_index(drop=True) if "Executed Ipc Active [inst/cycle]" in df.columns: - df["Executed Ipc Active [inst/cycle]"] = ipc.iloc[: len(df)].to_numpy() + df["Executed Ipc Active [inst/cycle]"] = ipc.reindex(df.index).to_numpy() mean_ipc = df["Executed Ipc Active [inst/cycle]"].mean() df["Executed Ipc Active [inst/cycle]"] = df[ "Executed Ipc Active [inst/cycle]" @@ -411,7 +411,8 @@ def main() -> None: allowed_nan=ALLOWED_NAN_COLUMNS | set(args.allow_nan), target_gpu=args.target_gpu, ) - print(f"wrote {out_csv}: {kernel_count} kernels") + final_count = len(pd.read_csv(out_csv)) + print(f"wrote {out_csv}: {final_count} kernels") finally: if work_dir_owned and not args.keep_work: shutil.rmtree(work_dir, ignore_errors=True) diff --git a/scripts/estimation/section_packages/gpu_kernel_ensemble_average.sh b/scripts/estimation/section_packages/gpu_kernel_ensemble_average.sh index 1be6f3d..d837c54 100644 --- a/scripts/estimation/section_packages/gpu_kernel_ensemble_average.sh +++ b/scripts/estimation/section_packages/gpu_kernel_ensemble_average.sh @@ -165,7 +165,10 @@ bk_section_package_transform_gpu_kernel_ensemble_average() { target_gpu: ($kernel.target_gpu // null), estimation_package: ($candidate.estimation_package // ""), predicted_time_ns: ($kernel.predicted_time_ns // null), - time_ratio_predicted_over_source: $time_ratio + time_ratio_predicted_over_source: $time_ratio, + source_metrics: ($kernel.source_metrics // {}), + predicted_metrics: ($kernel.metrics // {}), + metric_comparisons: ($kernel.metric_comparisons // []) } ) ) @@ -177,6 +180,35 @@ bk_section_package_transform_gpu_kernel_ensemble_average() { | (blocking_candidates | length) as $blocking_count | ($candidates | length) as $candidate_count | ($usable | map(candidate_time_ratio) | map(select(. != null and . > 0))) as $usable_ratios + | (.bench_time // .time // null) as $app_section_time + | ( + $candidates + | map( + . as $candidate + | (candidate_time_ratio) as $ratio + | { + estimation_package: ($candidate.estimation_package // ""), + scaling_method: ($candidate.scaling_method // ""), + applicability_status: ($candidate.package_applicability.status // ""), + source_section_time: $app_section_time, + projected_section_time: ($candidate.time // null), + time_ratio_predicted_over_source: $ratio, + source_gpus: ($candidate.metrics.source_gpus // []), + target_gpus: ($candidate.metrics.target_gpus // []), + kernel_count: ($candidate.metrics.kernel_count // (($candidate.metrics.matched_kernels // $candidate.metrics.kernels // []) | length)), + unique_kernel_count: (($candidate.metrics.kernel_names // (($candidate.metrics.matched_kernels // $candidate.metrics.kernels // []) | map(.name // "") | unique)) | length), + kernel_names: ($candidate.metrics.kernel_names // (($candidate.metrics.matched_kernels // $candidate.metrics.kernels // []) | map(.name // "") | unique)), + ncu_sample: { + kernel_count: ($candidate.metrics.kernel_count // (($candidate.metrics.matched_kernels // $candidate.metrics.kernels // []) | length)), + source_time: ($candidate.metrics.total_source_time // (if ($candidate.metrics.total_source_time_ns // null) != null then ($candidate.metrics.total_source_time_ns / 1000000000) else null end)), + source_time_ns: ($candidate.metrics.total_source_time_ns // null), + predicted_time: ($candidate.metrics.sample_predicted_time // (if ($candidate.metrics.total_predicted_time_ns // null) != null then ($candidate.metrics.total_predicted_time_ns / 1000000000) else null end)), + predicted_time_ns: ($candidate.metrics.total_predicted_time_ns // null) + }, + artifacts: ($candidate.artifacts // []) + } + ) + ) as $package_summaries | ($usable | candidate_kernel_records) as $kernel_records | ($kernel_records | map(.name) | unique | sort) as $kernel_names | ($kernel_names | length) as $unique_kernel_count @@ -207,14 +239,69 @@ bk_section_package_transform_gpu_kernel_ensemble_average() { } ) ) as $kernel_means + | ( + $kernel_records + | sort_by(.name) + | group_by(.name) + | map( + . as $kernel_group + | { + name: $kernel_group[0].name, + package_summaries: ( + $kernel_group + | sort_by(.estimation_package) + | group_by(.estimation_package) + | map( + . as $package_group + | ($package_group | map(.source_time_ns) | map(select(. != null))) as $source_times_ns + | ($package_group | map(.predicted_time_ns) | map(select(. != null))) as $predicted_times_ns + | ($package_group | map(.time_ratio_predicted_over_source) | map(select(. != null))) as $ratios + | ( + $package_group + | map(.metric_comparisons // []) + | add // [] + | sort_by(.name) + | group_by(.name) + | map( + . as $metric_group + | ($metric_group | map(.source_value // null) | map(select(. != null))) as $source_values + | ($metric_group | map(.predicted_value // null) | map(select(. != null))) as $predicted_values + | ($metric_group | map(.ratio_predicted_over_source // null) | map(select(. != null))) as $metric_ratios + | { + name: $metric_group[0].name, + sample_count: ($metric_group | length), + source_value_mean: (if ($source_values | length) > 0 then (($source_values | add) / ($source_values | length)) else null end), + predicted_value_mean: (if ($predicted_values | length) > 0 then (($predicted_values | add) / ($predicted_values | length)) else null end), + ratio_predicted_over_source_mean: (if ($metric_ratios | length) > 0 then (($metric_ratios | add) / ($metric_ratios | length)) else null end), + samples: $metric_group + } + ) + ) as $metric_comparisons + | { + estimation_package: $package_group[0].estimation_package, + sample_count: ($package_group | length), + source_gpus: ($package_group | map(.source_gpu // empty) | unique | sort), + target_gpus: ($package_group | map(.target_gpu // empty) | unique | sort), + source_time_ns_total: (if ($source_times_ns | length) > 0 then ($source_times_ns | add) else null end), + source_time_ns_mean: (if ($source_times_ns | length) > 0 then (($source_times_ns | add) / ($source_times_ns | length)) else null end), + predicted_time_ns_total: (if ($predicted_times_ns | length) > 0 then ($predicted_times_ns | add) else null end), + predicted_time_ns_mean: (if ($predicted_times_ns | length) > 0 then (($predicted_times_ns | add) / ($predicted_times_ns | length)) else null end), + mean_time_ratio_predicted_over_source: (if ($ratios | length) > 0 then (($ratios | add) / ($ratios | length)) else null end), + metric_comparisons: $metric_comparisons + } + ) + ) + } + ) + ) as $kernel_summaries | (if ($usable_ratios | length) > 0 then (($usable_ratios | add) / ($usable_ratios | length)) else null end) as $mean_ratio - | (.bench_time // .time // null) as $app_section_time | ($blocking_count == 0 and $usable_count > 0 and $unique_kernel_count == 1 and $mean_ratio != null and $app_section_time != null) as $can_project_section | (if $can_project_section then ($app_section_time * $mean_ratio) else $app_section_time end) as $output_time | . + { estimation_package: (if $can_project_section then "gpu_kernel_ensemble_average" else "identity" end), requested_estimation_package: (.requested_estimation_package // "gpu_kernel_ensemble_average"), + bench_time: $app_section_time, time: $output_time, scaling_method: (if $can_project_section then "gpu-kernel-ensemble-average" else "identity" end), package_applicability: { @@ -262,9 +349,11 @@ bk_section_package_transform_gpu_kernel_ensemble_average() { time_ratio_predicted_over_source: candidate_time_ratio, applicability_status: (.package_applicability.status // "") })), + package_summaries: $package_summaries, kernel_count: ($kernel_records | length), unique_kernel_count: $unique_kernel_count, kernel_names: $kernel_names, + kernel_summaries: $kernel_summaries, kernel_candidate_ratios: $kernel_means, app_gpu_section_time: $app_section_time, mean_time: (if $can_project_section then $output_time else null end), diff --git a/scripts/estimation/section_packages/gpu_kernel_lightgbm_v10.sh b/scripts/estimation/section_packages/gpu_kernel_lightgbm_v10.sh index 8f95c65..7d9fedb 100644 --- a/scripts/estimation/section_packages/gpu_kernel_lightgbm_v10.sh +++ b/scripts/estimation/section_packages/gpu_kernel_lightgbm_v10.sh @@ -502,6 +502,58 @@ def source_time_by_row(path): return [as_number(row.get(time_column)) for row in rows], time_column +def source_metric_candidates(metric_name): + candidates = [metric_name] + if metric_name.startswith("O-"): + candidates.append(metric_name[2:]) + if metric_name.startswith("brk_"): + candidates.append("breakdown_" + metric_name[4:]) + if metric_name.startswith("breakdown_"): + candidates.append("brk_" + metric_name[len("breakdown_"):]) + return list(dict.fromkeys(candidates)) + + +def source_metrics_by_row(path): + if not path: + return [] + candidate = Path(path) + if not candidate.is_file(): + return [] + + rows, fieldnames = read_csv_rows(path) + if not fieldnames: + return [] + + source_rows = [] + for row in rows: + source_metrics = {} + for metric_name in metric_columns: + for source_name in source_metric_candidates(metric_name): + if source_name in fieldnames: + value = as_number(row.get(source_name)) + if value is not None: + source_metrics[metric_name] = value + break + source_rows.append(source_metrics) + return source_rows + + +def metric_comparisons(source_metrics, predicted_metrics): + comparisons = [] + for metric_name in sorted(set(source_metrics) | set(predicted_metrics)): + item = {"name": metric_name} + source_value = source_metrics.get(metric_name) + predicted_value = predicted_metrics.get(metric_name) + if source_value is not None: + item["source_value"] = source_value + if predicted_value is not None: + item["predicted_value"] = predicted_value + if source_value not in (None, 0) and predicted_value is not None: + item["ratio_predicted_over_source"] = predicted_value / source_value + comparisons.append(item) + return comparisons + + reader = csv.DictReader(cleaned_lines(prediction_csv)) if not reader.fieldnames: raise SystemExit(f"prediction CSV has no header: {prediction_csv}") @@ -518,6 +570,7 @@ source_gpus = [] target_gpus = [] total_seconds = 0.0 source_times_ns, source_time_column = source_time_by_row(input_csv) +source_metrics_rows = source_metrics_by_row(input_csv) total_source_seconds = 0.0 source_time_count = 0 @@ -537,6 +590,7 @@ for idx, row in enumerate(reader, start=1): seconds = predicted_ns / 1e9 total_seconds += seconds source_ns = source_times_ns[idx - 1] if idx - 1 < len(source_times_ns) else None + source_metrics = source_metrics_rows[idx - 1] if idx - 1 < len(source_metrics_rows) else {} source_seconds = source_ns / 1e9 if source_ns is not None else None if source_seconds is not None: total_source_seconds += source_seconds @@ -565,6 +619,11 @@ for idx, row in enumerate(reader, start=1): kernel["target_gpu"] = target_gpu if metrics: kernel["metrics"] = metrics + if source_metrics: + kernel["source_metrics"] = source_metrics + comparisons = metric_comparisons(source_metrics, metrics) + if comparisons: + kernel["metric_comparisons"] = comparisons kernels.append(kernel) summary_metrics = { @@ -666,6 +725,7 @@ _bk_gpu_lightgbm_run_predictor() { prediction_csv_abs=$(_bk_gpu_lightgbm_abs_path "$prediction_csv") prediction_log_abs=$(_bk_gpu_lightgbm_abs_path "$prediction_log") + echo "Running PerfTools LightGBM_model/1.0 for ${section_name}: ${source_gpu}->${target_gpu}" >&2 if ! ( cd "$model_dir" "$python_bin" AI_model/run_inference.py \ diff --git a/scripts/estimation/section_packages/gpu_kernel_mlp_v15.sh b/scripts/estimation/section_packages/gpu_kernel_mlp_v15.sh index 9021f75..64ad3fb 100644 --- a/scripts/estimation/section_packages/gpu_kernel_mlp_v15.sh +++ b/scripts/estimation/section_packages/gpu_kernel_mlp_v15.sh @@ -1,5 +1,6 @@ #!/bin/bash -# gpu_kernel_mlp_v15.sh - Section package for the PerfTools MLP_NN/v1.5 GPU estimator. +# gpu_kernel_mlp_v15.sh - Section package and shared implementation for +# PerfTools MLP_NN GPU estimators. bk_section_package_metadata_gpu_kernel_mlp_v15() { cat <<'EOF' @@ -143,7 +144,7 @@ _bk_gpu_mlp_ensure_perftools_root() { mkdir -p "$(dirname "$root")" if [[ ! -d "$root/.git" ]]; then - echo "Fetching PerfTools for gpu_kernel_mlp_v15: ${repo} (${ref})" >&2 + echo "Fetching PerfTools for ${BK_GPU_MLP_PACKAGE_NAME:-gpu_kernel_mlp_v15}: ${repo} (${ref})" >&2 git clone --depth 1 "$repo" "$root" >&2 || { printf '%s\n' "$root" return 0 @@ -161,13 +162,15 @@ _bk_gpu_mlp_ensure_perftools_root() { _bk_gpu_mlp_predictor() { local root="$1" + local version_dir="${BK_GPU_MLP_VERSION_DIR:-v1.5}" + local predictor_script="${BK_GPU_MLP_PREDICT_SCRIPT:-predict_v15.py}" if [[ -z "$root" ]]; then printf '%s\n' "" return 0 fi - printf '%s\n' "${root}/MLP_NN/v1.5/predict_v15.py" + printf '%s\n' "${root}/MLP_NN/${version_dir}/${predictor_script}" } _bk_gpu_mlp_python_exists() { @@ -346,6 +349,7 @@ bk_section_package_check_applicability_gpu_kernel_mlp_v15() { local root local predictor local python_bin="${BK_GPU_MLP_PYTHON:-$(_bk_gpu_mlp_default_python)}" + local predictor_rel="MLP_NN/${BK_GPU_MLP_VERSION_DIR:-v1.5}/${BK_GPU_MLP_PREDICT_SCRIPT:-predict_v15.py}" local missing=() if [[ "$item_kind" != "section" ]]; then @@ -387,7 +391,7 @@ EOF missing+=('"BK_GPU_MLP_PERFTOOLS_ROOT"') fi if [[ -z "$predictor" || ! -f "$predictor" ]]; then - missing+=('"PerfTools MLP_NN/v1.5/predict_v15.py"') + missing+=("\"PerfTools predictor:${predictor_rel}\"") fi fi @@ -490,6 +494,58 @@ def source_time_by_row(path): return [as_number(row.get(time_column)) for row in rows], time_column +def source_metric_candidates(metric_name): + candidates = [metric_name] + if metric_name.startswith("O-"): + candidates.append(metric_name[2:]) + if metric_name.startswith("brk_"): + candidates.append("breakdown_" + metric_name[4:]) + if metric_name.startswith("breakdown_"): + candidates.append("brk_" + metric_name[len("breakdown_"):]) + return list(dict.fromkeys(candidates)) + + +def source_metrics_by_row(path): + if not path: + return [] + candidate = Path(path) + if not candidate.is_file(): + return [] + + rows, fieldnames = read_csv_rows(path) + if not fieldnames: + return [] + + source_rows = [] + for row in rows: + source_metrics = {} + for metric_name in metric_columns: + for source_name in source_metric_candidates(metric_name): + if source_name in fieldnames: + value = as_number(row.get(source_name)) + if value is not None: + source_metrics[metric_name] = value + break + source_rows.append(source_metrics) + return source_rows + + +def metric_comparisons(source_metrics, predicted_metrics): + comparisons = [] + for metric_name in sorted(set(source_metrics) | set(predicted_metrics)): + item = {"name": metric_name} + source_value = source_metrics.get(metric_name) + predicted_value = predicted_metrics.get(metric_name) + if source_value is not None: + item["source_value"] = source_value + if predicted_value is not None: + item["predicted_value"] = predicted_value + if source_value not in (None, 0) and predicted_value is not None: + item["ratio_predicted_over_source"] = predicted_value / source_value + comparisons.append(item) + return comparisons + + reader = csv.DictReader(cleaned_lines(prediction_csv)) if not reader.fieldnames: raise SystemExit(f"prediction CSV has no header: {prediction_csv}") @@ -506,6 +562,7 @@ source_gpus = [] target_gpus = [] total_seconds = 0.0 source_times_ns, source_time_column = source_time_by_row(input_csv) +source_metrics_rows = source_metrics_by_row(input_csv) total_source_seconds = 0.0 source_time_count = 0 nonpositive_prediction_count = 0 @@ -528,6 +585,7 @@ for idx, row in enumerate(reader, start=1): seconds = predicted_ns / 1e9 total_seconds += seconds source_ns = source_times_ns[idx - 1] if idx - 1 < len(source_times_ns) else None + source_metrics = source_metrics_rows[idx - 1] if idx - 1 < len(source_metrics_rows) else {} source_seconds = source_ns / 1e9 if source_ns is not None else None if source_seconds is not None: total_source_seconds += source_seconds @@ -556,6 +614,11 @@ for idx, row in enumerate(reader, start=1): kernel["target_gpu"] = target_gpu if metrics: kernel["metrics"] = metrics + if source_metrics: + kernel["source_metrics"] = source_metrics + comparisons = metric_comparisons(source_metrics, metrics) + if comparisons: + kernel["metric_comparisons"] = comparisons kernels.append(kernel) summary_metrics = { @@ -572,7 +635,7 @@ if nonpositive_prediction_count: "severity": "warning", "reason": "nonpositive_predicted_execution_time", "message": ( - "PerfTools MLP_NN/v1.5 returned non-positive predicted execution " + f"PerfTools MLP_NN/{model_version} returned non-positive predicted execution " "time for one or more kernel rows. Check target GPU selection and " "required NCU feature coverage." ), @@ -601,7 +664,7 @@ print(json.dumps({ }, "model": { "type": "cross_gpu_kernel_prediction_model", - "name": "PerfTools MLP_NN/v1.5", + "name": "PerfTools MLP_NN/" + model_version, "version": model_version, "repository": "https://github.com/masaaki-kondo/PerfTools", }, @@ -645,7 +708,10 @@ _bk_gpu_mlp_run_predictor() { local root local input_csv local ncu_archive - local output_dir="${BK_GPU_MLP_OUTPUT_DIR:-results/estimation_artifacts/gpu_kernel_mlp_v15}" + local package_name="${BK_GPU_MLP_PACKAGE_NAME:-gpu_kernel_mlp_v15}" + local version_dir="${BK_GPU_MLP_VERSION_DIR:-v1.5}" + local predictor_script="${BK_GPU_MLP_PREDICT_SCRIPT:-predict_v15.py}" + local output_dir="${BK_GPU_MLP_OUTPUT_DIR:-results/estimation_artifacts/${package_name}}" local prediction_csv local prediction_log local input_csv_abs @@ -672,18 +738,18 @@ _bk_gpu_mlp_run_predictor() { if ! ( cd "$root" - "$python_bin" MLP_NN/v1.5/predict_v15.py \ + "$python_bin" "MLP_NN/${version_dir}/${predictor_script}" \ --csv "$input_csv_abs" \ --row "${BK_GPU_MLP_ROW:-all}" \ --out "$prediction_csv_abs" \ --log "$prediction_log_abs" ) >/dev/null; then - echo "ERROR: PerfTools MLP_NN/v1.5 inference failed" >&2 + echo "ERROR: PerfTools MLP_NN/${version_dir} inference failed" >&2 return 1 fi if [[ ! -s "$prediction_csv_abs" ]]; then - echo "ERROR: PerfTools MLP_NN/v1.5 did not create prediction CSV: ${prediction_csv_abs}" >&2 + echo "ERROR: PerfTools MLP_NN/${version_dir} did not create prediction CSV: ${prediction_csv_abs}" >&2 return 1 fi @@ -702,8 +768,9 @@ bk_section_package_transform_gpu_kernel_mlp_v15() { local prediction_log="" local run_outputs local parsed_json - local package_name="gpu_kernel_mlp_v15" + local package_name="${BK_GPU_MLP_PACKAGE_NAME:-gpu_kernel_mlp_v15}" local model_version="${BK_GPU_MLP_MODEL_VERSION:-v1.5}" + local scaling_method="${BK_GPU_MLP_SCALING_METHOD:-gpu-kernel-mlp-${model_version}}" local selector_kind="" local selector_value="" local selector @@ -727,6 +794,7 @@ bk_section_package_transform_gpu_kernel_mlp_v15() { --arg prediction_log "$prediction_log" \ --arg selector_kind "$selector_kind" \ --arg selector_value "$selector_value" \ + --arg scaling_method "$scaling_method" \ --argjson parsed "$parsed_json" ' def selector_matches($kind; $value): if $kind == "" or $value == "" then true @@ -767,7 +835,7 @@ bk_section_package_transform_gpu_kernel_mlp_v15() { end ), bench_time: $source_section_time, - scaling_method: (if $can_identity_fallback then "identity" else "gpu-kernel-mlp-v1.5" end), + scaling_method: (if $can_identity_fallback then "identity" else $scaling_method end), estimation_package: (if $can_identity_fallback then "identity" else $parsed.estimation_package end), requested_estimation_package: (if $can_identity_fallback then $parsed.estimation_package else (.requested_estimation_package // $parsed.estimation_package) end), fallback_used: (if $can_identity_fallback then "identity" else null end), diff --git a/scripts/estimation/section_packages/gpu_kernel_mlp_v21.sh b/scripts/estimation/section_packages/gpu_kernel_mlp_v21.sh new file mode 100644 index 0000000..faeda5c --- /dev/null +++ b/scripts/estimation/section_packages/gpu_kernel_mlp_v21.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# gpu_kernel_mlp_v21.sh - Thin package wrapper for PerfTools MLP_NN/v2.1. + +bk_section_package_metadata_gpu_kernel_mlp_v21() { + cat <<'EOF' +{ + "name": "gpu_kernel_mlp_v21", + "fallback_target": "identity", + "source_system_scope": { + "kind": "benchmark_system", + "accepted_values": ["any"] + }, + "target_system_scope": { + "accepted_values": ["any"] + }, + "item_kind_scope": ["section"], + "required_result_fields": ["name", "app-side GPU section time as time or bench_time"], + "required_artifact_kinds": [ + "PerfTools MLP_NN/v2.1 prepared input CSV", + "precomputed prediction CSV", + "or BenchKit padata archive with Nsight Compute raw CSV" + ], + "acquisition_mode": "external", + "output_fields": [ + "time", + "bench_time", + "scaling_method", + "metrics", + "package_applicability" + ] +} +EOF +} + +bk_section_package_check_applicability_gpu_kernel_mlp_v21() ( + export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v21" + export BK_GPU_MLP_VERSION_DIR="v2.1" + export BK_GPU_MLP_PREDICT_SCRIPT="predict_v21.py" + export BK_GPU_MLP_MODEL_VERSION="v2.1" + export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v2.1" + bk_section_package_check_applicability_gpu_kernel_mlp_v15 "$@" +) + +bk_section_package_transform_gpu_kernel_mlp_v21() ( + export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v21" + export BK_GPU_MLP_VERSION_DIR="v2.1" + export BK_GPU_MLP_PREDICT_SCRIPT="predict_v21.py" + export BK_GPU_MLP_MODEL_VERSION="v2.1" + export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v2.1" + bk_section_package_transform_gpu_kernel_mlp_v15 "$@" +) diff --git a/scripts/estimation/section_packages/gpu_kernel_mlp_v40.sh b/scripts/estimation/section_packages/gpu_kernel_mlp_v40.sh new file mode 100644 index 0000000..09cf87d --- /dev/null +++ b/scripts/estimation/section_packages/gpu_kernel_mlp_v40.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# gpu_kernel_mlp_v40.sh - Thin package wrapper for PerfTools MLP_NN/v4.0. + +bk_section_package_metadata_gpu_kernel_mlp_v40() { + cat <<'EOF' +{ + "name": "gpu_kernel_mlp_v40", + "fallback_target": "identity", + "source_system_scope": { + "kind": "benchmark_system", + "accepted_values": ["any"] + }, + "target_system_scope": { + "accepted_values": ["any"] + }, + "item_kind_scope": ["section"], + "required_result_fields": ["name", "app-side GPU section time as time or bench_time"], + "required_artifact_kinds": [ + "PerfTools MLP_NN/v4.0 prepared input CSV", + "precomputed prediction CSV", + "or BenchKit padata archive with Nsight Compute raw CSV" + ], + "acquisition_mode": "external", + "output_fields": [ + "time", + "bench_time", + "scaling_method", + "metrics", + "package_applicability" + ] +} +EOF +} + +bk_section_package_check_applicability_gpu_kernel_mlp_v40() ( + export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v40" + export BK_GPU_MLP_VERSION_DIR="v4.0" + export BK_GPU_MLP_PREDICT_SCRIPT="predict_v40.py" + export BK_GPU_MLP_MODEL_VERSION="v4.0" + export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v4.0" + bk_section_package_check_applicability_gpu_kernel_mlp_v15 "$@" +) + +bk_section_package_transform_gpu_kernel_mlp_v40() ( + export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v40" + export BK_GPU_MLP_VERSION_DIR="v4.0" + export BK_GPU_MLP_PREDICT_SCRIPT="predict_v40.py" + export BK_GPU_MLP_MODEL_VERSION="v4.0" + export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v4.0" + bk_section_package_transform_gpu_kernel_mlp_v15 "$@" +) diff --git a/scripts/estimation/section_packages/gpu_kernel_mlp_v41.sh b/scripts/estimation/section_packages/gpu_kernel_mlp_v41.sh new file mode 100644 index 0000000..9628c86 --- /dev/null +++ b/scripts/estimation/section_packages/gpu_kernel_mlp_v41.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# gpu_kernel_mlp_v41.sh - Thin package wrapper for PerfTools MLP_NN/v4.1. + +bk_section_package_metadata_gpu_kernel_mlp_v41() { + cat <<'EOF' +{ + "name": "gpu_kernel_mlp_v41", + "fallback_target": "identity", + "source_system_scope": { + "kind": "benchmark_system", + "accepted_values": ["any"] + }, + "target_system_scope": { + "accepted_values": ["any"] + }, + "item_kind_scope": ["section"], + "required_result_fields": ["name", "app-side GPU section time as time or bench_time"], + "required_artifact_kinds": [ + "PerfTools MLP_NN/v4.1 prepared input CSV", + "precomputed prediction CSV", + "or BenchKit padata archive with Nsight Compute raw CSV" + ], + "acquisition_mode": "external", + "output_fields": [ + "time", + "bench_time", + "scaling_method", + "metrics", + "package_applicability" + ] +} +EOF +} + +bk_section_package_check_applicability_gpu_kernel_mlp_v41() ( + export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v41" + export BK_GPU_MLP_VERSION_DIR="v4.1" + export BK_GPU_MLP_PREDICT_SCRIPT="predict_v41.py" + export BK_GPU_MLP_MODEL_VERSION="v4.1" + export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v4.1" + bk_section_package_check_applicability_gpu_kernel_mlp_v15 "$@" +) + +bk_section_package_transform_gpu_kernel_mlp_v41() ( + export BK_GPU_MLP_PACKAGE_NAME="gpu_kernel_mlp_v41" + export BK_GPU_MLP_VERSION_DIR="v4.1" + export BK_GPU_MLP_PREDICT_SCRIPT="predict_v41.py" + export BK_GPU_MLP_MODEL_VERSION="v4.1" + export BK_GPU_MLP_SCALING_METHOD="gpu-kernel-mlp-v4.1" + bk_section_package_transform_gpu_kernel_mlp_v15 "$@" +) diff --git a/scripts/tests/test_estimation_gpu_kernel_ensemble_average.sh b/scripts/tests/test_estimation_gpu_kernel_ensemble_average.sh index 5534e83..244aa6a 100644 --- a/scripts/tests/test_estimation_gpu_kernel_ensemble_average.sh +++ b/scripts/tests/test_estimation_gpu_kernel_ensemble_average.sh @@ -56,8 +56,8 @@ kern_a,H100,A100,0,30 EOF cat > "${TMP_DIR}/source_input_single.csv" <<'EOF' -Kernel Name,Duration [ns] -kern_a,1000 +Kernel Name,Duration [ns],Memory Throughput [%],Achieved Occupancy +kern_a,1000,25,10 EOF cat > "${TMP_DIR}/lightgbm_pred_mixed.csv" <<'EOF' @@ -72,9 +72,9 @@ kern_b,H100,A100,5000,20 EOF cat > "${TMP_DIR}/source_input_mixed.csv" <<'EOF' -Kernel Name,Duration [ns] -kern_a,1000 -kern_b,2000 +Kernel Name,Duration [ns],Memory Throughput [%],Achieved Occupancy +kern_a,1000,25,10 +kern_b,2000,40,20 EOF cat > "${TMP_DIR}/breakdown.json" <<'EOF' @@ -129,10 +129,39 @@ if ! echo "$transformed_single" | jq -e ' .sections[0].estimation_package == "gpu_kernel_ensemble_average" and near(.sections[0].time; 20) and .sections[0].scaling_method == "gpu-kernel-ensemble-average" and + .sections[0].bench_time == 10 and .sections[0].metrics.aggregation == "single-kernel-package-ratio-mean" and .sections[0].metrics.candidate_count == 2 and .sections[0].metrics.applicable_candidate_count == 2 and .sections[0].metrics.candidate_packages == ["gpu_kernel_lightgbm_v10", "gpu_kernel_mlp_v15"] and + (.sections[0].metrics.package_summaries | length == 2) and + .sections[0].metrics.package_summaries[0].estimation_package == "gpu_kernel_lightgbm_v10" and + .sections[0].metrics.package_summaries[0].source_section_time == 10 and + near(.sections[0].metrics.package_summaries[0].projected_section_time; 10) and + near(.sections[0].metrics.package_summaries[0].time_ratio_predicted_over_source; 1) and + .sections[0].metrics.package_summaries[0].source_gpus == ["H100"] and + .sections[0].metrics.package_summaries[0].target_gpus == ["A100"] and + .sections[0].metrics.package_summaries[0].ncu_sample.kernel_count == 1 and + .sections[0].metrics.package_summaries[0].ncu_sample.source_time_ns == 1000 and + .sections[0].metrics.package_summaries[0].ncu_sample.predicted_time_ns == 1000 and + .sections[0].metrics.package_summaries[1].estimation_package == "gpu_kernel_mlp_v15" and + near(.sections[0].metrics.package_summaries[1].projected_section_time; 30) and + near(.sections[0].metrics.package_summaries[1].time_ratio_predicted_over_source; 3) and + (.sections[0].metrics.kernel_summaries | length == 1) and + .sections[0].metrics.kernel_summaries[0].name == "kern_a" and + (.sections[0].metrics.kernel_summaries[0].package_summaries | length == 2) and + .sections[0].metrics.kernel_summaries[0].package_summaries[0].estimation_package == "gpu_kernel_lightgbm_v10" and + .sections[0].metrics.kernel_summaries[0].package_summaries[0].sample_count == 1 and + .sections[0].metrics.kernel_summaries[0].package_summaries[0].source_gpus == ["H100"] and + .sections[0].metrics.kernel_summaries[0].package_summaries[0].target_gpus == ["A100"] and + .sections[0].metrics.kernel_summaries[0].package_summaries[0].source_time_ns_total == 1000 and + .sections[0].metrics.kernel_summaries[0].package_summaries[0].predicted_time_ns_total == 1000 and + near(.sections[0].metrics.kernel_summaries[0].package_summaries[0].mean_time_ratio_predicted_over_source; 1) and + (.sections[0].metrics.kernel_summaries[0].package_summaries[0].metric_comparisons | length >= 2) and + (.sections[0].metrics.kernel_summaries[0].package_summaries[0].metric_comparisons | map(select(.name == "O-Memory Throughput [%]" and .source_value_mean == 25 and .predicted_value_mean == 50 and .ratio_predicted_over_source_mean == 2)) | length == 1) and + .sections[0].metrics.kernel_summaries[0].package_summaries[1].estimation_package == "gpu_kernel_mlp_v15" and + near(.sections[0].metrics.kernel_summaries[0].package_summaries[1].mean_time_ratio_predicted_over_source; 3) and + (.sections[0].metrics.kernel_summaries[0].package_summaries[1].metric_comparisons | map(select(.name == "Memory Throughput [%]" and .source_value_mean == 25 and .predicted_value_mean == 30 and .ratio_predicted_over_source_mean == 1.2)) | length == 1) and near(.sections[0].metrics.mean_time_ratio_predicted_over_source; 2) and .sections[0].metrics.unique_kernel_count == 1 and .sections[0].metrics.kernel_names == ["kern_a"] and diff --git a/scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh b/scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh index 82da30b..781f176 100644 --- a/scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh +++ b/scripts/tests/test_estimation_gpu_kernel_mlp_v15.sh @@ -133,6 +133,16 @@ if args.log: handle.write("fake predictor called\n") PY +for version_script in \ + "v2.1 predict_v21.py" \ + "v4.0 predict_v40.py" \ + "v4.1 predict_v41.py"; do + read -r version_dir script_name <<< "$version_script" + mkdir -p "${FAKE_PERFTOOLS}/MLP_NN/${version_dir}" + cp "${FAKE_PERFTOOLS}/MLP_NN/v1.5/predict_v15.py" \ + "${FAKE_PERFTOOLS}/MLP_NN/${version_dir}/${script_name}" +done + cat > "${TMP_DIR}/input.csv" <<'EOF' kernel_name,src_gpu,tgt_gpu,Execution Time probe_kernel,A100,H100,2000000 @@ -191,4 +201,53 @@ echo "$transformed_from_input" | jq -e ' test -f "${TMP_DIR}/mlp_outputs/unknown_gpu_kernel_region_local_pred.csv" test -f "${TMP_DIR}/mlp_outputs/unknown_gpu_kernel_region_local.log" +unset BK_GPU_MLP_OUTPUT_DIR +for package_version in \ + "gpu_kernel_mlp_v21 v2.1" \ + "gpu_kernel_mlp_v40 v4.0" \ + "gpu_kernel_mlp_v41 v4.1"; do + read -r package_name version_label <<< "$package_version" + cat > "${TMP_DIR}/breakdown_${package_name}.json" </dev/null + export BK_GPU_MLP_ARTIFACT_MODE="input" + export BK_GPU_MLP_PERFTOOLS_ROOT="${FAKE_PERFTOOLS}" + export BK_GPU_MLP_OUTPUT_DIR="${TMP_DIR}/${package_name}_outputs" + transformed_family=$(bk_top_level_transform_breakdown "$(cat "${TMP_DIR}/breakdown_${package_name}.json")" "1" "1" "1" "identity" "identity") + popd >/dev/null + + echo "$transformed_family" | jq -e \ + --arg package_name "$package_name" \ + --arg version_label "$version_label" ' + (.sections | length == 1) and + .sections[0].name == "gpu_kernel_region" and + .sections[0].time == 0.022 and + .sections[0].bench_time == 0.011 and + .sections[0].scaling_method == ("gpu-kernel-mlp-" + $version_label) and + .sections[0].estimation_package == $package_name and + .sections[0].model.name == ("PerfTools MLP_NN/" + $version_label) and + .sections[0].model.version == $version_label and + .sections[0].metrics.kernel_count == 1 and + .sections[0].metrics.total_source_time_ns == 2000000 and + .sections[0].metrics.total_predicted_time_ns == 4000000 + ' >/dev/null + + test -f "${TMP_DIR}/${package_name}_outputs/unknown_gpu_kernel_region_local_pred.csv" + test -f "${TMP_DIR}/${package_name}_outputs/unknown_gpu_kernel_region_local.log" +done + echo "gpu_kernel_mlp_v15 section estimation test passed" diff --git a/scripts/tests/test_genesis_gpu_mlp_estimation.sh b/scripts/tests/test_genesis_gpu_mlp_estimation.sh index baf200c..e8b994a 100644 --- a/scripts/tests/test_genesis_gpu_mlp_estimation.sh +++ b/scripts/tests/test_genesis_gpu_mlp_estimation.sh @@ -20,6 +20,7 @@ source programs/genesis/estimate.sh test "${BK_ESTIMATION_BASELINE_EXP}" = "p8" test "${BK_ESTIMATION_BASELINE_SYSTEM}" = "Fugaku" test "${BK_ESTIMATION_FUTURE_SYSTEM}" = "FugakuNEXT" +test "${BK_GPU_KERNEL_ENSEMBLE_PACKAGES}" = "gpu_kernel_lightgbm_v10,gpu_kernel_mlp_v15,gpu_kernel_mlp_v21,gpu_kernel_mlp_v40,gpu_kernel_mlp_v41" cat > results/no_breakdown_input.json <<'EOF' {