From 66044e6936d97d1170a2b5580f1e6ed83b9a8af0 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Tue, 9 Jun 2026 09:20:59 +0200 Subject: [PATCH] fix: cap cpan_random_tester at 90 minutes and clean up our JVMs Default --max-runtime to 5400s so chatty jcpan installs cannot run for many hours. On timeout, kill only perlonjava JVMs tracked as descendants of the forked jcpan child so unrelated user jperl processes are untouched. Generated with [Cursor](https://cursor.com) Co-Authored-By: Cursor --- dev/tools/cpan_random_tester.pl | 116 ++++++++++++++++++++++++++------ 1 file changed, 96 insertions(+), 20 deletions(-) diff --git a/dev/tools/cpan_random_tester.pl b/dev/tools/cpan_random_tester.pl index 53db5499d..2208bb1b6 100644 --- a/dev/tools/cpan_random_tester.pl +++ b/dev/tools/cpan_random_tester.pl @@ -22,6 +22,7 @@ # perl dev/tools/cpan_random_tester.pl --modules list.txt # Test modules from file # perl dev/tools/cpan_random_tester.pl --report-only # Regenerate .md from .dat # perl dev/tools/cpan_random_tester.pl --timeout 120 # 2 min soft timeout +# perl dev/tools/cpan_random_tester.pl --max-runtime 0 # disable 90 min hard cap # perl dev/tools/cpan_random_tester.pl --jobs 8 # Parallelize CPAN test files # perl dev/tools/cpan_random_tester.pl --install # Install mode (deps stay) # @@ -62,12 +63,14 @@ my $fail_dat = File::Spec->catfile($report_dir, 'cpan-compatibility-fail.dat'); my $skip_dat = File::Spec->catfile($report_dir, 'cpan-compatibility-skip.dat'); my $log_dir = '/tmp/cpan_random_logs'; -my $KILL_AFTER = 10; # seconds between SIGTERM and SIGKILL (used by run_with_timeout) +my $KILL_AFTER = 10; # seconds between SIGTERM and SIGKILL (used by run_with_timeout) +my $DEFAULT_MAX_RUNTIME = 5400; # 90 minutes — hard cap per target (install or test) # jcpan -t soft timeouts (seconds): distribution root module -> timeout. # Overrides --timeout for that target only (heavy test suites). +# Hard --max-runtime still applies regardless of soft timeout or activity. my %MODULE_TIMEOUT_SECONDS = ( - 'DBIx::Class' => 3600, + 'DBIx::Class' => 3600, 'Image::ExifTool' => 3600, ); @@ -78,9 +81,9 @@ # CLI options # ────────────────────────────────────────────────────────────────────── my $count = 10; -my $timeout = 2400; # soft wall-clock timeout; progress can extend it -my $activity_grace = 600; # after soft timeout, allow this many idle seconds -my $max_runtime = 0; # 0 = no hard cap beyond activity timeout +my $timeout = 2400; # soft wall-clock timeout; progress can extend it +my $activity_grace = 600; # after soft timeout, allow this many idle seconds +my $max_runtime = $DEFAULT_MAX_RUNTIME; # hard cap per target module my $progress_interval = 60; my $jcpan_jobs = 1; # passed through as `jcpan --jobs N` my $report_only = 0; @@ -118,8 +121,11 @@ sub effective_timeout_for { my ($module) = @_; - my $secs = $MODULE_TIMEOUT_SECONDS{$module}; - defined $secs ? $secs : $timeout; + my $secs = $MODULE_TIMEOUT_SECONDS{$module} // $timeout; + if ($max_runtime && $secs > $max_runtime) { + $secs = $max_runtime; + } + return $secs; } # ────────────────────────────────────────────────────────────────────── @@ -250,9 +256,8 @@ sub effective_timeout_for { @selected = @pool[0 .. $count - 1]; } -printf "\nTesting %d randomly selected modules (soft timeout: %ds, activity grace: %ds, jcpan jobs: %d, commit: %s):\n", - scalar @selected, $timeout, $activity_grace, $jcpan_jobs, $git_commit; -printf "Hard max runtime: %ds\n", $max_runtime if $max_runtime; +printf "\nTesting %d randomly selected modules (soft timeout: %ds, activity grace: %ds, max runtime: %ds, jcpan jobs: %d, commit: %s):\n", + scalar @selected, $timeout, $activity_grace, $max_runtime, $jcpan_jobs, $git_commit; if (%MODULE_TIMEOUT_SECONDS) { print "Per-module soft timeouts: ", join(', ', map { "$_=${MODULE_TIMEOUT_SECONDS{$_}}s" } sort keys %MODULE_TIMEOUT_SECONDS), @@ -772,17 +777,20 @@ sub command_arg_label { } # Run a command with a progress-aware timeout. The per-module timeout is a -# soft wall clock: after it expires, output activity can keep the run alive. -# Once the child has been idle for --activity-grace seconds after that soft -# timeout, the whole process group is killed so no jperl/java child survives. +# soft wall clock: after it expires, output activity can keep the run alive +# until --max-runtime (default 90 minutes) or --activity-grace idle seconds. +# Once timed out, the process group is killed and any perlonjava JVMs that +# escaped into a different group but are still descendants of our jcpan child +# are mopped up (user-started jperl processes elsewhere are untouched). # Returns ($output, $timed_out, $timeout_error). sub run_with_timeout { my ($cmd, $secs) = @_; my @cmd = ref($cmd) eq 'ARRAY' ? @$cmd : ('/bin/sh', '-c', $cmd); - my $output = ''; - my $timed_out = 0; - my $timeout_error = ''; + my $output = ''; + my $timed_out = 0; + my $timeout_error = ''; + my %run_descendants = (); pipe(my $pipe, my $writer) or do { warn "pipe failed: $!\n"; @@ -834,6 +842,7 @@ sub run_with_timeout { $max_runtime, $now - $last_output ); $term_sent_at = $now; + note_run_descendants($pid, \%run_descendants); terminate_process_group($pid, 'TERM'); } elsif ($now >= $soft_deadline && ($now - $last_output) >= $activity_grace) { $timed_out = 1; @@ -842,12 +851,14 @@ sub run_with_timeout { $secs, $now - $last_output ); $term_sent_at = $now; + note_run_descendants($pid, \%run_descendants); terminate_process_group($pid, 'TERM'); } } if ($timed_out && !$child_done && !$kill_sent && $term_sent_at && (time() - $term_sent_at) >= $KILL_AFTER) { + note_run_descendants($pid, \%run_descendants); terminate_process_group($pid, 'KILL'); $kill_sent = 1; } @@ -899,6 +910,11 @@ sub run_with_timeout { close $pipe; # always close to avoid FD leak waitpid($pid, WNOHANG) unless $child_done; + if ($timed_out) { + note_run_descendants($pid, \%run_descendants); + cleanup_run_perlonjava_jvms(\%run_descendants); + } + return ($output // '', $timed_out, $timeout_error); } @@ -912,6 +928,63 @@ sub terminate_process_group { kill $signal, $pid; } +# Build pid => ppid map from ps (one snapshot per call). +sub read_ppid_map { + my %ppid; + open my $ps, '-|', 'ps', '-axo', 'pid=,ppid=' or return %ppid; + while (<$ps>) { + my ($p, $pp) = split; + next unless defined $p && defined $pp; + $ppid{$p} = $pp; + } + close $ps; + return %ppid; +} + +# Record every descendant of $root (including $root) into $seen. +sub note_run_descendants { + my ($root, $seen) = @_; + return unless $root; + + my %ppid = read_ppid_map(); + my @frontier = ($root); + $seen->{$root} = 1; + + while (@frontier) { + my @next; + for my $parent (@frontier) { + for my $p (keys %ppid) { + next unless ($ppid{$p} // -1) == $parent; + next if $seen->{$p}; + $seen->{$p} = 1; + push @next, $p; + } + } + @frontier = @next; + } +} + +sub is_perlonjava_java_pid { + my ($pid) = @_; + return 0 unless $pid && kill 0, $pid; + open my $ps, '-|', 'ps', '-p', $pid, '-o', 'command=' or return 0; + my $cmd = <$ps>; + close $ps; + return 0 unless defined $cmd; + return $cmd =~ /perlonjava.*\.jar|org\.perlonjava\.app\.cli\.Main/ ? 1 : 0; +} + +# Parallel prove --jobs workers can land outside the jcpan process group, so +# SIGKILL on the group may leave JVM descendants behind. Only kill JVMs we +# tracked as descendants of our forked jcpan child — not unrelated user jperl. +sub cleanup_run_perlonjava_jvms { + my ($descendants) = @_; + for my $pid (sort { $a <=> $b } keys %$descendants) { + next unless is_perlonjava_java_pid($pid); + kill 9, $pid; + } +} + sub print_progress_line { my ($start, $last_output, $soft_secs) = @_; my $now = time(); @@ -1155,8 +1228,9 @@ sub print_usage { --activity-grace N After --timeout has elapsed, kill the target if it produces no output for this many seconds (default: 600). - --max-runtime N Optional hard cap per target module in seconds (default: 0, - disabled). Useful for chatty tests that never finish. + --max-runtime N Hard cap per target module in seconds (default: 5400, + 90 minutes). Stops chatty installs/tests that never finish. + Pass 0 to disable the hard cap. --progress-interval N Print a progress heartbeat every N seconds while a target is still running (default: 60; 0 disables). @@ -1178,8 +1252,10 @@ sub print_usage { - Dependencies discovered during a run are recorded too (PASS/FAIL). - A few heavy targets (e.g. DBIx::Class) have a higher per-module timeout in the script. - Long targets are not killed merely for crossing --timeout if their output - is still active; they time out only after --activity-grace seconds without - output, or after --max-runtime if one is set. + is still active; they time out after --activity-grace seconds without + output, or unconditionally at --max-runtime (default 90 minutes). + - On timeout, stray perlonjava JVMs that are still descendants of the + forked jcpan child are cleaned up; unrelated user jperl runs are left alone. - --jobs parallelizes test files within a single jcpan run. This script keeps target modules sequential to avoid CPAN build/install directory contention. - If a previously-failed module now passes (e.g., its deps got