From 930b75b950bb499acd73282498b900bea87ce17b Mon Sep 17 00:00:00 2001 From: Jinghao-coding Date: Wed, 17 Jun 2026 23:12:00 +0800 Subject: [PATCH] fix(cli): align job management workflow with admin split --- cli/cmd/job.go | 1095 ++++++++++++++++--- cli/docs/COMMANDS.md | 138 ++- cli/internal/api/job.go | 407 ++++++- cli/internal/api/paths.go | 1 + cli/internal/i18n/catalog_job.go | 186 ++++ cli/internal/i18n/i18n.go | 1 + cli/skills/crater-cli-admin-job/SKILL.md | 52 + cli/skills/crater-cli-job/SKILL.md | 74 ++ cli/test/snapshots/job/job_test.go | 68 ++ cli/test/snapshots/read/read_matrix_test.go | 7 +- cli/testdata/snapshots/job/job.en.txtar | 167 +++ cli/testdata/snapshots/job/job.zh-CN.txtar | 167 +++ 12 files changed, 2163 insertions(+), 200 deletions(-) create mode 100644 cli/internal/i18n/catalog_job.go create mode 100644 cli/skills/crater-cli-admin-job/SKILL.md create mode 100644 cli/skills/crater-cli-job/SKILL.md create mode 100644 cli/test/snapshots/job/job_test.go create mode 100644 cli/testdata/snapshots/job/job.en.txtar create mode 100644 cli/testdata/snapshots/job/job.zh-CN.txtar diff --git a/cli/cmd/job.go b/cli/cmd/job.go index 33eec708..adcffe47 100644 --- a/cli/cmd/job.go +++ b/cli/cmd/job.go @@ -1,12 +1,16 @@ package cmd import ( + "encoding/json" "fmt" "os" "slices" + "strconv" "strings" + "time" "github.com/raids-lab/crater/cli/internal/api" + "github.com/raids-lab/crater/cli/internal/clierror" "github.com/raids-lab/crater/cli/internal/completion" "github.com/raids-lab/crater/cli/internal/i18n" "github.com/raids-lab/crater/cli/internal/output" @@ -14,20 +18,26 @@ import ( "github.com/spf13/cobra" ) +const ( + scheduleBackfill = 0 + scheduleNormal = 1 +) + var ( jobStatuses = []string{ - "Prequeue", "Pending", "Running", "Completed", "Failed", "Terminated", "Deleted", "Freed", "Cancelled", + "Prequeue", "Pending", "Aborting", "Aborted", "Running", "Restarting", + "Completing", "Completed", "Terminating", "Terminated", "Failed", + "Deleted", "Freed", "Cancelled", } jobTypes = []string{ "jupyter", "webide", "custom", "pytorch", "tensorflow", "kuberay", "deepspeed", "openmpi", } - interactiveJobTypes = []string{"jupyter", "webide"} ) var jobCmd = &cobra.Command{ Use: "job", - Short: "View jobs", - Long: "View Volcano job lists and job details from the active Crater platform.", + Short: "Manage jobs", + Long: "List, inspect, create, stop, and snapshot Volcano jobs on the active Crater platform.", RunE: func(cmd *cobra.Command, args []string) error { if len(args) > 0 { return errUnknownSubcommand(cmd, args[0]) @@ -36,46 +46,86 @@ var jobCmd = &cobra.Command{ }, } -var jobLsCmd = &cobra.Command{ - Use: "ls", - Short: "List jobs", - Args: noArgs, - RunE: runJobLs, -} +var jobLsCmd = &cobra.Command{Use: "ls", Short: "List jobs", Args: noArgs, RunE: runJobLs} +var jobGetCmd = &cobra.Command{Use: "get ", Short: "Get a job", Args: exactArgs(1, "job-name"), RunE: runJobGet} +var jobPodsCmd = &cobra.Command{Use: "pods ", Short: "List pods for a job", Args: exactArgs(1, "job-name"), RunE: runJobPods} +var jobEventsCmd = &cobra.Command{Use: "events ", Short: "List events for a job", Args: exactArgs(1, "job-name"), RunE: runJobEvents} +var jobYAMLCmd = &cobra.Command{Use: "yaml ", Short: "Show job YAML", Args: exactArgs(1, "job-name"), RunE: runJobYAML} +var jobTemplateCmd = &cobra.Command{Use: "template ", Short: "Show job template JSON", Args: exactArgs(1, "job-name"), RunE: runJobTemplate} +var jobTokenCmd = &cobra.Command{Use: "token ", Short: "Get Jupyter token", Args: exactArgs(1, "job-name"), RunE: runJobToken} +var jobSecretCmd = &cobra.Command{Use: "secret ", Short: "Get WebIDE secret", Args: exactArgs(1, "job-name"), RunE: runJobSecret} +var jobSSHCmd = &cobra.Command{Use: "ssh ", Short: "Open SSH for a running job", Args: exactArgs(1, "job-name"), RunE: runJobSSH} +var jobSnapshotCmd = &cobra.Command{Use: "snapshot ", Short: "Create a job image snapshot", Args: exactArgs(1, "job-name"), RunE: runJobSnapshot} +var jobAlertCmd = &cobra.Command{Use: "alert ", Short: "Toggle job alert state", Args: exactArgs(1, "job-name"), RunE: runJobAlert} +var jobDeleteCmd = &cobra.Command{Use: "delete ", Short: "Stop or delete a job", Args: exactArgs(1, "job-name"), RunE: runJobDelete} -var jobGetCmd = &cobra.Command{ - Use: "get ", - Short: "Get a job", - Args: exactArgs(1, "job-name"), - RunE: runJobGet, -} +var jobCreateCmd = &cobra.Command{Use: "create", Short: "Create jobs"} +var jobCreateJupyterCmd = &cobra.Command{Use: "jupyter", Short: "Create a Jupyter job", Args: noArgs, RunE: runJobCreateJupyter} +var jobCreateWebIDECmd = &cobra.Command{Use: "webide", Short: "Create a WebIDE job", Args: noArgs, RunE: runJobCreateWebIDE} +var jobCreateCustomCmd = &cobra.Command{Use: "custom", Short: "Create a custom single-node job", Args: noArgs, RunE: runJobCreateCustom} +var jobCreateTensorflowCmd = &cobra.Command{Use: "tensorflow", Short: "Create a TensorFlow distributed job from JSON", Args: noArgs, RunE: runJobCreateTensorflow} +var jobCreatePytorchCmd = &cobra.Command{Use: "pytorch", Short: "Create a PyTorch distributed job from JSON", Args: noArgs, RunE: runJobCreatePytorch} -var jobPodsCmd = &cobra.Command{ - Use: "pods ", - Short: "List pods for a job", - Args: exactArgs(1, "job-name"), - RunE: runJobPods, -} +var adminJobCmd = &cobra.Command{Use: "job", Short: "Admin job operations"} +var adminJobLsCmd = &cobra.Command{Use: "ls", Short: "List jobs", Args: noArgs, RunE: runAdminJobLs} +var adminJobDeleteCmd = &cobra.Command{Use: "delete ", Short: "Delete a job", Args: exactArgs(1, "job-name"), RunE: runAdminJobDelete} +var jobAdminLockCmd = &cobra.Command{Use: "lock ", Short: "Lock a job cleanup window", Args: exactArgs(1, "job-name"), RunE: runJobAdminLock} +var jobAdminUnlockCmd = &cobra.Command{Use: "unlock ", Short: "Unlock a job cleanup window", Args: exactArgs(1, "job-name"), RunE: runJobAdminUnlock} +var jobAdminKeepCmd = &cobra.Command{Use: "keep ", Short: "Toggle keep-when-low-usage", Args: exactArgs(1, "job-name"), RunE: runJobAdminKeep} +var jobAdminCleanCmd = &cobra.Command{Use: "clean", Short: "Run admin job cleanup actions"} +var jobAdminCleanWaitingJupyterCmd = &cobra.Command{Use: "waiting-jupyter", Short: "Cancel waiting Jupyter jobs", Args: noArgs, RunE: runJobAdminCleanWaitingJupyter} +var jobAdminCleanWaitingCustomCmd = &cobra.Command{Use: "waiting-custom", Short: "Cancel waiting custom jobs", Args: noArgs, RunE: runJobAdminCleanWaitingCustom} +var jobAdminCleanLongRunningCmd = &cobra.Command{Use: "long-running", Short: "Clean long-running jobs", Args: noArgs, RunE: runJobAdminCleanLongRunning} +var jobAdminCleanLowGPUCmd = &cobra.Command{Use: "low-gpu", Short: "Clean low GPU usage jobs", Args: noArgs, RunE: runJobAdminCleanLowGPU} -var jobEventsCmd = &cobra.Command{ - Use: "events ", - Short: "List events for a job", - Args: exactArgs(1, "job-name"), - RunE: runJobEvents, +func missingIssue(field string, labelKey string) usageIssue { + return usageIssue{ + Code: errorcodes.ErrMissingRequiredFlag, + Message: i18n.T("err_missing_required", i18n.T(labelKey), field), + Field: field, + } } -var jobYAMLCmd = &cobra.Command{ - Use: "yaml ", - Short: "Show job YAML", - Args: exactArgs(1, "job-name"), - RunE: runJobYAML, +func invalidIssue(field string, message string) usageIssue { + return usageIssue{Code: errorcodes.ErrInvalidFlagValue, Message: message, Field: field} } func runJobLs(cmd *cobra.Command, _ []string) error { - opts, err := readJobListOptions(cmd) + opts, err := readJobListOptions(cmd, false) + if err != nil { + return err + } + return listJobs(cmd, opts) +} + +func runAdminJobLs(cmd *cobra.Command, _ []string) error { + opts, err := readJobListOptions(cmd, true) if err != nil { return err } + return listJobs(cmd, opts) +} + +func readJobListOptions(cmd *cobra.Command, admin bool) (api.JobListOptions, error) { + all, _ := cmd.Flags().GetBool("all") + username, _ := cmd.Flags().GetString("user") + username = strings.TrimSpace(username) + days, _ := cmd.Flags().GetInt("days") + if days < -1 { + return api.JobListOptions{}, errUsageFromIssues([]usageIssue{invalidIssue("days", i18n.T("err_invalid_job_days"))}) + } + if err := validateJobListFilters(cmd); err != nil { + return api.JobListOptions{}, err + } + return api.JobListOptions{ + All: all, + Admin: admin, + Username: username, + Days: days, + }, nil +} + +func listJobs(cmd *cobra.Command, opts api.JobListOptions) error { client, err := activeAPIClient() if err != nil { return err @@ -84,13 +134,14 @@ func runJobLs(cmd *cobra.Command, _ []string) error { if err != nil { return cliErrFromAPI(err) } - jobs = filterJobs(cmd, jobs) + filtered, err := filterJobs(cmd, jobs) + if err != nil { + return err + } if outputJSON { - return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{ - "jobs": jobs, - })) + return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{"jobs": filtered})) } - printJobTable(jobs) + printJobTable(filtered) return nil } @@ -108,9 +159,7 @@ func runJobGet(_ *cobra.Command, args []string) error { return cliErrFromAPI(err) } if outputJSON { - return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{ - "job": job, - })) + return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{"job": job})) } printJobDetail(job) return nil @@ -130,11 +179,9 @@ func runJobPods(_ *cobra.Command, args []string) error { return cliErrFromAPI(err) } if outputJSON { - return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{ - "pods": pods, - })) + return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{"pods": pods})) } - printJobPodTable(pods) + printPodTable(pods) return nil } @@ -152,13 +199,9 @@ func runJobEvents(_ *cobra.Command, args []string) error { return cliErrFromAPI(err) } if outputJSON { - return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{ - "events": events, - })) - } - for _, event := range events { - fmt.Printf("%v\n", event) + return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{"events": events})) } + printEvents(events) return nil } @@ -171,82 +214,653 @@ func runJobYAML(_ *cobra.Command, args []string) error { if err != nil { return err } - yaml, err := client.GetJobYAML(name) + yamlText, err := client.GetJobYAML(name) if err != nil { return cliErrFromAPI(err) } if outputJSON { - return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{ - "yaml": yaml, - })) + return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{"yaml": yamlText})) } - fmt.Print(yaml) - if yaml != "" && !strings.HasSuffix(yaml, "\n") { + fmt.Print(yamlText) + if !strings.HasSuffix(yamlText, "\n") { fmt.Println() } return nil } -func readJobListOptions(cmd *cobra.Command) (api.JobListOptions, error) { - all, _ := cmd.Flags().GetBool("all") - username, _ := cmd.Flags().GetString("user") - username = strings.TrimSpace(username) +func runJobTemplate(_ *cobra.Command, args []string) error { + name, err := requiredArg(args, "job_label_name", "name") + if err != nil { + return err + } + client, err := activeAPIClient() + if err != nil { + return err + } + template, err := client.GetJobTemplate(name) + if err != nil { + return cliErrFromAPI(err) + } + if outputJSON { + return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{"template": template})) + } + fmt.Println(template) + return nil +} + +func runJobToken(_ *cobra.Command, args []string) error { + name, err := requiredArg(args, "job_label_name", "name") + if err != nil { + return err + } + client, err := activeAPIClient() + if err != nil { + return err + } + token, err := client.GetJupyterToken(name) + if err != nil { + return cliErrFromAPI(err) + } + return writeToken(token) +} + +func runJobSecret(_ *cobra.Command, args []string) error { + name, err := requiredArg(args, "job_label_name", "name") + if err != nil { + return err + } + client, err := activeAPIClient() + if err != nil { + return err + } + token, err := client.GetWebIDESecret(name) + if err != nil { + return cliErrFromAPI(err) + } + return writeToken(token) +} + +func runJobSSH(_ *cobra.Command, args []string) error { + name, err := requiredArg(args, "job_label_name", "name") + if err != nil { + return err + } + client, err := activeAPIClient() + if err != nil { + return err + } + ssh, err := client.OpenJobSSH(name) + if err != nil { + return cliErrFromAPI(err) + } + if outputJSON { + return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{"ssh": ssh})) + } + fmt.Printf("%s:%s\n", ssh.IP, ssh.Port) + return nil +} + +func runJobSnapshot(_ *cobra.Command, args []string) error { + return runJobMessage(args, func(client *api.Client, name string) (string, error) { + return client.SnapshotJob(name) + }) +} + +func runJobAlert(_ *cobra.Command, args []string) error { + return runJobMessage(args, func(client *api.Client, name string) (string, error) { + return client.ToggleJobAlert(name) + }) +} + +func runJobDelete(cmd *cobra.Command, args []string) error { + return runJobMessage(args, func(client *api.Client, name string) (string, error) { + return client.DeleteJob(name) + }) +} + +func runAdminJobDelete(cmd *cobra.Command, args []string) error { + return runJobMessage(args, func(client *api.Client, name string) (string, error) { + return client.AdminDeleteJob(name) + }) +} + +func runJobMessage(args []string, call func(*api.Client, string) (string, error)) error { + name, err := requiredArg(args, "job_label_name", "name") + if err != nil { + return err + } + client, err := activeAPIClient() + if err != nil { + return err + } + msg, err := call(client, name) + return writeMessage(msg, err) +} + +func runJobCreateJupyter(cmd *cobra.Command, _ []string) error { + req, err := collectInteractiveCreate(cmd) + if err != nil { + return err + } + client, err := activeAPIClient() + if err != nil { + return err + } + data, err := client.CreateJupyterJob(req) + return writeCreateResult(data, err) +} + +func runJobCreateWebIDE(cmd *cobra.Command, _ []string) error { + req, err := collectInteractiveCreate(cmd) + if err != nil { + return err + } + client, err := activeAPIClient() + if err != nil { + return err + } + data, err := client.CreateWebIDEJob(req) + return writeCreateResult(data, err) +} + +func runJobCreateCustom(cmd *cobra.Command, _ []string) error { + file, _ := cmd.Flags().GetString("file") + var req api.CreateTrainingJobRequest + var err error + if strings.TrimSpace(file) != "" { + err = readJSONFile(file, &req) + } else { + req, err = collectCustomCreate(cmd) + } + if err != nil { + return err + } + if err := validateTrainingRequest(req); err != nil { + return err + } + client, err := activeAPIClient() + if err != nil { + return err + } + data, err := client.CreateTrainingJob(req) + return writeCreateResult(data, err) +} + +func runJobCreateTensorflow(cmd *cobra.Command, _ []string) error { + var req api.CreateDistributedJobRequest + if err := readJSONFlag(cmd, &req); err != nil { + return err + } + if err := validateDistributedRequest(req); err != nil { + return err + } + client, err := activeAPIClient() + if err != nil { + return err + } + data, err := client.CreateTensorflowJob(req) + return writeCreateResult(data, err) +} + +func runJobCreatePytorch(cmd *cobra.Command, _ []string) error { + var req api.CreateDistributedJobRequest + if err := readJSONFlag(cmd, &req); err != nil { + return err + } + if err := validateDistributedRequest(req); err != nil { + return err + } + client, err := activeAPIClient() + if err != nil { + return err + } + data, err := client.CreatePytorchJob(req) + return writeCreateResult(data, err) +} + +func runJobAdminLock(cmd *cobra.Command, args []string) error { + name, err := requiredArg(args, "job_label_name", "name") + if err != nil { + return err + } + permanent, _ := cmd.Flags().GetBool("permanent") days, _ := cmd.Flags().GetInt("days") - status, _ := cmd.Flags().GetString("status") - jobType, _ := cmd.Flags().GetString("type") - status = strings.TrimSpace(status) - jobType = strings.TrimSpace(jobType) - interactive, _ := cmd.Flags().GetBool("interactive") - batch, _ := cmd.Flags().GetBool("batch") + hours, _ := cmd.Flags().GetInt("hours") + minutes, _ := cmd.Flags().GetInt("minutes") + if days < 0 || hours < 0 || minutes < 0 { + return errUsageFromIssues([]usageIssue{invalidIssue("duration", i18n.T("err_invalid_non_negative_int", "duration"))}) + } + if !permanent && days == 0 && hours == 0 && minutes == 0 { + return errUsageFromIssues([]usageIssue{invalidIssue("duration", i18n.T("err_value_required_when_flag_enabled", "duration", "permanent=false"))}) + } + client, err := activeAPIClient() + if err != nil { + return err + } + msg, err := client.LockJob(api.LockJobRequest{Name: name, IsPermanent: permanent, Days: days, Hours: hours, Minutes: minutes}) + return writeMessage(msg, err) +} - var issues []usageIssue - if days < -1 { - issues = append(issues, usageIssue{ - Code: errorcodes.ErrInvalidFlagValue, - Message: i18n.T("err_invalid_days"), - Field: "days", - }) +func runJobAdminUnlock(_ *cobra.Command, args []string) error { + return runJobMessage(args, func(client *api.Client, name string) (string, error) { + return client.UnlockJob(name) + }) +} + +func runJobAdminKeep(_ *cobra.Command, args []string) error { + return runJobMessage(args, func(client *api.Client, name string) (string, error) { + return client.ToggleJobKeep(name) + }) +} + +func runJobAdminCleanWaitingJupyter(cmd *cobra.Command, _ []string) error { + wait, err := waitMinutesFlag(cmd) + if err != nil { + return err } - if status != "" && !slices.Contains(jobStatuses, status) { - issues = append(issues, usageIssue{ - Code: errorcodes.ErrInvalidFlagValue, - Message: i18n.T("err_invalid_job_status", status), - Field: "status", - }) + client, err := activeAPIClient() + if err != nil { + return err } - if jobType != "" && !slices.Contains(jobTypes, jobType) { - issues = append(issues, usageIssue{ - Code: errorcodes.ErrInvalidFlagValue, - Message: i18n.T("err_invalid_job_type", jobType), - Field: "type", - }) + res, err := client.CleanWaitingJupyter(wait) + return writeCleanupResult(res, err) +} + +func runJobAdminCleanWaitingCustom(cmd *cobra.Command, _ []string) error { + wait, err := waitMinutesFlag(cmd) + if err != nil { + return err } - if interactive && batch { - issues = append(issues, usageIssue{ - Code: errorcodes.ErrInvalidFlagValue, - Message: i18n.T("err_job_interactive_batch_conflict"), - Field: "interactive", - }) + client, err := activeAPIClient() + if err != nil { + return err + } + res, err := client.CleanWaitingCustom(wait) + return writeCleanupResult(res, err) +} + +func runJobAdminCleanLongRunning(cmd *cobra.Command, _ []string) error { + batchDays, _ := cmd.Flags().GetInt("batch-days") + interactiveDays, _ := cmd.Flags().GetInt("interactive-days") + if batchDays < 0 || interactiveDays < 0 { + return errUsageFromIssues([]usageIssue{invalidIssue("days", i18n.T("err_invalid_non_negative_int", "days"))}) + } + req := api.CleanLongTimeRequest{} + if batchDays > 0 { + req.BatchDays = &batchDays + } + if interactiveDays > 0 { + req.InteractiveDays = &interactiveDays + } + client, err := activeAPIClient() + if err != nil { + return err + } + res, err := client.CleanLongRunning(req) + return writeCleanupResult(res, err) +} + +func runJobAdminCleanLowGPU(cmd *cobra.Command, _ []string) error { + timeRange, _ := cmd.Flags().GetInt("time-range") + waitTime, _ := cmd.Flags().GetInt("wait-time") + util, _ := cmd.Flags().GetInt("util") + issues := []usageIssue{} + if timeRange <= 0 { + issues = append(issues, invalidIssue("time-range", i18n.T("err_invalid_positive_int", "time-range"))) + } + if waitTime < 0 || util < 0 { + issues = append(issues, invalidIssue("threshold", i18n.T("err_invalid_non_negative_int", "threshold"))) } if len(issues) > 0 { - return api.JobListOptions{}, errUsageFromIssues(issues) + return errUsageFromIssues(issues) } - return api.JobListOptions{ - All: all, - Username: username, - Days: days, - }, nil + req := api.CleanLowGPUUsageRequest{TimeRange: timeRange} + if waitTime > 0 { + req.WaitTime = &waitTime + } + if util > 0 { + req.Util = &util + } + client, err := activeAPIClient() + if err != nil { + return err + } + res, err := client.CleanLowGPUUsage(req) + return writeCleanupResult(res, err) +} + +func collectInteractiveCreate(cmd *cobra.Command) (api.CreateInteractiveJobRequest, error) { + file, _ := cmd.Flags().GetString("file") + if strings.TrimSpace(file) != "" { + var req api.CreateInteractiveJobRequest + return req, readJSONFile(file, &req) + } + common, resource, image, err := collectBasicCreate(cmd) + if err != nil { + return api.CreateInteractiveJobRequest{}, err + } + req := api.CreateInteractiveJobRequest{JobCommonRequest: common, Resource: resource, Image: image} + return req, validateInteractiveRequest(req) +} + +func collectCustomCreate(cmd *cobra.Command) (api.CreateTrainingJobRequest, error) { + common, resource, image, err := collectBasicCreate(cmd) + if err != nil { + return api.CreateTrainingJobRequest{}, err + } + workingDir, _ := cmd.Flags().GetString("working-dir") + command, _ := cmd.Flags().GetString("command") + shell, _ := cmd.Flags().GetString("shell") + req := api.CreateTrainingJobRequest{ + CreateInteractiveJobRequest: api.CreateInteractiveJobRequest{JobCommonRequest: common, Resource: resource, Image: image}, + WorkingDir: workingDir, + } + if strings.TrimSpace(command) != "" { + req.Command = &command + } + if strings.TrimSpace(shell) != "" { + req.Shell = &shell + } + return req, validateTrainingRequest(req) +} + +func collectBasicCreate(cmd *cobra.Command) (api.JobCommonRequest, api.ResourceList, api.ImageBaseInfo, error) { + name, _ := cmd.Flags().GetString("name") + imageLink, _ := cmd.Flags().GetString("image") + archs, _ := cmd.Flags().GetStringSlice("arch") + cpu, _ := cmd.Flags().GetFloat64("cpu") + memory, _ := cmd.Flags().GetString("memory") + gpu, _ := cmd.Flags().GetInt("gpu") + gpuResource, _ := cmd.Flags().GetString("gpu-resource") + template, _ := cmd.Flags().GetString("template") + alert, _ := cmd.Flags().GetBool("alert") + cpuPinning, _ := cmd.Flags().GetBool("cpu-pinning") + schedule, _ := cmd.Flags().GetString("schedule") + + issues := []usageIssue{} + if strings.TrimSpace(name) == "" { + issues = append(issues, missingIssue("name", "job_label_display_name")) + } + if strings.TrimSpace(imageLink) == "" { + issues = append(issues, missingIssue("image", "job_label_image")) + } + if cpu < 0 { + issues = append(issues, invalidIssue("cpu", i18n.T("err_invalid_non_negative_float", "cpu"))) + } + if strings.TrimSpace(memory) == "" { + issues = append(issues, missingIssue("memory", "job_label_memory")) + } else if strings.HasPrefix(strings.TrimSpace(memory), "-") { + issues = append(issues, invalidIssue("memory", i18n.T("err_invalid_non_negative_int", "memory"))) + } + if gpu < 0 { + issues = append(issues, invalidIssue("gpu", i18n.T("err_invalid_non_negative_int", "gpu"))) + } + if gpu > 0 && strings.TrimSpace(gpuResource) == "" { + issues = append(issues, missingIssue("gpu-resource", "job_label_gpu_resource")) + } + scheduleValue, err := parseScheduleType(schedule) + if err != nil { + issues = append(issues, invalidIssue("schedule", err.Error())) + } + if len(issues) > 0 { + return api.JobCommonRequest{}, nil, api.ImageBaseInfo{}, errUsageFromIssues(issues) + } + + envs, err := parseEnvFlags(cmd) + if err != nil { + return api.JobCommonRequest{}, nil, api.ImageBaseInfo{}, err + } + volumes, err := parseVolumeFlags(cmd) + if err != nil { + return api.JobCommonRequest{}, nil, api.ImageBaseInfo{}, err + } + datasets, err := parseDatasetFlags(cmd) + if err != nil { + return api.JobCommonRequest{}, nil, api.ImageBaseInfo{}, err + } + selectors, err := parseSelectorFlags(cmd) + if err != nil { + return api.JobCommonRequest{}, nil, api.ImageBaseInfo{}, err + } + forwards, err := parseForwardFlags(cmd) + if err != nil { + return api.JobCommonRequest{}, nil, api.ImageBaseInfo{}, err + } + + resources := api.ResourceList{ + "cpu": strconv.FormatFloat(cpu, 'f', -1, 64), + "memory": memory, + } + if gpu > 0 { + resources[gpuResource] = strconv.Itoa(gpu) + } + common := api.JobCommonRequest{ + Name: name, + VolumeMounts: volumes, + DatasetMounts: datasets, + Envs: envs, + Selectors: selectors, + Template: template, + AlertEnabled: alert, + CpuPinningEnabled: cpuPinning, + Forwards: forwards, + ScheduleType: scheduleValue, + } + return common, resources, api.ImageBaseInfo{ImageLink: imageLink, Archs: archs}, nil +} + +func parseScheduleType(raw string) (*int, error) { + raw = strings.TrimSpace(strings.ToLower(raw)) + if raw == "" { + return nil, nil + } + switch raw { + case "normal", "1": + v := scheduleNormal + return &v, nil + case "backfill", "0": + v := scheduleBackfill + return &v, nil + default: + return nil, fmt.Errorf("%s", i18n.T("err_invalid_job_schedule", raw)) + } +} + +func parseEnvFlags(cmd *cobra.Command) ([]api.EnvVar, error) { + values, _ := cmd.Flags().GetStringArray("env") + out := make([]api.EnvVar, 0, len(values)) + for _, value := range values { + key, val, ok := strings.Cut(value, "=") + if !ok || strings.TrimSpace(key) == "" { + return nil, errUsageFromIssues([]usageIssue{invalidIssue("env", i18n.T("err_invalid_enum", "env", value))}) + } + out = append(out, api.EnvVar{Name: strings.TrimSpace(key), Value: val}) + } + return out, nil +} + +func parseVolumeFlags(cmd *cobra.Command) ([]api.VolumeMount, error) { + values, _ := cmd.Flags().GetStringArray("volume") + out := make([]api.VolumeMount, 0, len(values)) + for _, value := range values { + subPath, mountPath, ok := strings.Cut(value, ":") + if !ok || strings.TrimSpace(mountPath) == "" { + return nil, errUsageFromIssues([]usageIssue{invalidIssue("volume", i18n.T("err_invalid_enum", "volume", value))}) + } + out = append(out, api.VolumeMount{SubPath: strings.TrimSpace(subPath), MountPath: strings.TrimSpace(mountPath)}) + } + return out, nil +} + +func parseDatasetFlags(cmd *cobra.Command) ([]api.DatasetMount, error) { + values, _ := cmd.Flags().GetStringArray("dataset") + out := make([]api.DatasetMount, 0, len(values)) + for _, value := range values { + rawID, mountPath, ok := strings.Cut(value, ":") + id, parseErr := strconv.ParseUint(strings.TrimSpace(rawID), 10, 0) + if !ok || parseErr != nil || strings.TrimSpace(mountPath) == "" { + return nil, errUsageFromIssues([]usageIssue{invalidIssue("dataset", i18n.T("err_invalid_enum", "dataset", value))}) + } + out = append(out, api.DatasetMount{DatasetID: uint(id), MountPath: strings.TrimSpace(mountPath)}) + } + return out, nil +} + +func parseSelectorFlags(cmd *cobra.Command) ([]api.NodeSelectorRequirement, error) { + values, _ := cmd.Flags().GetStringArray("selector") + out := make([]api.NodeSelectorRequirement, 0, len(values)) + for _, value := range values { + key, rest, ok := strings.Cut(value, "=") + op, rawValues, ok2 := strings.Cut(rest, ":") + if !ok || !ok2 || strings.TrimSpace(key) == "" || strings.TrimSpace(op) == "" { + return nil, errUsageFromIssues([]usageIssue{invalidIssue("selector", i18n.T("err_invalid_enum", "selector", value))}) + } + selector := api.NodeSelectorRequirement{Key: strings.TrimSpace(key), Operator: strings.TrimSpace(op)} + if strings.TrimSpace(rawValues) != "" { + selector.Values = splitCSV(rawValues) + } + out = append(out, selector) + } + return out, nil +} + +func parseForwardFlags(cmd *cobra.Command) ([]api.Forward, error) { + values, _ := cmd.Flags().GetStringArray("forward") + out := make([]api.Forward, 0, len(values)) + for _, value := range values { + parts := strings.Split(value, ":") + if len(parts) < 2 || len(parts) > 3 { + return nil, errUsageFromIssues([]usageIssue{invalidIssue("forward", i18n.T("err_invalid_enum", "forward", value))}) + } + port, err := strconv.Atoi(strings.TrimSpace(parts[1])) + if err != nil || port <= 0 { + return nil, errUsageFromIssues([]usageIssue{invalidIssue("forward", i18n.T("err_invalid_positive_int", "forward port"))}) + } + forward := api.Forward{Name: strings.TrimSpace(parts[0]), Port: port} + if len(parts) == 3 { + forward.Type = strings.TrimSpace(parts[2]) + } + out = append(out, forward) + } + return out, nil +} + +func readJSONFlag(cmd *cobra.Command, dst interface{}) error { + file, _ := cmd.Flags().GetString("file") + if strings.TrimSpace(file) == "" { + return errUsageFromIssues([]usageIssue{missingIssue("file", "job_label_file")}) + } + return readJSONFile(file, dst) +} + +func readJSONFile(path string, dst interface{}) error { + data, err := os.ReadFile(path) + if err != nil { + return &clierror.Error{Category: errorcodes.CategorySystem, Code: errorcodes.ErrCommandExecution, Message: i18n.T("err_read_file", path, err.Error())} + } + if err := json.Unmarshal(data, dst); err != nil { + return &clierror.Error{Category: errorcodes.CategoryUsage, Code: errorcodes.ErrInvalidFlagValue, Message: i18n.T("err_unmarshal_file", path, err.Error())} + } + return nil +} + +func validateInteractiveRequest(req api.CreateInteractiveJobRequest) error { + return validateBasicRequest(req.JobCommonRequest, req.Resource, req.Image) +} + +func validateTrainingRequest(req api.CreateTrainingJobRequest) error { + issues := validateBasicIssues(req.JobCommonRequest, req.Resource, req.Image) + if strings.TrimSpace(req.WorkingDir) == "" { + issues = append(issues, missingIssue("working-dir", "job_label_working_dir")) + } + if len(issues) > 0 { + return errUsageFromIssues(issues) + } + return nil +} + +func validateDistributedRequest(req api.CreateDistributedJobRequest) error { + issues := validateCommonIssues(req.JobCommonRequest) + if len(req.Tasks) == 0 { + issues = append(issues, missingIssue("tasks", "job_label_tasks")) + } + for i, task := range req.Tasks { + prefix := fmt.Sprintf("tasks[%d]", i) + if strings.TrimSpace(task.Name) == "" { + issues = append(issues, missingIssue(prefix+".name", "job_label_task_name")) + } + if task.Replicas <= 0 { + issues = append(issues, invalidIssue(prefix+".replicas", i18n.T("err_invalid_positive_int", prefix+".replicas"))) + } + issues = append(issues, validateResourceIssues(prefix+".resource", task.Resource)...) + if strings.TrimSpace(task.Image.ImageLink) == "" { + issues = append(issues, missingIssue(prefix+".image", "job_label_image")) + } + } + if len(issues) > 0 { + return errUsageFromIssues(issues) + } + return nil +} + +func validateBasicRequest(common api.JobCommonRequest, resource api.ResourceList, image api.ImageBaseInfo) error { + issues := validateBasicIssues(common, resource, image) + if len(issues) > 0 { + return errUsageFromIssues(issues) + } + return nil +} + +func validateBasicIssues(common api.JobCommonRequest, resource api.ResourceList, image api.ImageBaseInfo) []usageIssue { + issues := validateCommonIssues(common) + issues = append(issues, validateResourceIssues("resource", resource)...) + if strings.TrimSpace(image.ImageLink) == "" { + issues = append(issues, missingIssue("image", "job_label_image")) + } + return issues } -func filterJobs(cmd *cobra.Command, jobs []api.JobInfo) []api.JobInfo { +func validateCommonIssues(common api.JobCommonRequest) []usageIssue { + if strings.TrimSpace(common.Name) == "" { + return []usageIssue{missingIssue("name", "job_label_display_name")} + } + return nil +} + +func validateResourceIssues(field string, resources api.ResourceList) []usageIssue { + issues := []usageIssue{} + if len(resources) == 0 { + return append(issues, missingIssue(field, "job_label_resources")) + } + for key, value := range resources { + if strings.HasPrefix(strings.TrimSpace(value), "-") { + issues = append(issues, invalidIssue(field+"."+key, i18n.T("err_invalid_non_negative_int", field+"."+key))) + } + } + return issues +} + +func filterJobs(cmd *cobra.Command, jobs []api.JobInfo) ([]api.JobInfo, error) { + if err := validateJobListFilters(cmd); err != nil { + return nil, err + } status, _ := cmd.Flags().GetString("status") jobType, _ := cmd.Flags().GetString("type") - nodeName, _ := cmd.Flags().GetString("node") + node, _ := cmd.Flags().GetString("node") + owner, _ := cmd.Flags().GetString("owner") interactive, _ := cmd.Flags().GetBool("interactive") batch, _ := cmd.Flags().GetBool("batch") - status = strings.TrimSpace(status) - jobType = strings.TrimSpace(jobType) - nodeName = strings.TrimSpace(nodeName) + from, _ := cmd.Flags().GetString("from") + to, _ := cmd.Flags().GetString("to") + fromTime, err := parseOptionalTime(from) + if err != nil { + return nil, err + } + toTime, err := parseOptionalTime(to) + if err != nil { + return nil, err + } + out := jobs[:0] for _, job := range jobs { if status != "" && job.Status != status { @@ -255,29 +869,171 @@ func filterJobs(cmd *cobra.Command, jobs []api.JobInfo) []api.JobInfo { if jobType != "" && job.JobType != jobType { continue } - if nodeName != "" && !slices.Contains(job.Nodes, nodeName) { + if node != "" && !slices.Contains(job.Nodes, node) { continue } - isInteractive := slices.Contains(interactiveJobTypes, job.JobType) + if owner != "" && job.UserInfo.Username != owner && job.Owner != owner { + continue + } + isInteractive := job.JobType == "jupyter" || job.JobType == "webide" if interactive && !isInteractive { continue } if batch && isInteractive { continue } + createdAt := job.CreatedAt + if fromTime != nil && !createdAt.IsZero() && createdAt.Before(*fromTime) { + continue + } + if toTime != nil && !createdAt.IsZero() && createdAt.After(*toTime) { + continue + } out = append(out, job) } + return out, nil +} + +func validateJobListFilters(cmd *cobra.Command) error { + status, _ := cmd.Flags().GetString("status") + jobType, _ := cmd.Flags().GetString("type") + interactive, _ := cmd.Flags().GetBool("interactive") + batch, _ := cmd.Flags().GetBool("batch") + from, _ := cmd.Flags().GetString("from") + to, _ := cmd.Flags().GetString("to") + status = strings.TrimSpace(status) + jobType = strings.TrimSpace(jobType) + if status != "" && !slices.Contains(jobStatuses, status) { + return errUsageFromIssues([]usageIssue{invalidIssue("status", i18n.T("err_invalid_job_status", status))}) + } + if jobType != "" && !slices.Contains(jobTypes, jobType) { + return errUsageFromIssues([]usageIssue{invalidIssue("type", i18n.T("err_invalid_job_type", jobType))}) + } + if interactive && batch { + return errUsageFromIssues([]usageIssue{invalidIssue("interactive", i18n.T("err_job_interactive_batch_conflict"))}) + } + if _, err := parseOptionalTime(from); err != nil { + return err + } + if _, err := parseOptionalTime(to); err != nil { + return err + } + return nil +} + +func parseOptionalTime(value string) (*time.Time, error) { + value = strings.TrimSpace(value) + if value == "" { + return nil, nil + } + parsed, err := parseAPITime(value) + if err != nil { + return nil, errUsageFromIssues([]usageIssue{invalidIssue("time", i18n.T("err_invalid_enum", "time", value))}) + } + return &parsed, nil +} + +func parseAPITime(value string) (time.Time, error) { + value = strings.TrimSpace(value) + if value == "" || strings.HasPrefix(value, "0001-") { + return time.Time{}, nil + } + formats := []string{time.RFC3339Nano, time.RFC3339, "2006-01-02", "2006-01-02 15:04:05"} + var last error + for _, layout := range formats { + parsed, err := time.Parse(layout, value) + if err == nil { + return parsed, nil + } + last = err + } + return time.Time{}, last +} + +func formatAPITime(value time.Time) string { + if value.IsZero() { + return "-" + } + return value.Format(time.RFC3339) +} + +func waitMinutesFlag(cmd *cobra.Command) (int, error) { + wait, _ := cmd.Flags().GetInt("wait-minutes") + if wait <= 0 { + return 0, errUsageFromIssues([]usageIssue{invalidIssue("wait-minutes", i18n.T("err_invalid_positive_int", "wait-minutes"))}) + } + return wait, nil +} + +func splitCSV(value string) []string { + parts := strings.Split(value, ",") + out := make([]string, 0, len(parts)) + for _, part := range parts { + part = strings.TrimSpace(part) + if part != "" { + out = append(out, part) + } + } return out } +func writeMessage(msg string, err error) error { + if err != nil { + return cliErrFromAPI(err) + } + if outputJSON { + return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{"message": msg})) + } + fmt.Println(msg) + return nil +} + +func writeCreateResult(data map[string]interface{}, err error) error { + if err != nil { + return cliErrFromAPI(err) + } + if outputJSON { + return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{"job": data})) + } + fmt.Println(i18n.T("job_create_submitted")) + if metadata, ok := data["metadata"].(map[string]interface{}); ok { + if name, ok := metadata["name"].(string); ok { + fmt.Printf("%s: %s\n", "JobName", name) + } + } + return nil +} + +func writeCleanupResult(res *api.CleanupResult, err error) error { + if err != nil { + return cliErrFromAPI(err) + } + if outputJSON { + return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{"cleanup": res})) + } + fmt.Printf("%s: %s\n", "Reminded", strings.Join(res.Reminded, ",")) + fmt.Printf("%s: %s\n", "Deleted", strings.Join(res.Deleted, ",")) + return nil +} + +func writeToken(token *api.JobToken) error { + if outputJSON { + return output.WriteSuccessJSON(os.Stdout, output.SuccessEnvelope(map[string]interface{}{"token": token})) + } + fmt.Printf("%s: %s\n", "URL", token.FullURL) + fmt.Printf("%s: %s\n", "Token", token.Token) + fmt.Printf("%s: %s\n", "Pod", token.PodName) + return nil +} + func printJobTable(jobs []api.JobInfo) { fmt.Printf("%s %s %s %s %s %s %s\n", i18n.PadRight(i18n.T("table_name"), 24), - i18n.PadRight("JOB_NAME", 34), + i18n.PadRight("JobName", 34), i18n.PadRight(i18n.T("table_type"), 12), i18n.PadRight(i18n.T("table_status"), 14), - i18n.PadRight(i18n.T("table_queue"), 18), - i18n.PadRight(i18n.T("table_nodes"), 24), + i18n.PadRight(i18n.T("table_owner"), 16), + i18n.PadRight(i18n.T("table_nodes"), 22), i18n.PadRight(i18n.T("table_resources"), 24)) for _, job := range jobs { fmt.Printf("%s %s %s %s %s %s %s\n", @@ -285,8 +1041,8 @@ func printJobTable(jobs []api.JobInfo) { i18n.PadRight(job.JobName, 34), i18n.PadRight(job.JobType, 12), i18n.PadRight(job.Status, 14), - i18n.PadRight(job.Queue, 18), - i18n.PadRight(strings.Join(job.Nodes, ","), 24), + i18n.PadRight(job.UserInfo.Username, 16), + i18n.PadRight(strings.Join(job.Nodes, ","), 22), i18n.PadRight(formatResources(job.Resources), 24)) } } @@ -299,58 +1055,137 @@ func printJobDetail(job *api.JobDetail) { fmt.Printf("%s: %s\n", "JobName", job.JobName) fmt.Printf("%s: %s\n", i18n.T("table_type"), job.JobType) fmt.Printf("%s: %s\n", i18n.T("table_status"), job.Status) + fmt.Printf("%s: %s\n", i18n.T("table_owner"), job.Username) fmt.Printf("%s: %s\n", i18n.T("table_queue"), job.Queue) - fmt.Printf("%s: %s\n", i18n.T("table_owner"), job.UserInfo.Nickname) fmt.Printf("%s: %s\n", i18n.T("table_resources"), formatResources(job.Resources)) + fmt.Printf("%s: %s\n", i18n.T("table_created_at"), formatAPITime(job.CreatedAt)) + fmt.Printf("%s: %s\n", i18n.T("table_started_at"), formatAPITime(job.StartedAt)) + fmt.Printf("%s: %s\n", i18n.T("table_completed_at"), formatAPITime(job.CompletedAt)) } -func printJobPodTable(pods []api.PodDetail) { +func printPodTable(pods []api.PodDetail) { fmt.Printf("%s %s %s %s %s %s\n", - i18n.PadRight(i18n.T("table_name"), 36), - i18n.PadRight(i18n.T("table_namespace"), 22), + i18n.PadRight(i18n.T("table_name"), 32), + i18n.PadRight(i18n.T("table_namespace"), 18), i18n.PadRight(i18n.T("table_node"), 24), i18n.PadRight("IP", 16), - i18n.PadRight(i18n.T("table_phase"), 14), + i18n.PadRight(i18n.T("table_status"), 12), i18n.PadRight(i18n.T("table_resources"), 24)) for _, pod := range pods { fmt.Printf("%s %s %s %s %s %s\n", - i18n.PadRight(pod.Name, 36), - i18n.PadRight(pod.Namespace, 22), - i18n.PadRight(pod.NodeName, 24), - i18n.PadRight(emptyDash(pod.IP), 16), - i18n.PadRight(pod.Phase, 14), + i18n.PadRight(pod.Name, 32), + i18n.PadRight(pod.Namespace, 18), + i18n.PadRight(emptyDash(pod.NodeName), 24), + i18n.PadRight(pod.IP, 16), + i18n.PadRight(pod.Phase, 12), i18n.PadRight(formatResources(pod.Resource), 24)) } } +func printEvents(events []map[string]interface{}) { + fmt.Printf("%s %s %s\n", i18n.PadRight("Type", 10), i18n.PadRight("Reason", 24), "Message") + for _, event := range events { + fmt.Printf("%s %s %s\n", + i18n.PadRight(fmt.Sprint(event["type"]), 10), + i18n.PadRight(fmt.Sprint(event["reason"]), 24), + fmt.Sprint(event["message"])) + } +} + func formatResources(resources api.ResourceList) string { if len(resources) == 0 { return "-" } - parts := make([]string, 0, len(resources)) - for k, v := range resources { - parts = append(parts, k+"="+v) + keys := make([]string, 0, len(resources)) + for key := range resources { + keys = append(keys, key) + } + slices.Sort(keys) + parts := make([]string, 0, len(keys)) + for _, key := range keys { + parts = append(parts, key+"="+resources[key]) } - slices.Sort(parts) return strings.Join(parts, ",") } +func addCreateCommonFlags(cmd *cobra.Command) { + cmd.Flags().String("file", "", "Read exact JSON request body from file") + cmd.Flags().String("name", "", "Display name") + cmd.Flags().String("image", "", "Image link") + cmd.Flags().StringSlice("arch", nil, "Image architecture, repeatable or comma-separated") + cmd.Flags().Float64("cpu", 1, "CPU request") + cmd.Flags().String("memory", "", "Memory request, for example 8Gi") + cmd.Flags().Int("gpu", 0, "GPU count") + cmd.Flags().String("gpu-resource", "nvidia.com/gpu", "GPU resource name") + cmd.Flags().String("template", "", "Template name or JSON") + cmd.Flags().Bool("alert", false, "Enable alert") + cmd.Flags().Bool("cpu-pinning", false, "Enable CPU pinning") + cmd.Flags().String("schedule", "", "Schedule type: normal or backfill") + cmd.Flags().StringArray("env", nil, "Environment variable KEY=VALUE, repeatable") + cmd.Flags().StringArray("volume", nil, "Workspace mount subPath:mountPath, repeatable") + cmd.Flags().StringArray("dataset", nil, "Dataset mount id:mountPath, repeatable") + cmd.Flags().StringArray("selector", nil, "Node selector key=Operator:value1,value2, repeatable") + cmd.Flags().StringArray("forward", nil, "Forward name:port[:type], repeatable") +} + func init() { jobLsCmd.Flags().Bool("all", false, "List all jobs visible to this account") - jobLsCmd.Flags().String("user", "", "List jobs for a specific username") + jobLsCmd.Flags().String("user", "", "List jobs for a username") jobLsCmd.Flags().Int("days", 0, "Look back days for --all or --user; -1 means all") jobLsCmd.Flags().String("status", "", "Filter by job status") jobLsCmd.Flags().String("type", "", "Filter by job type") jobLsCmd.Flags().String("node", "", "Filter by node name") + jobLsCmd.Flags().String("owner", "", "Filter by owner username or display name") + jobLsCmd.Flags().String("from", "", "Filter createdAt from time, RFC3339 or YYYY-MM-DD") + jobLsCmd.Flags().String("to", "", "Filter createdAt until time, RFC3339 or YYYY-MM-DD") jobLsCmd.Flags().Bool("interactive", false, "Only show interactive jobs") jobLsCmd.Flags().Bool("batch", false, "Only show batch jobs") + + adminJobLsCmd.Flags().String("user", "", "List jobs for a username") + adminJobLsCmd.Flags().Int("days", 0, "Look back days; -1 means all") + adminJobLsCmd.Flags().String("status", "", "Filter by job status") + adminJobLsCmd.Flags().String("type", "", "Filter by job type") + adminJobLsCmd.Flags().String("node", "", "Filter by node name") + adminJobLsCmd.Flags().String("owner", "", "Filter by owner username or display name") + adminJobLsCmd.Flags().String("from", "", "Filter createdAt from time, RFC3339 or YYYY-MM-DD") + adminJobLsCmd.Flags().String("to", "", "Filter createdAt until time, RFC3339 or YYYY-MM-DD") + adminJobLsCmd.Flags().Bool("interactive", false, "Only show interactive jobs") + adminJobLsCmd.Flags().Bool("batch", false, "Only show batch jobs") + + addCreateCommonFlags(jobCreateJupyterCmd) + addCreateCommonFlags(jobCreateWebIDECmd) + addCreateCommonFlags(jobCreateCustomCmd) + jobCreateCustomCmd.Flags().String("working-dir", "/workspace", "Working directory") + jobCreateCustomCmd.Flags().String("command", "", "Command to run") + jobCreateCustomCmd.Flags().String("shell", "sh", "Shell for --command") + jobCreateTensorflowCmd.Flags().String("file", "", "Read exact JSON request body from file") + jobCreatePytorchCmd.Flags().String("file", "", "Read exact JSON request body from file") + + jobAdminLockCmd.Flags().Bool("permanent", false, "Lock permanently") + jobAdminLockCmd.Flags().Int("days", 0, "Lock days") + jobAdminLockCmd.Flags().Int("hours", 0, "Lock hours") + jobAdminLockCmd.Flags().Int("minutes", 0, "Lock minutes") + jobAdminCleanWaitingJupyterCmd.Flags().Int("wait-minutes", 0, "Waiting minutes threshold") + jobAdminCleanWaitingCustomCmd.Flags().Int("wait-minutes", 0, "Waiting minutes threshold") + jobAdminCleanLongRunningCmd.Flags().Int("batch-days", 0, "Batch job running days threshold") + jobAdminCleanLongRunningCmd.Flags().Int("interactive-days", 0, "Interactive job running days threshold") + jobAdminCleanLowGPUCmd.Flags().Int("time-range", 0, "GPU usage lookback range") + jobAdminCleanLowGPUCmd.Flags().Int("wait-time", 0, "Wait time before cleanup") + jobAdminCleanLowGPUCmd.Flags().Int("util", 0, "GPU utilization threshold") + completion.RegisterFlagValue([]string{"job", "ls"}, "status", staticValueCompleter(jobStatuses, nil)) completion.RegisterFlagValue([]string{"job", "ls"}, "type", staticValueCompleter(jobTypes, nil)) + completion.RegisterFlagValue([]string{"admin", "job", "ls"}, "status", staticValueCompleter(jobStatuses, nil)) + completion.RegisterFlagValue([]string{"admin", "job", "ls"}, "type", staticValueCompleter(jobTypes, nil)) + scheduleValues := []string{"normal", "backfill"} + for _, path := range [][]string{{"job", "create", "jupyter"}, {"job", "create", "webide"}, {"job", "create", "custom"}} { + completion.RegisterFlagValue(path, "schedule", staticValueCompleter(scheduleValues, nil)) + } - jobCmd.AddCommand(jobLsCmd) - jobCmd.AddCommand(jobGetCmd) - jobCmd.AddCommand(jobPodsCmd) - jobCmd.AddCommand(jobEventsCmd) - jobCmd.AddCommand(jobYAMLCmd) + jobCreateCmd.AddCommand(jobCreateJupyterCmd, jobCreateWebIDECmd, jobCreateCustomCmd, jobCreateTensorflowCmd, jobCreatePytorchCmd) + jobAdminCleanCmd.AddCommand(jobAdminCleanWaitingJupyterCmd, jobAdminCleanWaitingCustomCmd, jobAdminCleanLongRunningCmd, jobAdminCleanLowGPUCmd) + adminJobCmd.AddCommand(adminJobLsCmd, adminJobDeleteCmd, jobAdminLockCmd, jobAdminUnlockCmd, jobAdminKeepCmd, jobAdminCleanCmd) + jobCmd.AddCommand(jobLsCmd, jobGetCmd, jobPodsCmd, jobEventsCmd, jobYAMLCmd, jobTemplateCmd, jobTokenCmd, jobSecretCmd, jobSSHCmd, jobSnapshotCmd, jobAlertCmd, jobDeleteCmd, jobCreateCmd) + adminCmd.AddCommand(adminJobCmd) rootCmd.AddCommand(jobCmd) } diff --git a/cli/docs/COMMANDS.md b/cli/docs/COMMANDS.md index a03053e7..a6b18ffe 100644 --- a/cli/docs/COMMANDS.md +++ b/cli/docs/COMMANDS.md @@ -347,7 +347,7 @@ ## 6. 作业模块 (job) -本模块提供 Volcano 作业的只读查询能力。第一版仅使用 `/api/v1/vcjobs` 族接口;`aijobs` / `spjobs` 后续独立扩展。 +本模块覆盖前端作业页面的通用 Volcano 作业能力:列表、详情、Pods、事件、YAML、模板、Jupyter/WebIDE 访问凭据、SSH、快照、告警、删除/停止、基础创建,以及管理员锁定/保留/清理操作。所有访问平台的命令都使用当前 `auth` active context 的 token。管理员能力统一放在 `crater admin job ...` 下,不使用普通命令加 `--admin`。 ### `crater job ls` - **描述**: 列出当前账号可见的作业。 @@ -357,8 +357,10 @@ - `--user` (string): 调用 `/api/v1/vcjobs/user/{username}`,列出指定用户且位于 `--days` 回看窗口内的作业。 - `--days` (int): 与 `--all` 或 `--user` 配合使用的回溯天数;`-1` 表示不按时间过滤。小于 `-1` 的值返回 `usage_error`。 - `--status` (string): 本地过滤作业状态。 - - `--type` (string): 本地过滤作业类型。 + - `--type` (string): 本地过滤作业类型:`jupyter | webide | custom | pytorch | tensorflow | kuberay | deepspeed | openmpi`。 - `--node` (string): 本地过滤运行在指定节点上的作业。 + - `--owner` (string): 本地按用户名或显示名筛选。 + - `--from` / `--to` (string): 本地按 `createdAt` 时间范围筛选,支持 RFC3339 或 `YYYY-MM-DD`。 - `--interactive` (bool): 本地过滤交互式作业(`jupyter` / `webide`)。 - `--batch` (bool): 本地过滤非交互式作业。 - **处理逻辑**: @@ -369,44 +371,132 @@ - **`--json` 的 `data`**:`jobs`(数组,元素与平台作业摘要响应一致,过滤后返回)。 - **状态**: [x] Completed -### `crater job get ` -- **描述**: 查看单个作业详情。 +### `crater job get|pods|events|yaml|template ` +- **描述**: + - `get`: 查看单个作业详情。 + - `pods`: 查看指定作业的 Pod 列表。 + - `events`: 查看指定作业的 Kubernetes 事件。 + - `yaml`: 查看指定作业的 YAML。 + - `template`: 输出作业模板内容。 - **位置参数**: - `` (positional, required): 平台作业名,对应前端详情页路径中的作业名。 - **处理逻辑**: - - 调用 `/api/v1/vcjobs/{name}/detail`。 + - `get` 调用 `/api/v1/vcjobs/{name}/detail`。 + - `pods` 调用 `/api/v1/vcjobs/{name}/pods`。 + - `events` 调用 `/api/v1/vcjobs/{name}/event`。 + - `yaml` 调用 `/api/v1/vcjobs/{name}/yaml`,默认模式直接输出 YAML 字符串到 stdout。 + - `template` 调用 `/api/v1/vcjobs/{name}/template`。 - 缺少 `` 时返回 `usage_error`。 -- **`--json` 的 `data`**:`job`(对象,平台作业详情响应)。 + - 默认模式以表格展示 Pod 名称、命名空间、节点、IP、阶段和资源。 +- **`--json` 的 `data`**: + - `get`: `job` + - `pods`: `pods` + - `events`: `events` + - `yaml`: `yaml` + - `template`: `template` - **状态**: [x] Completed -### `crater job pods ` -- **描述**: 查看指定作业的 Pod 列表。 +### `crater job token|secret|ssh|snapshot|alert|delete ` +- **描述**: + - `token`: 获取运行中 Jupyter 作业的 URL 与 token。 + - `secret`: 获取运行中 WebIDE 作业的 URL 与密码。 + - `ssh`: 为运行中作业开启 SSH,并输出 `host:port`。 + - `snapshot`: 为 Jupyter 或 Custom 作业创建镜像快照。 + - `alert`: 切换作业告警状态。 + - `delete`: 停止或删除自己的作业。 - **位置参数**: - `` (positional, required): 平台作业名。 -- **处理逻辑**: - - 调用 `/api/v1/vcjobs/{name}/pods`。 - - 默认模式以表格展示 Pod 名称、命名空间、节点、IP、阶段和资源。 -- **`--json` 的 `data`**:`pods`(数组,平台作业 Pod 响应)。 +- **`--json` 的 `data`**: + - `token` / `secret`: `token` + - `ssh`: `ssh` + - `snapshot` / `alert` / `delete`: `message` +- **状态**: [x] Completed + +### `crater job create jupyter|webide` +- **描述**: 创建交互式作业。支持 flags 构造常用请求,也支持 `--file` 传入完整 JSON 请求体。 +- **位置参数**: 无;如果提供任何位置参数,返回 `usage_error`。 +- **选项**: + - `--file` (string): 从文件读取完整 JSON 请求体;提供后忽略其它创建 flags。 + - `--name` (string, required without `--file`): 显示名称。 + - `--image` (string, required without `--file`): 镜像地址。 + - `--arch` (stringSlice): 镜像架构。 + - `--cpu` (float, default `1`): CPU 请求量,必须大于等于 0。 + - `--memory` (string, required without `--file`): 内存请求量,不能为负数。 + - `--gpu` (int, default `0`): GPU 数量,必须大于等于 0。 + - `--gpu-resource` (string, default `nvidia.com/gpu`): GPU 资源名;`--gpu > 0` 时必填。 + - `--schedule` (string): `normal | backfill`。 + - `--env` (stringArray): `KEY=VALUE`,可重复。 + - `--volume` (stringArray): `subPath:mountPath`,可重复。 + - `--dataset` (stringArray): `datasetID:mountPath`,可重复。 + - `--selector` (stringArray): `key=Operator:value1,value2`,可重复。 + - `--forward` (stringArray): `name:port[:type]`,可重复。 + - `--template`、`--alert`、`--cpu-pinning`。 +- **本地校验**: CPU、Memory、GPU 不允许负数;缺少 `name`、`image`、`memory` 会在发起请求前聚合报错。 +- **`--json` 的 `data`**:`job`(后端返回的 Volcano Job 对象)。 +- **状态**: [x] Completed + +### `crater job create custom` +- **描述**: 创建单机自定义训练作业。支持 flags 构造常用请求,也支持 `--file` 传入完整 JSON 请求体。 +- **位置参数**: 无;如果提供任何位置参数,返回 `usage_error`。 +- **额外选项**: + - `--working-dir` (string, default `/workspace`): 工作目录。 + - `--command` (string): 容器中执行的命令。 + - `--shell` (string, default `sh`): `--command` 使用的 shell。 +- **其余选项与校验**: 同 `jupyter|webide`。 +- **`--json` 的 `data`**:`job`。 - **状态**: [x] Completed -### `crater job events ` -- **描述**: 查看指定作业的 Kubernetes 事件。 +### `crater job create tensorflow|pytorch --file ` +- **描述**: 创建 TensorFlow 或 PyTorch 分布式作业。由于后端 `tasks[]` 结构较复杂,CLI 要求通过 `--file` 传入与前端/后端 DTO 对齐的完整 JSON 请求体。 +- **位置参数**: 无;如果提供任何位置参数,返回 `usage_error`。 +- **本地校验**: + - `name` 必填。 + - `tasks` 至少一个。 + - 每个 task 的 `name`、`image.imageLink` 必填。 + - 每个 task 的 `replicas` 必须大于 0。 + - 每个 task 的资源值不能为负数。 +- **`--json` 的 `data`**:`job`。 +- **状态**: [x] Completed + +### `crater admin job ls` +- **描述**: 使用 `/api/v1/admin/vcjobs` 或 `/api/v1/admin/vcjobs/user/{username}` 列出管理员可见作业。 +- **位置参数**: 无;如果提供任何位置参数,返回 `usage_error`。 +- **选项**: + - `--user` (string): 列出指定用户作业。 + - `--days` (int): 回看天数;`-1` 表示全部,默认 `0`。 + - `--status`、`--type`、`--node`、`--owner`、`--from`、`--to`、`--interactive`、`--batch`: 与 `crater job ls` 相同,均为本地筛选。 +- **`--json` 的 `data`**:`jobs`(数组)。 +- **状态**: [x] Completed + +### `crater admin job delete ` +- **描述**: 使用 `/api/v1/admin/vcjobs/{name}` 管理员删除作业。 - **位置参数**: - `` (positional, required): 平台作业名。 -- **处理逻辑**: - - 调用 `/api/v1/vcjobs/{name}/event`。 - - 默认模式逐行打印事件对象摘要;脚本化使用建议加 `--json`。 -- **`--json` 的 `data`**:`events`(数组,平台事件响应)。 +- **`--json` 的 `data`**:`message`。 - **状态**: [x] Completed -### `crater job yaml ` -- **描述**: 查看指定作业的 YAML。 +### `crater admin job lock|unlock|keep ` +- **描述**: + - `lock`: 设置清理锁定时长或永久锁定。 + - `unlock`: 清除清理锁定。 + - `keep`: 切换低 GPU 利用率清理时的保留状态。 - **位置参数**: - `` (positional, required): 平台作业名。 -- **处理逻辑**: - - 调用 `/api/v1/vcjobs/{name}/yaml`。 - - 默认模式直接输出 YAML 字符串到 stdout,不添加表格或装饰文本。 -- **`--json` 的 `data`**:`yaml`(字符串)。 +- **`lock` 选项**: + - `--permanent` (bool): 永久锁定。 + - `--days` / `--hours` / `--minutes` (int): 锁定时长;非永久锁定时至少一个大于 0。 +- **`--json` 的 `data`**:`message`。 +- **状态**: [x] Completed + +### `crater admin job clean ...` +- **描述**: 管理员作业清理操作,对应前端 admin jobs 的清理接口。 +- **位置参数**: 无;如果提供任何位置参数,返回 `usage_error`。 +- **子命令**: + - `waiting-jupyter --wait-minutes N`: 取消等待超过阈值的 Jupyter 作业。 + - `waiting-custom --wait-minutes N`: 取消等待超过阈值的 Custom 作业。 + - `long-running [--batch-days N] [--interactive-days N]`: 清理长时间运行作业。 + - `low-gpu --time-range N [--wait-time N] [--util N]`: 清理低 GPU 利用率作业。 +- **`--json` 的 `data`**:`cleanup`(含 `reminded` 与 `deleted`)。 - **状态**: [x] Completed --- diff --git a/cli/internal/api/job.go b/cli/internal/api/job.go index 614c317f..adee9347 100644 --- a/cli/internal/api/job.go +++ b/cli/internal/api/job.go @@ -2,6 +2,7 @@ package api import ( "fmt" + "net/url" "time" ) @@ -9,12 +10,33 @@ type JobClient interface { ListJobs(opts JobListOptions) ([]JobInfo, error) GetJob(name string) (*JobDetail, error) GetJobPods(name string) ([]PodDetail, error) - GetJobEvents(name string) ([]interface{}, error) + GetJobEvents(name string) ([]map[string]interface{}, error) GetJobYAML(name string) (string, error) + GetJobTemplate(name string) (string, error) + GetJupyterToken(name string) (*JobToken, error) + GetWebIDESecret(name string) (*JobToken, error) + OpenJobSSH(name string) (*SSHInfo, error) + SnapshotJob(name string) (string, error) + ToggleJobAlert(name string) (string, error) + DeleteJob(name string) (string, error) + AdminDeleteJob(name string) (string, error) + CreateJupyterJob(req CreateInteractiveJobRequest) (map[string]interface{}, error) + CreateWebIDEJob(req CreateInteractiveJobRequest) (map[string]interface{}, error) + CreateTrainingJob(req CreateTrainingJobRequest) (map[string]interface{}, error) + CreateTensorflowJob(req CreateDistributedJobRequest) (map[string]interface{}, error) + CreatePytorchJob(req CreateDistributedJobRequest) (map[string]interface{}, error) + ToggleJobKeep(name string) (string, error) + LockJob(req LockJobRequest) (string, error) + UnlockJob(name string) (string, error) + CleanWaitingJupyter(waitMinutes int) (*CleanupResult, error) + CleanWaitingCustom(waitMinutes int) (*CleanupResult, error) + CleanLongRunning(req CleanLongTimeRequest) (*CleanupResult, error) + CleanLowGPUUsage(req CleanLowGPUUsageRequest) (*CleanupResult, error) } type JobListOptions struct { All bool + Admin bool Username string Days int } @@ -24,6 +46,38 @@ type UserInfo struct { Nickname string `json:"nickname"` } +type ImageBaseInfo struct { + ImageLink string `json:"imageLink"` + Archs []string `json:"archs,omitempty"` +} + +type VolumeMount struct { + SubPath string `json:"subPath"` + MountPath string `json:"mountPath"` +} + +type DatasetMount struct { + DatasetID uint `json:"datasetID"` + MountPath string `json:"mountPath"` +} + +type EnvVar struct { + Name string `json:"name"` + Value string `json:"value"` +} + +type NodeSelectorRequirement struct { + Key string `json:"key"` + Operator string `json:"operator"` + Values []string `json:"values,omitempty"` +} + +type Forward struct { + Name string `json:"name"` + Port int `json:"port"` + Type string `json:"type,omitempty"` +} + type JobInfo struct { Name string `json:"name"` JobName string `json:"jobName"` @@ -45,25 +99,27 @@ type JobInfo struct { } type JobDetail struct { - Name string `json:"name"` - Namespace string `json:"namespace"` - Username string `json:"username"` - Nickname string `json:"nickname"` - UserInfo UserInfo `json:"userInfo"` - JobName string `json:"jobName"` - JobType string `json:"jobType"` - ScheduleType int `json:"scheduleType"` - Retry string `json:"retry"` - Queue string `json:"queue"` - Status string `json:"status"` - Resources ResourceList `json:"resources"` - CreatedAt time.Time `json:"createdAt"` - StartedAt time.Time `json:"startedAt"` - CompletedAt time.Time `json:"completedAt"` - ProfileData interface{} `json:"profileData,omitempty"` - ScheduleData interface{} `json:"scheduleData,omitempty"` - Events interface{} `json:"events,omitempty"` - TerminatedInfo interface{} `json:"terminatedStates,omitempty"` + Name string `json:"name"` + Namespace string `json:"namespace"` + Username string `json:"username"` + Nickname string `json:"nickname"` + UserInfo UserInfo `json:"userInfo"` + JobName string `json:"jobName"` + JobType string `json:"jobType"` + ScheduleType int `json:"scheduleType"` + WaitingToleranceSeconds *int64 `json:"waitingToleranceSeconds,omitempty"` + Queue string `json:"queue"` + Resources ResourceList `json:"resources"` + Status string `json:"status"` + Retry string `json:"retry"` + ProfileData map[string]interface{} `json:"profileData,omitempty"` + ScheduleData map[string]interface{} `json:"scheduleData,omitempty"` + Events []map[string]interface{} `json:"events,omitempty"` + TerminatedStates []map[string]interface{} `json:"terminatedStates,omitempty"` + TerminatedInfo interface{} `json:"terminatedStates,omitempty"` + CreatedAt time.Time `json:"createdAt"` + StartedAt time.Time `json:"startedAt"` + CompletedAt time.Time `json:"completedAt"` } type PodDetail struct { @@ -76,25 +132,118 @@ type PodDetail struct { Phase string `json:"phase"` } +type JobToken struct { + BaseURL string `json:"baseURL"` + FullURL string `json:"fullURL"` + Token string `json:"token"` + PodName string `json:"podName"` + Namespace string `json:"namespace"` +} + +type SSHInfo struct { + IP string `json:"ip"` + Port string `json:"port"` +} + +type JobCommonRequest struct { + Name string `json:"name"` + VolumeMounts []VolumeMount `json:"volumeMounts,omitempty"` + DatasetMounts []DatasetMount `json:"datasetMounts,omitempty"` + Envs []EnvVar `json:"envs,omitempty"` + Selectors []NodeSelectorRequirement `json:"selectors,omitempty"` + Template string `json:"template,omitempty"` + AlertEnabled bool `json:"alertEnabled"` + CpuPinningEnabled bool `json:"cpuPinningEnabled,omitempty"` + Forwards []Forward `json:"forwards,omitempty"` + ScheduleType *int `json:"scheduleType,omitempty"` +} + +type CreateInteractiveJobRequest struct { + JobCommonRequest + Resource ResourceList `json:"resource"` + Image ImageBaseInfo `json:"image"` +} + +type CreateTrainingJobRequest struct { + CreateInteractiveJobRequest + Shell *string `json:"shell,omitempty"` + Command *string `json:"command,omitempty"` + WorkingDir string `json:"workingDir"` +} + +type PortRequest struct { + Name string `json:"name"` + Port int32 `json:"port"` +} + +type TaskRequest struct { + Name string `json:"name"` + Replicas int32 `json:"replicas"` + Resource ResourceList `json:"resource"` + Image ImageBaseInfo `json:"image"` + Shell *string `json:"shell,omitempty"` + Command *string `json:"command,omitempty"` + WorkingDir *string `json:"workingDir,omitempty"` + Ports []PortRequest `json:"ports,omitempty"` +} + +type CreateDistributedJobRequest struct { + JobCommonRequest + Tasks []TaskRequest `json:"tasks"` +} + +type LockJobRequest struct { + Name string `json:"name"` + IsPermanent bool `json:"isPermanent"` + Days int `json:"days"` + Hours int `json:"hours"` + Minutes int `json:"minutes"` +} + +type CleanupResult struct { + Reminded []string `json:"reminded"` + Deleted []string `json:"deleted"` +} + +type CleanLongTimeRequest struct { + BatchDays *int `json:"batchDays,omitempty"` + InteractiveDays *int `json:"interactiveDays,omitempty"` +} + +type CleanLowGPUUsageRequest struct { + TimeRange int `json:"timeRange"` + WaitTime *int `json:"waitTime,omitempty"` + Util *int `json:"util,omitempty"` +} + func (c *Client) ListJobs(opts JobListOptions) ([]JobInfo, error) { - path := VCJobListPath - var result Response[[]JobInfo] - req := c.httpClient.R(). - SetSuccessResult(&result). - SetErrorResult(&result) + prefix := VCJobsPrefix + if opts.Admin { + prefix = AdminVCJobsPrefix + } + path := prefix + query := url.Values{} switch { case opts.Username != "": - path = fmt.Sprintf("%s/user/%s", VCJobListPath, opts.Username) + path += "/user/" + url.PathEscape(opts.Username) + if opts.Days != 0 { + query.Set("days", fmt.Sprintf("%d", opts.Days)) + } + case opts.All && !opts.Admin: + path += "/all" if opts.Days != 0 { - req.SetQueryParam("days", fmt.Sprintf("%d", opts.Days)) + query.Set("days", fmt.Sprintf("%d", opts.Days)) } - case opts.All: - path = VCJobListPath + "/all" + case opts.Admin: if opts.Days != 0 { - req.SetQueryParam("days", fmt.Sprintf("%d", opts.Days)) + query.Set("days", fmt.Sprintf("%d", opts.Days)) } } - resp, err := req.Get(path) + if encoded := query.Encode(); encoded != "" { + path += "?" + encoded + } + var result Response[[]JobInfo] + resp, err := c.httpClient.R().SetSuccessResult(&result).SetErrorResult(&result).Get(path) if err != nil { return nil, &NetworkError{Cause: err} } @@ -106,10 +255,7 @@ func (c *Client) ListJobs(opts JobListOptions) ([]JobInfo, error) { func (c *Client) GetJob(name string) (*JobDetail, error) { var result Response[JobDetail] - resp, err := c.httpClient.R(). - SetSuccessResult(&result). - SetErrorResult(&result). - Get(VCJobListPath + "/" + name + "/detail") + resp, err := c.httpClient.R().SetSuccessResult(&result).SetErrorResult(&result).Get(VCJobsPrefix + "/" + url.PathEscape(name) + "/detail") if err != nil { return nil, &NetworkError{Cause: err} } @@ -121,10 +267,127 @@ func (c *Client) GetJob(name string) (*JobDetail, error) { func (c *Client) GetJobPods(name string) ([]PodDetail, error) { var result Response[[]PodDetail] + resp, err := c.httpClient.R().SetSuccessResult(&result).SetErrorResult(&result).Get(VCJobsPrefix + "/" + url.PathEscape(name) + "/pods") + if err != nil { + return nil, &NetworkError{Cause: err} + } + if err := errorFromResponse(resp, result.Code, result.Message); err != nil { + return nil, err + } + return result.Data, nil +} + +func (c *Client) GetJobEvents(name string) ([]map[string]interface{}, error) { + var result Response[[]map[string]interface{}] + resp, err := c.httpClient.R().SetSuccessResult(&result).SetErrorResult(&result).Get(VCJobsPrefix + "/" + url.PathEscape(name) + "/event") + if err != nil { + return nil, &NetworkError{Cause: err} + } + if err := errorFromResponse(resp, result.Code, result.Message); err != nil { + return nil, err + } + return result.Data, nil +} + +func (c *Client) GetJobYAML(name string) (string, error) { + var result Response[string] + resp, err := c.httpClient.R().SetSuccessResult(&result).SetErrorResult(&result).Get(VCJobsPrefix + "/" + url.PathEscape(name) + "/yaml") + if err != nil { + return "", &NetworkError{Cause: err} + } + if err := errorFromResponse(resp, result.Code, result.Message); err != nil { + return "", err + } + return result.Data, nil +} + +func (c *Client) GetJobTemplate(name string) (string, error) { + var result Response[string] + resp, err := c.httpClient.R().SetSuccessResult(&result).SetErrorResult(&result).Get(VCJobsPrefix + "/" + url.PathEscape(name) + "/template") + if err != nil { + return "", &NetworkError{Cause: err} + } + if err := errorFromResponse(resp, result.Code, result.Message); err != nil { + return "", err + } + return result.Data, nil +} + +func (c *Client) GetJupyterToken(name string) (*JobToken, error) { + return c.getJobToken(name, "token") +} + +func (c *Client) GetWebIDESecret(name string) (*JobToken, error) { + return c.getJobToken(name, "secret") +} + +func (c *Client) getJobToken(name string, suffix string) (*JobToken, error) { + var result Response[JobToken] + resp, err := c.httpClient.R().SetSuccessResult(&result).SetErrorResult(&result).Get(VCJobsPrefix + "/" + url.PathEscape(name) + "/" + suffix) + if err != nil { + return nil, &NetworkError{Cause: err} + } + if err := errorFromResponse(resp, result.Code, result.Message); err != nil { + return nil, err + } + return &result.Data, nil +} + +func (c *Client) OpenJobSSH(name string) (*SSHInfo, error) { + var result Response[SSHInfo] + resp, err := c.httpClient.R().SetSuccessResult(&result).SetErrorResult(&result).Post(VCJobsPrefix + "/" + url.PathEscape(name) + "/ssh") + if err != nil { + return nil, &NetworkError{Cause: err} + } + if err := errorFromResponse(resp, result.Code, result.Message); err != nil { + return nil, err + } + return &result.Data, nil +} + +func (c *Client) SnapshotJob(name string) (string, error) { + return c.messagePost(VCJobsPrefix+"/"+url.PathEscape(name)+"/snapshot", nil) +} + +func (c *Client) ToggleJobAlert(name string) (string, error) { + return c.messagePut(VCJobsPrefix+"/"+url.PathEscape(name)+"/alert", nil) +} + +func (c *Client) DeleteJob(name string) (string, error) { + return c.messageDelete(VCJobsPrefix + "/" + url.PathEscape(name)) +} + +func (c *Client) AdminDeleteJob(name string) (string, error) { + return c.messageDelete(AdminVCJobsPrefix + "/" + url.PathEscape(name)) +} + +func (c *Client) CreateJupyterJob(req CreateInteractiveJobRequest) (map[string]interface{}, error) { + return c.createJob("jupyter", req) +} + +func (c *Client) CreateWebIDEJob(req CreateInteractiveJobRequest) (map[string]interface{}, error) { + return c.createJob("webide", req) +} + +func (c *Client) CreateTrainingJob(req CreateTrainingJobRequest) (map[string]interface{}, error) { + return c.createJob("training", req) +} + +func (c *Client) CreateTensorflowJob(req CreateDistributedJobRequest) (map[string]interface{}, error) { + return c.createJob("tensorflow", req) +} + +func (c *Client) CreatePytorchJob(req CreateDistributedJobRequest) (map[string]interface{}, error) { + return c.createJob("pytorch", req) +} + +func (c *Client) createJob(kind string, body interface{}) (map[string]interface{}, error) { + var result Response[map[string]interface{}] resp, err := c.httpClient.R(). + SetBody(body). SetSuccessResult(&result). SetErrorResult(&result). - Get(VCJobListPath + "/" + name + "/pods") + Post(VCJobsPrefix + "/" + kind) if err != nil { return nil, &NetworkError{Cause: err} } @@ -134,27 +397,83 @@ func (c *Client) GetJobPods(name string) ([]PodDetail, error) { return result.Data, nil } -func (c *Client) GetJobEvents(name string) ([]interface{}, error) { - var result Response[[]interface{}] +func (c *Client) ToggleJobKeep(name string) (string, error) { + return c.messagePut(AdminOperationsPfx+"/keep/"+url.PathEscape(name), nil) +} + +func (c *Client) LockJob(req LockJobRequest) (string, error) { + return c.messagePut(AdminOperationsPfx+"/add/locktime", req) +} + +func (c *Client) UnlockJob(name string) (string, error) { + return c.messagePut(AdminOperationsPfx+"/clear/locktime", map[string]string{"name": name}) +} + +func (c *Client) CleanWaitingJupyter(waitMinutes int) (*CleanupResult, error) { + return c.cleanup(AdminOperationsPfx+"/clean/clean-waiting-jupyter-job", map[string]interface{}{ + "waitMinitues": waitMinutes, + "jobTypes": []string{"jupyter"}, + }) +} + +func (c *Client) CleanWaitingCustom(waitMinutes int) (*CleanupResult, error) { + return c.cleanup(AdminOperationsPfx+"/clean/clean-waiting-custom-job", map[string]interface{}{ + "waitMinitues": waitMinutes, + "jobTypes": []string{"custom"}, + }) +} + +func (c *Client) CleanLongRunning(req CleanLongTimeRequest) (*CleanupResult, error) { + return c.cleanup(AdminOperationsPfx+"/clean/clean-long-running-job", req) +} + +func (c *Client) CleanLowGPUUsage(req CleanLowGPUUsageRequest) (*CleanupResult, error) { + return c.cleanup(AdminOperationsPfx+"/clean/clean-low-gpu-usage-job", req) +} + +func (c *Client) cleanup(path string, body interface{}) (*CleanupResult, error) { + var result Response[CleanupResult] resp, err := c.httpClient.R(). + SetBody(body). SetSuccessResult(&result). SetErrorResult(&result). - Get(VCJobListPath + "/" + name + "/event") + Post(path) if err != nil { return nil, &NetworkError{Cause: err} } if err := errorFromResponse(resp, result.Code, result.Message); err != nil { return nil, err } + return &result.Data, nil +} + +func (c *Client) messagePost(path string, body interface{}) (string, error) { + var result Response[string] + resp, err := c.httpClient.R().SetBody(body).SetSuccessResult(&result).SetErrorResult(&result).Post(path) + if err != nil { + return "", &NetworkError{Cause: err} + } + if err := errorFromResponse(resp, result.Code, result.Message); err != nil { + return "", err + } return result.Data, nil } -func (c *Client) GetJobYAML(name string) (string, error) { +func (c *Client) messagePut(path string, body interface{}) (string, error) { var result Response[string] - resp, err := c.httpClient.R(). - SetSuccessResult(&result). - SetErrorResult(&result). - Get(VCJobListPath + "/" + name + "/yaml") + resp, err := c.httpClient.R().SetBody(body).SetSuccessResult(&result).SetErrorResult(&result).Put(path) + if err != nil { + return "", &NetworkError{Cause: err} + } + if err := errorFromResponse(resp, result.Code, result.Message); err != nil { + return "", err + } + return result.Data, nil +} + +func (c *Client) messageDelete(path string) (string, error) { + var result Response[string] + resp, err := c.httpClient.R().SetSuccessResult(&result).SetErrorResult(&result).Delete(path) if err != nil { return "", &NetworkError{Cause: err} } diff --git a/cli/internal/api/paths.go b/cli/internal/api/paths.go index 83246587..30287857 100644 --- a/cli/internal/api/paths.go +++ b/cli/internal/api/paths.go @@ -32,6 +32,7 @@ const ( AIJobsPrefix = "/api/v1/aijobs" SPJobsPrefix = "/api/v1/spjobs" VCJobsPrefix = "/api/v1/vcjobs" + AdminVCJobsPrefix = "/api/v1/admin/vcjobs" ) // AuthLoginPath 为登录接口路径(含模块前缀)。 diff --git a/cli/internal/i18n/catalog_job.go b/cli/internal/i18n/catalog_job.go new file mode 100644 index 00000000..9c5c2eb2 --- /dev/null +++ b/cli/internal/i18n/catalog_job.go @@ -0,0 +1,186 @@ +package i18n + +var catalogJob = map[Language]map[string]string{ + En: { + "flag_alert": "Enable job alert", + "flag_batch-days": "Batch job running days threshold", + "flag_command": "Command to run", + "flag_cpu-pinning": "Enable CPU pinning", + "flag_dataset": "Dataset mount id:mountPath, repeatable", + "flag_env": "Environment variable KEY=VALUE, repeatable", + "flag_file": "Read exact JSON request body from file", + "flag_forward": "Forward name:port[:type], repeatable", + "flag_from": "Filter createdAt from time, RFC3339 or YYYY-MM-DD", + "flag_gpu-resource": "GPU resource name", + "flag_hours": "Lock hours", + "flag_image": "Image link", + "flag_interactive-days": "Interactive job running days threshold", + "flag_memory": "Memory request, for example 8Gi", + "flag_minutes": "Lock minutes", + "flag_permanent": "Lock permanently", + "flag_schedule": "Schedule type: normal or backfill", + "flag_selector": "Node selector key=Operator:value1,value2, repeatable", + "flag_shell": "Shell for --command", + "flag_template": "Template name or JSON", + "flag_time-range": "GPU usage lookback range", + "flag_to": "Filter createdAt until time, RFC3339 or YYYY-MM-DD", + "flag_util": "GPU utilization threshold", + "flag_volume": "Workspace mount subPath:mountPath, repeatable", + "flag_wait-minutes": "Waiting minutes threshold", + "flag_wait-time": "Wait time before cleanup", + "flag_working-dir": "Working directory", + + "job_admin_clean_long-running_long": "Run the administrator long-running job cleanup action.", + "job_admin_clean_long-running_short": "Clean long-running jobs", + "job_admin_clean_low-gpu_long": "Run the administrator low GPU usage cleanup action.", + "job_admin_clean_low-gpu_short": "Clean low GPU usage jobs", + "job_admin_clean_long": "Run administrator job cleanup actions.", + "job_admin_clean_short": "Run admin job cleanup actions", + "job_admin_clean_waiting-custom_long": "Cancel waiting custom jobs after a wait threshold.", + "job_admin_clean_waiting-custom_short": "Cancel waiting custom jobs", + "job_admin_clean_waiting-jupyter_long": "Cancel waiting Jupyter jobs after a wait threshold.", + "job_admin_clean_waiting-jupyter_short": "Cancel waiting Jupyter jobs", + "job_admin_keep_long": "Toggle whether a job is kept when low-resource cleanup runs.", + "job_admin_keep_short": "Toggle keep-when-low-usage", + "job_admin_lock_long": "Lock a job to prevent cleanup for a duration or permanently.", + "job_admin_lock_short": "Lock a job cleanup window", + "job_admin_long": "Run administrator-only job operations.", + "job_admin_short": "Admin job operations", + "job_admin_unlock_long": "Clear a job cleanup lock.", + "job_admin_unlock_short": "Unlock a job cleanup window", + "job_alert_long": "Toggle email alert state for a job.", + "job_alert_short": "Toggle job alert state", + "job_create_custom_long": "Create a custom single-node training job from flags or a JSON file.", + "job_create_custom_short": "Create a custom single-node job", + "job_create_jupyter_long": "Create a Jupyter interactive job from flags or a JSON file.", + "job_create_jupyter_short": "Create a Jupyter job", + "job_create_long": "Create interactive, single-node, or distributed jobs.", + "job_create_pytorch_long": "Create a PyTorch distributed job from an exact JSON request file.", + "job_create_pytorch_short": "Create a PyTorch distributed job from JSON", + "job_create_short": "Create jobs", + "job_create_submitted": "job submitted", + "job_create_tensorflow_long": "Create a TensorFlow distributed job from an exact JSON request file.", + "job_create_tensorflow_short": "Create a TensorFlow distributed job from JSON", + "job_create_webide_long": "Create a WebIDE interactive job from flags or a JSON file.", + "job_create_webide_short": "Create a WebIDE job", + "job_delete_long": "Stop or delete one of your jobs.", + "job_delete_short": "Stop or delete a job", + "job_events_long": "List Kubernetes events related to a job.", + "job_label_display_name": "display name", + "job_label_file": "file", + "job_label_gpu_resource": "GPU resource", + "job_label_image": "image", + "job_label_memory": "memory", + "job_label_resources": "resources", + "job_label_task_name": "task name", + "job_label_tasks": "tasks", + "job_label_working_dir": "working directory", + "job_secret_long": "Get WebIDE URL and password for a running WebIDE job.", + "job_secret_short": "Get WebIDE secret", + "job_snapshot_long": "Create an image snapshot for a Jupyter or custom job.", + "job_snapshot_short": "Create a job image snapshot", + "job_ssh_long": "Open SSH access for a running job and print host and port.", + "job_ssh_short": "Open SSH for a running job", + "job_template_long": "Show the saved job template body.", + "job_template_short": "Show job template JSON", + "job_token_long": "Get Jupyter URL and token for a running Jupyter job.", + "job_token_short": "Get Jupyter token", + + "err_invalid_job_days": "days must be -1 or greater", + + "table_completed_at": "CompletedAt", + "table_created_at": "CreatedAt", + "table_started_at": "StartedAt", + }, + ZhCN: { + "flag_alert": "开启作业告警", + "flag_batch-days": "批处理作业运行天数阈值", + "flag_command": "要运行的命令", + "flag_cpu-pinning": "开启 CPU 绑核", + "flag_dataset": "数据集挂载 id:mountPath,可重复", + "flag_env": "环境变量 KEY=VALUE,可重复", + "flag_file": "从文件读取完整 JSON 请求体", + "flag_forward": "端口转发 name:port[:type],可重复", + "flag_from": "按 createdAt 起始时间筛选,RFC3339 或 YYYY-MM-DD", + "flag_gpu-resource": "GPU 资源名称", + "flag_hours": "锁定小时数", + "flag_image": "镜像地址", + "flag_interactive-days": "交互式作业运行天数阈值", + "flag_memory": "内存请求量,例如 8Gi", + "flag_minutes": "锁定分钟数", + "flag_permanent": "永久锁定", + "flag_schedule": "调度类型:normal 或 backfill", + "flag_selector": "节点选择器 key=Operator:value1,value2,可重复", + "flag_shell": "--command 使用的 shell", + "flag_template": "模板名称或 JSON", + "flag_time-range": "GPU 使用率回看范围", + "flag_to": "按 createdAt 截止时间筛选,RFC3339 或 YYYY-MM-DD", + "flag_util": "GPU 利用率阈值", + "flag_volume": "工作区挂载 subPath:mountPath,可重复", + "flag_wait-minutes": "等待分钟数阈值", + "flag_wait-time": "清理前等待时间", + "flag_working-dir": "工作目录", + + "job_admin_clean_long-running_long": "执行管理员长时间运行作业清理。", + "job_admin_clean_long-running_short": "清理长时间运行作业", + "job_admin_clean_low-gpu_long": "执行管理员低 GPU 利用率作业清理。", + "job_admin_clean_low-gpu_short": "清理低 GPU 利用率作业", + "job_admin_clean_long": "执行管理员作业清理操作。", + "job_admin_clean_short": "执行管理员作业清理", + "job_admin_clean_waiting-custom_long": "按等待阈值取消等待中的自定义作业。", + "job_admin_clean_waiting-custom_short": "取消等待中的自定义作业", + "job_admin_clean_waiting-jupyter_long": "按等待阈值取消等待中的 Jupyter 作业。", + "job_admin_clean_waiting-jupyter_short": "取消等待中的 Jupyter 作业", + "job_admin_keep_long": "切换作业在低资源清理时是否保留。", + "job_admin_keep_short": "切换低利用率清理保留状态", + "job_admin_lock_long": "按时长或永久锁定作业,避免被清理。", + "job_admin_lock_short": "锁定作业清理窗口", + "job_admin_long": "执行仅管理员可用的作业操作。", + "job_admin_short": "管理员作业操作", + "job_admin_unlock_long": "清除作业清理锁定。", + "job_admin_unlock_short": "解锁作业清理窗口", + "job_alert_long": "切换作业邮件告警状态。", + "job_alert_short": "切换作业告警状态", + "job_create_custom_long": "通过 flags 或 JSON 文件创建自定义单机训练作业。", + "job_create_custom_short": "创建自定义单机作业", + "job_create_jupyter_long": "通过 flags 或 JSON 文件创建 Jupyter 交互式作业。", + "job_create_jupyter_short": "创建 Jupyter 作业", + "job_create_long": "创建交互式、单机或分布式作业。", + "job_create_pytorch_long": "从完整 JSON 请求文件创建 PyTorch 分布式作业。", + "job_create_pytorch_short": "从 JSON 创建 PyTorch 分布式作业", + "job_create_short": "创建作业", + "job_create_submitted": "作业已提交", + "job_create_tensorflow_long": "从完整 JSON 请求文件创建 TensorFlow 分布式作业。", + "job_create_tensorflow_short": "从 JSON 创建 TensorFlow 分布式作业", + "job_create_webide_long": "通过 flags 或 JSON 文件创建 WebIDE 交互式作业。", + "job_create_webide_short": "创建 WebIDE 作业", + "job_delete_long": "停止或删除自己的作业。", + "job_delete_short": "停止或删除作业", + "job_events_long": "列出作业相关 Kubernetes 事件。", + "job_label_display_name": "显示名称", + "job_label_file": "文件", + "job_label_gpu_resource": "GPU 资源", + "job_label_image": "镜像", + "job_label_memory": "内存", + "job_label_resources": "资源", + "job_label_task_name": "任务名称", + "job_label_tasks": "任务列表", + "job_label_working_dir": "工作目录", + "job_secret_long": "获取运行中 WebIDE 作业的 URL 和密码。", + "job_secret_short": "获取 WebIDE 密钥", + "job_snapshot_long": "为 Jupyter 或自定义作业创建镜像快照。", + "job_snapshot_short": "创建作业镜像快照", + "job_ssh_long": "为运行中作业开启 SSH 并输出主机与端口。", + "job_ssh_short": "开启作业 SSH", + "job_template_long": "显示保存的作业模板内容。", + "job_template_short": "显示作业模板 JSON", + "job_token_long": "获取运行中 Jupyter 作业的 URL 和 token。", + "job_token_short": "获取 Jupyter token", + + "err_invalid_job_days": "days 必须大于等于 -1", + + "table_completed_at": "完成时间", + "table_created_at": "创建时间", + "table_started_at": "开始时间", + }, +} diff --git a/cli/internal/i18n/i18n.go b/cli/internal/i18n/i18n.go index 3f4ef85c..a7857f23 100644 --- a/cli/internal/i18n/i18n.go +++ b/cli/internal/i18n/i18n.go @@ -25,6 +25,7 @@ var translations = mergeCatalogs( catalogDownload, catalogRead, catalogErrors, + catalogJob, ) func mergeCatalogs(catalogs ...map[Language]map[string]string) map[Language]map[string]string { diff --git a/cli/skills/crater-cli-admin-job/SKILL.md b/cli/skills/crater-cli-admin-job/SKILL.md new file mode 100644 index 00000000..2ee63c8b --- /dev/null +++ b/cli/skills/crater-cli-admin-job/SKILL.md @@ -0,0 +1,52 @@ +--- +name: crater-cli-admin-job +version: 0.1.0 +description: Use Crater CLI admin job commands to list, delete, lock, keep, and clean jobs. +--- + +# Crater CLI Admin Job + +Use this skill when the user asks for administrator job operations from the CLI. Administrator commands use the `crater admin job ...` prefix. Do not use `--admin` on ordinary `crater job ...` commands. + +First apply the shared Crater CLI rules from `crater-cli-shared`: prefer `--json --no-interactive` for automation, treat stderr JSON as the error contract, and confirm exact `jobName` values before destructive operations. + +## Command Map + +- List admin-visible jobs: `crater admin job ls` +- Delete a job as admin: `crater admin job delete ` +- Lock cleanup: `crater admin job lock [--permanent | --days N | --hours N | --minutes N]` +- Unlock cleanup: `crater admin job unlock ` +- Toggle low-usage keep state: `crater admin job keep ` +- Cleanup jobs: `crater admin job clean waiting-jupyter|waiting-custom|long-running|low-gpu ...` + +## Safe Defaults + +Use `crater admin job ls --json --no-interactive` before destructive actions to confirm the exact backend `jobName`. User-facing display names are not always accepted by job APIs. + +Do not pass negative durations or cleanup thresholds. `lock` requires `--permanent` or at least one positive duration field. + +## Common Workflows + +List a user's jobs as admin: + +```bash +crater admin job ls --user alice --json --no-interactive +``` + +Delete a job as admin: + +```bash +crater admin job delete jpt-alice-abcde --json --no-interactive +``` + +Lock a job for cleanup protection: + +```bash +crater admin job lock jpt-alice-abcde --days 1 --json --no-interactive +``` + +Clean low GPU usage jobs: + +```bash +crater admin job clean low-gpu --time-range 3600 --wait-time 600 --util 10 --json --no-interactive +``` diff --git a/cli/skills/crater-cli-job/SKILL.md b/cli/skills/crater-cli-job/SKILL.md new file mode 100644 index 00000000..8397ef1d --- /dev/null +++ b/cli/skills/crater-cli-job/SKILL.md @@ -0,0 +1,74 @@ +--- +name: crater-cli-job +version: 0.1.0 +description: Use Crater CLI job commands to list, inspect, create, stop, and snapshot jobs. +--- + +# Crater CLI Job + +Use this skill when the user asks to operate Crater jobs from the CLI: list jobs, inspect details, view pods/events/YAML/templates, get Jupyter/WebIDE access, open SSH, create jobs, stop/delete jobs, or snapshot jobs. + +First apply the shared Crater CLI rules from `crater-cli-shared`: prefer `--json --no-interactive` for automation, treat stderr JSON as the error contract, and never expose sensitive tokens unless the user explicitly needs them. + +## Command Map + +- List jobs: `crater job ls` +- Detail surfaces: `crater job get|pods|events|yaml|template ` +- Access helpers: `crater job token `, `crater job secret `, `crater job ssh ` +- Lifecycle helpers: `crater job snapshot `, `crater job alert `, `crater job delete ` +- Create interactive jobs: `crater job create jupyter|webide ...` +- Create custom jobs: `crater job create custom ...` +- Create distributed jobs: `crater job create tensorflow|pytorch --file request.json` + +## Safe Defaults + +Use `crater job ls --json --no-interactive` before destructive actions to confirm the exact `jobName`. User-facing display names are not always accepted by job APIs. + +For create commands, validate resource values before calling the platform. CPU, memory, GPU counts, task replicas, and cleanup thresholds must not be negative. The CLI also performs local validation, but agents should still avoid constructing invalid requests. + +For Jupyter/WebIDE access commands, the returned token or password is sensitive. Prefer JSON only when the next tool needs structured fields, and avoid echoing secrets into logs or issue bodies. + +## Common Workflows + +List running GPU jobs for a user: + +```bash +crater job ls --user alice --status Running --json --no-interactive +``` + +Inspect a job: + +```bash +crater job get jpt-alice-abcde --json --no-interactive +crater job pods jpt-alice-abcde --json --no-interactive +crater job events jpt-alice-abcde --json --no-interactive +``` + +Create a Jupyter job: + +```bash +crater job create jupyter \ + --name experiment-notebook \ + --image harbor.example/project/jupyter:latest \ + --cpu 4 \ + --memory 16Gi \ + --gpu 1 \ + --gpu-resource nvidia.com/gpu \ + --json --no-interactive +``` + +Create a distributed PyTorch job from an exact backend-compatible request: + +```bash +crater job create pytorch --file pytorch-job.json --json --no-interactive +``` + +Stop or delete a job: + +```bash +crater job delete jpt-alice-abcde --json --no-interactive +``` + +## Notes + +`crater job create tensorflow|pytorch` intentionally uses `--file` because the backend accepts a nested `tasks[]` request. Keep the JSON aligned with the frontend DTO fields: `name`, `tasks`, `resource`, `image.imageLink`, `volumeMounts`, `envs`, `selectors`, `alertEnabled`, `template`, and optional scheduling fields. diff --git a/cli/test/snapshots/job/job_test.go b/cli/test/snapshots/job/job_test.go new file mode 100644 index 00000000..f439a652 --- /dev/null +++ b/cli/test/snapshots/job/job_test.go @@ -0,0 +1,68 @@ +package job_test + +import ( + "os" + "testing" + + "github.com/raids-lab/crater/cli/internal/snaptest" +) + +const goldenStemJob = "job" + +func runJobCases(t *testing.T, bin string, baseEnv []string, cases []snaptest.Case) []*snaptest.Result { + t.Helper() + out := make([]*snaptest.Result, len(cases)) + for i := range cases { + r, err := snaptest.Run(bin, baseEnv, cases[i].Args) + if err != nil { + t.Fatalf("case %s: %v", cases[i].ID, err) + } + out[i] = r + } + return out +} + +func jobCases() []snaptest.Case { + return []snaptest.Case{ + {ID: "01-unknown-nojson", Args: []string{"job", "wat", "--no-interactive"}}, + {ID: "02-unknown-json", Args: []string{"job", "wat", "--no-interactive", "--json"}}, + {ID: "03-get-missing-name-nojson", Args: []string{"job", "get", "--no-interactive"}}, + {ID: "04-get-missing-name-json", Args: []string{"job", "get", "--no-interactive", "--json"}}, + {ID: "05-ls-invalid-days-nojson", Args: []string{"job", "ls", "--no-interactive", "--days", "-2"}}, + {ID: "06-ls-invalid-status-json", Args: []string{"job", "ls", "--no-interactive", "--json", "--status", "bad"}}, + {ID: "07-create-jupyter-multi-usage-nojson", Args: []string{"job", "create", "jupyter", "--no-interactive", "--name", "demo", "--cpu", "-1", "--memory", "-2Gi", "--gpu", "-1"}}, + {ID: "08-create-jupyter-multi-usage-json", Args: []string{"job", "create", "jupyter", "--no-interactive", "--json", "--name", "demo", "--cpu", "-1", "--memory", "-2Gi", "--gpu", "-1"}}, + {ID: "09-create-custom-missing-working-dir-json", Args: []string{"job", "create", "custom", "--no-interactive", "--json", "--name", "demo", "--image", "example/image:tag", "--memory", "2Gi", "--working-dir", ""}}, + {ID: "10-admin-lock-missing-duration-nojson", Args: []string{"admin", "job", "lock", "job-123", "--no-interactive"}}, + {ID: "11-admin-clean-low-gpu-invalid-json", Args: []string{"admin", "job", "clean", "low-gpu", "--no-interactive", "--json", "--time-range", "0", "--wait-time", "-1"}}, + {ID: "12-ls-network-timeout-json", Args: []string{"job", "ls", "--no-interactive", "--json"}}, + } +} + +func TestJobSnapshotsEN(t *testing.T) { + path := snaptest.GoldenFileT(t, "job", goldenStemJob, "en") + home := t.TempDir() + baseEnv := snaptest.EnvMinimal(home, "en") + baseEnv = append(baseEnv, "CRATER_TEST_SANDBOX_HTTP=timeout") + bin := snaptest.CraterExecutable(t) + cases := jobCases() + results := runJobCases(t, bin, baseEnv, cases) + update := os.Getenv("UPDATE_SNAPSHOTS") == "1" || os.Getenv("UPDATE_SNAPSHOTS") == "true" + if err := snaptest.MatchOrUpdateGolden(path, "en", cases, results, update); err != nil { + t.Fatal(err) + } +} + +func TestJobSnapshotsZhCN(t *testing.T) { + path := snaptest.GoldenFileT(t, "job", goldenStemJob, "zh-CN") + home := t.TempDir() + baseEnv := snaptest.EnvMinimal(home, "zh-CN") + baseEnv = append(baseEnv, "CRATER_TEST_SANDBOX_HTTP=timeout") + bin := snaptest.CraterExecutable(t) + cases := jobCases() + results := runJobCases(t, bin, baseEnv, cases) + update := os.Getenv("UPDATE_SNAPSHOTS") == "1" || os.Getenv("UPDATE_SNAPSHOTS") == "true" + if err := snaptest.MatchOrUpdateGolden(path, "zh-CN", cases, results, update); err != nil { + t.Fatal(err) + } +} diff --git a/cli/test/snapshots/read/read_matrix_test.go b/cli/test/snapshots/read/read_matrix_test.go index 91cb82ab..60f064a5 100644 --- a/cli/test/snapshots/read/read_matrix_test.go +++ b/cli/test/snapshots/read/read_matrix_test.go @@ -14,7 +14,8 @@ func TestReadCommandMatrix(t *testing.T) { commands := [][]string{ {"node", "ls"}, {"node", "get"}, {"node", "pods"}, {"node", "gpu"}, - {"job", "ls"}, {"job", "get"}, {"job", "pods"}, {"job", "events"}, {"job", "yaml"}, + {"job", "ls"}, {"job", "get"}, {"job", "pods"}, {"job", "events"}, {"job", "yaml"}, {"job", "template"}, {"job", "token"}, {"job", "secret"}, {"job", "ssh"}, {"job", "snapshot"}, {"job", "alert"}, {"job", "delete"}, + {"job", "create", "jupyter"}, {"job", "create", "webide"}, {"job", "create", "custom"}, {"job", "create", "tensorflow"}, {"job", "create", "pytorch"}, {"image", "ls"}, {"account", "ls"}, {"account", "get"}, {"account", "members"}, {"account", "users-out"}, {"account", "billing", "config"}, {"account", "billing", "members"}, {"resource", "ls"}, {"resource", "networks"}, {"resource", "vgpu"}, {"resource", "prices"}, @@ -33,6 +34,8 @@ func TestReadCommandMatrix(t *testing.T) { {"admin", "dataset", "ls"}, {"admin", "model-download", "ls"}, {"admin", "billing", "status"}, {"admin", "billing", "jobs"}, + {"admin", "job", "ls"}, {"admin", "job", "delete"}, {"admin", "job", "lock"}, {"admin", "job", "unlock"}, {"admin", "job", "keep"}, + {"admin", "job", "clean", "waiting-jupyter"}, {"admin", "job", "clean", "waiting-custom"}, {"admin", "job", "clean", "long-running"}, {"admin", "job", "clean", "low-gpu"}, {"admin", "order", "ls"}, {"admin", "order", "get"}, {"admin", "user", "ls"}, {"admin", "user", "billing", "summary"}, {"admin", "user", "billing", "accounts"}, } @@ -57,7 +60,7 @@ func TestReadCommandMatrix(t *testing.T) { {"admin", "system-config", "llm"}, {"admin", "system-config", "gpu-analysis"}, {"admin", "system-config", "prequeue"}, {"admin", "queue-quotas"}, {"admin", "gpu-analyses"}, {"admin", "operation-logs"}, {"admin", "cronjobs"}, {"admin", "whitelist"}, {"admin", "account", "ls"}, {"admin", "dataset", "ls"}, {"admin", "model-download", "ls"}, - {"admin", "billing", "status"}, {"admin", "billing", "jobs"}, {"admin", "order", "ls"}, {"admin", "user", "ls"}, {"admin", "user", "billing", "summary"}, + {"admin", "billing", "status"}, {"admin", "billing", "jobs"}, {"admin", "job", "ls"}, {"admin", "order", "ls"}, {"admin", "user", "ls"}, {"admin", "user", "billing", "summary"}, } env404 := append(baseEnv, "CRATER_TEST_SANDBOX_HTTP=error404") for _, command := range apiCases { diff --git a/cli/testdata/snapshots/job/job.en.txtar b/cli/testdata/snapshots/job/job.en.txtar new file mode 100644 index 00000000..97ea0a2d --- /dev/null +++ b/cli/testdata/snapshots/job/job.en.txtar @@ -0,0 +1,167 @@ +# Crater CLI snapshot bundle (txtar). Regenerate: make snapshot-update (or UPDATE_SNAPSHOTS=1 go test ./test/snapshots/...) +-- en/01-unknown-nojson/argv -- +crater job wat --no-interactive +-- en/01-unknown-nojson/exit -- +2 +-- en/01-unknown-nojson/stdout -- +-- en/01-unknown-nojson/stderr -- +Error: + unknown command "wat" for "crater job" + + Did you mean this? + get + + Run "crater job --help" for usage. +-- en/02-unknown-json/argv -- +crater job wat --no-interactive --json +-- en/02-unknown-json/exit -- +2 +-- en/02-unknown-json/stdout -- +-- en/02-unknown-json/stderr -- +{ + "category": "usage_error", + "code": "ERR_UNKNOWN_COMMAND", + "message": "unknown command \"wat\" for \"crater job\"\n\nDid you mean this?\n\tget\n\nRun \"crater job --help\" for usage." +} +-- en/03-get-missing-name-nojson/argv -- +crater job get --no-interactive +-- en/03-get-missing-name-nojson/exit -- +2 +-- en/03-get-missing-name-nojson/stdout -- +-- en/03-get-missing-name-nojson/stderr -- +Error: + job name is required () +-- en/04-get-missing-name-json/argv -- +crater job get --no-interactive --json +-- en/04-get-missing-name-json/exit -- +2 +-- en/04-get-missing-name-json/stdout -- +-- en/04-get-missing-name-json/stderr -- +{ + "category": "usage_error", + "code": "ERR_MISSING_REQUIRED_FLAG", + "message": "job name is required (\u003cname\u003e)" +} +-- en/05-ls-invalid-days-nojson/argv -- +crater job ls --no-interactive --days -2 +-- en/05-ls-invalid-days-nojson/exit -- +2 +-- en/05-ls-invalid-days-nojson/stdout -- +-- en/05-ls-invalid-days-nojson/stderr -- +Error: + days must be -1 or greater +-- en/06-ls-invalid-status-json/argv -- +crater job ls --no-interactive --json --status bad +-- en/06-ls-invalid-status-json/exit -- +2 +-- en/06-ls-invalid-status-json/stdout -- +-- en/06-ls-invalid-status-json/stderr -- +{ + "category": "usage_error", + "code": "ERR_INVALID_FLAG_VALUE", + "message": "invalid job status: bad" +} +-- en/07-create-jupyter-multi-usage-nojson/argv -- +crater job create jupyter --no-interactive --name demo --cpu -1 --memory -2Gi --gpu -1 +-- en/07-create-jupyter-multi-usage-nojson/exit -- +2 +-- en/07-create-jupyter-multi-usage-nojson/stdout -- +-- en/07-create-jupyter-multi-usage-nojson/stderr -- +Error: + image is required (--image) + err_invalid_non_negative_float + err_invalid_non_negative_int + err_invalid_non_negative_int +-- en/08-create-jupyter-multi-usage-json/argv -- +crater job create jupyter --no-interactive --json --name demo --cpu -1 --memory -2Gi --gpu -1 +-- en/08-create-jupyter-multi-usage-json/exit -- +2 +-- en/08-create-jupyter-multi-usage-json/stdout -- +-- en/08-create-jupyter-multi-usage-json/stderr -- +{ + "category": "usage_error", + "code": "ERR_INVALID_FLAG_VALUE", + "message": "image is required (--image)\nerr_invalid_non_negative_float\nerr_invalid_non_negative_int\nerr_invalid_non_negative_int", + "context": { + "issues": [ + { + "code": "ERR_MISSING_REQUIRED_FLAG", + "field": "image", + "message": "image is required (--image)" + }, + { + "code": "ERR_INVALID_FLAG_VALUE", + "field": "cpu", + "message": "err_invalid_non_negative_float" + }, + { + "code": "ERR_INVALID_FLAG_VALUE", + "field": "memory", + "message": "err_invalid_non_negative_int" + }, + { + "code": "ERR_INVALID_FLAG_VALUE", + "field": "gpu", + "message": "err_invalid_non_negative_int" + } + ] + } +} +-- en/09-create-custom-missing-working-dir-json/argv -- +crater job create custom --no-interactive --json --name demo --image example/image:tag --memory 2Gi --working-dir +-- en/09-create-custom-missing-working-dir-json/exit -- +2 +-- en/09-create-custom-missing-working-dir-json/stdout -- +-- en/09-create-custom-missing-working-dir-json/stderr -- +{ + "category": "usage_error", + "code": "ERR_MISSING_REQUIRED_FLAG", + "message": "working directory is required (--working-dir)" +} +-- en/10-admin-lock-missing-duration-nojson/argv -- +crater admin job lock job-123 --no-interactive +-- en/10-admin-lock-missing-duration-nojson/exit -- +2 +-- en/10-admin-lock-missing-duration-nojson/stdout -- +-- en/10-admin-lock-missing-duration-nojson/stderr -- +Error: + err_value_required_when_flag_enabled +-- en/11-admin-clean-low-gpu-invalid-json/argv -- +crater admin job clean low-gpu --no-interactive --json --time-range 0 --wait-time -1 +-- en/11-admin-clean-low-gpu-invalid-json/exit -- +2 +-- en/11-admin-clean-low-gpu-invalid-json/stdout -- +-- en/11-admin-clean-low-gpu-invalid-json/stderr -- +{ + "category": "usage_error", + "code": "ERR_INVALID_FLAG_VALUE", + "message": "err_invalid_positive_int\nerr_invalid_non_negative_int", + "context": { + "issues": [ + { + "code": "ERR_INVALID_FLAG_VALUE", + "field": "time-range", + "message": "err_invalid_positive_int" + }, + { + "code": "ERR_INVALID_FLAG_VALUE", + "field": "threshold", + "message": "err_invalid_non_negative_int" + } + ] + } +} +-- en/12-ls-network-timeout-json/argv -- +crater job ls --no-interactive --json +-- en/12-ls-network-timeout-json/exit -- +4 +-- en/12-ls-network-timeout-json/stdout -- +-- en/12-ls-network-timeout-json/stderr -- +{ + "category": "api_error", + "code": "ERR_NETWORK_FAILURE", + "message": "could not reach server: Get \"https://example.invalid/api/v1/vcjobs\": context deadline exceeded", + "context": { + "msg": "network error: Get \"https://example.invalid/api/v1/vcjobs\": context deadline exceeded" + } +} diff --git a/cli/testdata/snapshots/job/job.zh-CN.txtar b/cli/testdata/snapshots/job/job.zh-CN.txtar new file mode 100644 index 00000000..87b1849a --- /dev/null +++ b/cli/testdata/snapshots/job/job.zh-CN.txtar @@ -0,0 +1,167 @@ +# Crater CLI snapshot bundle (txtar). Regenerate: make snapshot-update (or UPDATE_SNAPSHOTS=1 go test ./test/snapshots/...) +-- zh-CN/01-unknown-nojson/argv -- +crater job wat --no-interactive +-- zh-CN/01-unknown-nojson/exit -- +2 +-- zh-CN/01-unknown-nojson/stdout -- +-- zh-CN/01-unknown-nojson/stderr -- +Error: + unknown command "wat" for "crater job" + + Did you mean this? + get + + Run "crater job --help" for usage. +-- zh-CN/02-unknown-json/argv -- +crater job wat --no-interactive --json +-- zh-CN/02-unknown-json/exit -- +2 +-- zh-CN/02-unknown-json/stdout -- +-- zh-CN/02-unknown-json/stderr -- +{ + "category": "usage_error", + "code": "ERR_UNKNOWN_COMMAND", + "message": "unknown command \"wat\" for \"crater job\"\n\nDid you mean this?\n\tget\n\nRun \"crater job --help\" for usage." +} +-- zh-CN/03-get-missing-name-nojson/argv -- +crater job get --no-interactive +-- zh-CN/03-get-missing-name-nojson/exit -- +2 +-- zh-CN/03-get-missing-name-nojson/stdout -- +-- zh-CN/03-get-missing-name-nojson/stderr -- +Error: + 缺少必要参数:作业名称 () +-- zh-CN/04-get-missing-name-json/argv -- +crater job get --no-interactive --json +-- zh-CN/04-get-missing-name-json/exit -- +2 +-- zh-CN/04-get-missing-name-json/stdout -- +-- zh-CN/04-get-missing-name-json/stderr -- +{ + "category": "usage_error", + "code": "ERR_MISSING_REQUIRED_FLAG", + "message": "缺少必要参数:作业名称 (\u003cname\u003e)" +} +-- zh-CN/05-ls-invalid-days-nojson/argv -- +crater job ls --no-interactive --days -2 +-- zh-CN/05-ls-invalid-days-nojson/exit -- +2 +-- zh-CN/05-ls-invalid-days-nojson/stdout -- +-- zh-CN/05-ls-invalid-days-nojson/stderr -- +Error: + days 必须大于等于 -1 +-- zh-CN/06-ls-invalid-status-json/argv -- +crater job ls --no-interactive --json --status bad +-- zh-CN/06-ls-invalid-status-json/exit -- +2 +-- zh-CN/06-ls-invalid-status-json/stdout -- +-- zh-CN/06-ls-invalid-status-json/stderr -- +{ + "category": "usage_error", + "code": "ERR_INVALID_FLAG_VALUE", + "message": "无效的作业状态:bad" +} +-- zh-CN/07-create-jupyter-multi-usage-nojson/argv -- +crater job create jupyter --no-interactive --name demo --cpu -1 --memory -2Gi --gpu -1 +-- zh-CN/07-create-jupyter-multi-usage-nojson/exit -- +2 +-- zh-CN/07-create-jupyter-multi-usage-nojson/stdout -- +-- zh-CN/07-create-jupyter-multi-usage-nojson/stderr -- +Error: + 缺少必要参数:镜像 (--image) + err_invalid_non_negative_float + err_invalid_non_negative_int + err_invalid_non_negative_int +-- zh-CN/08-create-jupyter-multi-usage-json/argv -- +crater job create jupyter --no-interactive --json --name demo --cpu -1 --memory -2Gi --gpu -1 +-- zh-CN/08-create-jupyter-multi-usage-json/exit -- +2 +-- zh-CN/08-create-jupyter-multi-usage-json/stdout -- +-- zh-CN/08-create-jupyter-multi-usage-json/stderr -- +{ + "category": "usage_error", + "code": "ERR_INVALID_FLAG_VALUE", + "message": "缺少必要参数:镜像 (--image)\nerr_invalid_non_negative_float\nerr_invalid_non_negative_int\nerr_invalid_non_negative_int", + "context": { + "issues": [ + { + "code": "ERR_MISSING_REQUIRED_FLAG", + "field": "image", + "message": "缺少必要参数:镜像 (--image)" + }, + { + "code": "ERR_INVALID_FLAG_VALUE", + "field": "cpu", + "message": "err_invalid_non_negative_float" + }, + { + "code": "ERR_INVALID_FLAG_VALUE", + "field": "memory", + "message": "err_invalid_non_negative_int" + }, + { + "code": "ERR_INVALID_FLAG_VALUE", + "field": "gpu", + "message": "err_invalid_non_negative_int" + } + ] + } +} +-- zh-CN/09-create-custom-missing-working-dir-json/argv -- +crater job create custom --no-interactive --json --name demo --image example/image:tag --memory 2Gi --working-dir +-- zh-CN/09-create-custom-missing-working-dir-json/exit -- +2 +-- zh-CN/09-create-custom-missing-working-dir-json/stdout -- +-- zh-CN/09-create-custom-missing-working-dir-json/stderr -- +{ + "category": "usage_error", + "code": "ERR_MISSING_REQUIRED_FLAG", + "message": "缺少必要参数:工作目录 (--working-dir)" +} +-- zh-CN/10-admin-lock-missing-duration-nojson/argv -- +crater admin job lock job-123 --no-interactive +-- zh-CN/10-admin-lock-missing-duration-nojson/exit -- +2 +-- zh-CN/10-admin-lock-missing-duration-nojson/stdout -- +-- zh-CN/10-admin-lock-missing-duration-nojson/stderr -- +Error: + err_value_required_when_flag_enabled +-- zh-CN/11-admin-clean-low-gpu-invalid-json/argv -- +crater admin job clean low-gpu --no-interactive --json --time-range 0 --wait-time -1 +-- zh-CN/11-admin-clean-low-gpu-invalid-json/exit -- +2 +-- zh-CN/11-admin-clean-low-gpu-invalid-json/stdout -- +-- zh-CN/11-admin-clean-low-gpu-invalid-json/stderr -- +{ + "category": "usage_error", + "code": "ERR_INVALID_FLAG_VALUE", + "message": "err_invalid_positive_int\nerr_invalid_non_negative_int", + "context": { + "issues": [ + { + "code": "ERR_INVALID_FLAG_VALUE", + "field": "time-range", + "message": "err_invalid_positive_int" + }, + { + "code": "ERR_INVALID_FLAG_VALUE", + "field": "threshold", + "message": "err_invalid_non_negative_int" + } + ] + } +} +-- zh-CN/12-ls-network-timeout-json/argv -- +crater job ls --no-interactive --json +-- zh-CN/12-ls-network-timeout-json/exit -- +4 +-- zh-CN/12-ls-network-timeout-json/stdout -- +-- zh-CN/12-ls-network-timeout-json/stderr -- +{ + "category": "api_error", + "code": "ERR_NETWORK_FAILURE", + "message": "无法连接服务器:Get \"https://example.invalid/api/v1/vcjobs\": context deadline exceeded", + "context": { + "msg": "network error: Get \"https://example.invalid/api/v1/vcjobs\": context deadline exceeded" + } +}