分布式训练相关
更新时间:2025-05-28
查询训练任务列表
使用以下代码可以查询训练任务列表。
Go
1// import "github.com/baidubce/bce-sdk-go/services/aihc"
2// import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
3ak, sk, endpoint := ak_test, sk_test, endpoint_test
4client, _ := aihc.NewClient(ak, sk, endpoint)
5req := &v1.OpenAPIJobListRequest{
6ResourcePoolID: RESOURCE_POOL_ID,
7PageNo: 1,
8PageSize: 3,
9}
10result, err := client.ListJobs(req)
11
12if err != nil {
13panic(err)
14}
15jsonBytes, _ := json.Marshal(result)
16fmt.Println(string(jsonBytes))
注意:
- 根据接口文档去填写具体的访问参数,接口链接为查询训练任务列表
创建训练任务
使用以下代码可以创建训练任务。
Go
1// import "github.com/baidubce/bce-sdk-go/services/aihc"
2// import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
3ak, sk, endpoint := ak_test, sk_test, endpoint_test
4resourcePoolID := RESOURCE_POOL_ID
5
6jobConfig := &v1.OpenAPIJobCreateRequest{
7Name: AIJobName,
8JobSpec: v1.OpenAPIAIJobSpec{
9Command: `echo "hello sdk"; sleep infinity`,
10Replicas: 1,
11Image: ImageID,
12Resources: []v1.OpenAPIResource{
13{
14Name: "cpu",
15Quantity: 1,
16},
17},
18EnableRDMA: false,
19},
20EnableBccl: false,
21}
22client, _ := aihc.NewClient(ak, sk, endpoint)
23result, err := client.CreateJob(jobConfig, resourcePoolID)
24
25if err != nil {
26panic(err)
27}
28
29jsonBytes, _ := json.Marshal(result)
30fmt.Println(string(jsonBytes))
注意:
- 根据接口文档去填写具体的访问参数,接口链接为创建训练任务
查询训练任务详情
使用以下代码可以查询训练任务详情。
Go
1// import "github.com/baidubce/bce-sdk-go/services/aihc"
2// import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
3ak, sk, endpoint := ak_test, sk_test, endpoint_test
4resourcePoolID, JobID := RESOURCE_POOL_ID, AIJobID
5
6client, _ := aihc.NewClient(ak, sk, endpoint)
7result, err := client.GetJob(JobID, resourcePoolID)
8
9if err != nil {
10panic(err)
11}
12
13jsonBytes, _ := json.Marshal(result)
14fmt.Println(string(jsonBytes))
注意:
- 根据接口文档去填写具体的访问参数,接口链接为查询训练任务详情
更新训练任务
使用以下代码可以更新训练任务。
Go
1// import "github.com/baidubce/bce-sdk-go/services/aihc"
2// import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
3ak, sk, endpoint := ak_test, sk_test, endpoint_test
4resourcePoolID := RESOURCE_POOL_ID
5jobID := AIJobID
6
7jobConfig := &v1.OpenAPIJobUpdateRequest{
8Priority: "high",
9}
10client, _ := aihc.NewClient(ak, sk, endpoint)
11result, err := client.UpdateJob(jobConfig, jobID, resourcePoolID)
12
13if err != nil {
14panic(err)
15}
16jsonBytes, _ := json.Marshal(result)
17fmt.Println(string(jsonBytes))
注意:
- 根据接口文档去填写具体的访问参数,接口链接为更新训练任务
停止训练任务
使用以下代码可以停止训练任务。
Go
1// import "github.com/baidubce/bce-sdk-go/services/aihc"
2// import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
3ak, sk, endpoint := ak_test, sk_test, endpoint_test
4resourcePoolID := RESOURCE_POOL_ID
5jobID := AIJobID
6
7client, _ := aihc.NewClient(ak, sk, endpoint)
8result, err := client.StopJob(jobID, resourcePoolID)
9log.Infof("stop job result: %v", result)
10if err != nil {
11panic(err)
12}
13jsonBytes, _ := json.Marshal(result)
14fmt.Println(string(jsonBytes))
注意:
- 根据接口文档去填写具体的访问参数,接口链接为停止训练任务
删除训练任务
使用以下代码可以删除训练任务。
Go
1// import "github.com/baidubce/bce-sdk-go/services/aihc"
2// import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
3ak, sk, endpoint := ak_test, sk_test, endpoint_test
4resourcePoolID, JobID := RESOURCE_POOL_ID, AIJobID
5
6client, _ := aihc.NewClient(ak, sk, endpoint)
7result, err := client.DeleteJob(JobID, resourcePoolID)
8
9if err != nil {
10panic(err)
11}
12
13jsonBytes, _ := json.Marshal(result)
14fmt.Println(string(jsonBytes))
注意:
- 根据接口文档去填写具体的访问参数,接口链接为删除训练任务
查询训练任务事件
使用以下代码可以查询训练任务事件。
Go
1// import "github.com/baidubce/bce-sdk-go/services/aihc"
2// import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
3ak, sk, endpoint := ak_test, sk_test, endpoint_test
4
5req := &v1.GetJobEventsRequest{
6Namespace: "",
7JobFramework: "PyTorchJob",
8StartTime: "",
9EndTime: "",
10JobID: AIJobID,
11ResourcePoolID: RESOURCE_POOL_ID,
12}
13
14client, _ := aihc.NewClient(ak, sk, endpoint)
15result, err := client.GetTaskEvent(req)
16
17if err != nil {
18panic(err)
19}
20jsonBytes, _ := json.Marshal(result)
21fmt.Println(string(jsonBytes))
注意:
- 根据接口文档去填写具体的访问参数,接口链接为查询训练任务事件
查询训练任务日志
使用以下代码可以查询训练任务日志。
Go
1// import "github.com/baidubce/bce-sdk-go/services/aihc"
2// import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
3ak, sk, endpoint := ak_test, sk_test, endpoint_test
4
5req := &v1.GetPodLogsRequest{
6JobID: AIJobID,
7ResourcePoolID: RESOURCE_POOL_ID,
8PodName: PodName,
9Namespace: "default",
10StartTime: "",
11EndTime: "",
12MaxLines: "",
13Container: "",
14Chunk: "",
15}
16
17client, _ := aihc.NewClient(ak, sk, endpoint)
18result, err := client.GetPodLogs(req)
19
20if err != nil {
21panic(err)
22}
23jsonBytes, _ := json.Marshal(result)
24fmt.Println(string(jsonBytes))
注意:
- 根据接口文档去填写具体的访问参数,接口链接为查询训练任务日志
查询训练任务Pod事件
使用以下代码可以查询训练任务Pod事件。
Go
1// import "github.com/baidubce/bce-sdk-go/services/aihc"
2// import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
3ak, sk, endpoint := ak_test, sk_test, endpoint_test
4req := &v1.GetPodEventsRequest{
5JobID: AIJobID,
6ResourcePoolID: RESOURCE_POOL_ID,
7Namespace: "",
8JobFramework: "PyTorchJob",
9StartTime: "",
10EndTime: "",
11PodName: PodName,
12}
13
14client, _ := aihc.NewClient(ak, sk, endpoint)
15result, err := client.GetPodEvents(req)
16
17if err != nil {
18panic(err)
19}
20jsonBytes, _ := json.Marshal(result)
21fmt.Println(string(jsonBytes))
注意:
- 根据接口文档去填写具体的访问参数,接口链接为查询训练任务Pod事件
查询训练任务监控
使用以下代码可以查询训练任务监控。
Go
1// import "github.com/baidubce/bce-sdk-go/services/aihc"
2// import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
3ak, sk, endpoint := ak_test, sk_test, endpoint_test
4req := &v1.GetTaskMetricsRequest{
5StartTime: "",
6ResourcePoolID: RESOURCE_POOL_ID,
7EndTime: "",
8TimeStep: "",
9MetricType: MetricType,
10JobID: AIJobID,
11Namespace: "",
12RateInterval: "",
13}
14
15client, _ := aihc.NewClient(ak, sk, endpoint)
16result, err := client.GetTaskMetrics(req)
17
18if err != nil {
19panic(err)
20}
21jsonBytes, _ := json.Marshal(result)
22fmt.Println(string(jsonBytes))
注意:
- 根据接口文档去填写具体的访问参数,接口链接为查询训练任务监控
查询训练任务所在节点列表
使用以下代码可以查询训练任务所在节点列表。
Go
1// import "github.com/baidubce/bce-sdk-go/services/aihc"
2// import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
3ak, sk, endpoint := ak_test, sk_test, endpoint_test
4resourcePoolID := RESOURCE_POOL_ID
5jobID := AIJobID
6namespace := ""
7
8client, _ := aihc.NewClient(ak, sk, endpoint)
9result, err := client.GetJobNodesList(jobID, resourcePoolID, namespace)
10
11if err != nil {
12panic(err)
13}
14jsonBytes, _ := json.Marshal(result)
15fmt.Println(string(jsonBytes))
注意:
- 根据接口文档去填写具体的访问参数,接口链接为查询训练任务所在节点列表
获取训练任务WebTerminal地址
使用以下代码可以获取训练任务WebTerminal地址。
Go
1// import "github.com/baidubce/bce-sdk-go/services/aihc"
2// import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
3ak, sk, endpoint := ak_test, sk_test, endpoint_test
4
5req := &v1.GetWebShellURLRequest{
6JobID: AIJobID,
7ResourcePoolID: RESOURCE_POOL_ID,
8PodName: PodName,
9Namespace: "",
10PingTimeoutSecond: "",
11HandshakeTimeoutSecond: "",
12}
13
14client, _ := aihc.NewClient(ak, sk, endpoint)
15result, err := client.GetWebSSHUrl(req)
16
17if err != nil {
18panic(err)
19}
20jsonBytes, _ := json.Marshal(result)
21fmt.Println(string(jsonBytes))
注意:
- 根据接口文档去填写具体的访问参数,接口链接为获取训练任务WebTerminal地址