查询训练任务详情
更新时间:2025-05-23
描述
获取一个训练任务的详细信息。
请求结构
Bash
1GET /api/v1/aijobs/{jobId}
2Host:aihc.bj.baidubce.com
3Authorization:authorization string
4ContentType: application/json
请求头域
除公共头域外,无其它特殊头域。
请求参数
参数名称 | 类型 | 是否必须 | 参数位置 | 说明 |
---|---|---|---|---|
resourcePoolId | String | 是 | Query 参数 | 标识资源池的唯一标识符 |
jobId | String | 是 | Path 参数 | 训练任务ID |
返回头域
除公共头域,无其它特殊头域。
返回参数
参数名称 | 类型 | 说明 |
---|---|---|
requestId | String | 请求ID |
result | JobInfoResult | 成功请求时的返回结果 |
返回示例
JSON
1{
2 "result": {
3 "jobId": "pytorchjob-19d38d07-3e04-49ef-8428-d792881fc5fa",
4 "name": "job-test-3",
5 "resourcePoolId": "cce-6zwnp4zf",
6 "command": "python -m torch.distributed.run /workspace/examples/imagenet.py --arch=resnet18 --epochs=100 --batch-size=32 --workers=0 /workspace/data/tiny-imagenet-200",
7 "createdAt": "2024-07-16T17:20:04Z",
8 "finishedAt": "",
9 "datasources": [],
10 "enableFaultTolerance": true,
11 "labels": [
12 {
13 "key": "aaaaa",
14 "value": "bbbb"
15 },
16 {
17 "key": "aijob.cce.baidubce.com/create-from-aihcp-api",
18 "value": "true"
19 },
20 {
21 "key": "aijob.cce.baidubce.com/openapi-jobid",
22 "value": "pytorchjob-19d38d07-3e04-49ef-8428-d792881fc5fa"
23 }
24 ],
25 "priority": "normal",
26 "queue": "default",
27 "status": "Running",
28 "image": "registry.baidubce.com/cce-ai-native/cy-pytorch-mnist:etcd",
29 "resources": [
30 {
31 "name": "cpu",
32 "quantity": 1
33 }
34 ],
35 "enableRDMA": false,
36 "queueingSequence": 1,
37 "podList": {
38 "listMeta": {
39 "totalItems": 1
40 },
41 "pods": [
42 {
43 "PodIP": "10.11.3.106",
44 "nodeName": "192.168.12.46",
45 "objectMeta": {
46 "annotations": {
47 "aijob.cce.baidubce.com/fault-tolerance-enabled": "true",
48 "aijob.cce.baidubce.com/openapi-jobid": "pytorchjob-19d38d07-3e04-49ef-8428-d792881fc5fa",
49 "aijob.cce.baidubce.com/raw-request": "{\"name\":\"job-test-3\",\"namespace\":\"default\",\"queue\":\"default\",\"priority\":\"normal\",\"oversell\":false,\"faultTolerance\":true,\"command\":\"python -m torch.distributed.run /workspace/examples/imagenet.py --arch=resnet18 --epochs=100 --batch-size=32 --workers=0 /workspace/data/tiny-imagenet-200\",\"datasources\":[],\"jobFramework\":\"PyTorchJob\",\"jobDistributed\":false,\"jobSpec\":{\"Master\":{\"replicas\":1,\"restartPolicy\":\"Never\",\"image\":\"registry.baidubce.com/cce-ai-native/cy-pytorch-mnist:etcd\",\"tag\":\"\",\"resource\":{\"cpu\":1},\"env\":{\"AIHC_JOB_NAME\":\"job-test-3\",\"AIHC_TENSORBOARD_LOG_PATH\":\"\",\"LOGLEVEL\":\"DEBUG\",\"NCCL_DEBUG\":\"INFO\"},\"command\":\"python -m torch.distributed.run /workspace/examples/imagenet.py --arch=resnet18 --epochs=100 --batch-size=32 --workers=0 /workspace/data/tiny-imagenet-200\",\"args\":\"\",\"postStart\":\"\",\"preStop\":\"\"}},\"imagePullSecrets\":null,\"imagePullSecretUsername\":\"\",\"imagePullSecretPassword\":\"\",\"labels\":{\"aaaaa\":\"bbbb\",\"aijob.cce.baidubce.com/create-from-aihcp-api\":\"true\"},\"annotations\":null,\"nodeSelector\":null,\"autoCreatePVC\":true,\"hostNetwork\":false,\"isCopyJob\":false,\"sourceJobName\":\"\",\"workloadType\":\"PyTorchJob\",\"pfsId\":\"\"}",
50 "cce-workload-kind": "PyTorchJob",
51 "cce-workload-name": "job-test-3",
52 "prometheus.io/path": "/metrics",
53 "prometheus.io/port": "9101",
54 "prometheus.io/scrape": "true",
55 "scheduling.k8s.io/group-name": "job-test-3",
56 "scheduling.k8s.io/job-enable-oversell": "false",
57 "volcano.sh/task-spec": "master"
58 },
59 "creationTimestamp": "2024-07-16T17:20:04Z",
60 "labels": {
61 "aaaaa": "bbbb",
62 "aijob.cce.baidubce.com/create-from-aihcp-api": "true",
63 "aijob.cce.baidubce.com/openapi-jobid": "pytorchjob-19d38d07-3e04-49ef-8428-d792881fc5fa",
64 "training.kubeflow.org/job-name": "job-test-3",
65 "training.kubeflow.org/job-role": "master",
66 "training.kubeflow.org/operator-name": "pytorchjob-controller",
67 "training.kubeflow.org/replica-index": "0",
68 "training.kubeflow.org/replica-type": "master"
69 },
70 "name": "job-test-3-master-0",
71 "namespace": "default",
72 "ownerReferences": [
73 {
74 "apiVersion": "kubeflow.org/v1",
75 "kind": "PyTorchJob",
76 "name": "job-test-3",
77 "uid": "b3212c83-ca27-4989-a346-bed704eba7eb",
78 "controller": true,
79 "blockOwnerDeletion": true
80 }
81 ]
82 },
83 "podStatus": {
84 "podPhase": "Running",
85 "status": "Running"
86 },
87 "replicaType": "master",
88 "restartCount": 0,
89 "envs": [
90 {
91 "name": "LOGLEVEL",
92 "value": "DEBUG"
93 },
94 {
95 "name": "AIHC_JOB_NAME",
96 "value": "job-test-3"
97 },
98 {
99 "name": "NCCL_DEBUG",
100 "value": "INFO"
101 },
102 {
103 "name": "AIHC_TENSORBOARD_LOG_PATH",
104 "value": ""
105 },
106 {
107 "name": "BCCL_BUS_BW_CALCULATE_MODE",
108 "value": "Agg"
109 },
110 {
111 "name": "BCCL_PROFILING_FILE",
112 "value": "/var/logs/bccl/busbw.cal.%h.%p"
113 },
114 {
115 "name": "BCCL_UNIX_SOCKET_PATH",
116 "value": "/var/logs/bccl"
117 },
118 {
119 "name": "BCCL_TRACE_HANG_SIGNAL",
120 "value": "10"
121 },
122 {
123 "name": "PYTHONUNBUFFERED",
124 "value": "0"
125 },
126 {
127 "name": "MASTER_PORT",
128 "value": "23456"
129 },
130 {
131 "name": "MASTER_ADDR",
132 "value": "job-test-3-master-0"
133 },
134 {
135 "name": "WORLD_SIZE",
136 "value": "1"
137 },
138 {
139 "name": "RANK",
140 "value": "0"
141 },
142 {
143 "name": "NVIDIA_VISIBLE_DEVICES",
144 "value": "void"
145 }
146 ]
147 }
148 ]
149 }
150 },
151 "requestId": "4a516705-9c97-4e32-9473-c783ec85bec4"
152}