工作流的定义包含三部分:
如下是一个工作流配置yaml的基本结构,其中:
inputs
定义了工作流的全局输入参数taskTemplates
定义子任务(节点)的配置,详细参数说明参考自定义训练配置tasks
中编排了工作流中任务的节点执行顺序与依赖关系,并定义了每个任务节点的输入值version: v1 kind: PipelineTemplate inputs: ... taskTemplates: ... tasks: ...
... taskTemplates: - name: hello_world_temp type: CustomTask spec: # 入口命令 Entrypoint: echo "Hello World!" # 训练框架,支持 TensorFlowPS / PyTorchDDP / MPI / BytePS / Custom Framework: Custom # 镜像 URL 地址:当使用火山引擎镜像或者公网镜像时使用该字段,ImageUrl 优先级高于 Image 字段 ImageUrl: vemlp-cn-beijing.cr.volces.com/preset-images/python:3.7-ubuntu20.04 # 队列名称 ResourceQueueName: <Your Resource Queue Name> # 实例配置 TaskRoleSpecs: - Flavor: ml.g1ie.large # 实例规格(Flavor) RoleName: worker # 角色名称(RoleName) RoleReplicas: 1 # 角色数量(RoleReplicas) ...
... tasks: - name: hello_world # 节点名 taskTemplateName: hello_world_temp # 引用的任务模板名称 ...
version: v1 kind: PipelineTemplate # 定义工作流中的节点(子任务)模板 taskTemplates: - name: hello_world_temp type: CustomTask spec: # 入口命令 Entrypoint: echo "Hello World!" # 训练框架,支持 TensorFlowPS / PyTorchDDP / MPI / BytePS / Custom Framework: Custom # 镜像 URL 地址:当使用火山引擎镜像或者公网镜像时使用该字段,ImageUrl 优先级高于 Image 字段 ImageUrl: vemlp-cn-beijing.cr.volces.com/preset-images/python:3.7-ubuntu20.04 # 队列名称 ResourceQueueName: <Your Resource Queue Name> # 实例配置 TaskRoleSpecs: - Flavor: ml.g1ie.large # 实例规格(Flavor) RoleName: worker # 角色名称(RoleName) RoleReplicas: 1 # 角色数量(RoleReplicas) tasks: - name: hello_world # 节点名 taskTemplateName: hello_world_temp # 引用的任务模板名称
... taskTemplates: - name: demo type: CustomTask spec: Entrypoint: echo "Hello Pipeline!" # 训练框架,支持 TensorFlowPS / PyTorchDDP / MPI / BytePS / Custom Framework: Custom # 镜像 URL 地址:当使用火山引擎镜像或者公网镜像时使用该字段,ImageUrl 优先级高于 Image 字段 ImageUrl: vemlp-cn-beijing.cr.volces.com/preset-images/python:3.7-ubuntu20.04 # 队列名称 ResourceQueueName: ldy-test-user0 # 实例配置 TaskRoleSpecs: - Flavor: ml.g1ie.large# 实例规格(Flavor) RoleName: worker # 角色名称(RoleName) RoleReplicas: 1 # 角色数量(RoleReplicas) ...
... tasks: - name: task_a taskTemplateName: demo # 引用demo中定义的任务配置 - name: task_b taskTemplateName: demo dependencies: # 指定任务上游依赖 - task_a # 依赖 task_a 运行结束 ...
version: v1 kind: PipelineTemplate # 定义工作流中的节点(子任务)模板 taskTemplates: - name: demo type: CustomTask spec: Entrypoint: echo "Hello Pipeline!" # 训练框架,支持 TensorFlowPS / PyTorchDDP / MPI / BytePS / Custom Framework: Custom # 镜像 URL 地址:当使用火山引擎镜像或者公网镜像时使用该字段,ImageUrl 优先级高于 Image 字段 ImageUrl: vemlp-cn-beijing.cr.volces.com/preset-images/python:3.7-ubuntu20.04 # 队列名称 ResourceQueueName: ldy-test-user0 # 实例配置 TaskRoleSpecs: - Flavor: ml.g1ie.large# 实例规格(Flavor) RoleName: worker # 角色名称(RoleName) RoleReplicas: 1 # 角色数量(RoleReplicas) tasks: - name: task_a # 节点名 taskTemplateName: demo # 引用的任务模板名称 - name: task_b taskTemplateName: demo dependencies: # 指定任务上游依赖 - task_a # 依赖 task_a 运行结束
... # 定义 Pipeline 的输入参数 inputs: - name: image_url # 参数名 type: string # 参数类型 hint: 镜像地址 # 参数描述 # 参数默认值 defaultValue: vemlp-cn-beijing.cr.volces.com/preset-images/python:3.7-ubuntu20.04 - name: command type: string hint: 入口命令 ...
... taskTemplates: ... # 定义工作流中的节点(子任务)模板的输入参数 inputs: - name: resource_queue_name # 参数名 type: string # 参数类型 hint: 队列名称 # 参数描述 defaultValue: <Your Resource Queue Name> # 参数默认值 - name: image_url type: string hint: 镜像地址 - name: command type: string hint: 入口命令 spec: ... Entrypoint: '{{inputs.command}}' # 引用模版定义的 name 为 command 的输入参数 ImageUrl: '{{inputs.image_url}}' ResourceQueueName: '{{inputs.resource_queue_name}}' ... ... ...
... tasks: - name: task_a taskTemplateName: task_demo inputs: - name: image_url value: '{{pipeline.inputs.image_url}}' # 引用 Pipeline 定义的 name 为 image_url 的输入参数 - name: command value: '{{pipeline.inputs.command}}' # 未声明的输入参数,将使用模板定义的默认值,如 resource_queue_name - name: task_b taskTemplateName: task_demo dependencies: - task_a inputs: - name: image_url value: '{{pipeline.inputs.image_url}}' - name: command value: echo "task b!" # 使用常量 重写模板定义的 name 为 command 的输入参数 - name: resource_queue_name # 重写模板定义的 name 为 resource_queue_name 输入参数的默认值 value: 全流量0414 ...
version: v1 kind: PipelineTemplate # 定义 Pipeline 的输入参数 inputs: - name: image_url # 参数名 type: string # 参数类型 hint: 镜像地址 # 参数描述 # 参数默认值 defaultValue: vemlp-cn-beijing.cr.volces.com/preset-images/python:3.7-ubuntu20.04 - name: command type: string hint: 入口命令 taskTemplates: - name: task_demo type: CustomTask # 定义工作流中的节点(子任务)模板的输入参数 inputs: - name: resource_queue_name # 参数名 type: string # 参数类型 hint: 队列名称 # 参数描述 defaultValue: <Your Resource Queue Name> # 参数默认值 - name: image_url type: string hint: 镜像地址 - name: command type: string hint: 入口命令 spec: Entrypoint: '{{inputs.command}}' # 引用模版定义的 name 为 command 的输入参数 Framework: Custom ImageUrl: '{{inputs.image_url}}' ResourceQueueName: '{{inputs.resource_queue_name}}' TaskRoleSpecs: - Flavor: ml.g1ie.large RoleName: worker RoleReplicas: 1 tasks: - name: task_a taskTemplateName: task_demo inputs: - name: image_url value: '{{pipeline.inputs.image_url}}' # 引用 Pipeline 定义的 name 为 image_url 的输入参数 - name: command value: '{{pipeline.inputs.command}}' # 未声明的输入参数,将使用模板定义的默认值,如 resource_queue_name - name: task_b taskTemplateName: task_demo dependencies: - task_a inputs: - name: image_url value: '{{pipeline.inputs.image_url}}' - name: command value: echo "task b!" # 使用常量 重写模板定义的 name 为 command 的输入参数 - name: resource_queue_name # 重写模板定义的 name 为 resource_queue_name 输入参数的默认值 value: 全流量0414
... # 定义工作流中的节点(子任务)模板 taskTemplates: - name: upload_code type: CustomTask spec: # 待上传代码的本地路径, 如是目录且以 '/' 结尾, 则将该目录下的所有内容上传到 RemoteMountCodePath, # 如是目录且不以 '/' 结尾, 则将该目录及该目录下所有内容上传到 RemoteMountCodePath UserCodePath: samples/pipeline/code/single_with_upload_code/ # 使用之前上传的代码快照 # UserCodePath: tos://ml-platform-auto-created-required-2100000050-cn-guilin-boe/static/customtask/manifest/snapshot_jqsgdgcsc69nvzx8.manifest # 上传代码后的路径, 该路径将作为容器中的代码挂载路径 RemoteMountCodePath: "/root/code" ... ... ...
version: v1 kind: PipelineTemplate # 定义工作流中的节点(子任务)模板 taskTemplates: - name: upload_code type: CustomTask spec: # 待上传代码的本地路径, 如是目录且以 '/' 结尾, 则将该目录下的所有内容上传到 RemoteMountCodePath, # 如是目录且不以 '/' 结尾, 则将该目录及该目录下所有内容上传到 RemoteMountCodePath UserCodePath: run.sh # 使用之前上传的代码快照 # UserCodePath: tos://ml-platform-auto-created-required-2100000050-cn-guilin-boe/static/customtask/manifest/snapshot_jqsgdgcsc69nvzx8.manifest # 上传代码后的路径, 该路径将作为容器中的代码挂载路径 RemoteMountCodePath: "/root/code" # 可见范围,支持 Public / Queue / Private AccessType: Public # 可见用户列表, 注释下方两行可以配置, 注意: 如可见范围为 Private/Public, 则可见用户列表不生效 # AccessUsers: # - "username_a" # - "username_b" # 入口命令 Entrypoint: cd /root/code && bash run.sh # 训练框架,支持 TensorFlowPS / PyTorchDDP / MPI / BytePS / Custom Framework: Custom # 镜像 URL 地址:当使用火山引擎镜像或者公网镜像时使用该字段,ImageUrl 优先级高于 Image 字段 ImageUrl: vemlp-cn-beijing.cr.volces.com/preset-images/python:3.7-ubuntu20.04 # 队列名称 ResourceQueueName: <Your Resource Queue Name> # 实例配置 TaskRoleSpecs: - Flavor: ml.c1.large # 实例规格(Flavor) RoleName: worker # 角色名称(RoleName) RoleReplicas: 1 # 角色数量(RoleReplicas) tasks: - name: upload_code # 节点名 taskTemplateName: upload_code # 引用的任务模板名称
version: v1 kind: PipelineTemplate # 定义工作流中的节点(子任务)模板 taskTemplates: - name: demo type: CustomTask spec: ... tasks: - name: task_a taskTemplateName: demo - name: task_b taskTemplateName: demo dependencies: - task_a - name: task_c taskTemplateName: demo dependencies: - task_b # +---------+ # | task_a | # +----+----+ # | # +----v----+ # | task_b | # +----+----+ # | # +----v----+ # | task_c | # +---------+
version: v1 kind: PipelineTemplate # 定义工作流中的节点(子任务)模板 taskTemplates: - name: demo type: CustomTask spec: ... tasks: - name: task_a taskTemplateName: demo - name: task_b taskTemplateName: demo - name: task_c taskTemplateName: demo # +--------+ +--------+ +--------+ # | task_a | | task_b | | task_c | # +--------+ +--------+ +--------+
version: v1 kind: PipelineTemplate # 定义工作流中的节点(子任务)模板 taskTemplates: - name: demo type: CustomTask spec: ... tasks: - name: task_a taskTemplateName: demo - name: task_b taskTemplateName: demo dependencies: - task_a - name: task_c taskTemplateName: demo dependencies: - task_a - name: task_d taskTemplateName: demo dependencies: - task_b - task_c # +--------+ # | task_a | # +--------+ # / \ # +--------+ +--------+ # | task_b | | task_c | # +--------+ +--------+ # \ / # +--------+ # | task_d | # +--------+
工作流支持三种 loop 语法:
withSequence 遍历一些数字,可通过{{item}}
来引用
withItems 获取一个列表并执行其中每一项
可以是一个单个的值,可通过{{item}}
来引用
可以是一个json,可通过{{item.key}}
来引用
withParam 读取并遍历一个 JSON 数组,支持从工作流中的其他步骤动态生成。
... tasks: - name: hello-world-x5 taskTemplateName: print-message inputs: - name: message value: '{\"hello world\":{{item}}}' withSequence: count: 5 ...
version: v1 kind: PipelineTemplate # 定义工作流中的节点(子任务)模板 taskTemplates: - name: print-message type: CustomTask inputs: - name: message type: string outputs: - name: message type: string valueFrom: file: /out spec: # 入口命令 Entrypoint: | echo {{inputs.message}} | tee /out # 训练框架,支持 TensorFlowPS / PyTorchDDP / MPI / BytePS / Custom Framework: Custom # 镜像 URL 地址:当使用火山引擎镜像或者公网镜像时使用该字段,ImageUrl 优先级高于 Image 字段 ImageUrl: vemlp-cn-beijing.cr.volces.com/preset-images/python:3.7 # 队列名称 ResourceQueueName: <Your Resource Queue Name> # 实例配置 TaskRoleSpecs: - Flavor: ml.c3i.large RoleName: worker RoleReplicas: 1 tasks: - name: hello-world-x5 # 节点名 taskTemplateName: print-message # 引用的任务模板名称 inputs: - name: message value: '{\"hello world\":{{item}}}' withSequence: count: 5 - name: aggregate-output taskTemplateName: print-message dependencies: - hello-world-x5 inputs: - name: message value: "{{tasks.hello-world-x5.outputs.message}}"
... tasks: - name: loop-with-items taskTemplateName: print-message inputs: - name: message value: '{\"image\":\"{{item.image}}\",\"tag\":\"{{item.tag}}\"}' withItems: - '{ "image": "debian", "tag": "9.1" }' #item set 1 - '{ "image": "debian", "tag": "8.9" }' #item set 2 - '{ "image": "alpine", "tag": "3.6" }' #item set 3 - '{ "image": "ubuntu", "tag": "17.10" }' #item set 4 ...
version: v1 kind: PipelineTemplate taskTemplates: - name: print-message type: CustomTask inputs: - name: message type: string outputs: - name: message type: string valueFrom: file: /out spec: Entrypoint: | echo {{inputs.message}} | tee /out Framework: Custom ImageUrl: vemlp-cn-beijing.cr.volces.com/preset-images/python:3.7 ResourceQueueName: <Your Resource Queue Name> TaskRoleSpecs: - Flavor: ml.c3i.large RoleName: worker RoleReplicas: 1 tasks: - name: loop-with-items taskTemplateName: print-message inputs: - name: message value: '{\"image\":\"{{item.image}}\",\"tag\":\"{{item.tag}}\"}' withItems: - '{ "image": "debian", "tag": "9.1" }' #item set 1 - '{ "image": "debian", "tag": "8.9" }' #item set 2 - '{ "image": "alpine", "tag": "3.6" }' #item set 3 - '{ "image": "ubuntu", "tag": "17.10" }' #item set 4 - name: aggregate-output taskTemplateName: print-message dependencies: - loop-with-items inputs: - name: message value: "{{tasks.loop-with-items.outputs.message}}"
... tasks: - name: generate taskTemplateName: gen-number-list - name: loop-with-param taskTemplateName: sleep-then-print-message dependencies: - generate inputs: - name: number value: "{{item}}" - name: message value: '{\"number\":{{item}}}' withParam: "{{tasks.generate.outputs.number-list}}" ...
version: v1 kind: PipelineTemplate taskTemplates: - name: gen-number-list type: CustomTask outputs: - name: number-list type: string valueFrom: file: /out spec: Entrypoint: | echo "[5,6,7,8,9]" > /out Framework: Custom ImageUrl: vemlp-cn-beijing.cr.volces.com/preset-images/python:3.7 ResourceQueueName: <Your Resource Queue Name> TaskRoleSpecs: - Flavor: ml.c3i.large RoleName: worker RoleReplicas: 1 - name: sleep-then-print-message type: CustomTask inputs: - name: number type: int - name: message type: string outputs: - name: message type: string valueFrom: file: /out spec: Entrypoint: | echo {{inputs.message}} | tee /out Framework: Custom ImageUrl: vemlp-cn-beijing.cr.volces.com/preset-images/python:3.7 ResourceQueueName: <Your Resource Queue Name> TaskRoleSpecs: - Flavor: ml.c3i.large RoleName: worker RoleReplicas: 1 tasks: - name: generate taskTemplateName: gen-number-list - name: loop-with-param taskTemplateName: sleep-then-print-message dependencies: - generate inputs: - name: number value: "{{item}}" - name: message value: '{\"number\":{{item}}}' withParam: "{{tasks.generate.outputs.number-list}}" - name: aggregate-output taskTemplateName: sleep-then-print-message dependencies: - loop-with-param inputs: - name: number value: 0 - name: message value: "{{tasks.loop-with-param.outputs.message}}"