Skip to content

Commit

Permalink
[PyTorch][Training][EC2][SageMaker]PyTorch 2.6.0 Currency Release (#4556
Browse files Browse the repository at this point in the history
)

* change test-ecr-scan image data storage

* add init files

* revert ecr change

* fix torch link

* add --no-build-isolation for TE

* skip OSS compliance to build image for OSS test

* add 2.6 ec2 test

* add 2.6 ec2 tests

* build with OSS, enbale telemetry test, add venv in ec2 test

* add allowlist

* remove conda from ec2 tests

* change setup file conda

* skip dgl test

* enable all ec2 tests

* modify te tests

* build SM image and modify tests

* fix some tests

* not build

* add fastai comment

* rebuild sm without fatsai and change telemetry tests

* fix sm local tests

* fix skip dict and run all tests

* rebuilt sm with cuda compat and run all tests

* build ec2 image with cuda patch

* rebuild sm image and run all tests

* build 2.5 to verify telemetry tests

* add smppy and build

* run all tests

* revert toml

* address comments and build ec2

* build sm image

* run all tests

* fix te tests

* ec2 images run all tests

* backward compatible test with 2.5

* disable autopatch

* do build

* build autopatch 2.5

* fix autopatch

* revert toml

* add remind of telemetry

---------

Co-authored-by: Yadan Wei <[email protected]>
  • Loading branch information
Yadan-Wei and Yadan Wei authored Mar 6, 2025
1 parent a184c91 commit 355d7f9
Show file tree
Hide file tree
Showing 17 changed files with 1,281 additions and 32 deletions.
72 changes: 72 additions & 0 deletions pytorch/training/buildspec-2-6-ec2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 2.6.0
short_version: &SHORT_VERSION "2.6"
arch_type: x86
# autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
start_cuda_compat:
source: docker/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
dockerd_entrypoint:
source: docker/build_artifacts/dockerd_entrypoint.sh
target: dockerd_entrypoint.sh
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py

images:
BuildEC2CPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_CPU_TRAINING_PY3 false
image_size_baseline: 6500
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
# build_tag_override: "beta:2.6.0-cpu-py311-ubuntu22.04-ec2"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
BuildEC2GPUPTTrainPy3cu126DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 19700
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
cuda_version: &CUDA_VERSION cu126
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
# build_tag_override: "beta:2.6.0-gpu-py311-cu121-ubuntu22.04-ec2"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
72 changes: 72 additions & 0 deletions pytorch/training/buildspec-2-6-sm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 2.6.0
short_version: &SHORT_VERSION "2.6"
arch_type: x86
# autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
start_cuda_compat:
source: docker/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
dockerd_entrypoint:
source: docker/build_artifacts/dockerd_entrypoint.sh
target: dockerd_entrypoint.sh
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py

images:
BuildSageMakerCPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_CPU_TRAINING_PY3 false
image_size_baseline: 6200
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# build_tag_override: "beta:2.6.0-cpu-py311-ubuntu22.04-sagemaker"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
BuildSageMakerGPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 21500
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
cuda_version: &CUDA_VERSION cu126
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# build_tag_override: "beta:2.6.0-gpu-py311-cu124-ubuntu22.04-sagemaker"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
2 changes: 1 addition & 1 deletion pytorch/training/buildspec.yml
Original file line number Diff line number Diff line change
@@ -1 +1 @@
buildspec_pointer: buildspec-2-5-sm.yml
buildspec_pointer: buildspec-2-6-sm.yml
Loading

0 comments on commit 355d7f9

Please sign in to comment.