-
Notifications
You must be signed in to change notification settings - Fork 483
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[PyTorch][Training][EC2][SageMaker]PyTorch 2.6.0 Currency Release (#4556
) * change test-ecr-scan image data storage * add init files * revert ecr change * fix torch link * add --no-build-isolation for TE * skip OSS compliance to build image for OSS test * add 2.6 ec2 test * add 2.6 ec2 tests * build with OSS, enbale telemetry test, add venv in ec2 test * add allowlist * remove conda from ec2 tests * change setup file conda * skip dgl test * enable all ec2 tests * modify te tests * build SM image and modify tests * fix some tests * not build * add fastai comment * rebuild sm without fatsai and change telemetry tests * fix sm local tests * fix skip dict and run all tests * rebuilt sm with cuda compat and run all tests * build ec2 image with cuda patch * rebuild sm image and run all tests * build 2.5 to verify telemetry tests * add smppy and build * run all tests * revert toml * address comments and build ec2 * build sm image * run all tests * fix te tests * ec2 images run all tests * backward compatible test with 2.5 * disable autopatch * do build * build autopatch 2.5 * fix autopatch * revert toml * add remind of telemetry --------- Co-authored-by: Yadan Wei <[email protected]>
- Loading branch information
Showing
17 changed files
with
1,281 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment> | ||
prod_account_id: &PROD_ACCOUNT_ID 763104351884 | ||
region: ®ION <set-$REGION-in-environment> | ||
framework: &FRAMEWORK pytorch | ||
version: &VERSION 2.6.0 | ||
short_version: &SHORT_VERSION "2.6" | ||
arch_type: x86 | ||
# autopatch_build: "True" | ||
|
||
repository_info: | ||
training_repository: &TRAINING_REPOSITORY | ||
image_type: &TRAINING_IMAGE_TYPE training | ||
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] | ||
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] | ||
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] | ||
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] | ||
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] | ||
|
||
context: | ||
training_context: &TRAINING_CONTEXT | ||
start_cuda_compat: | ||
source: docker/build_artifacts/start_cuda_compat.sh | ||
target: start_cuda_compat.sh | ||
dockerd_entrypoint: | ||
source: docker/build_artifacts/dockerd_entrypoint.sh | ||
target: dockerd_entrypoint.sh | ||
changehostname: | ||
source: docker/build_artifacts/changehostname.c | ||
target: changehostname.c | ||
start_with_right_hostname: | ||
source: docker/build_artifacts/start_with_right_hostname.sh | ||
target: start_with_right_hostname.sh | ||
example_mnist_file: | ||
source: docker/build_artifacts/mnist.py | ||
target: mnist.py | ||
deep_learning_container: | ||
source: ../../src/deep_learning_container.py | ||
target: deep_learning_container.py | ||
|
||
images: | ||
BuildEC2CPUPTTrainPy3DockerImage: | ||
<<: *TRAINING_REPOSITORY | ||
build: &PYTORCH_CPU_TRAINING_PY3 false | ||
image_size_baseline: 6500 | ||
device_type: &DEVICE_TYPE cpu | ||
python_version: &DOCKER_PYTHON_VERSION py3 | ||
tag_python_version: &TAG_PYTHON_VERSION py312 | ||
os_version: &OS_VERSION ubuntu22.04 | ||
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] | ||
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] | ||
# build_tag_override: "beta:2.6.0-cpu-py311-ubuntu22.04-ec2" | ||
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] | ||
target: ec2 | ||
context: | ||
<<: *TRAINING_CONTEXT | ||
BuildEC2GPUPTTrainPy3cu126DockerImage: | ||
<<: *TRAINING_REPOSITORY | ||
build: &PYTORCH_GPU_TRAINING_PY3 false | ||
image_size_baseline: 19700 | ||
device_type: &DEVICE_TYPE gpu | ||
python_version: &DOCKER_PYTHON_VERSION py3 | ||
tag_python_version: &TAG_PYTHON_VERSION py312 | ||
cuda_version: &CUDA_VERSION cu126 | ||
os_version: &OS_VERSION ubuntu22.04 | ||
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] | ||
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] | ||
# build_tag_override: "beta:2.6.0-gpu-py311-cu121-ubuntu22.04-ec2" | ||
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., | ||
*DEVICE_TYPE ] | ||
target: ec2 | ||
context: | ||
<<: *TRAINING_CONTEXT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment> | ||
prod_account_id: &PROD_ACCOUNT_ID 763104351884 | ||
region: ®ION <set-$REGION-in-environment> | ||
framework: &FRAMEWORK pytorch | ||
version: &VERSION 2.6.0 | ||
short_version: &SHORT_VERSION "2.6" | ||
arch_type: x86 | ||
# autopatch_build: "True" | ||
|
||
repository_info: | ||
training_repository: &TRAINING_REPOSITORY | ||
image_type: &TRAINING_IMAGE_TYPE training | ||
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] | ||
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] | ||
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] | ||
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] | ||
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] | ||
|
||
context: | ||
training_context: &TRAINING_CONTEXT | ||
start_cuda_compat: | ||
source: docker/build_artifacts/start_cuda_compat.sh | ||
target: start_cuda_compat.sh | ||
dockerd_entrypoint: | ||
source: docker/build_artifacts/dockerd_entrypoint.sh | ||
target: dockerd_entrypoint.sh | ||
changehostname: | ||
source: docker/build_artifacts/changehostname.c | ||
target: changehostname.c | ||
start_with_right_hostname: | ||
source: docker/build_artifacts/start_with_right_hostname.sh | ||
target: start_with_right_hostname.sh | ||
example_mnist_file: | ||
source: docker/build_artifacts/mnist.py | ||
target: mnist.py | ||
deep_learning_container: | ||
source: ../../src/deep_learning_container.py | ||
target: deep_learning_container.py | ||
|
||
images: | ||
BuildSageMakerCPUPTTrainPy3DockerImage: | ||
<<: *TRAINING_REPOSITORY | ||
build: &PYTORCH_CPU_TRAINING_PY3 false | ||
image_size_baseline: 6200 | ||
device_type: &DEVICE_TYPE cpu | ||
python_version: &DOCKER_PYTHON_VERSION py3 | ||
tag_python_version: &TAG_PYTHON_VERSION py312 | ||
os_version: &OS_VERSION ubuntu22.04 | ||
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] | ||
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] | ||
# build_tag_override: "beta:2.6.0-cpu-py311-ubuntu22.04-sagemaker" | ||
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] | ||
target: sagemaker | ||
context: | ||
<<: *TRAINING_CONTEXT | ||
BuildSageMakerGPUPTTrainPy3DockerImage: | ||
<<: *TRAINING_REPOSITORY | ||
build: &PYTORCH_GPU_TRAINING_PY3 false | ||
image_size_baseline: 21500 | ||
device_type: &DEVICE_TYPE gpu | ||
python_version: &DOCKER_PYTHON_VERSION py3 | ||
tag_python_version: &TAG_PYTHON_VERSION py312 | ||
cuda_version: &CUDA_VERSION cu126 | ||
os_version: &OS_VERSION ubuntu22.04 | ||
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] | ||
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] | ||
# build_tag_override: "beta:2.6.0-gpu-py311-cu124-ubuntu22.04-sagemaker" | ||
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., | ||
*DEVICE_TYPE ] | ||
target: sagemaker | ||
context: | ||
<<: *TRAINING_CONTEXT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
buildspec_pointer: buildspec-2-5-sm.yml | ||
buildspec_pointer: buildspec-2-6-sm.yml |
Oops, something went wrong.