From 9209f1522d2407820d3ec6a7ddcc78b7ccb076c6 Mon Sep 17 00:00:00 2001 From: Andy Linfoot <78757007+andy-neuma@users.noreply.github.com> Date: Thu, 22 Feb 2024 22:43:40 -0500 Subject: [PATCH] additional updates to "bump-to-v0.3.2" (#39) SUMMARY * update `TORCH_CUDA_ARCH_LIST` to match `magic_wand` * update "test vllm" action to run tests serially * add helper script to find *.py tests, run them serially, and output JUnit formatted xml TEST working through changes manually on debug instance --------- Co-authored-by: andy-neuma --- .github/actions/nm-build-vllm/action.yml | 2 - .github/actions/nm-set-env/action.yml | 13 +++-- .github/actions/nm-test-vllm/action.yml | 12 ++--- .github/pull_request_template.md | 6 +++ .github/scripts/run-tests | 66 ++++++++++++++++++++++++ .github/workflows/build-test.yml | 24 ++++++--- .github/workflows/remote-push.yml | 5 +- 7 files changed, 106 insertions(+), 22 deletions(-) create mode 100644 .github/pull_request_template.md create mode 100755 .github/scripts/run-tests diff --git a/.github/actions/nm-build-vllm/action.yml b/.github/actions/nm-build-vllm/action.yml index 780c2f99de3c6..5218078ba1704 100644 --- a/.github/actions/nm-build-vllm/action.yml +++ b/.github/actions/nm-build-vllm/action.yml @@ -19,8 +19,6 @@ runs: steps: - id: build run: | - # TODO: this is a hack ... fix it later - # pyenv hardcoded ... python version hardcoded ... COMMIT=${{ github.sha }} VENV="${{ inputs.venv }}-${COMMIT:0:7}" source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate diff --git a/.github/actions/nm-set-env/action.yml b/.github/actions/nm-set-env/action.yml index d5b108d97ba4a..863354f35dd0b 100644 --- a/.github/actions/nm-set-env/action.yml +++ b/.github/actions/nm-set-env/action.yml @@ -1,15 +1,20 @@ name: set neuralmagic env description: 'sets environment variables for neuralmagic' inputs: - hf_home: + hf_token: description: 'Hugging Face home' required: true + Gi_per_thread: + description: 'requested GiB to reserve per thread' + required: true runs: using: composite steps: - run: | - echo "HF_HOME=${HF_HOME_TOKEN}" >> $GITHUB_ENV - echo "TORCH_CUDA_ARCH_LIST=8.0+PTX" >> $GITHUB_ENV + echo "HF_TOKEN=${HF_TOKEN_SECRET}" >> $GITHUB_ENV + NUM_THREADS=$(./.github/scripts/determine-threading -G ${{ inputs.Gi_per_thread }}) + echo "MAX_JOBS=${NUM_THREADS}" >> $GITHUB_ENV + echo "VLLM_INSTALL_PUNICA_KERNELS=1" >> $GITHUB_ENV echo "PYENV_ROOT=/usr/local/apps/pyenv" >> $GITHUB_ENV echo "XDG_CONFIG_HOME=/usr/local/apps" >> $GITHUB_ENV WHOAMI=$(whoami) @@ -17,5 +22,5 @@ runs: echo "LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64::/usr/local/cuda-12.1/lib64:" >> $GITHUB_ENV echo "PROJECT_ID=12" >> $GITHUB_ENV env: - HF_HOME_TOKEN: ${{ inputs.hf_home }} + HF_TOKEN_SECRET: ${{ inputs.hf_token }} shell: bash diff --git a/.github/actions/nm-test-vllm/action.yml b/.github/actions/nm-test-vllm/action.yml index 27dae15df0332..7d05450e4e1c2 100644 --- a/.github/actions/nm-test-vllm/action.yml +++ b/.github/actions/nm-test-vllm/action.yml @@ -4,8 +4,8 @@ inputs: test_directory: description: 'test directory, path is relative to neuralmagic-vllm' required: true - test_xml: - description: 'filename for xml test results' + test_results: + description: 'top-level directory for xml test results' required: true python: description: 'python version, e.g. 3.10.12' @@ -22,15 +22,15 @@ runs: steps: - id: test run: | - SUCCESS=0 - # TODO: this is a hack ... fix it later - # pyenv hardcoded ... python version hardcoded ... COMMIT=${{ github.sha }} VENV="${{ inputs.venv }}-${COMMIT:0:7}" source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate pip3 install --index-url http://192.168.201.226:8080/ --trusted-host 192.168.201.226 magic-wand pip3 install -r requirements-dev.txt - pytest --junitxml=${{ inputs.test_xml }} ${{ inputs.test_directory }} || SUCCESS=$? + # run tests via runner script (serially) + SUCCESS=0 + ./.github/scripts/run-tests -t ${{ inputs.test_directory }} -r ${{ inputs.test_results }} || SUCCESS=$? + echo "was this a SUCCESS? ${SUCCESS}" echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT" exit ${SUCCESS} shell: bash diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000000000..e871931956390 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,6 @@ +SUMMARY: +"please provide a brief summary" + +TEST PLAN: +"please outline how the changes were tested" + diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests new file mode 100755 index 0000000000000..2c5aeb1d9826e --- /dev/null +++ b/.github/scripts/run-tests @@ -0,0 +1,66 @@ +#!/bin/bash -e + +# simple helper script to manage concurrency while running tests + +usage() { + echo "Usage: ${0} " + echo + echo " -t - test directory, i.e. location of *.py test files. (default 'tests/')" + echo " -r - desired results base directory. xml results will mirror provided tests directory structure. (default 'test-results/')" + echo " -h - this list of options" + echo + echo "note: all paths are relative to 'neuralmagic-vllm' root" + echo + exit 1 +} + +TEST_DIR=tests +RESULTS_DIR=test-results + +while getopts "ht:r:" OPT; do + case "${OPT}" in + h) + usage + ;; + t) + TEST_DIR="${OPTARG}" + ;; + r) + RESULTS_DIR="${OPTARG}" + ;; + esac +done + +# check if variables are valid +if [ -z "${RESULTS_DIR}" ]; then + echo "please set desired results base directory" + usage +fi + +if [ -z "${TEST_DIR}" ]; then + echo "please set test directory" + usage +fi + +if [ ! -d "${TEST_DIR}" ]; then + echo "specified test directory, '${TEST_DIR}' does not exist ..." + usage +fi + +# run tests serially +TESTS_DOT_PY=$(find ${TEST_DIR} -not -name "__init__.py" -name "*.py") +TESTS_TO_RUN=($TESTS_DOT_PY) +SUCCESS=0 +for TEST in "${TESTS_TO_RUN[@]}" +do + LOCAL_SUCCESS=0 + RESULT_XML=$(echo ${TEST} | sed -e "s/${TEST_DIR}/${RESULTS_DIR}/" | sed -e "s/.py/.xml/") + pytest --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$? + SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) +done + +if [ "${SUCCESS}" -eq "0" ]; then + exit 0 +else + exit 1 +fi diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 26a9b5cb89bcd..7d571b50adf14 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -15,6 +15,10 @@ on: description: "git commit hash or branch name" type: string required: true + Gi_per_thread: + description: 'requested GiB to reserve per thread' + type: string + required: true python: description: "python version, e.g. 3.10.12" type: string @@ -35,6 +39,10 @@ on: description: "git commit hash or branch name" type: string required: true + Gi_per_thread: + description: 'requested GiB to reserve per thread' + type: string + required: true python: description: "python version, e.g. 3.10.12" type: string @@ -61,7 +69,8 @@ jobs: id: setenv uses: ./.github/actions/nm-set-env/ with: - hf_home: ${{ secrets.NM_HF_HOME }} + hf_token: ${{ secrets.NM_HF_TOKEN }} + Gi_per_thread: ${{ inputs.Gi_per_thread }} - name: set python id: set_python @@ -88,7 +97,7 @@ jobs: id: build uses: ./.github/actions/nm-build-vllm/ with: - Gi_per_thread: 1 + Gi_per_thread: ${{ inputs.Gi_per_thread }} python: ${{ inputs.python }} venv: TEST @@ -97,7 +106,7 @@ jobs: uses: ./.github/actions/nm-test-vllm/ with: test_directory: tests - test_xml: test-results/all_tests.xml + test_results: test-results python: ${{ inputs.python }} venv: TEST @@ -134,12 +143,13 @@ jobs: TEST_STATUS: ${{ steps.test.outputs.status }} run: | echo "checkout status: ${CHECKOUT}" - if [[ "${CHECKOUT}" != *"success"* ]]; then exit 1; fi - if [ ${LINT_STATUS} -ne 0 ]; then exit 1; fi - if [ ${BUILD_STATUS} -ne 0 ]; then exit 1; fi + echo "lint status: ${LINT_STATUS}" echo "build status: ${BUILD_STATUS}" - if [ ${TEST_STATUS} -ne 0 ]; then exit 1; fi echo "test status: ${TEST_STATUS}" + if [[ "${CHECKOUT}" != *"success"* ]]; then exit 1; fi + if [ -z "${LINT_STATUS}" ] || [ "${LINT_STATUS}" -ne "0" ]; then exit 1; fi + if [ -z "${BUILD_STATUS}" ] || [ "${BUILD_STATUS}" -ne "0" ]; then exit 1; fi + if [ -z "${TEST_STATUS}" ] || [ "${TEST_STATUS}" -ne "0" ]; then exit 1; fi - name: complete testmo run uses: ./.github/actions/nm-testmo-run-complete/ diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml index c10b386ceb23e..800db24fde970 100644 --- a/.github/workflows/remote-push.yml +++ b/.github/workflows/remote-push.yml @@ -13,8 +13,6 @@ jobs: # TODO: expand python matrix later, once CI system has # matured. - # TODO: adjust timeout after we get a bit more experience. - # making it 60 is a bit permissive. # TODO: enable this later AWS-AVX2-32G-A10G-24G: @@ -24,7 +22,8 @@ jobs: uses: ./.github/workflows/build-test.yml with: label: aws-avx2-32G-a10g-24G - timeout: 60 + timeout: 180 gitref: '${{ github.ref }}' + Gi_per_thread: 4 python: ${{ matrix.python }} secrets: inherit