From 4ddd6d98ad1082b67dfc2948ea5c99fea8bcc700 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Tue, 22 Oct 2024 14:04:28 -0700 Subject: [PATCH 01/10] Add new schema for GPU images --- .../cloud-init/artifacts/components.json | 22 +++++++++++++++++++ schemas/components.cue | 9 +++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/components.json b/parts/linux/cloud-init/artifacts/components.json index 1d883b08d7e..8ad3e78b245 100644 --- a/parts/linux/cloud-init/artifacts/components.json +++ b/parts/linux/cloud-init/artifacts/components.json @@ -558,6 +558,28 @@ ] } ], + + "GPUContainerImages": [ + { + "downloadURL": "mcr.microsoft.com/aks/aks-gpu-grid", + "multiArchVersionsV2": [ + { + "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda", + "latestVersion": "550.90.12-20241021233454" + } + ] + }, + { + "downloadURL": "mcr.microsoft.com/aks/aks-gpu-grid", + "multiArchVersionsV2": [ + { + "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-grid", + "latestVersion": "535.161.08-20241021235607" + } + ] + } + ], + "Packages": [ { "name": "oras", diff --git a/schemas/components.cue b/schemas/components.cue index 920df60e51b..1df7b34a744 100644 --- a/schemas/components.cue +++ b/schemas/components.cue @@ -15,7 +15,13 @@ package components multiArchVersionsV2: [...#VersionV2] } +#GPUContainerImage: { + downloadURL: string + multiArchVersionsV2: [...#VersionV2] +} + #Images: [...#ContainerImage] +#GPUContainerImage: [...#GPUContainerImage] #Packages: [...#Package] #VersionV2: { k8sVersion?: string @@ -67,7 +73,8 @@ package components #Components: { ContainerImages: #Images - Packages: #Packages + Packages: #Packages + GPUContainerImages: #GPUContainerImage } #Components \ No newline at end of file From cb2439d0475e547addd8048dac11c20e994fccaa Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Wed, 23 Oct 2024 00:47:26 -0700 Subject: [PATCH 02/10] update schema and version --- parts/linux/cloud-init/artifacts/components.json | 4 ++-- schemas/components.cue | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/components.json b/parts/linux/cloud-init/artifacts/components.json index 8ad3e78b245..fb0b5e50a8c 100644 --- a/parts/linux/cloud-init/artifacts/components.json +++ b/parts/linux/cloud-init/artifacts/components.json @@ -561,7 +561,7 @@ "GPUContainerImages": [ { - "downloadURL": "mcr.microsoft.com/aks/aks-gpu-grid", + "downloadURL": "mcr.microsoft.com/aks/aks-gpu-cuda:*", "multiArchVersionsV2": [ { "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda", @@ -570,7 +570,7 @@ ] }, { - "downloadURL": "mcr.microsoft.com/aks/aks-gpu-grid", + "downloadURL": "mcr.microsoft.com/aks/aks-gpu-grid:*", "multiArchVersionsV2": [ { "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-grid", diff --git a/schemas/components.cue b/schemas/components.cue index 1df7b34a744..11cb518cbc6 100644 --- a/schemas/components.cue +++ b/schemas/components.cue @@ -21,7 +21,7 @@ package components } #Images: [...#ContainerImage] -#GPUContainerImage: [...#GPUContainerImage] +#GPUContainerImages: [...#GPUContainerImage] #Packages: [...#Package] #VersionV2: { k8sVersion?: string From 6d11f6aaf4856e9cbb756e6ac0109e9141255eb7 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Wed, 23 Oct 2024 01:04:57 -0700 Subject: [PATCH 03/10] update schema --- parts/linux/cloud-init/artifacts/components.json | 8 ++++++++ schemas/components.cue | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/components.json b/parts/linux/cloud-init/artifacts/components.json index fb0b5e50a8c..4fe9bc44ee9 100644 --- a/parts/linux/cloud-init/artifacts/components.json +++ b/parts/linux/cloud-init/artifacts/components.json @@ -563,6 +563,10 @@ { "downloadURL": "mcr.microsoft.com/aks/aks-gpu-cuda:*", "multiArchVersionsV2": [ + { + "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda", + "latestVersion": "550.90.12-20241021233454" + }, { "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda", "latestVersion": "550.90.12-20241021233454" @@ -572,6 +576,10 @@ { "downloadURL": "mcr.microsoft.com/aks/aks-gpu-grid:*", "multiArchVersionsV2": [ + { + "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-grid", + "latestVersion": "535.161.08-20241021235607" + }, { "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-grid", "latestVersion": "535.161.08-20241021235607" diff --git a/schemas/components.cue b/schemas/components.cue index 11cb518cbc6..5b0ef7e5da3 100644 --- a/schemas/components.cue +++ b/schemas/components.cue @@ -21,7 +21,7 @@ package components } #Images: [...#ContainerImage] -#GPUContainerImages: [...#GPUContainerImage] +#GPUImages: [...#GPUContainerImage] #Packages: [...#Package] #VersionV2: { k8sVersion?: string @@ -74,7 +74,7 @@ package components #Components: { ContainerImages: #Images Packages: #Packages - GPUContainerImages: #GPUContainerImage + GPUContainerImages?: #GPUImages } #Components \ No newline at end of file From efaf51e02f528e8419402817c465b7560389d6f7 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Wed, 23 Oct 2024 01:05:56 -0700 Subject: [PATCH 04/10] remove repeated version --- parts/linux/cloud-init/artifacts/components.json | 8 -------- 1 file changed, 8 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/components.json b/parts/linux/cloud-init/artifacts/components.json index 4fe9bc44ee9..fb0b5e50a8c 100644 --- a/parts/linux/cloud-init/artifacts/components.json +++ b/parts/linux/cloud-init/artifacts/components.json @@ -563,10 +563,6 @@ { "downloadURL": "mcr.microsoft.com/aks/aks-gpu-cuda:*", "multiArchVersionsV2": [ - { - "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda", - "latestVersion": "550.90.12-20241021233454" - }, { "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda", "latestVersion": "550.90.12-20241021233454" @@ -576,10 +572,6 @@ { "downloadURL": "mcr.microsoft.com/aks/aks-gpu-grid:*", "multiArchVersionsV2": [ - { - "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-grid", - "latestVersion": "535.161.08-20241021235607" - }, { "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-grid", "latestVersion": "535.161.08-20241021235607" From 2ced72e77a9215ed422511e81802b8ebba2c6c99 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Wed, 23 Oct 2024 11:11:51 -0700 Subject: [PATCH 05/10] Consume GPU version from components.json --- vhdbuilder/packer/install-dependencies.sh | 37 +++++++++++++++++++---- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 1ca79c8e85c..3898aa2c5f0 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -351,14 +351,37 @@ INSTALLED_RUNC_VERSION=$(runc --version | head -n1 | sed 's/runc version //') echo " - runc version ${INSTALLED_RUNC_VERSION}" >> ${VHD_LOGS_FILEPATH} capture_benchmark "${SCRIPT_NAME}_artifact_streaming_download" -if [[ $OS == $UBUNTU_OS_NAME && $(isARM64) != 1 ]]; then # no ARM64 SKU with GPU now +GPUContainerImages=$(jq ".GPUContainerImages" $COMPONENTS_FILEPATH | jq -c '.[]') + +NVIDIA_DRIVER_IMAGE="" +NVIDIA_DRIVER_IMAGE_TAG="" + +if [[ $OS == $UBUNTU_OS_NAME && $(isARM64) != 1 ]]; then # No ARM64 SKU with GPU now gpu_action="copy" - NVIDIA_DRIVER_IMAGE_SHA="20241008175307" - export NVIDIA_DRIVER_IMAGE_TAG="550.90.12-${NVIDIA_DRIVER_IMAGE_SHA}" - NVIDIA_DRIVER_IMAGE="mcr.microsoft.com/aks/aks-gpu-cuda" + + while IFS= read -r imageToBePulled; do + downloadURL=$(echo "${imageToBePulled}" | jq -r '.downloadURL') + imageName=$(echo "$downloadURL" | sed 's/:.*$//') + + if [[ "$imageName" == "mcr.microsoft.com/aks/aks-gpu-cuda" ]]; then + latestVersion=$(echo "${imageToBePulled}" | jq -r '.multiArchVersionsV2[0].latestVersion') + NVIDIA_DRIVER_IMAGE="$imageName" + NVIDIA_DRIVER_IMAGE_TAG="$latestVersion" + break # Exit the loop once we find the image + fi + done <<< "$GPUContainerImages" + + # Check if the NVIDIA_DRIVER_IMAGE and NVIDIA_DRIVER_IMAGE_TAG were found + if [[ -z "$NVIDIA_DRIVER_IMAGE" || -z "$NVIDIA_DRIVER_IMAGE_TAG" ]]; then + echo "Error: Unable to find aks-gpu-cuda image in components.json" + exit 1 + fi mkdir -p /opt/{actions,gpu} - ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG + + ctr -n k8s.io image pull "$NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG" + + # Check for the "fullgpu" feature flag if grep -q "fullgpu" <<< "$FEATURE_FLAGS"; then bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install" ret=$? @@ -368,11 +391,13 @@ if [[ $OS == $UBUNTU_OS_NAME && $(isARM64) != 1 ]]; then # no ARM64 SKU with GP fi fi - cat << EOF >> ${VHD_LOGS_FILEPATH} + cat << EOF >> ${VHD_LOGS_FILEPATH} - nvidia-driver=${NVIDIA_DRIVER_IMAGE_TAG} EOF + fi + ls -ltr /opt/gpu/* >> ${VHD_LOGS_FILEPATH} installBpftrace From a28eab2d482658cdc0019d5f643f0de2c3f34212 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Thu, 24 Oct 2024 11:54:12 -0700 Subject: [PATCH 06/10] New GPUContainerImage schema with os, arch and cache info --- .../cloud-init/artifacts/components.json | 10 +- schemas/components.cue | 8 ++ vhdbuilder/packer/install-dependencies.sh | 115 +++++++++++++----- 3 files changed, 100 insertions(+), 33 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/components.json b/parts/linux/cloud-init/artifacts/components.json index fb0b5e50a8c..85c78c61ff0 100644 --- a/parts/linux/cloud-init/artifacts/components.json +++ b/parts/linux/cloud-init/artifacts/components.json @@ -567,6 +567,13 @@ "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda", "latestVersion": "550.90.12-20241021233454" } + ], + "cached": true, + "osSelectors":[ + { + "os": "ubuntu", + "arch": "amd64" + } ] }, { @@ -576,7 +583,8 @@ "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-grid", "latestVersion": "535.161.08-20241021235607" } - ] + ], + "cached": false } ], diff --git a/schemas/components.cue b/schemas/components.cue index 5b0ef7e5da3..1a6d4280fca 100644 --- a/schemas/components.cue +++ b/schemas/components.cue @@ -9,6 +9,12 @@ package components previousLatestVersion?: #ContainerImagePrefetchOptimization } +#OSSelector: { + os: string + arch: string +} + + #ContainerImage: { downloadURL: string amd64OnlyVersions: [...string] @@ -18,6 +24,8 @@ package components #GPUContainerImage: { downloadURL: string multiArchVersionsV2: [...#VersionV2] + cached: bool + osSelectors?: [...#OSSelector] } #Images: [...#ContainerImage] diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 3898aa2c5f0..b1914bf7752 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -351,50 +351,101 @@ INSTALLED_RUNC_VERSION=$(runc --version | head -n1 | sed 's/runc version //') echo " - runc version ${INSTALLED_RUNC_VERSION}" >> ${VHD_LOGS_FILEPATH} capture_benchmark "${SCRIPT_NAME}_artifact_streaming_download" -GPUContainerImages=$(jq ".GPUContainerImages" $COMPONENTS_FILEPATH | jq -c '.[]') +gpu_action="" +declare -A pulled_gpu_images -NVIDIA_DRIVER_IMAGE="" -NVIDIA_DRIVER_IMAGE_TAG="" +# Loop over each GPUContainerImage +while IFS= read -r gpuImageToBePulled; do + # Extract 'cached' field and convert it to lowercase + cached=$(echo "${gpuImageToBePulled}" | jq -r '.cached' | tr '[:upper:]' '[:lower:]') -if [[ $OS == $UBUNTU_OS_NAME && $(isARM64) != 1 ]]; then # No ARM64 SKU with GPU now - gpu_action="copy" + if [[ "$cached" != "true" ]]; then + # Skip images that are not meant to be cached + continue + fi - while IFS= read -r imageToBePulled; do - downloadURL=$(echo "${imageToBePulled}" | jq -r '.downloadURL') - imageName=$(echo "$downloadURL" | sed 's/:.*$//') + # Extract 'osSelectors' if present + osSelectors=$(echo "${gpuImageToBePulled}" | jq -r '.osSelectors // empty') - if [[ "$imageName" == "mcr.microsoft.com/aks/aks-gpu-cuda" ]]; then - latestVersion=$(echo "${imageToBePulled}" | jq -r '.multiArchVersionsV2[0].latestVersion') - NVIDIA_DRIVER_IMAGE="$imageName" - NVIDIA_DRIVER_IMAGE_TAG="$latestVersion" - break # Exit the loop once we find the image - fi - done <<< "$GPUContainerImages" + shouldPull=0 # Default to not pull - # Check if the NVIDIA_DRIVER_IMAGE and NVIDIA_DRIVER_IMAGE_TAG were found - if [[ -z "$NVIDIA_DRIVER_IMAGE" || -z "$NVIDIA_DRIVER_IMAGE_TAG" ]]; then - echo "Error: Unable to find aks-gpu-cuda image in components.json" - exit 1 + if [[ -n "$osSelectors" ]]; then + # osSelectors is provided; check if current OS and arch match any entry + while IFS= read -r selector; do + os=$(echo "$selector" | jq -r '.os') + arch=$(echo "$selector" | jq -r '.arch') + + if [[ "$os" == "$CURRENT_OS" ]]; then + if [[ "$arch" == "$CPU_ARCH" ]]; then + shouldPull=1 + break # Found a matching selector + fi + fi + done <<< "$(echo "$osSelectors" | jq -c '.[]')" + else + # No osSelectors provided; decide whether to pull + # Assuming we pull the image if no osSelectors are specified + shouldPull=1 fi - mkdir -p /opt/{actions,gpu} + if [[ "$shouldPull" == "1" ]]; then + # Extract image details + downloadURL=$(echo "${gpuImageToBePulled}" | jq -r '.downloadURL') + imageName=$(echo "$downloadURL" | sed 's/:.*$//') - ctr -n k8s.io image pull "$NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG" + # Get the latestVersion + latestVersion=$(echo "${gpuImageToBePulled}" | jq -r '.multiArchVersionsV2[0].latestVersion') - # Check for the "fullgpu" feature flag - if grep -q "fullgpu" <<< "$FEATURE_FLAGS"; then - bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install" - ret=$? - if [[ "$ret" != "0" ]]; then - echo "Failed to install GPU driver, exiting..." - exit $ret + if [[ -z "$latestVersion" || "$latestVersion" == "null" ]]; then + echo "Error: latestVersion not found for $imageName" + exit 1 fi - fi - cat << EOF >> ${VHD_LOGS_FILEPATH} - - nvidia-driver=${NVIDIA_DRIVER_IMAGE_TAG} -EOF + fullImage="$imageName:$latestVersion" + + # Pull the image + echo "Pulling image: $fullImage" + ctr -n k8s.io image pull "$fullImage" + if [[ $? -ne 0 ]]; then + echo "Failed to pull image: $fullImage" + exit 1 + fi + # Record the pulled image + pulled_gpu_images["$imageName"]="$latestVersion" + + # Set gpu_action if pulling the aks-gpu-cuda image + if [[ "$imageName" == "mcr.microsoft.com/aks/aks-gpu-cuda" ]]; then + gpu_action="copy" + + # Create necessary directories + mkdir -p /opt/{actions,gpu} + + # Check for the "fullgpu" feature flag + if grep -q "fullgpu" <<< "$FEATURE_FLAGS"; then + echo "Installing GPU driver from image: $fullImage" + bash -c "$CTR_GPU_INSTALL_CMD $fullImage gpuinstall /entrypoint.sh install" + ret=$? + if [[ "$ret" != "0" ]]; then + echo "Failed to install GPU driver, exiting..." + exit $ret + fi + fi + fi + else + echo "Skipping image $imageName due to osSelector constraints or cached=false." + fi +done <<< "$GPUContainerImages" + +# Log the pulled images +if [[ "${#pulled_gpu_images[@]}" -gt 0 ]]; then + echo "Logging pulled GPU images to $VHD_LOGS_FILEPATH" + for imageName in "${!pulled_gpu_images[@]}"; do + imageVersion=${pulled_gpu_images[$imageName]} + echo " - $imageName=$imageVersion" >> "$VHD_LOGS_FILEPATH" + done +else + echo "No GPU images were pulled." fi From 40762367095b572d5eb78141b6d7c84d102a0fbe Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Thu, 24 Oct 2024 12:06:03 -0700 Subject: [PATCH 07/10] Add Readme --- .github/README-RENOVATE.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/README-RENOVATE.md b/.github/README-RENOVATE.md index e4b35ecc89a..c4a5bd50547 100644 --- a/.github/README-RENOVATE.md +++ b/.github/README-RENOVATE.md @@ -20,6 +20,7 @@ - [Okay, I just have 5 minutes. Please just tell me how to onboard a new package/container now to Renovate.json for auto-update.](#okay-i-just-have-5-minutes-please-just-tell-me-how-to-onboard-a-new-packagecontainer-now-to-renovatejson-for-auto-update) - [What is the responsibility of a PR assignee?](#what-is-the-responsibility-of-a-pr-assignee) - [What components are onboarded to Renovate for auto-update and what are not yet?](#what-components-are-onboarded-to-renovate-for-auto-update-and-what-are-not-yet) + - [Special considerations for GPU Container Images](#special-considerations-for-gpu-container-images) # TL;DR This readme is mainly describing how the renovate.json is constructed and the reasoning behind. If you are adding a new component to be cached in VHD, please refer to this [Readme-components](../parts/linux/cloud-init/artifacts/README-COMPONENTS.md) for tutorial. If you are onboarding a newly added component to Renovate automatic updates, you can jump to the [Hands-on guide and FAQ](#hands-on-guide-and-faq). @@ -348,4 +349,8 @@ As of 9/18/2024, - PMC hosted packages, namely `runc` and `containerd`, are onboarded for auto-update. - Acs-mirror hosted packages/binaries, namely `cni-plugins`, `azure-cni`, `cri-tools`, `kubernetes-binaries` and `azure-acr-credential-provider`, are NOT onboarded for auto-update yet. There are plans to move the acs-mirror hosted packages to MCR OCI which will be downloaded by Oras. We will wait for this transition to be completed to understand the details how to manage them. -For the most up-to-date information, please refer to the actual configuration file `components.json`. \ No newline at end of file +For the most up-to-date information, please refer to the actual configuration file `components.json`. + +## Special considerations for GPU Container Images + +AKS-GPU container images are container images built by the config and scripts in the aks-gpu repo, for Ubuntu. They contain drivers and relevant dependencies. `GPUContainerImages` is added as a special config for only those images, and they contain additional config for deciding which OS and architecture they should be cached in (if necessary). Do not add other images that are not specific to AKS GPU config management, to that list. \ No newline at end of file From 034864bfbff1eed99e7854c1a248fa72b6483ac8 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Thu, 24 Oct 2024 12:19:27 -0700 Subject: [PATCH 08/10] Use updateMultiArchVersions --- vhdbuilder/packer/install-dependencies.sh | 84 ++++++++++++----------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index b1914bf7752..2b4956301ab 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -375,11 +375,10 @@ while IFS= read -r gpuImageToBePulled; do os=$(echo "$selector" | jq -r '.os') arch=$(echo "$selector" | jq -r '.arch') - if [[ "$os" == "$CURRENT_OS" ]]; then - if [[ "$arch" == "$CPU_ARCH" ]]; then - shouldPull=1 - break # Found a matching selector - fi + # Check OS and arch in one line, and remove "Any" cases + if [[ "$os" == "$CURRENT_OS" && "$arch" == "$CPU_ARCH" ]]; then + shouldPull=1 + break # Found a matching selector fi done <<< "$(echo "$osSelectors" | jq -c '.[]')" else @@ -393,45 +392,52 @@ while IFS= read -r gpuImageToBePulled; do downloadURL=$(echo "${gpuImageToBePulled}" | jq -r '.downloadURL') imageName=$(echo "$downloadURL" | sed 's/:.*$//') - # Get the latestVersion - latestVersion=$(echo "${gpuImageToBePulled}" | jq -r '.multiArchVersionsV2[0].latestVersion') - - if [[ -z "$latestVersion" || "$latestVersion" == "null" ]]; then - echo "Error: latestVersion not found for $imageName" - exit 1 - fi - - fullImage="$imageName:$latestVersion" + # Get the versions using updateMultiArchVersions + MULTI_ARCH_VERSIONS=() + updateMultiArchVersions "${gpuImageToBePulled}" - # Pull the image - echo "Pulling image: $fullImage" - ctr -n k8s.io image pull "$fullImage" - if [[ $? -ne 0 ]]; then - echo "Failed to pull image: $fullImage" + if [[ ${#MULTI_ARCH_VERSIONS[@]} -eq 0 ]]; then + echo "Error: No versions found for $imageName" exit 1 fi - # Record the pulled image - pulled_gpu_images["$imageName"]="$latestVersion" + for latestVersion in "${MULTI_ARCH_VERSIONS[@]}"; do + fullImage="$imageName:$latestVersion" - # Set gpu_action if pulling the aks-gpu-cuda image - if [[ "$imageName" == "mcr.microsoft.com/aks/aks-gpu-cuda" ]]; then - gpu_action="copy" - - # Create necessary directories - mkdir -p /opt/{actions,gpu} + # Pull the image + echo "Pulling image: $fullImage" + ctr -n k8s.io image pull "$fullImage" + if [[ $? -ne 0 ]]; then + echo "Failed to pull image: $fullImage" + exit 1 + fi - # Check for the "fullgpu" feature flag - if grep -q "fullgpu" <<< "$FEATURE_FLAGS"; then - echo "Installing GPU driver from image: $fullImage" - bash -c "$CTR_GPU_INSTALL_CMD $fullImage gpuinstall /entrypoint.sh install" - ret=$? - if [[ "$ret" != "0" ]]; then - echo "Failed to install GPU driver, exiting..." - exit $ret + # Record the pulled image + pulled_gpu_images+=("$fullImage") + + # Set gpu_action if pulling the aks-gpu-cuda image + if [[ "$imageName" == "mcr.microsoft.com/aks/aks-gpu-cuda" ]]; then + gpu_action="copy" + + # Create necessary directories + mkdir -p /opt/{actions,gpu} + + # Run gpuinstall only once + if [[ "$gpu_install_done" -eq 0 ]]; then + # Check for the "fullgpu" feature flag + if grep -q "fullgpu" <<< "$FEATURE_FLAGS"; then + echo "Installing GPU driver from image: $fullImage" + bash -c "$CTR_GPU_INSTALL_CMD $fullImage gpuinstall /entrypoint.sh install" + ret=$? + if [[ "$ret" != "0" ]]; then + echo "Failed to install GPU driver, exiting..." + exit $ret + fi + fi + gpu_install_done=1 fi fi - fi + done else echo "Skipping image $imageName due to osSelector constraints or cached=false." fi @@ -440,15 +446,15 @@ done <<< "$GPUContainerImages" # Log the pulled images if [[ "${#pulled_gpu_images[@]}" -gt 0 ]]; then echo "Logging pulled GPU images to $VHD_LOGS_FILEPATH" - for imageName in "${!pulled_gpu_images[@]}"; do - imageVersion=${pulled_gpu_images[$imageName]} - echo " - $imageName=$imageVersion" >> "$VHD_LOGS_FILEPATH" + for image in "${pulled_gpu_images[@]}"; do + echo " - $image" >> "$VHD_LOGS_FILEPATH" done else echo "No GPU images were pulled." fi + ls -ltr /opt/gpu/* >> ${VHD_LOGS_FILEPATH} installBpftrace From b3c5c4167ec9222a2630d855856a1ee3e8c10611 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Thu, 24 Oct 2024 12:29:13 -0700 Subject: [PATCH 09/10] indent --- parts/linux/cloud-init/artifacts/components.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/components.json b/parts/linux/cloud-init/artifacts/components.json index 85c78c61ff0..2932b1c528e 100644 --- a/parts/linux/cloud-init/artifacts/components.json +++ b/parts/linux/cloud-init/artifacts/components.json @@ -583,7 +583,7 @@ "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-grid", "latestVersion": "535.161.08-20241021235607" } - ], + ], "cached": false } ], From 60d3638b4f9eb21da7e0b5a7d05a347e685f56de Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Thu, 24 Oct 2024 13:08:11 -0700 Subject: [PATCH 10/10] Constrain to one gpuImageVersion per GPUContainerImage --- schemas/components.cue | 2 +- vhdbuilder/packer/install-dependencies.sh | 80 +++++++++++------------ 2 files changed, 40 insertions(+), 42 deletions(-) diff --git a/schemas/components.cue b/schemas/components.cue index 1a6d4280fca..7ea4c02e7f2 100644 --- a/schemas/components.cue +++ b/schemas/components.cue @@ -23,7 +23,7 @@ package components #GPUContainerImage: { downloadURL: string - multiArchVersionsV2: [...#VersionV2] + gpuImageVersion: #VersionV2 cached: bool osSelectors?: [...#OSSelector] } diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 2b4956301ab..7c645cedda1 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -352,7 +352,8 @@ echo " - runc version ${INSTALLED_RUNC_VERSION}" >> ${VHD_LOGS_FILEPATH} capture_benchmark "${SCRIPT_NAME}_artifact_streaming_download" gpu_action="" -declare -A pulled_gpu_images +gpu_install_done=0 +pulled_gpu_images=() # Loop over each GPUContainerImage while IFS= read -r gpuImageToBePulled; do @@ -375,7 +376,6 @@ while IFS= read -r gpuImageToBePulled; do os=$(echo "$selector" | jq -r '.os') arch=$(echo "$selector" | jq -r '.arch') - # Check OS and arch in one line, and remove "Any" cases if [[ "$os" == "$CURRENT_OS" && "$arch" == "$CPU_ARCH" ]]; then shouldPull=1 break # Found a matching selector @@ -383,8 +383,8 @@ while IFS= read -r gpuImageToBePulled; do done <<< "$(echo "$osSelectors" | jq -c '.[]')" else # No osSelectors provided; decide whether to pull - # Assuming we pull the image if no osSelectors are specified - shouldPull=1 + # Assuming we do not pull the image if no osSelectors are specified + shouldPull=0 fi if [[ "$shouldPull" == "1" ]]; then @@ -392,52 +392,49 @@ while IFS= read -r gpuImageToBePulled; do downloadURL=$(echo "${gpuImageToBePulled}" | jq -r '.downloadURL') imageName=$(echo "$downloadURL" | sed 's/:.*$//') - # Get the versions using updateMultiArchVersions - MULTI_ARCH_VERSIONS=() - updateMultiArchVersions "${gpuImageToBePulled}" + # Extract the version from gpuImageVersion + latestVersion=$(echo "${gpuImageToBePulled}" | jq -r '.gpuImageVersion.latestVersion') - if [[ ${#MULTI_ARCH_VERSIONS[@]} -eq 0 ]]; then - echo "Error: No versions found for $imageName" + if [[ -z "$latestVersion" || "$latestVersion" == "null" ]]; then + echo "Error: latestVersion not found for $imageName" exit 1 fi - for latestVersion in "${MULTI_ARCH_VERSIONS[@]}"; do - fullImage="$imageName:$latestVersion" + fullImage="$imageName:$latestVersion" - # Pull the image - echo "Pulling image: $fullImage" - ctr -n k8s.io image pull "$fullImage" - if [[ $? -ne 0 ]]; then - echo "Failed to pull image: $fullImage" - exit 1 - fi + # Pull the image + echo "Pulling image: $fullImage" + ctr -n k8s.io image pull "$fullImage" + if [[ $? -ne 0 ]]; then + echo "Failed to pull image: $fullImage" + exit 1 + fi - # Record the pulled image - pulled_gpu_images+=("$fullImage") - - # Set gpu_action if pulling the aks-gpu-cuda image - if [[ "$imageName" == "mcr.microsoft.com/aks/aks-gpu-cuda" ]]; then - gpu_action="copy" - - # Create necessary directories - mkdir -p /opt/{actions,gpu} - - # Run gpuinstall only once - if [[ "$gpu_install_done" -eq 0 ]]; then - # Check for the "fullgpu" feature flag - if grep -q "fullgpu" <<< "$FEATURE_FLAGS"; then - echo "Installing GPU driver from image: $fullImage" - bash -c "$CTR_GPU_INSTALL_CMD $fullImage gpuinstall /entrypoint.sh install" - ret=$? - if [[ "$ret" != "0" ]]; then - echo "Failed to install GPU driver, exiting..." - exit $ret - fi + # Record the pulled image + pulled_gpu_images+=("$fullImage") + + # Set gpu_action if pulling the aks-gpu-cuda image + if [[ "$imageName" == "mcr.microsoft.com/aks/aks-gpu-cuda" ]]; then + gpu_action="copy" + + # Create necessary directories + mkdir -p /opt/{actions,gpu} + + # Run gpuinstall only once + if [[ "$gpu_install_done" -eq 0 ]]; then + # Check for the "fullgpu" feature flag + if grep -q "fullgpu" <<< "$FEATURE_FLAGS"; then + echo "Installing GPU driver from image: $fullImage" + bash -c "$CTR_GPU_INSTALL_CMD $fullImage gpuinstall /entrypoint.sh install" + ret=$? + if [[ "$ret" != "0" ]]; then + echo "Failed to install GPU driver, exiting..." + exit $ret fi - gpu_install_done=1 fi + gpu_install_done=1 fi - done + fi else echo "Skipping image $imageName due to osSelector constraints or cached=false." fi @@ -455,6 +452,7 @@ fi + ls -ltr /opt/gpu/* >> ${VHD_LOGS_FILEPATH} installBpftrace