diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 00000000..d5ed99ef --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,98 @@ +############################################################################### +# Copyright (c) 2022-23, Lawrence Livermore National Security, LLC and RADIUSS +# project contributors. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: (MIT) +############################################################################### + +# DESCRIPTION: +############################################################################### +# General GitLab pipelines configurations for supercomputers and Linux clusters +# at Lawrence Livermore National Laboratory (LLNL). +# This entire pipeline is LLNL-specific +# +# Important note: This file is a template provided by llnl/radiuss-shared-ci. +# Remains to set variable values, change the reference to the radiuss-shared-ci +# repo, opt-in and out optional features. The project can then extend it with +# additional stages. +# +# In addition, each project should copy over and complete: +# - .gitlab/custom-jobs-and-variables.yml +# - .gitlab/subscribed-pipelines.yml +# +# The jobs should be specified in a file local to the project, +# - .gitlab/jobs/${CI_MACHINE}.yml +# or generated (see LLNL/Umpire for an example). +############################################################################### + +# We define the following GitLab pipeline variables: +variables: +##### LC GITLAB CONFIGURATION +# Use an LLNL service user to run CI. This prevents from running pipelines as +# an actual user. + LLNL_SERVICE_USER: "" +# Use the service user workspace. Solves permission issues, stores everything +# at the same location whoever triggers a pipeline. + CUSTOM_CI_BUILDS_DIR: "/usr/workspace/AMS/gitlab-runner" +# Tells Gitlab to recursively update the submodules when cloning the project. +# GIT_SUBMODULE_STRATEGY: recursive + +##### PROJECT VARIABLES +# We build the projects in the CI clone directory. +# Used in script/gitlab/build_and_test.sh script. +# TODO: add a clean-up mechanism. + BUILD_ROOT: ${CI_PROJECT_DIR} + +##### SHARED_CI CONFIGURATION +# Required information about GitHub repository + GITHUB_PROJECT_NAME: "AMS" + GITHUB_PROJECT_ORG: "LLNL" +# Set the build-and-test command. +# Nested variables are allowed and useful to customize the job command. We +# prevent variable expansion so that you can define them at job level. + JOB_CMD: + value: "scripts/gitlab/ci-build-test.sh" + expand: false +# Override the pattern describing branches that will skip the "draft PR filter +# test". Add protected branches here. See default value in +# preliminary-ignore-draft-pr.yml. +# ALWAYS_RUN_PATTERN: "" + +# We organize the build-and-test stage with sub-pipelines. Each sub-pipeline +# corresponds to a test batch on a given machine. + +# High level stages +stages: + - prerequisites + - build-and-test + +# Template for jobs triggering a build-and-test sub-pipeline: +.build-and-test: + stage: build-and-test + trigger: + include: + - local: '.gitlab/custom-jobs-and-variables.yml' + - project: 'radiuss/radiuss-shared-ci' + ref: 'v2024.07.0' + file: 'pipelines/${CI_MACHINE}.yml' + # Add your jobs + # you can use a local file + - local: '.gitlab/jobs/${CI_MACHINE}.yml' + # or a file generated in the previous steps + # - artifact: '${CI_MACHINE}-jobs.yml' + # job: 'generate-job-file' + # (See Umpire CI setup for an example). + strategy: depend + forward: + pipeline_variables: true + +include: + # Sets ID tokens for every job using `default:` + - project: 'lc-templates/id_tokens' + file: 'id_tokens.yml' + # [Optional] checks preliminary to running the actual CI test + - project: 'radiuss/radiuss-shared-ci' + ref: 'v2024.07.0' + file: 'utilities/preliminary-ignore-draft-pr.yml' + # pipelines subscribed by the project + - local: '.gitlab/subscribed-pipelines.yml' diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml new file mode 100644 index 00000000..5af33aee --- /dev/null +++ b/.gitlab/custom-jobs-and-variables.yml @@ -0,0 +1,62 @@ +############################################################################### +# Copyright (c) 2022-23, Lawrence Livermore National Security, LLC and RADIUSS +# project contributors. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: (MIT) +############################################################################### + +# We define the following GitLab pipeline variables: +variables: +# In some pipelines we create only one allocation shared among jobs in +# order to save time and resources. This allocation has to be uniquely +# named so that we are sure to retrieve it and avoid collisions. + ALLOC_NAME: ${CI_PROJECT_NAME}_ci_${CI_PIPELINE_ID} + +# Ruby +# Arguments for top level allocation + RUBY_SHARED_ALLOC: "--mpi=none --exclusive --reservation=ci --time=20 --nodes=1" +# Arguments for job level allocation + RUBY_JOB_ALLOC: "--mpi=none --reservation=ci --nodes=1" +# Add variables that should apply to all the jobs on a machine: +# RUBY_MY_VAR: "..." + +# Poodle +# Arguments for top level allocation + POODLE_SHARED_ALLOC: "--exclusive --partition=pdebug --time=10 --nodes=1" +# Arguments for job level allocation + POODLE_JOB_ALLOC: "--nodes=1" +# Add variables that should apply to all the jobs on a machine: +# POODLE_MY_VAR: "..." + +# Corona +# Arguments for top level allocation +# OPTIONAL: "-o per-resource.count=2" allows to get 2 jobs running on each node. + CORONA_SHARED_ALLOC: "--exclusive --time-limit=15m --nodes=1" +# Arguments for job level allocation + CORONA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" +# Add variables that should apply to all the jobs on a machine: +# CORONA_MY_VAR: "..." + +# Tioga +# Arguments for top level allocation +# OPTIONAL: "-o per-resource.count=2" allows to get 2 jobs running on each node. + TIOGA_SHARED_ALLOC: "--queue=pci --exclusive --time-limit=15m --nodes=1" +# Arguments for job level allocation + TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" +# Add variables that should apply to all the jobs on a machine: +# TIOGA_MY_VAR: "..." + +# Lassen uses a different job scheduler (spectrum lsf) that does not allow +# pre-allocation the same way slurm does. Arguments for job level allocation + LASSEN_JOB_ALLOC: "1 -W 30 -q pci" +# Add variables that should apply to all the jobs on a machine: +# LASSEN_MY_VAR: "..." + + +# Configuration shared by build and test jobs specific to this project. +# Not all configuration can be shared. Here projects can fine tune the +# CI behavior. +# See Umpire for an example (export junit test reports). +.custom_job: + variables: + JOB_TEMPLATE_CANNOT_BE_EMPTY: "True" diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml new file mode 100644 index 00000000..c2bc5564 --- /dev/null +++ b/.gitlab/jobs/lassen.yml @@ -0,0 +1,49 @@ +############################################################################### +# Copyright (c) 2022-23, Lawrence Livermore National Security, LLC and RADIUSS +# project contributors. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: (MIT) +############################################################################### + +# We require project to define their job command using a variable (JOB_CMD). +# In customization/gitlab-ci.yml, we encourage to define this variable as +# non-expandable, so that project can use nested variables to configure the job +# command. The caveat is that the reproducer here cannot capture the +# definition of these variables in a generic fashion. By overriding the +# following section, projects can specify the variables to define in the +# reproducer to exactly reproduce the CI build. +.lassen_reproducer_vars: + script: + - export WITH_CUDA="On" + +# With GitLab CI, included files cannot be empty. +# TODO: remove when you have at least on job defined. +variables: + INCLUDED_FILE_CANNOT_BE_EMPTY: "True" + +############### +# Explanations: +############### +# RADIUSS Shared CI provides a pipeline for each machine, where a template job +# is provided. Each of your jobs must extend this template to be added to the +# list of jobs running on the associated machine. +# +# The job template then expects you to define the "JOB_CMD" variable with the +# one line command used to trigger the build and test of your project. +# +# We suggest that you set your command in such a way that you can then +# customize it per job with variables. E.g.: +# "./path/to/my_ci_script ${A_VARIABLE}" + +## Adding jobs defined by the project. +## Note: placing the extends section first allows you to override part of the +## shared implementation if needed (and if you know what you are doing). +#: +# extends: .job_on_lassen +# variables: +# : "" + +build-run-lassen: + extends: .job_on_lassen + variables: + diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml new file mode 100644 index 00000000..802e491e --- /dev/null +++ b/.gitlab/jobs/ruby.yml @@ -0,0 +1,49 @@ +############################################################################### +# Copyright (c) 2022-23, Lawrence Livermore National Security, LLC and RADIUSS +# project contributors. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: (MIT) +############################################################################### + +# We require project to define their job command using a variable (JOB_CMD). +# In customization/gitlab-ci.yml, we encourage to define this variable as +# non-expandable, so that project can use nested variables to configure the job +# command. The caveat is that the reproducer here cannot capture the +# definition of these variables in a generic fashion. By overriding the +# following section, projects can specify the variables to define in the +# reproducer to exactly reproduce the CI build. +.ruby_reproducer_vars: + script: + - export WITH_CUDA="Off" + +# With GitLab CI, included files cannot be empty. +# TODO: remove when you have at least on job defined. +variables: + INCLUDED_FILE_CANNOT_BE_EMPTY: "True" + +############### +# Explanations: +############### +# RADIUSS Shared CI provides a pipeline for each machine, where a template job +# is provided. Each of your jobs must extend this template to be added to the +# list of jobs running on the associated machine. +# +# The job template then expects you to define the "JOB_CMD" variable with the +# one line command used to trigger the build and test of your project. +# +# We suggest that you set your command in such a way that you can then +# customize it per job with variables. E.g.: +# "./path/to/my_ci_script ${A_VARIABLE}" + +## Adding jobs defined by the project. +## Note: placing the extends section first allows you to override part of the +## shared implementation if needed (and if you know what you are doing). +#: +# extends: .job_on_ruby +# variables: +# : "" + +build-run-ruby: + extends: .job_on_ruby + variables: + diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml new file mode 100644 index 00000000..c0f0cc42 --- /dev/null +++ b/.gitlab/subscribed-pipelines.yml @@ -0,0 +1,91 @@ +############################################################################### +# Copyright (c) 2022-23, Lawrence Livermore National Security, LLC and RADIUSS +# project contributors. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: (MIT) +############################################################################### + +# The template job to test whether a machine is up. +# Expects CI_MACHINE defined to machine name. +.machine-check: + stage: prerequisites + tags: [shell, oslic] + variables: + GIT_STRATEGY: none + script: + - | + if [[ $(jq '.[env.CI_MACHINE].total_nodes_up' /usr/global/tools/lorenz/data/loginnodeStatus) == 0 ]] + then + echo -e "\e[31mNo node available on ${CI_MACHINE}\e[0m" + curl --url "https://github.com/repos/${GITHUB_PROJECT_ORG}/${GITHUB_PROJECT_NAME}/statuses/${CI_COMMIT_SHA}" \ + --header 'Content-Type: application/json' \ + --header "authorization: Bearer ${GITHUB_TOKEN}" \ + --data "{ \"state\": \"failure\", \"target_url\": \"${CI_PIPELINE_URL}\", \"description\": \"GitLab ${CI_MACHINE} down\", \"context\": \"ci/gitlab/${CI_MACHINE}\" }" + exit 1 + fi + +### +# Trigger a build-and-test pipeline for a machine. +# Comment the jobs for machines you don’t need. +### + +# RUBY +ruby-up-check: + variables: + CI_MACHINE: "ruby" + extends: [.machine-check] + +ruby-build-and-test: + variables: + CI_MACHINE: "ruby" + needs: [ruby-up-check] + extends: [.build-and-test] + +## POODLE +#poodle-up-check: +# variables: +# CI_MACHINE: "poodle" +# extends: [.machine-check] +# +#poodle-build-and-test: +# variables: +# CI_MACHINE: "poodle" +# needs: [poodle-up-check] +# extends: [.build-and-test] +# +## CORONA +#corona-up-check: +# variables: +# CI_MACHINE: "corona" +# extends: [.machine-check] +# +#corona-build-and-test: +# variables: +# CI_MACHINE: "corona" +# needs: [corona-up-check] +# extends: [.build-and-test] +# +## TIOGA +#tioga-up-check: +# variables: +# CI_MACHINE: "tioga" +# extends: [.machine-check] +# +#tioga-build-and-test: +# variables: +# CI_MACHINE: "tioga" +# needs: [tioga-up-check] +# extends: [.build-and-test] + +# LASSEN +lassen-up-check: + variables: + CI_MACHINE: "lassen" + extends: [.machine-check] + +lassen-build-and-test: + variables: + CI_MACHINE: "lassen" + needs: [lassen-up-check] + extends: [.build-and-test] + diff --git a/scripts/gitlab/ci-build-test.sh b/scripts/gitlab/ci-build-test.sh new file mode 100755 index 00000000..83224762 --- /dev/null +++ b/scripts/gitlab/ci-build-test.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +source scripts/gitlab/setup-env.sh + +export CTEST_OUTPUT_ON_FAILURE=1 +# WITH_CUDA is defined in the per machine job yml. + +build_and_test() { + WITH_TORCH=${1} + WITH_FAISS=${2} + WITH_HDF5=${3} + WITH_MPI=${4} + WITH_CALIPER=${5} + + echo "*******************************************************************************************" + echo "Build configuration" \ + "WITH_TORCH ${WITH_TORCH}" \ + "WITH_FAISS ${WITH_FAISS}" \ + "WITH_HDF5 ${WITH_HDF5}" \ + "WITH_MPI ${WITH_MPI}" \ + "WITH_CUDA ${WITH_CUDA}" + echo "*******************************************************************************************" + + rm -rf build + mkdir build + pushd build + + cmake \ + -DBUILD_SHARED_LIBS=On \ + -DCMAKE_PREFIX_PATH=$INSTALL_DIR \ + -DWITH_CALIPER=On \ + -DWITH_HDF5=${WITH_HDF5} \ + -DWITH_EXAMPLES=On \ + -DAMS_HDF5_DIR=$AMS_HDF5_PATH \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DCMAKE_BUILD_TYPE=Release \ + -DCUDA_ARCH=$AMS_CUDA_ARCH \ + -DWITH_CUDA=${WITH_CUDA} \ + -DUMPIRE_DIR=$AMS_UMPIRE_PATH \ + -DMFEM_DIR=$AMS_MFEM_PATH \ + -DWITH_FAISS=${WITH_FAISS} \ + -DWITH_MPI=${WITH_MPI} \ + -DWITH_TORCH=${WITH_TORCH} \ + -DWITH_TESTS=On \ + -DTorch_DIR=$AMS_TORCH_PATH \ + -DFAISS_DIR=$AMS_FAISS_PATH \ + -DWITH_AMS_DEBUG=On \ + -DWITH_WORKFLOW=On \ + -DWITH_ADIAK=On \ + ${CI_PROJECT_DIR} || { echo "CMake failed"; exit 1; } + + make -j || { echo "Building failed"; exit 1; } + make test || { echo "Tests failed"; exit 1; } + popd + + rm -rf build +} + +# build_and_test WITH_TORCH WITH_FAISS WITH_HDF5 WITH_MPI +build_and_test "On" "On" "On" "On" +build_and_test "On" "On" "On" "Off" +build_and_test "Off" "On" "On" "On" +build_and_test "Off" "Off" "On" "On" +build_and_test "Off" "Off" "Off" "On" + diff --git a/scripts/gitlab/setup-env.sh b/scripts/gitlab/setup-env.sh new file mode 100644 index 00000000..805f4e4d --- /dev/null +++ b/scripts/gitlab/setup-env.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +host=$(hostname) +host=${host//[0-9]/} + +SPACK_ENV_PATH="0.20" + +if [[ "$SYS_TYPE" == "blueos_3_ppc64le_ib_p9" ]]; then + ## load relevant modules on lassen + module load gcc/8.3.1 + module load cmake/3.23.1 + echo "Loading mpi and cuda" + if [[ "${SPACK_ENV_PATH}" == "0.20" ]]; then + module load cuda/11.6.1 + elif [[ "${SPACK_ENV_PATH}" == "0.18" ]]; then + module load cuda/11.4.1 + fi + module load spectrum-mpi + CUDA_ARCH=70 +elif [[ "$SYS_TYPE" == "toss_3_x86_64_ib" ]]; then + module load mvapich2/2.3 + if [[ "${SPACK_ENV_PATH}" == "0.20" ]]; then + module load cuda/11.6.1 + elif [[ "${SPACK_ENV_PATH}" == "0.18" ]]; then + module load cuda/11.4.1 + fi + CUDA_ARCH=60 +fi + +# Load gcc to avoid cmake warnings on mpi with the default Intel-based StdEnv. +if [[ "$host" == "ruby" ]]; then + module load gcc/11.2.1 +fi + +## activate spack +source /usr/workspace/AMS/ams-spack-environments/${SPACK_ENV_PATH}/spack/share/spack/setup-env.sh + +## activate the spack environment +spack env activate /usr/workspace/AMS/ams-spack-environments/${SPACK_ENV_PATH}/$host + +## export the paths (currently cmake needs these) +export AMS_MFEM_PATH=`spack location -i mfem` +export AMS_TORCH_PATH=`spack location -i py-torch` +export AMS_FAISS_PATH=`spack location -i faiss` +export AMS_UMPIRE_PATH=`spack location -i umpire` +export AMS_HIREDIS_PATH=`spack location -i hiredis` +export AMS_REDIS_PLUS_PLUS_PATH=`spack location -i redis-plus-plus` +export AMS_HDF5_PATH=`spack location -i hdf5` +export AMS_CUDA_ARCH=${CUDA_ARCH} + +echo "AMS_MFEM_PATH = $AMS_MFEM_PATH" +echo "AMS_TORCH_PATH = $AMS_TORCH_PATH" +echo "AMS_FAISS_PATH = $AMS_FAISS_PATH" +echo "AMS_UMPIRE_PATH = $AMS_UMPIRE_PATH" +echo "AMS_CUDA_ARCH = $AMS_CUDA_ARCH" +echo "AMS_HIREDIS_PATH = $AMS_HIREDIS_PATH" +echo "AMS_REDIS_PLUS_PLUS_PATH = $AMS_REDIS_PLUS_PLUS_PATH" +echo "AMS_HDF5_PATH = $AMS_HDF5_PATH" + +export AMS_TORCH_PATH=$(echo $AMS_TORCH_PATH/lib/python3.*/site-packages/torch/share/cmake/Torch) + +echo "(for cmake) AMS_TORCH_PATH = $AMS_TORCH_PATH" + diff --git a/tests/AMSlib/CMakeLists.txt b/tests/AMSlib/CMakeLists.txt index 73135c89..a3a9c339 100644 --- a/tests/AMSlib/CMakeLists.txt +++ b/tests/AMSlib/CMakeLists.txt @@ -214,13 +214,14 @@ if(WITH_FAISS) BUILD_TEST(ams_hdcache_test test_hdcache.cpp) ADDTEST(ams_hdcache_test AMSHDCacheMeanPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 1 10 4.0 4 5) - # The max case fails on DEVICE. We should be aware about this when adding support for CI for GPUs - ADDTEST(ams_hdcache_test AMSHDCacheMaxPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 2 10 4.0 4 5) - ADDTEST(ams_hdcache_test AMSHDCacheMeanPolicySingle ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "single" 1 10 4.0 4 5) - # The max case fails on DEVICE. We should be aware about this when adding support for CI for GPUs + ADDTEST(ams_hdcache_test AMSHDCacheMaxPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 2 10 4.0 4 5) ADDTEST(ams_hdcache_test AMSHDCacheMaxPolicySingle ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "single" 2 10 4.0 4 5) + # The max case fails on DEVICE. We should be aware about this when adding support for CI for GPUs + if (WITH_CUDA) + set_tests_properties(AMSHDCacheMaxPolicySingle::DEVICE AMSHDCacheMaxPolicyDouble::DEVICE PROPERTIES DISABLED TRUE) + endif() if(WITH_TORCH) if (WITH_EXAMPLES)