Skip to content

Commit

Permalink
Cleanup unused code and variables, fix wcoss2 issue starting ecflow
Browse files Browse the repository at this point in the history
  • Loading branch information
BrianCurtis-NOAA committed Apr 30, 2024
1 parent 04bbc15 commit d7fcb15
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 89 deletions.
75 changes: 13 additions & 62 deletions tests/rt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -686,17 +686,13 @@ case ${MACHINE_ID} in
module load ecflow/5.6.0.13
fi
module load intel/19.1.3.304 python/3.8.6
if [[ "${ECFLOW:-false}" == true ]] ; then
# ECF_ROOT=${ECF_ROOT:-}
# ECFLOW_START="${ECF_ROOT}/scripts/server_check.sh"
# ECFLOW_STOP="${ECF_ROOT}/bin/ecflow_stop.sh"
export ECF_OUTPUTDIR="${PATHRT}/ecf_outputdir"
export ECF_COMDIR="${PATHRT}/ecf_comdir"
rm -rf "${ECF_OUTPUTDIR}" "${ECF_COMDIR}"
mkdir -p "${ECF_OUTPUTDIR}"
mkdir -p "${ECF_COMDIR}"
# export ECFLOW_START ECFLOW_STOP
fi
#if [[ "${ECFLOW:-false}" == true ]] ; then
#export ECF_OUTPUTDIR="${PATHRT}/ecf_outputdir"
#export ECF_COMDIR="${PATHRT}/ecf_comdir"
#rm -rf "${ECF_OUTPUTDIR}" "${ECF_COMDIR}"
#mkdir -p "${ECF_OUTPUTDIR}"
#mkdir -p "${ECF_COMDIR}"
#fi
export colonifnco=":output" # hack

DISKNM="/lfs/h2/emc/nems/noscrub/emc.nems/RT"
Expand All @@ -716,9 +712,6 @@ case ${MACHINE_ID} in
if [[ "${ROCOTO:-false}" == true ]] ; then
module use /ncrc/proj/epic/rocoto/modulefiles
module load rocoto
# ROCOTORUN=$(command -v rocotorun)
# ROCOTOSTAT=$(command -v rocotostat)
# ROCOTOCOMPLETE=$(command -v rocotocomplete)
ROCOTO_SCHEDULER="slurm"
fi

Expand All @@ -731,8 +724,6 @@ case ${MACHINE_ID} in
module load gcc/12.2.0
if [[ "${ECFLOW:-false}" == true ]] ; then
module load ecflow/5.8.4
# ECFLOW_START=/ncrc/proj/epic/spack-stack/ecflow-5.8.4/bin/ecflow_start.sh
# ECFLOW_STOP=/ncrc/proj/epic/spack-stack/ecflow-5.8.4/bin/ecflow_stop.sh
ECF_HOST=$(hostname)
ECF_PORT=$(( $(id -u) + 1500 ))
export ECF_PORT ECF_HOST
Expand All @@ -753,17 +744,11 @@ case ${MACHINE_ID} in
set -x
if [[ "${ROCOTO:-false}" == true ]] ; then
module load rocoto
# ROCOTORUN=$(command -v rocotorun)
# ROCOTOSTAT=$(command -v rocotostat)
# ROCOTOCOMPLETE=$(command -v rocotocomplete)
ROCOTO_SCHEDULER=slurm
fi

if [[ "${ECFLOW:-false}" == true ]] ; then
module load ecflow/5.11.4
# ECFLOW_START="$(command -v ecflow_start.sh)"
# ECFLOW_STOP="$(command -v ecflow_stop.sh)"
# export ECFLOW_START ECFLOW_STOP
fi

QUEUE="batch"
Expand All @@ -786,17 +771,12 @@ case ${MACHINE_ID} in

if [[ "${ROCOTO:-false}" == true ]] ; then
module load contrib rocoto
# ROCOTORUN=$(command -v rocotorun)
# ROCOTOSTAT=$(command -v rocotostat)
# ROCOTOCOMPLETE=$(command -v rocotocomplete)
ROCOTO_SCHEDULER="slurm"
fi

module use /work/noaa/epic/role-epic/spack-stack/orion/modulefiles
if [[ "${ECFLOW:-false}" == true ]] ; then
module load ecflow/5.8.4
# ECFLOW_START="/work/noaa/epic/role-epic/spack-stack/orion/ecflow-5.8.4/bin/ecflow_start.sh"
# ECFLOW_STOP="/work/noaa/epic/role-epic/spack-stack/orion/ecflow-5.8.4/bin/ecflow_stop.sh"
ECF_HOST=$(hostname)
ECF_PORT="$(( $(id -u) + 1500 ))"
export ECF_PORT ECF_HOST
Expand All @@ -817,17 +797,12 @@ case ${MACHINE_ID} in
set -x
if [[ "${ROCOTO:-false}" == true ]] ; then
module load contrib rocoto
# ROCOTORUN=$(command -v rocotorun)
# ROCOTOSTAT=$(command -v rocotostat)
# ROCOTOCOMPLETE=$(command -v rocotocomplete)
ROCOTO_SCHEDULER="slurm"
fi

module use /work/noaa/epic/role-epic/spack-stack/hercules/modulefiles
if [[ "${ECFLOW:-false}" == true ]] ; then
module load ecflow/5.8.4
# ECFLOW_START="/work/noaa/epic/role-epic/spack-stack/hercules/ecflow-5.8.4/bin/ecflow_start.sh"
# ECFLOW_STOP="/work/noaa/epic/role-epic/spack-stack/hercules/ecflow-5.8.4/bin/ecflow_stop.sh"
ECF_HOST=$(hostname)
ECF_PORT="$(( $(id -u) + 1500 ))"
export ECF_PORT ECF_HOST
Expand Down Expand Up @@ -857,17 +832,11 @@ case ${MACHINE_ID} in

if [[ "${ROCOTO:-false}" == true ]] ; then
module load rocoto
# ROCOTORUN=$(command -v rocotorun)
# ROCOTOSTAT=$(command -v rocotostat)
# ROCOTOCOMPLETE=$(command -v rocotocomplete)
ROCOTO_SCHEDULER="slurm"
fi

if [[ "${ECFLOW:-false}" == true ]] ; then
module load ecflow/5.11.4
# ECFLOW_START=/apps/ecflow/5.11.4/bin/ecflow_start.sh
# ECFLOW_STOP=/apps/ecflow/5.11.4/bin/ecflow_stop.sh
# export ECFLOW_START ECFLOW_STOP
fi

module use /mnt/lfs4/HFIP/hfv3gfs/role.epic/spack-stack/spack-stack-1.5.0/envs/unified-env-rocky8/install/modulefiles/Core
Expand All @@ -889,9 +858,6 @@ case ${MACHINE_ID} in
set -x
if [[ "${ROCOTO:-false}" == true ]] ; then
module load rocoto/1.3.2
# ROCOTORUN=$(command -v rocotorun)
# ROCOTOSTAT=$(command -v rocotostat)
# ROCOTOCOMPLETE=$(command -v rocotocomplete)
ROCOTO_SCHEDULER=slurm
fi
if [[ "${ECFLOW:-false}" == true ]] ; then
Expand All @@ -902,8 +868,6 @@ case ${MACHINE_ID} in
module use /data/prod/jedi/spack-stack/modulefiles
if [[ "${ECFLOW:-false}" == true ]] ; then
module load ecflow/5.8.4
# ECFLOW_START="/data/prod/jedi/spack-stack/ecflow-5.8.4/bin/ecflow_start.sh"
# ECFLOW_STOP="/data/prod/jedi/spack-stack/ecflow-5.8.4/bin/ecflow_stop.sh"
ECF_HOST=$(hostname)
ECF_PORT="$(( $(id -u) + 1500 ))"
export ECF_PORT ECF_HOST
Expand Down Expand Up @@ -937,8 +901,6 @@ case ${MACHINE_ID} in
module load stack-python/3.10.8
# export PYTHONPATH=/glade/p/ral/jntp/tools/miniconda3/4.8.3/envs/ufs-weather-model/lib/python3.8/site-packages:/glade/p/ral/jntp/tools/miniconda3/4.8.3/lib/python3.8/site-packages
if [[ "${ECFLOW:-false}" == true ]] ; then
# ECFLOW_START=/glade/work/epicufsrt/contrib/spack-stack/derecho/ecflow-5.8.4/bin/ecflow_start.sh
# ECFLOW_STOP=/glade/work/epicufsrt/contrib/spack-stack/derecho/ecflow-5.8.4/bin/ecflow_stop.sh
ECF_HOST=$(hostname)
ECF_PORT=$(( $(id -u) + 1500 ))
export ECF_PORT ECF_HOST
Expand All @@ -957,9 +919,6 @@ case ${MACHINE_ID} in


if [[ "${ROCOTO:-false}" == true ]] ; then
# ROCOTORUN=$(command -v rocotorun)
# ROCOTOSTAT=$(command -v rocotostat)
# ROCOTOCOMPLETE=$(command -v rocotocomplete)
ROCOTO_SCHEDULER="pbspro"
fi
;;
Expand Down Expand Up @@ -1006,9 +965,6 @@ case ${MACHINE_ID} in

if [[ "${ROCOTO:-false}" == true ]] ; then
module load rocoto/1.3.3
# ROCOTORUN=$(command -v rocotorun)
# ROCOTOSTAT=$(command -v rocotostat)
# ROCOTOCOMPLETE=$(command -v rocotocomplete)
ROCOTO_SCHEDULER=slurm
fi

Expand Down Expand Up @@ -1156,25 +1112,20 @@ fi
if [[ ${ECFLOW} == true ]]; then
echo "Verifying ECFLOW support..."
case ${MACHINE_ID} in
wcoss2|acorn)
ECFLOW_START="$(command -v server_check.sh)"
ECFLOW_STOP="$(command -v ecflow_stop.sh)"
;;
expanse|stampede|noaacloud)
die "ECFLOW not supported on this machine, please do not use '-e'."
;;
*)
ECFLOW_START="$(command -v ecflow_start.sh)"
ECFLOW_STOP="$(command -v ecflow_stop.sh)"
;;
esac
export ECFLOW_START ECFLOW_STOP
export ECFLOW_START

export ECF_OUTPUTDIR="${PATHRT}/ecf_outputdir"
export ECF_COMDIR="${PATHRT}/ecf_comdir"
rm -rf "${ECF_OUTPUTDIR}" "${ECF_COMDIR}"
mkdir -p "${ECF_OUTPUTDIR}"
mkdir -p "${ECF_COMDIR}"
#export ECF_OUTPUTDIR="${PATHRT}/ecf_outputdir"
#export ECF_COMDIR="${PATHRT}/ecf_comdir"
#rm -rf "${ECF_OUTPUTDIR}" "${ECF_COMDIR}"
#mkdir -p "${ECF_OUTPUTDIR}"
#mkdir -p "${ECF_COMDIR}"
# Default maximum number of compile and run jobs
MAX_BUILDS=10 #Max build jobs
MAX_JOBS=30 #Max test/run jobs
Expand Down
48 changes: 21 additions & 27 deletions tests/rt_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -658,13 +658,15 @@ ecflow_run() {
fi

# Start the ecflow_server
echo "rt_utils.sh: Checking status of the ecflow_server..."
set +e
ecflow_client --ping --host="${ECF_HOST}" --port="${ECF_PORT}"
not_running=$?
set -e

if [[ ${not_running} -eq 1 ]]; then
echo "ecflow_server is NOT running on ${ECF_HOST}:${ECF_PORT}"
echo "rt_utils.sh: ecflow_server is not running on ${ECF_HOST}:${ECF_PORT}"
echo "rt_utils.sh: attempting to start ecflow_server..."

if [[ ${MACHINE_ID} == wcoss2 || ${MACHINE_ID} == acorn ]]; then
#shellcheck disable=SC2029
Expand All @@ -675,34 +677,34 @@ ecflow_run() {
else
${ECFLOW_START} -p "${ECF_PORT}" -d "${RUNDIR_ROOT}/ecflow_server"
fi
echo "Since this script is starting the ecflow_server, we will stop it at the end"
export STOP_ECFLOW_AT_END=true
# Try pinging ecflow server now, and erroring out if not there.
set +e
ecflow_client --ping --host="${ECF_HOST}" --port="${ECF_PORT}"
not_running=$?
set -e

if [[ ${not_running} -eq 1 ]]; then
echo "ERROR: Failure to start ecflow, exiting..."
echo "rt_utils.sh: ERROR -- Failure to start ecflow. Exiting..."
exit 1
fi
else
echo "ecflow_server is already running on ${ECF_HOST}:${ECF_PORT}"
echo "rt_utils.sh: Confirmed: ecflow_server is running on ${ECF_HOST}:${ECF_PORT}"
fi

ECFLOW_RUNNING=true
echo "rt_utils.sh: Starting ECFLOW tasks..."
set +e
ecflow_client --load="${ECFLOW_RUN}/${ECFLOW_SUITE}.def" --host="${ECF_HOST}" --port="${ECF_PORT}"
ecflow_client --begin="${ECFLOW_SUITE}" --host="${ECF_HOST}" --port="${ECF_PORT}"
ecflow_client --restart --host="${ECF_HOST}" --port="${ECF_PORT}"
set -e
sleep 10

active_tasks=1
sleep 10
max_active_tasks=$( ecflow_client --get_state "/${ECFLOW_SUITE}" )
max_active_tasks=$( grep "task " <<< "${max_active_tasks}" )
max_active_tasks=$( grep -cP 'state:active|state:submitted|state:queued' <<< "${max_active_tasks}" )
echo "rt_utils.sh: Total number of tasks processed -- ${max_active_tasks}"
while [[ "${active_tasks}" -ne 0 ]]
do
sleep 10 & wait $!
Expand All @@ -716,46 +718,38 @@ ecflow_run() {
done

sleep 65 # wait one ECF_INTERVAL plus 5 seconds
echo "rt_utils.sh: ECFLOW tasks completed, cleaning up suite"
set +e
ecflow_client --delete=force yes "/${ECFLOW_SUITE}"
set -e
sleep 5
}

ecflow_kill() {
echo "rt_utils.sh: Killing ECFLOW Workflow..."
[[ ${ECFLOW_RUNNING:-false} == true ]] || return
set +e
ecflow_client --suspend "/${ECFLOW_SUITE}"
ecflow_client --kill "/${ECFLOW_SUITE}"
sleep 20
ecflow_client --delete=force yes "/${ECFLOW_SUITE}"
set -e
[[ ${ECFLOW_RUNNING:-false} == true ]] || return
echo "rt_utils.sh: Deleting ECFLOW suite: ${ECFLOW_SUITE}"
set +e
ecflow_client --suspend "/${ECFLOW_SUITE}"
ecflow_client --kill "/${ECFLOW_SUITE}"
sleep 20
ecflow_client --delete=force yes "/${ECFLOW_SUITE}"
set -e
}

ecflow_stop() {
echo "rt_utils.sh: Stopping ECFLOW Workflow..."
[[ ${ECFLOW_RUNNING:-false} == true ]] || return
echo "rt_utils.sh: Checking whether to stop ecflow_server..."
set +e
SUITES=$( ecflow_client --get )
SUITES=$( grep "^suite" <<< "${SUITES}" )
echo "SUITES=${SUITES}"
if [[ -z "${SUITES}" ]]; then
echo "rt_utils.sh: No other suites running, stopping ecflow_server"
ecflow_client --halt=yes
ecflow_client --check_pt
ecflow_client --terminate=yes
fi
if [[ ${STOP_ECFLOW_AT_END} == true ]]; then
echo "rt_utils.sh: Stopping ECFLOW Server..."
case ${MACHINE_ID} in
wcoss2|acorn|hera|jet)
#shellcheck disable=SC2029
ssh "${ECF_HOST}" "bash -l -c \"${ECFLOW_STOP} -p ${ECF_PORT}\""
;;
*)
${ECFLOW_STOP} -p "${ECF_PORT}"
;;
esac
else
echo "rt_utils.sh: Potential suites running, NOT stopping ecflow_server..."
fi
set -e
}

0 comments on commit d7fcb15

Please sign in to comment.