From ca6fa95e3ebe4b54b94d0ed7eebd8250b8d32a73 Mon Sep 17 00:00:00 2001 From: Shawn Dawson Date: Mon, 28 Aug 2023 14:09:22 -0700 Subject: [PATCH] Feature/dawson/atslite1 args (#150) * First itteration of running on same node under flux * Typo on option fixed * Updating to test --level and --filter options with atslite1 * Further Updating to test --level and --filter options with atslite1 * Fix issue reported by Ben Liu regarding lrun and old_default and mpibind --------- Co-authored-by: MishaZakharchanka --- ats/atsMachines/fluxScheduled.py | 17 +++++++++++++ ats/atsMachines/lsf_asq.py | 10 ++++---- ats/tools/atslite1.py | 7 ++++-- ats/tools/atslite3.py | 7 ++++-- test/HelloATS/READ.ME | 41 ++++++++++---------------------- test/HelloATS/create_test_ats.py | 26 ++++++++++++++++---- 6 files changed, 65 insertions(+), 43 deletions(-) diff --git a/ats/atsMachines/fluxScheduled.py b/ats/atsMachines/fluxScheduled.py index b4abe4e..13aaf91 100755 --- a/ats/atsMachines/fluxScheduled.py +++ b/ats/atsMachines/fluxScheduled.py @@ -62,6 +62,10 @@ def init(self): self.coresPerGPU = 0 self.coresPerNode = int(self.numCores / self.numNodes) + # Strings used to determine which node a user wants the test to run + # Used with same_node var + self.node_list = [] + # Maintain for backwards compatability with projects # Allow user to over-ride the coresPerNode # Other schedulers call this npMax, but for flux we are calling this coresPerNode @@ -263,6 +267,19 @@ def calculateCommandList(self, test): ret.append(f"-t{max_time}") + + + import pprint + + same_node = test.options.get('same_node', None) + if same_node is not None: + if same_node not in self.node_list: + self.node_list.append(same_node) + pprint.pprint(self.node_list) + print(f"This is the node that we are trying to run on:{self.node_list.index(same_node) % self.numNodes}") + ret.append(f"--requires=-rank:{self.node_list.index(same_node) % self.numNodes}") + + """ Need to set -n{np} and -c{test.cpus_per_task}. But we also need to account for accessing GPUS using flux. In testing flux outside of ATS it is evident that one needs to increase the -c option diff --git a/ats/atsMachines/lsf_asq.py b/ats/atsMachines/lsf_asq.py index 97c6ab3..7c5976a 100644 --- a/ats/atsMachines/lsf_asq.py +++ b/ats/atsMachines/lsf_asq.py @@ -445,7 +445,7 @@ def calculateCommandList(self, test): "--env", str_omp_proc_bind, "-N", str(int(test.num_nodes)), "-n", str(np) - ] + str_lrun_jsrun_args.split() + str_mpibind + commandList + ] + str_lrun_jsrun_args.split() + [ str_mpibind ] + commandList else : return ["lrun", str_smpi, @@ -453,7 +453,7 @@ def calculateCommandList(self, test): "--env", str_omp_num_threads, "--env", str_omp_proc_bind, "-n", str(np) - ] + str_lrun_jsrun_args.split() + str_mpibind + commandList + ] + str_lrun_jsrun_args.split() + [ str_mpibind ] + commandList else: if ( test.num_nodes > 0) : return ["lrun", @@ -546,7 +546,7 @@ def calculateCommandList(self, test): if test.jsrun_bind == "unset": if self.mpibind: - str_lrun_jsrun_args = str_lrun_jsrun_args + " -b none " + str_mpibind + str_lrun_jsrun_args = str_lrun_jsrun_args + " -b none " + [ str_mpibind ] else: if self.old_defaults: str_lrun_jsrun_args = str_lrun_jsrun_args + " -b rs " @@ -554,7 +554,7 @@ def calculateCommandList(self, test): str_lrun_jsrun_args = str_lrun_jsrun_args + " -b rs " else: if self.mpibind: - str_lrun_jsrun_args = str_lrun_jsrun_args + " -b " + test.jsrun_bind + " " + str_mpibind + str_lrun_jsrun_args = str_lrun_jsrun_args + " -b " + test.jsrun_bind + " " + [ str_mpibind ] else: str_lrun_jsrun_args = str_lrun_jsrun_args + " -b " + test.jsrun_bind @@ -594,7 +594,7 @@ def calculateCommandList(self, test): if str_lrun_jsrun_args == "unset": str_lrun_jsrun_args = str_mpibind else: - str_lrun_jsrun_args = str_lrun_jsrun_args + " " + str_mpibind + str_lrun_jsrun_args = str_lrun_jsrun_args + " " + [ str_mpibind ] cpu_per_rs = np * test.cpus_per_task diff --git a/ats/tools/atslite1.py b/ats/tools/atslite1.py index ada970e..fb45908 100755 --- a/ats/tools/atslite1.py +++ b/ats/tools/atslite1.py @@ -30,10 +30,13 @@ def main(): clean_found = False exclusive_found = False nosub_found = False + level_found = False for index, arg in enumerate(sys.argv): - #print arg - if (arg.find('=') >= 0): + # print("SAD DEBUG index=%i arg=%s" % (index, arg)) + if (arg.startswith('level=') >= 0): + level_found = True + elif (arg.find('=') >= 0): (key, val) = arg.split('=',1) sys.argv[index] = key + '="' + val + '"' elif (arg.find('exclusive') >= 0): diff --git a/ats/tools/atslite3.py b/ats/tools/atslite3.py index 5e926ae..6b9abeb 100755 --- a/ats/tools/atslite3.py +++ b/ats/tools/atslite3.py @@ -28,9 +28,12 @@ def main(): clean_found = False exclusive_found = False nosub_found = False + level_found = False + for index, arg in enumerate(sys.argv): - #print arg - if (arg.find('=') >= 0): + if (arg.startswith('level=') >= 0): + level_found = True + elif (arg.find('=') >= 0): (key, val) = arg.split('=',1) sys.argv[index] = key + '="' + val + '"' elif (arg.find('exclusive') >= 0): diff --git a/test/HelloATS/READ.ME b/test/HelloATS/READ.ME index 3de4ffe..d685278 100644 --- a/test/HelloATS/READ.ME +++ b/test/HelloATS/READ.ME @@ -30,19 +30,24 @@ It may also present some which are HALTED if errors are detected in slurm or mpi init by ATS. -------------------------------------------------------------------------------- -Toss 3 (rzgenie, etc.). Only use Slurm +Toss 4 Testing on slurm based toss4 machines such as rzwhippet -------------------------------------------------------------------------------- export PATH=${PATH}:/usr/gapps/ats/scripts - module load python/3.8.2 - // Modify this line to be the ats install you are testing + module load python/3.9.12 export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH - mpicc hello_ats.c <- build the code - ./create_test_ats.py <- create the ats test file - atslite1 test.ats <- test using slurm + mpicc hello_ats.c <- build the code + ./create_test_ats.py <- create the ats test file + atslite1 test.ats <- test using slurm + export -n MACHINE_TYPE + + # POODLE TEST LINE (or any other system without cross node MPI) + salloc -N1 -p pdebug --exclusive + atslite1 test.ats + exit - The end of the run should include: + ATS SUMMARY May 16, 2023 14:20:16 FAILED: 10 a(a.out_1), a(a.out_3), a(a.out_9), a(a.out_11), ats_check_log#7, ats_check_log#8, a(a.out_17), a(a.out_19), a(a.out_25), a(a.out_27) PASSED: 18 SKIPPED: 8 @@ -71,28 +76,6 @@ Toss 4 Cray rzvernal with rocm 5.5 PASSED: 18 SKIPPED: 8 --------------------------------------------------------------------------------- -Toss 4 Testing on slurm based toss4 machines such as rzwhippet --------------------------------------------------------------------------------- - export PATH=${PATH}:/usr/gapps/ats/scripts - module load python/3.9.12 - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH - - mpicc hello_ats.c <- build the code - ./create_test_ats.py <- create the ats test file - atslite1 test.ats <- test using slurm - export -n MACHINE_TYPE - - # POODLE TEST LINE (or any other system without cross node MPI) - salloc -N1 -p pdebug --exclusive - atslite1 test.ats - exit - - - ATS SUMMARY May 16, 2023 14:20:16 - FAILED: 10 a(a.out_1), a(a.out_3), a(a.out_9), a(a.out_11), ats_check_log#7, ats_check_log#8, a(a.out_17), a(a.out_19), a(a.out_25), a(a.out_27) - PASSED: 18 - SKIPPED: 8 -------------------------------------------------------------------------------- Blueos (rzansel) Uses LSF diff --git a/test/HelloATS/create_test_ats.py b/test/HelloATS/create_test_ats.py index 5b57244..f75f22f 100755 --- a/test/HelloATS/create_test_ats.py +++ b/test/HelloATS/create_test_ats.py @@ -27,30 +27,46 @@ def get_test_lines_generator(): # Duplicate items in nprocs: [1, 2, ..., 64] --> [1, 1, 2, 2, ..., 64, 64] nprocs = sorted(2 * [1, 2, 3, 4, 5, 6, 7, 8, 16 ]) - test_line = "t%d=test (executable='./a.out', clas='%s', " \ + test_line = "t%d=test (executable='./a.out', level=20, clas='%s', " \ "label='a.out_%d', np=%d, sandbox=False)\n" return (test_line % (test_num, arg_, test_num, num_proc) for test_num, arg_, num_proc in zip(range(1, 44, 2), clas, nprocs)) - def get_testif_lines_generator(): """Returns a generator containing testifs (conditional tests).""" testif_line = "t%d=testif(t%d, executable = my_checker, " \ - "clas = t%d.outname, nosrun=True)\n" + "level=20, clas = t%d.outname, nosrun=True)\n" return (testif_line % (testif_num, testif_num - 1, testif_num - 1) for testif_num in range(2, 45, 2)) +def get_test_lines_generator_level_10(): + """Returns a generator containing independent tests.""" + clas = itertools.cycle(['', 'arg1 arg2 arg3']) + nprocs = sorted(2 * [1, 2, 3, 4 ]) + labels = ('the', 'cat', 'in', 'hat', 'chased', 'big', 'red', 'fox') + + test_line = "test(executable='./a.out', level=10, clas='%s', " \ + "label='%s', np=%d, nt=1)\n" + return (test_line % (arg_, label, num_proc) + for arg_, label, num_proc in zip(clas, labels, nprocs) ) + + if __name__ == "__main__": TEST_ATS = "test.ats" - FILE_HEADER = get_file_header() - TEST_LINES = get_test_lines_generator() + FILE_HEADER = get_file_header() + TEST_LINES = get_test_lines_generator() TESTIF_LINES = get_testif_lines_generator() + TEST10_LINES = get_test_lines_generator_level_10() with open(TEST_ATS, 'w') as ofp: ofp.write(FILE_HEADER) + for test, testif in zip(TEST_LINES, TESTIF_LINES): ofp.write(test) ofp.write(testif) + for test in TEST10_LINES: + ofp.write(test) + print(f"Most Excellent! Created ats test file {TEST_ATS}\n")