Provide tests for SIG/HPC slurm packages

This MR provides a very, very basic test suite for the Slurm packages
built by the HPC SIG. It checks the following:

- Necessary packages for a single-node Slurm instance install
  successfully from the SIG/HPC repository
- A job can be scheduled and executed to completion
- A job can be scheduled and then cancelled

```
./fifloader.py --clean --load templates.fif.json
openqa-cli api -X POST isos ISO=Rocky-8.8-x86_64-dvd.iso ARCH=x86_64 DISTRI=rocky FLAVOR=dvd-iso VERSION=8.8 CURRREL=8 BUILD=-${date +%Y%d%m}.0-slurm-8.8 TEST=slurm22,slurm23
openqa-cli api -X POST isos ISO=Rocky-9.2-x86_64-dvd.iso ARCH=x86_64 DISTRI=rocky FLAVOR=dvd-iso VERSION=9.2 CURRREL=9 BUILD=-${date +%Y%d%m}.0-slurm-9.2 TEST=slurm22,slurm23

```

- [x] My code follows the style guidelines of this project
- [x] I have performed a self-review of my own code
- [x] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [x] My changes generate no new warnings
- [x] Any dependent changes have been merged and published in downstream modules
This commit is contained in:
Al Bowles 2023-07-25 15:04:07 -05:00
parent f4c33a4f21
commit 8becb62887
No known key found for this signature in database
GPG Key ID: 9B42314A30F1A3D1
2 changed files with 100 additions and 5 deletions

View File

@ -1418,6 +1418,36 @@
"WORKER_CLASS": "tap" "WORKER_CLASS": "tap"
} }
}, },
"slurm22": {
"profiles": {
"rocky-dvd-iso-aarch64-*-aarch64": 10,
"rocky-dvd-iso-x86_64-*-64bit": 10
},
"settings": {
"BOOTFROM": "c",
"HDD_1": "disk_%FLAVOR%_%MACHINE%.qcow2",
"POSTINSTALL": "slurm",
"START_AFTER_TEST": "%DEPLOY_UPLOAD_TEST%",
"ROOT_PASSWORD": "weakpassword",
"USER_LOGIN": "false",
"SLURM_VERSION": "22"
}
},
"slurm23": {
"profiles": {
"rocky-dvd-iso-aarch64-*-aarch64": 10,
"rocky-dvd-iso-x86_64-*-64bit": 10
},
"settings": {
"BOOTFROM": "c",
"HDD_1": "disk_%FLAVOR%_%MACHINE%.qcow2",
"POSTINSTALL": "slurm",
"START_AFTER_TEST": "%DEPLOY_UPLOAD_TEST%",
"ROOT_PASSWORD": "weakpassword",
"USER_LOGIN": "false",
"SLURM_VERSION": "23"
}
},
"support_server": { "support_server": {
"profiles": { "profiles": {
"rocky-dvd-iso-aarch64-*-aarch64": 10, "rocky-dvd-iso-aarch64-*-aarch64": 10,

65
tests/slurm.pm Normal file
View File

@ -0,0 +1,65 @@
use base "installedtest";
use strict;
use testapi;
use utils;
sub slurm_setup {
# install HPC repository
my $version = get_var("SLURM_VERSION");
assert_script_run "dnf -y install rocky-release-hpc", 120;
# Set up munge
assert_script_run "dnf -y install munge", 120;
assert_script_run "dd if=/dev/urandom bs=1 count=1024 >/etc/munge/munge.key";
assert_script_run "chmod 400 /etc/munge/munge.key";
assert_script_run "chown munge.munge /etc/munge/munge.key";
assert_script_run "systemctl enable --now munge.service";
# install slurm
if (get_var("CURRREL") eq '8') {
assert_script_run "dnf config-manager --set-enabled powertools";
}
assert_script_run "dnf install -y slurm$version-slurmdbd slurm$version-slurmrestd slurm$version-slurmctld slurm$version-slurmd";
# Since this is a single node system, we don't have to modify the conf files. We will for larger multi-node tests.
# start services
assert_script_run "systemctl enable --now slurmctld slurmdbd slurmrestd slurmd";
}
sub run {
my $self = shift;
# do all the install stuff
slurm_setup();
# if everything is configured right, sinfo should show the following output
# $ sinfo
# PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
# debug* up infinite 1 idle localhost
validate_script_output "sinfo", sub { m/debug.*localhost/ };
# write a boring job script
assert_script_run "echo '#!/bin/bash\n#SBATCH --job-name=antarctica_time\nsleep 120\nTZ=NZST date' > job.sh";
## schedule a job and run it to completion
assert_script_run "sbatch job.sh";
validate_script_output "squeue", sub { m/antar/ };
sleep 121;
# after 121 seconds, job should have completed and no longer exist in the queue
validate_script_output "squeue", sub { $_ !~ m/antar/ };
## cancel a job
assert_script_run "sbatch job.sh";
validate_script_output "squeue", sub { m/antar/ };
assert_script_run "scancel 2";
# job should no longer be in the queue
validate_script_output "squeue", sub { $_ !~ m/antar/ };
}
sub test_flags {
return {fatal => 1};
}
1;
# vim: set sw=4 et: