lol

slurm service: add tests

+80
+80
nixos/tests/slurm.nix
··· 1 + import ./make-test.nix ({ pkgs, ... }: 2 + let mungekey = "mungeverryweakkeybuteasytointegratoinatest"; 3 + slurmconfig = { 4 + client.enable = true; 5 + controlMachine = "control"; 6 + nodeName = '' 7 + control 8 + NodeName=node[1-3] CPUs=1 State=UNKNOWN 9 + ''; 10 + partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP"; 11 + }; 12 + in { 13 + name = "slurm"; 14 + 15 + nodes = 16 + let 17 + computeNode = 18 + { config, pkgs, ...}: 19 + { 20 + # TODO slrumd port and slurmctld port should be configurations and 21 + # automatically allowed by the firewall. 22 + networking.firewall.enable = false; 23 + services.munge.enable = true; 24 + services.slurm = slurmconfig; 25 + }; 26 + in { 27 + control = 28 + { config, pkgs, ...}: 29 + { 30 + networking.firewall.enable = false; 31 + services.munge.enable = true; 32 + services.slurm = { 33 + server.enable = true; 34 + } // slurmconfig; 35 + }; 36 + node1 = computeNode; 37 + node2 = computeNode; 38 + node3 = computeNode; 39 + }; 40 + 41 + testScript = 42 + '' 43 + startAll; 44 + 45 + # Set up authentification across the cluster 46 + foreach my $node (($control,$node1,$node2,$node3)) 47 + { 48 + $node->waitForUnit("default.target"); 49 + 50 + $node->succeed("mkdir /etc/munge"); 51 + $node->succeed("echo '${mungekey}' > /etc/munge/munge.key"); 52 + $node->succeed("chmod 0400 /etc/munge/munge.key"); 53 + $node->succeed("systemctl restart munged"); 54 + } 55 + 56 + # Restart the services since they have probably failed due to the munge init 57 + # failure 58 + 59 + subtest "can_start_slurmctld", sub { 60 + $control->succeed("systemctl restart slurmctld"); 61 + $control->waitForUnit("slurmctld.service"); 62 + }; 63 + 64 + subtest "can_start_slurmd", sub { 65 + foreach my $node (($control,$node1,$node2,$node3)) 66 + { 67 + $node->succeed("systemctl restart slurmd.service"); 68 + $node->waitForUnit("slurmd"); 69 + } 70 + }; 71 + 72 + # Test that the cluster work and can distribute jobs; 73 + 74 + subtest "run_distributed_command", sub { 75 + # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). 76 + # The output must contain the 3 different names 77 + $control->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq"); 78 + }; 79 + ''; 80 + })