tangled
alpha
login
or
join now
pyrox.dev
/
nixpkgs
0
fork
atom
lol
0
fork
atom
overview
issues
pulls
pipelines
slurm service: add tests
Lancelot SIX
10 years ago
4994f0f7
ca4c3547
+80
1 changed file
expand all
collapse all
unified
split
nixos
tests
slurm.nix
+80
nixos/tests/slurm.nix
···
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
···
1
+
import ./make-test.nix ({ pkgs, ... }:
2
+
let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
3
+
slurmconfig = {
4
+
client.enable = true;
5
+
controlMachine = "control";
6
+
nodeName = ''
7
+
control
8
+
NodeName=node[1-3] CPUs=1 State=UNKNOWN
9
+
'';
10
+
partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP";
11
+
};
12
+
in {
13
+
name = "slurm";
14
+
15
+
nodes =
16
+
let
17
+
computeNode =
18
+
{ config, pkgs, ...}:
19
+
{
20
+
# TODO slrumd port and slurmctld port should be configurations and
21
+
# automatically allowed by the firewall.
22
+
networking.firewall.enable = false;
23
+
services.munge.enable = true;
24
+
services.slurm = slurmconfig;
25
+
};
26
+
in {
27
+
control =
28
+
{ config, pkgs, ...}:
29
+
{
30
+
networking.firewall.enable = false;
31
+
services.munge.enable = true;
32
+
services.slurm = {
33
+
server.enable = true;
34
+
} // slurmconfig;
35
+
};
36
+
node1 = computeNode;
37
+
node2 = computeNode;
38
+
node3 = computeNode;
39
+
};
40
+
41
+
testScript =
42
+
''
43
+
startAll;
44
+
45
+
# Set up authentification across the cluster
46
+
foreach my $node (($control,$node1,$node2,$node3))
47
+
{
48
+
$node->waitForUnit("default.target");
49
+
50
+
$node->succeed("mkdir /etc/munge");
51
+
$node->succeed("echo '${mungekey}' > /etc/munge/munge.key");
52
+
$node->succeed("chmod 0400 /etc/munge/munge.key");
53
+
$node->succeed("systemctl restart munged");
54
+
}
55
+
56
+
# Restart the services since they have probably failed due to the munge init
57
+
# failure
58
+
59
+
subtest "can_start_slurmctld", sub {
60
+
$control->succeed("systemctl restart slurmctld");
61
+
$control->waitForUnit("slurmctld.service");
62
+
};
63
+
64
+
subtest "can_start_slurmd", sub {
65
+
foreach my $node (($control,$node1,$node2,$node3))
66
+
{
67
+
$node->succeed("systemctl restart slurmd.service");
68
+
$node->waitForUnit("slurmd");
69
+
}
70
+
};
71
+
72
+
# Test that the cluster work and can distribute jobs;
73
+
74
+
subtest "run_distributed_command", sub {
75
+
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
76
+
# The output must contain the 3 different names
77
+
$control->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
78
+
};
79
+
'';
80
+
})