nixpkgs mirror (for testing) github.com/NixOS/nixpkgs
nix
at devShellTools-shell 169 lines 4.9 kB view raw
1{ lib, pkgs, ... }: 2let 3 slurmconfig = { 4 services.slurm = { 5 controlMachine = "control"; 6 nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; 7 partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; 8 extraConfig = '' 9 AccountingStorageHost=dbd 10 AccountingStorageType=accounting_storage/slurmdbd 11 ''; 12 }; 13 environment.systemPackages = [ mpitest ]; 14 networking.firewall.enable = false; 15 systemd.tmpfiles.rules = [ 16 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest" 17 ]; 18 }; 19 20 mpitest = 21 let 22 mpitestC = pkgs.writeText "mpitest.c" '' 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include <mpi.h> 26 27 int 28 main (int argc, char *argv[]) 29 { 30 int rank, size, length; 31 char name[512]; 32 33 MPI_Init (&argc, &argv); 34 MPI_Comm_rank (MPI_COMM_WORLD, &rank); 35 MPI_Comm_size (MPI_COMM_WORLD, &size); 36 MPI_Get_processor_name (name, &length); 37 38 if ( rank == 0 ) printf("size=%d\n", size); 39 40 printf ("%s: hello world from process %d of %d\n", name, rank, size); 41 42 MPI_Finalize (); 43 44 return EXIT_SUCCESS; 45 } 46 ''; 47 in 48 pkgs.runCommand "mpitest" { } '' 49 mkdir -p $out/bin 50 ${lib.getDev pkgs.mpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest 51 ''; 52in 53{ 54 name = "slurm"; 55 56 meta.maintainers = [ lib.maintainers.markuskowa ]; 57 58 nodes = 59 let 60 computeNode = 61 { ... }: 62 { 63 imports = [ slurmconfig ]; 64 # TODO slurmd port and slurmctld port should be configurations and 65 # automatically allowed by the firewall. 66 services.slurm = { 67 client.enable = true; 68 }; 69 }; 70 in 71 { 72 73 control = 74 { ... }: 75 { 76 imports = [ slurmconfig ]; 77 services.slurm = { 78 server.enable = true; 79 }; 80 }; 81 82 submit = 83 { ... }: 84 { 85 imports = [ slurmconfig ]; 86 services.slurm = { 87 enableStools = true; 88 }; 89 }; 90 91 dbd = 92 { pkgs, ... }: 93 let 94 passFile = pkgs.writeText "dbdpassword" "password123"; 95 in 96 { 97 networking.firewall.enable = false; 98 systemd.tmpfiles.rules = [ 99 "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest" 100 ]; 101 services.slurm.dbdserver = { 102 enable = true; 103 storagePassFile = "${passFile}"; 104 }; 105 services.mysql = { 106 enable = true; 107 package = pkgs.mariadb; 108 initialScript = pkgs.writeText "mysql-init.sql" '' 109 CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123'; 110 GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost'; 111 ''; 112 ensureDatabases = [ "slurm_acct_db" ]; 113 ensureUsers = [ 114 { 115 ensurePermissions = { 116 "slurm_acct_db.*" = "ALL PRIVILEGES"; 117 }; 118 name = "slurm"; 119 } 120 ]; 121 }; 122 }; 123 124 node1 = computeNode; 125 node2 = computeNode; 126 node3 = computeNode; 127 }; 128 129 testScript = '' 130 start_all() 131 132 # Make sure DBD is up after DB initialzation 133 with subtest("can_start_slurmdbd"): 134 dbd.succeed("systemctl restart slurmdbd") 135 dbd.wait_for_unit("slurmdbd.service") 136 dbd.wait_for_open_port(6819) 137 138 # there needs to be an entry for the current 139 # cluster in the database before slurmctld is restarted 140 with subtest("add_account"): 141 control.succeed("sacctmgr -i add cluster default") 142 # check for cluster entry 143 control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default") 144 145 with subtest("can_start_slurmctld"): 146 control.succeed("systemctl restart slurmctld") 147 control.wait_for_unit("slurmctld.service") 148 149 with subtest("can_start_slurmd"): 150 for node in [node1, node2, node3]: 151 node.succeed("systemctl restart slurmd.service") 152 node.wait_for_unit("slurmd") 153 154 # Test that the cluster works and can distribute jobs; 155 156 with subtest("run_distributed_command"): 157 # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). 158 # The output must contain the 3 different names 159 submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq") 160 161 with subtest("check_slurm_dbd"): 162 # find the srun job from above in the database 163 control.succeed("sleep 5") 164 control.succeed("sacct | grep hostname") 165 166 with subtest("run_PMIx_mpitest"): 167 submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3") 168 ''; 169}