From 09642fe41802c5406a9f13684a56ecae360f061c Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Mon, 11 May 2020 08:02:46 -0500 Subject: [PATCH 01/36] - lustre-ipoib - This is a created implementation of Lustre using IP over infiniband (IPoIB) Changes to files to enable Infiniband functionality: lfsmaster.sh lfsoss.sh lfsclient.sh lfsrepo.sh Addition for correct drives placement of OSSes : instaldrives.sh *installdrives.sh takes about 15 minutes to run so please either remote this entity, or wait it out. --- examples/lustre_ipoib/config.json | 223 ++++++++++++++++++ examples/lustre_ipoib/readme.md | 37 +++ .../lustre_ipoib/scripts/installdrives.sh | 33 +++ examples/lustre_ipoib/scripts/lfsclient.sh | 44 ++++ examples/lustre_ipoib/scripts/lfsmaster.sh | 32 +++ examples/lustre_ipoib/scripts/lfsoss.sh | 38 +++ examples/lustre_ipoib/scripts/lfspkgs.sh | 11 + 7 files changed, 418 insertions(+) create mode 100644 examples/lustre_ipoib/config.json create mode 100644 examples/lustre_ipoib/readme.md create mode 100755 examples/lustre_ipoib/scripts/installdrives.sh create mode 100755 examples/lustre_ipoib/scripts/lfsclient.sh create mode 100755 examples/lustre_ipoib/scripts/lfsmaster.sh create mode 100755 examples/lustre_ipoib/scripts/lfsoss.sh create mode 100755 examples/lustre_ipoib/scripts/lfspkgs.sh diff --git a/examples/lustre_ipoib/config.json b/examples/lustre_ipoib/config.json new file mode 100644 index 000000000..8de923c03 --- /dev/null +++ b/examples/lustre_ipoib/config.json @@ -0,0 +1,223 @@ +{ + "location": "southcentralus", + "resource_group": "variables.resource_group", + "install_from": "headnode", + "admin_user": "hpcadmin", + "vnet": { + "name": "hpcvnet", + "address_prefix": "10.2.0.0/20", + "subnets": { + "compute": "10.2.0.0/22", + "storage": "10.2.4.0/24" + } + }, + "variables": { + "resource_group": "", + "image": "OpenLogic:CentOS-HPC:7.6:latest", + "lustreimage": "OpenLogic:CentOS-HPC:7.6:latest", + "drivenum": 4, + "ossnum": 4, + "low_priority": true, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.10", + "lustre_mount": "/lustre" + }, + "resources": { + "headnode": { + "type": "vm", + "vm_type": "Standard_HC44rs", + "accelerated_networking": false, + "public_ip": true, + "image": "variables.image", + "subnet": "compute", + "tags": [ + "disable-selinux", + "cndefault", + "lfsrepo", + "lfsclient", + "lfsazimport", + "localuser", + "pbsserver", + "loginnode", + "nfsserver" + ] + }, + "lustre": { + "type": "vmss", + "vm_type": "Standard_HC44rs", + "instances": "9", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre[0:5]", + "osses[1:5]", + "lfsrepo", + "lfsclient[5:9]", + "localuser", + "pbsclient[5:9]", + "nfsclient", + "disable-selinux", + "lfsloganalytics" + ] + } + }, + "install": [ + { + "script": "disable-selinux.sh", + "tag": "disable-selinux", + "sudo": true + }, + { + "script": "cndefault.sh", + "tag": "cndefault", + "sudo": true + }, + { + "script": "nfsserver.sh", + "tag": "nfsserver", + "sudo": true + }, + { + "script": "nfsclient.sh", + "args": [ + "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. + +The configuration file requires the following variables to be set: + +| Variable | Description | +|-------------------------|----------------------------------------------| +| resource_group | The resource group for the project | +| storage_account | The storage account for HSM | +| storage_key | The storage key for HSM | +| storage_container | The container to use for HSM | +| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | +| la_resourcegroup | The resource group for Log Analytics | +| la_name | The Log Analytics Workspace name | + +> Note: you can remove log anaytics and/or HSM from the config file if not required. + +> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/examples/lustre_ipoib/scripts/installdrives.sh b/examples/lustre_ipoib/scripts/installdrives.sh new file mode 100755 index 000000000..221e2349e --- /dev/null +++ b/examples/lustre_ipoib/scripts/installdrives.sh @@ -0,0 +1,33 @@ +#!/bin/bash +groupname=$1 +vmlist=$2 +ossnum=$3 +drivenum=$4 + +#create the drives first before attachint to vmss +drivecount=$(($drivenum*$ossnum)) + +for ((num=1; num<=$drivecount; num++)); do + az disk create -g $groupname -n "lustredrive$num" --size-gb 1024 & +done + +sleep 60 # to ensure all drives are made + +#Now use the created drives +index=0 +lustrecnt=1 + +idlisttmp=$(az vmss list-instances --resource-group $groupname --name lustre |grep providers/Microsoft.Compute/virtualMachineScaleSets/lustre/virtualMachines | awk -F "virtualMachines/" '{print $2}' | sed '/networkInterfaces/d'| sed 's/["].*$//') + +idlist=($idlisttmp) + +for vmname in ${vmlist[@]}; do + ((index++)) + if [ $index -gt 0 ] ; then + for ((diskid=1; diskid<=$drivenum; diskid++)); do + az vmss disk attach --vmss-name lustre --disk lustredrive${lustrecnt} --sku Premium_LRS --instance-id ${idlist[$index]} --resource-group $groupname + ((lustrecnt++)) + done + fi +done + diff --git a/examples/lustre_ipoib/scripts/lfsclient.sh b/examples/lustre_ipoib/scripts/lfsclient.sh new file mode 100755 index 000000000..4e30d37fa --- /dev/null +++ b/examples/lustre_ipoib/scripts/lfsclient.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} +mkdir ~/.ssh + +cp -r /share/home/hpcuser/.ssh ~/ + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if rpm -q lustre; then + + # if the server packages are installed only the client kmod is needed + # for 2.10 and nothing extra is needed for 2.12 + if [ "$lustre_version" = "2.10" ]; then + + if ! rpm -q kmod-lustre-client; then + yum -y install kmod-lustre-client + fi + + fi + +else + + # install the client RPMs if not already installed + if ! rpm -q lustre-client kmod-lustre-client; then + yum -y install lustre-client kmod-lustre-client + fi + weak-modules --add-kernel $(uname -r) + +fi +#Include the correct infiniband options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/examples/lustre_ipoib/scripts/lfsmaster.sh b/examples/lustre_ipoib/scripts/lfsmaster.sh new file mode 100755 index 000000000..dce36a159 --- /dev/null +++ b/examples/lustre_ipoib/scripts/lfsmaster.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh ~/ + +#Include the correct ipoib options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + +fi diff --git a/examples/lustre_ipoib/scripts/lfsoss.sh b/examples/lustre_ipoib/scripts/lfsoss.sh new file mode 100755 index 000000000..0b9b060a5 --- /dev/null +++ b/examples/lustre_ipoib/scripts/lfsoss.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh ~/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if [ "$PSSH_NODENUM" != "0" ]; then + + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device +#Include the correct ipoib options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +fi diff --git a/examples/lustre_ipoib/scripts/lfspkgs.sh b/examples/lustre_ipoib/scripts/lfspkgs.sh new file mode 100755 index 000000000..3120d3ba6 --- /dev/null +++ b/examples/lustre_ipoib/scripts/lfspkgs.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +yum -y install lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre-resource-agents e2fsprogs || exit 1 + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf + +systemctl restart waagent + +weak-modules --add-kernel --no-initramfs + +umount /mnt/resource From 92ac2c00c52284593d56fb41ba23849bc56a770c Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Fri, 22 May 2020 08:31:16 -0500 Subject: [PATCH 02/36] - lustre_ipoib_nvmedrives - This is a created implementation of Lustre using IP over infiniband (IPoIB) using the existing 700GB NVMe drives in the H series nodes Changes to files to enable Infiniband functionality: lfsmaster.sh lfsoss.sh lfsclient.sh lfsrepo.sh --- examples/lustre_ipoib_nvmedrives/config.json | 204 ++++++++++++++++++ examples/lustre_ipoib_nvmedrives/readme.md | 35 +++ .../scripts/lfsclient.sh | 44 ++++ .../scripts/lfsmaster.sh | 32 +++ .../lustre_ipoib_nvmedrives/scripts/lfsoss.sh | 38 ++++ .../scripts/lfspkgs.sh | 11 + .../scripts/waitforreboot.sh | 2 + 7 files changed, 366 insertions(+) create mode 100644 examples/lustre_ipoib_nvmedrives/config.json create mode 100644 examples/lustre_ipoib_nvmedrives/readme.md create mode 100755 examples/lustre_ipoib_nvmedrives/scripts/lfsclient.sh create mode 100755 examples/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh create mode 100755 examples/lustre_ipoib_nvmedrives/scripts/lfsoss.sh create mode 100755 examples/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh create mode 100755 examples/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh diff --git a/examples/lustre_ipoib_nvmedrives/config.json b/examples/lustre_ipoib_nvmedrives/config.json new file mode 100644 index 000000000..5800a1dcc --- /dev/null +++ b/examples/lustre_ipoib_nvmedrives/config.json @@ -0,0 +1,204 @@ +{ + "location": "southcentralus", + "resource_group": "variables.resource_group", + "install_from": "headnode", + "admin_user": "hpcadmin", + "vnet": { + "name": "hpcvnet", + "address_prefix": "10.2.0.0/20", + "subnets": { + "compute": "10.2.0.0/22", + "storage": "10.2.4.0/24" + } + }, + "variables": { + "resource_group": "", + "image": "OpenLogic:CentOS-HPC:7.6:latest", + "lustreimage": "OpenLogic:CentOS-HPC:7.6:latest", + "drivenum": 4, + "ossnum": 4, + "low_priority": true, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.10", + "lustre_mount": "/lustre" + }, + "resources": { + "headnode": { + "type": "vm", + "vm_type": "Standard_HC44rs", + "accelerated_networking": false, + "public_ip": true, + "image": "variables.image", + "subnet": "compute", + "tags": [ + "disable-selinux", + "cndefault", + "lfsrepo", + "lfsclient", + "lfsazimport", + "localuser", + "pbsserver", + "loginnode", + "nfsserver" + ] + }, + "lustre": { + "type": "vmss", + "vm_type": "Standard_HC44rs", + "instances": "9", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre[0:5]", + "osses[1:5]", + "lfsrepo", + "lfsclient[5:9]", + "localuser", + "pbsclient[5:9]", + "nfsclient", + "disable-selinux", + "lfsloganalytics" + ] + } + }, + "install": [ + { + "script": "disable-selinux.sh", + "tag": "disable-selinux", + "sudo": true + }, + { + "script": "cndefault.sh", + "tag": "cndefault", + "sudo": true + }, + { + "script": "nfsserver.sh", + "tag": "nfsserver", + "sudo": true + }, + { + "script": "nfsclient.sh", + "args": [ + "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. + +The configuration file requires the following variables to be set: + +| Variable | Description | +|-------------------------|----------------------------------------------| +| resource_group | The resource group for the project | +| storage_account | The storage account for HSM | +| storage_key | The storage key for HSM | +| storage_container | The container to use for HSM | +| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | +| la_resourcegroup | The resource group for Log Analytics | +| la_name | The Log Analytics Workspace name | + +> Note: you can remove log anaytics and/or HSM from the config file if not required. + +> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/examples/lustre_ipoib_nvmedrives/scripts/lfsclient.sh b/examples/lustre_ipoib_nvmedrives/scripts/lfsclient.sh new file mode 100755 index 000000000..4e30d37fa --- /dev/null +++ b/examples/lustre_ipoib_nvmedrives/scripts/lfsclient.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} +mkdir ~/.ssh + +cp -r /share/home/hpcuser/.ssh ~/ + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if rpm -q lustre; then + + # if the server packages are installed only the client kmod is needed + # for 2.10 and nothing extra is needed for 2.12 + if [ "$lustre_version" = "2.10" ]; then + + if ! rpm -q kmod-lustre-client; then + yum -y install kmod-lustre-client + fi + + fi + +else + + # install the client RPMs if not already installed + if ! rpm -q lustre-client kmod-lustre-client; then + yum -y install lustre-client kmod-lustre-client + fi + weak-modules --add-kernel $(uname -r) + +fi +#Include the correct infiniband options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/examples/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh b/examples/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh new file mode 100755 index 000000000..dce36a159 --- /dev/null +++ b/examples/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh ~/ + +#Include the correct ipoib options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + +fi diff --git a/examples/lustre_ipoib_nvmedrives/scripts/lfsoss.sh b/examples/lustre_ipoib_nvmedrives/scripts/lfsoss.sh new file mode 100755 index 000000000..0b9b060a5 --- /dev/null +++ b/examples/lustre_ipoib_nvmedrives/scripts/lfsoss.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh ~/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if [ "$PSSH_NODENUM" != "0" ]; then + + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device +#Include the correct ipoib options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +fi diff --git a/examples/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh b/examples/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh new file mode 100755 index 000000000..3120d3ba6 --- /dev/null +++ b/examples/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +yum -y install lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre-resource-agents e2fsprogs || exit 1 + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf + +systemctl restart waagent + +weak-modules --add-kernel --no-initramfs + +umount /mnt/resource diff --git a/examples/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh b/examples/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh new file mode 100755 index 000000000..753167b8f --- /dev/null +++ b/examples/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh @@ -0,0 +1,2 @@ +#!/bin/bash +sleep 60 #enough time for node reboot to continue process From cf36ae9deab2c731e89013f480db933c7064632b Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Thu, 4 Jun 2020 11:38:23 -0500 Subject: [PATCH 03/36] - lustre-ipoib - This is a created implementation of Lustre using ip over infiniband (IPoIB) - lustre-rdma - This is a created implementation of Lustre using native Remote Direct Memory Access (RDMA) Changes to files to enable Infiniband functionality: lfsmaster.sh lfsoss.sh lfsclient.sh lfsrepo.sh lfspkgs.sh Addition for the installation of new OFED : installOFED.sh Addition for correct Lustre kernel : lustreinstall1.sh Lustre packages : lustreinstall2.sh Addition for rebooting of Lustre MDS/OSS: rebootlustre.sh Addition for pause after MDS/OSS reboot : waitforreboot.sh --- examples/lustre_rdma_nvmedrives/config.json | 235 ++++++++++++++++++ examples/lustre_rdma_nvmedrives/readme.md | 37 +++ .../scripts/installOFED.sh | 4 + .../scripts/lfsclient.sh | 47 ++++ .../scripts/lfsmaster.sh | 33 +++ .../lustre_rdma_nvmedrives/scripts/lfsoss.sh | 32 +++ .../lustre_rdma_nvmedrives/scripts/lfsrepo.sh | 27 ++ .../scripts/lustreinstall1.sh | 8 + .../scripts/lustreinstall2.sh | 10 + .../scripts/lustrenetwork.sh | 9 + .../scripts/rebootlustre.sh | 20 ++ .../scripts/waitforreboot.sh | 2 + examples/lustre_rdma_nvmedrives/writeup | 17 ++ 13 files changed, 481 insertions(+) create mode 100644 examples/lustre_rdma_nvmedrives/config.json create mode 100644 examples/lustre_rdma_nvmedrives/readme.md create mode 100755 examples/lustre_rdma_nvmedrives/scripts/installOFED.sh create mode 100755 examples/lustre_rdma_nvmedrives/scripts/lfsclient.sh create mode 100755 examples/lustre_rdma_nvmedrives/scripts/lfsmaster.sh create mode 100755 examples/lustre_rdma_nvmedrives/scripts/lfsoss.sh create mode 100755 examples/lustre_rdma_nvmedrives/scripts/lfsrepo.sh create mode 100755 examples/lustre_rdma_nvmedrives/scripts/lustreinstall1.sh create mode 100755 examples/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh create mode 100755 examples/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh create mode 100755 examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh create mode 100755 examples/lustre_rdma_nvmedrives/scripts/waitforreboot.sh create mode 100644 examples/lustre_rdma_nvmedrives/writeup diff --git a/examples/lustre_rdma_nvmedrives/config.json b/examples/lustre_rdma_nvmedrives/config.json new file mode 100644 index 000000000..8d2afed3e --- /dev/null +++ b/examples/lustre_rdma_nvmedrives/config.json @@ -0,0 +1,235 @@ +{ + "location": "southcentralus", + "resource_group": "variables.resource_group", + "install_from": "headnode", + "admin_user": "hpcadmin", + "vnet": { + "name": "hpcvnet", + "address_prefix": "10.2.0.0/20", + "subnets": { + "compute": "10.2.0.0/22", + "storage": "10.2.4.0/24" + } + }, + "variables": { + "resource_group": "", + "image": "OpenLogic:CentOS:7.6:latest", + "lustreimage": "OpenLogic:CentOS:7.6:latest", + "drivenum": 4, + "ossnum": 4, + "low_priority": true, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.10", + "lustre_mount": "/lustre" + }, + "resources": { + "headnode": { + "type": "vm", + "vm_type": "Standard_HB60rs", + "accelerated_networking": false, + "public_ip": true, + "image": "variables.image", + "subnet": "compute", + "tags": [ + "disable-selinux", + "cndefault", + "lfsrepo", + "rebootlustre", + "lfsclient", + "lfsazimport", + "localuser", + "pbsserver", + "allnodes", + "loginnode", + "nfsserver" + ] + }, + "lustre": { + "type": "vmss", + "vm_type": "Standard_HB120rs_v2", + "instances": "9", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre[0:5]", + "osses[1:5]", + "lfsrepo", + "lfsclient[5:9]", + "localuser", + "pbsclient[5:9]", + "nfsclient", + "allnodes", + "disable-selinux", + "lfsloganalytics" + ] + } + }, + "install": [ + { + "script": "disable-selinux.sh", + "tag": "disable-selinux", + "sudo": true + }, + { + "script": "cndefault.sh", + "tag": "cndefault", + "sudo": true + }, + { + "script": "nfsserver.sh", + "tag": "nfsserver", + "sudo": true + }, + { + "script": "nfsclient.sh", + "args": [ + "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. + +The configuration file requires the following variables to be set: + +| Variable | Description | +|-------------------------|----------------------------------------------| +| resource_group | The resource group for the project | +| storage_account | The storage account for HSM | +| storage_key | The storage key for HSM | +| storage_container | The container to use for HSM | +| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | +| la_resourcegroup | The resource group for Log Analytics | +| la_name | The Log Analytics Workspace name | + +> Note: you can remove log anaytics and/or HSM from the config file if not required. + +> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/examples/lustre_rdma_nvmedrives/scripts/installOFED.sh b/examples/lustre_rdma_nvmedrives/scripts/installOFED.sh new file mode 100755 index 000000000..c267519fc --- /dev/null +++ b/examples/lustre_rdma_nvmedrives/scripts/installOFED.sh @@ -0,0 +1,4 @@ +#!/bin/bash +yum -y groupinstall --skip-broken "Infiniband Support" 2>/dev/null +echo "done installing Infiniband" +exit 0 diff --git a/examples/lustre_rdma_nvmedrives/scripts/lfsclient.sh b/examples/lustre_rdma_nvmedrives/scripts/lfsclient.sh new file mode 100755 index 000000000..0a3f302fc --- /dev/null +++ b/examples/lustre_rdma_nvmedrives/scripts/lfsclient.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} +mkdir ~/.ssh + +cp -r /share/home/hpcuser/.ssh ~/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/examples/lustre_rdma_nvmedrives/scripts/lfsmaster.sh b/examples/lustre_rdma_nvmedrives/scripts/lfsmaster.sh new file mode 100755 index 000000000..1869a1f71 --- /dev/null +++ b/examples/lustre_rdma_nvmedrives/scripts/lfsmaster.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh /root/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + +fi + diff --git a/examples/lustre_rdma_nvmedrives/scripts/lfsoss.sh b/examples/lustre_rdma_nvmedrives/scripts/lfsoss.sh new file mode 100755 index 000000000..ada2bb8c7 --- /dev/null +++ b/examples/lustre_rdma_nvmedrives/scripts/lfsoss.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh /root/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if [ "$PSSH_NODENUM" != "0" ]; then + lnetctl net add --net o2ib --if ib0 #double check + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}@o2ib" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device + + +mkdir /mnt/oss +echo "$device /mnt/oss lustre noatime,nodiratime,nobarrier 0 2" >> /etc/fstab +mount -a +fi diff --git a/examples/lustre_rdma_nvmedrives/scripts/lfsrepo.sh b/examples/lustre_rdma_nvmedrives/scripts/lfsrepo.sh new file mode 100755 index 000000000..db1eeb165 --- /dev/null +++ b/examples/lustre_rdma_nvmedrives/scripts/lfsrepo.sh @@ -0,0 +1,27 @@ +#!/bin/bash +lustre_version=${1-2.10} + +cat << EOF >/etc/yum.repos.d/LustrePack.repo +[lustreserver] +name=lustreserver +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ +enabled=1 +gpgcheck=0 + +[e2fs] +name=e2fs +baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ +enabled=1 +gpgcheck=0 + +[lustreclient] +name=lustreclient +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ +enabled=1 +gpgcheck=0 +EOF + +#Include the correct rdma options +#cat >/etc/modprobe.d/lustre.conf</dev/null + diff --git a/examples/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh b/examples/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh new file mode 100755 index 000000000..60f3e759e --- /dev/null +++ b/examples/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh @@ -0,0 +1,10 @@ +#!/bin/bash +yum -y --nogpgcheck --enablerepo=lustreserver install kmod-lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre lustre-resource-agents +modprobe -v lustre + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf + +weak-modules --add-kernel --no-initramfs +systemctl enable lustre +umount /mnt/resource diff --git a/examples/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh b/examples/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh new file mode 100755 index 000000000..f95d33864 --- /dev/null +++ b/examples/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf +service waagent restart +service rdma start +modprobe lnet +lctl network configure +lnetctl net add --net o2ib --if ib0 #need this to come up every time +sleep 5 diff --git a/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh b/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh new file mode 100755 index 000000000..dd486c3e0 --- /dev/null +++ b/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh @@ -0,0 +1,20 @@ +#!/bin/bash +groupname=$1 +vmlist=$2 +ossnum=$3 + +totalcount=$(($ossnum+2)) +index=0 + +#prep headnode +cp -r /share/home/hpcuser/.ssh /root/ + +#needs to be done sequentially +for vmname in ${vmlist[@]}; do + if [ $index -lt $totalcount ] ; then + echo "Rebooting $vmname" + ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null + fi +done +exit 0 # to ensure no errors are thrown + diff --git a/examples/lustre_rdma_nvmedrives/scripts/waitforreboot.sh b/examples/lustre_rdma_nvmedrives/scripts/waitforreboot.sh new file mode 100755 index 000000000..73411ca61 --- /dev/null +++ b/examples/lustre_rdma_nvmedrives/scripts/waitforreboot.sh @@ -0,0 +1,2 @@ +#!/bin/bash +sleep 180 #enough time for node reboot to continue process diff --git a/examples/lustre_rdma_nvmedrives/writeup b/examples/lustre_rdma_nvmedrives/writeup new file mode 100644 index 000000000..809ec71d3 --- /dev/null +++ b/examples/lustre_rdma_nvmedrives/writeup @@ -0,0 +1,17 @@ +- lustre-ipoib - This is a created implementation of Lustre using ip over infiniband (IPoIB) +- lustre-rdma - This is a created implementation of Lustre using native Remote Direct Memory Access (RDMA) + +Changes to files to enable Infiniband functionality: +lfsmaster.sh +lfsoss.sh +lfsclient.sh +lfsrepo.sh +lfspkgs.sh + +Addition for the installation of new OFED : installOFED.sh + +Addition for correct Lustre kernel : lustreinstall1.sh +Lustre packages : lustreinstall2.sh + +Addition for rebooting of Lustre MDS/OSS: rebootlustre.sh +Addition for pause after MDS/OSS reboot : waitforreboot.sh From 35d4839e0126b47ea25bc4294dd9bbe42101dbfa Mon Sep 17 00:00:00 2001 From: chadnar2 <52789065+chadnar2@users.noreply.github.com> Date: Sat, 27 Jun 2020 16:39:06 -0500 Subject: [PATCH 04/36] Update config.json --- examples/lustre_rdma_nvmedrives/config.json | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/lustre_rdma_nvmedrives/config.json b/examples/lustre_rdma_nvmedrives/config.json index 8d2afed3e..841daab77 100644 --- a/examples/lustre_rdma_nvmedrives/config.json +++ b/examples/lustre_rdma_nvmedrives/config.json @@ -15,7 +15,6 @@ "resource_group": "", "image": "OpenLogic:CentOS:7.6:latest", "lustreimage": "OpenLogic:CentOS:7.6:latest", - "drivenum": 4, "ossnum": 4, "low_priority": true, "storage_account": "", From fe4684eb39941383e35acff104ba9f372e6ab459 Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Mon, 29 Jun 2020 07:54:05 -0500 Subject: [PATCH 05/36] changes to config.json rebootlustre.sh to remove extra unnecessary lines --- examples/lustre_ipoib_nvmedrives/config.json | 10 +++++----- examples/lustre_ipoib_nvmedrives/readme.md | 2 +- examples/lustre_rdma_nvmedrives/config.json | 1 - .../lustre_rdma_nvmedrives/scripts/rebootlustre.sh | 5 ++--- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/examples/lustre_ipoib_nvmedrives/config.json b/examples/lustre_ipoib_nvmedrives/config.json index 5800a1dcc..8583e738b 100644 --- a/examples/lustre_ipoib_nvmedrives/config.json +++ b/examples/lustre_ipoib_nvmedrives/config.json @@ -32,7 +32,7 @@ "resources": { "headnode": { "type": "vm", - "vm_type": "Standard_HC44rs", + "vm_type": "Standard_HB60rs", "accelerated_networking": false, "public_ip": true, "image": "variables.image", @@ -51,7 +51,7 @@ }, "lustre": { "type": "vmss", - "vm_type": "Standard_HC44rs", + "vm_type": "Standard_HB120rs_v2", "instances": "9", "accelerated_networking": false, "image": "variables.lustreimage", @@ -127,7 +127,7 @@ "script": "lfsoss.sh", "args": [ "$(head -n1 hostlists/tags/lustre)", - "/dev/sdb" + "/dev/nvme0n1" ], "tag": "lustre", "sudo": true @@ -186,7 +186,7 @@ "pbspro_19.1.1.centos7/pbspro-server-19.1.1-0.x86_64.rpm" ], "tag": "pbsserver", - "sudo": false + "sudo": true }, { "script": "pbsclient.sh", @@ -197,7 +197,7 @@ "pbspro_19.1.1.centos7/pbspro-execution-19.1.1-0.x86_64.rpm" ], "tag": "lfsclient", - "sudo": false + "sudo": true } ] diff --git a/examples/lustre_ipoib_nvmedrives/readme.md b/examples/lustre_ipoib_nvmedrives/readme.md index ea323662b..4dbc248f4 100644 --- a/examples/lustre_ipoib_nvmedrives/readme.md +++ b/examples/lustre_ipoib_nvmedrives/readme.md @@ -2,7 +2,7 @@ Visualisation: [config.json](https://azurehpc.azureedge.net/?o=https://raw.githubusercontent.com/Azure/azurehpc/master/examples/lustre_Infiniband/config.json) -This is a deployment of Lustre using the available infiniband network. This solution has been designed to work with either IP over infiniband or true Remote Direct Memory Access(RDMA), although only the IPoIB version has been developed thus far. This particular deployment will use NVMe drives for the OSSes and MDSes. +This is a deployment of Lustre using the available infiniband network. This solution has been designed to work with either IP over infiniband or true Remote Direct Memory Access(RDMA). This particular deployment will use NVMe drives for the OSSes and MDSes. This deployment will only function using the Python based AzureHPC (not the BASH libexec). diff --git a/examples/lustre_rdma_nvmedrives/config.json b/examples/lustre_rdma_nvmedrives/config.json index 8d2afed3e..5a2eb4c1f 100644 --- a/examples/lustre_rdma_nvmedrives/config.json +++ b/examples/lustre_rdma_nvmedrives/config.json @@ -123,7 +123,6 @@ "tag": "rebootlustre", "sudo": true, "args": [ - "variables.resource_group", "$( Date: Mon, 29 Jun 2020 08:02:26 -0500 Subject: [PATCH 06/36] changes to lustre_rdma_nvmedrives config.json and rebootlustre.sh to remove unnecessary lines --- examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh b/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh index 9d1bf38c7..6135ef8c3 100755 --- a/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh +++ b/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh @@ -1,3 +1,4 @@ +#reboot lustre nodes #!/bin/bash vmlist=$1 ossnum=$2 From 541a2542b8901866e89cf2789a1f368867f63113 Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Mon, 29 Jun 2020 08:02:48 -0500 Subject: [PATCH 07/36] changes to lustre_rdma_nvmedrives config.json and rebootlustre.sh to remove unnecessary lines --- examples/lustre_rdma_nvmedrives/config.json | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/lustre_rdma_nvmedrives/config.json b/examples/lustre_rdma_nvmedrives/config.json index 5a2eb4c1f..e92d24112 100644 --- a/examples/lustre_rdma_nvmedrives/config.json +++ b/examples/lustre_rdma_nvmedrives/config.json @@ -232,3 +232,4 @@ } ] } + From 81ad86a5c0b82c37ef5c73d5bd7aa7a08b4cada0 Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Mon, 29 Jun 2020 08:12:19 -0500 Subject: [PATCH 08/36] Modification to config.json and rebootlustre.sh to remove unnecessary lines --- examples/lustre_rdma_nvmedrives/config.json | 1 - examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/examples/lustre_rdma_nvmedrives/config.json b/examples/lustre_rdma_nvmedrives/config.json index 9e5699dae..c4a5b77dc 100644 --- a/examples/lustre_rdma_nvmedrives/config.json +++ b/examples/lustre_rdma_nvmedrives/config.json @@ -231,4 +231,3 @@ } ] } - diff --git a/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh b/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh index 6135ef8c3..9d1bf38c7 100755 --- a/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh +++ b/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh @@ -1,4 +1,3 @@ -#reboot lustre nodes #!/bin/bash vmlist=$1 ossnum=$2 From f37b167dac9a8c27150d9d636b5dfe5f3f00dc4e Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Tue, 30 Jun 2020 10:44:38 -0500 Subject: [PATCH 09/36] Modifications to config.json files to clean up location variable. Addition of lustre_rdma_avs --- examples/lustre_ipoib/config.json | 3 +- examples/lustre_ipoib_nvmedrives/config.json | 3 +- .../azhpc_install_config/hostlists/compute | 2 + .../azhpc_install_config/hostlists/headnode | 1 + .../azhpc_install_config/hostlists/lfsmaster | 1 + .../azhpc_install_config/hostlists/linux | 6 + .../azhpc_install_config/hostlists/lustre | 2 + .../hostlists/tags/cndefault | 6 + .../hostlists/tags/disable-selinux | 6 + .../hostlists/tags/lfsazimport | 1 + .../hostlists/tags/lfsclient | 3 + .../hostlists/tags/lfsloganalytics | 3 + .../hostlists/tags/lfsmaster | 1 + .../hostlists/tags/lfsrepo | 6 + .../hostlists/tags/localuser | 6 + .../hostlists/tags/loginnode | 1 + .../hostlists/tags/lustre | 3 + .../hostlists/tags/nfsclient | 5 + .../hostlists/tags/nfsserver | 1 + .../hostlists/tags/ossnode | 2 + .../hostlists/tags/pbsclient | 2 + .../hostlists/tags/pbsserver | 1 + .../hostlists/tags/rebootlustre | 1 + .../azhpc_install_config/hpcadmin_id_rsa | 27 + .../azhpc_install_config/hpcadmin_id_rsa.pub | 1 + .../install/00_install_node_setup.sh | 48 ++ .../install/01_disable-selinux.sh | 18 + .../install/02_cndefault.sh | 18 + .../install/03_nfsserver.sh | 18 + .../install/04_nfsclient.sh | 18 + .../install/05_localuser.sh | 18 + .../install/06_lfsrepo.sh | 18 + .../install/07_lustreinstall1.sh | 18 + .../install/08_rebootlustre.sh | 18 + .../install/09_waitforreboot.sh | 7 + .../install/10_installOFED.sh | 18 + .../install/11_lustreinstall2.sh | 18 + .../install/12_lustrenetwork.sh | 18 + .../install/13_lfsmaster.sh | 18 + .../azhpc_install_config/install/14_lfsoss.sh | 18 + .../azhpc_install_config/install/15_lfshsm.sh | 18 + .../install/16_lfsclient.sh | 18 + .../install/17_lfsimport.sh | 18 + .../install/18_lfsloganalytics.sh | 18 + .../install/19_pbsdownload.sh | 18 + .../install/20_pbsserver.sh | 19 + .../install/21_pbsclient.sh | 19 + .../azhpc_install_config/scripts/cndefault.sh | 23 + .../scripts/disable-selinux.sh | 6 + .../scripts/installOFED.sh | 4 + .../azhpc_install_config/scripts/lfsclient.sh | 48 ++ .../azhpc_install_config/scripts/lfshsm.sh | 95 +++ .../azhpc_install_config/scripts/lfsimport.sh | 31 + .../scripts/lfsloganalytics.sh | 31 + .../azhpc_install_config/scripts/lfsmaster.sh | 31 + .../azhpc_install_config/scripts/lfsoss.sh | 30 + .../azhpc_install_config/scripts/lfsrepo.sh | 27 + .../azhpc_install_config/scripts/localuser.sh | 40 ++ .../scripts/lustreinstall1.sh | 8 + .../scripts/lustreinstall2.sh | 10 + .../scripts/lustrenetwork.sh | 9 + .../azhpc_install_config/scripts/nfsclient.sh | 34 + .../azhpc_install_config/scripts/nfsserver.sh | 212 +++++++ .../azhpc_install_config/scripts/pbsclient.sh | 22 + .../scripts/pbsdownload.sh | 9 + .../azhpc_install_config/scripts/pbsserver.sh | 19 + .../scripts/rebootlustre.sh | 16 + .../scripts/waitforreboot.sh | 2 + examples/lustre_rdma_avs/config.json | 270 ++++++++ examples/lustre_rdma_avs/deploy_config.json | 583 ++++++++++++++++++ examples/lustre_rdma_avs/hpcadmin_id_rsa | 27 + examples/lustre_rdma_avs/hpcadmin_id_rsa.pub | 1 + examples/lustre_rdma_avs/readme.md | 37 ++ .../lustre_rdma_avs/scripts/installOFED.sh | 4 + examples/lustre_rdma_avs/scripts/lfsclient.sh | 48 ++ examples/lustre_rdma_avs/scripts/lfsmaster.sh | 31 + examples/lustre_rdma_avs/scripts/lfsoss.sh | 30 + examples/lustre_rdma_avs/scripts/lfsrepo.sh | 27 + .../lustre_rdma_avs/scripts/lustreinstall1.sh | 8 + .../lustre_rdma_avs/scripts/lustreinstall2.sh | 10 + .../lustre_rdma_avs/scripts/lustrenetwork.sh | 9 + examples/lustre_rdma_avs/scripts/oldreboot | 20 + .../lustre_rdma_avs/scripts/rebootlustre.sh | 16 + .../lustre_rdma_avs/scripts/removeMOFED.sh | 6 + .../lustre_rdma_avs/scripts/waitforreboot.sh | 2 + examples/lustre_rdma_avs/writeup | 20 + examples/lustre_rdma_avs/writeuplustreipoib | 11 + examples/lustre_rdma_nvmedrives/config.json | 3 +- 88 files changed, 2378 insertions(+), 3 deletions(-) create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/compute create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/headnode create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/linux create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/lustre create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa create mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/localuser.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh create mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh create mode 100644 examples/lustre_rdma_avs/config.json create mode 100644 examples/lustre_rdma_avs/deploy_config.json create mode 100644 examples/lustre_rdma_avs/hpcadmin_id_rsa create mode 100644 examples/lustre_rdma_avs/hpcadmin_id_rsa.pub create mode 100644 examples/lustre_rdma_avs/readme.md create mode 100755 examples/lustre_rdma_avs/scripts/installOFED.sh create mode 100755 examples/lustre_rdma_avs/scripts/lfsclient.sh create mode 100755 examples/lustre_rdma_avs/scripts/lfsmaster.sh create mode 100755 examples/lustre_rdma_avs/scripts/lfsoss.sh create mode 100755 examples/lustre_rdma_avs/scripts/lfsrepo.sh create mode 100755 examples/lustre_rdma_avs/scripts/lustreinstall1.sh create mode 100755 examples/lustre_rdma_avs/scripts/lustreinstall2.sh create mode 100755 examples/lustre_rdma_avs/scripts/lustrenetwork.sh create mode 100755 examples/lustre_rdma_avs/scripts/oldreboot create mode 100755 examples/lustre_rdma_avs/scripts/rebootlustre.sh create mode 100755 examples/lustre_rdma_avs/scripts/removeMOFED.sh create mode 100755 examples/lustre_rdma_avs/scripts/waitforreboot.sh create mode 100644 examples/lustre_rdma_avs/writeup create mode 100644 examples/lustre_rdma_avs/writeuplustreipoib diff --git a/examples/lustre_ipoib/config.json b/examples/lustre_ipoib/config.json index 8de923c03..0127dc390 100644 --- a/examples/lustre_ipoib/config.json +++ b/examples/lustre_ipoib/config.json @@ -1,5 +1,5 @@ { - "location": "southcentralus", + "location": "variables.location", "resource_group": "variables.resource_group", "install_from": "headnode", "admin_user": "hpcadmin", @@ -12,6 +12,7 @@ } }, "variables": { + "location": "", "resource_group": "", "image": "OpenLogic:CentOS-HPC:7.6:latest", "lustreimage": "OpenLogic:CentOS-HPC:7.6:latest", diff --git a/examples/lustre_ipoib_nvmedrives/config.json b/examples/lustre_ipoib_nvmedrives/config.json index 8583e738b..4a7bd988b 100644 --- a/examples/lustre_ipoib_nvmedrives/config.json +++ b/examples/lustre_ipoib_nvmedrives/config.json @@ -1,5 +1,5 @@ { - "location": "southcentralus", + "location": "variables.location", "resource_group": "variables.resource_group", "install_from": "headnode", "admin_user": "hpcadmin", @@ -12,6 +12,7 @@ } }, "variables": { + "location" : "", "resource_group": "", "image": "OpenLogic:CentOS-HPC:7.6:latest", "lustreimage": "OpenLogic:CentOS-HPC:7.6:latest", diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/compute b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/compute new file mode 100644 index 000000000..232110d4a --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/compute @@ -0,0 +1,2 @@ +compute0001 +compute0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/headnode b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/headnode new file mode 100644 index 000000000..1a9798066 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/headnode @@ -0,0 +1 @@ +headnode diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster new file mode 100644 index 000000000..a47bf87fe --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster @@ -0,0 +1 @@ +lfsmaster diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/linux b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/linux new file mode 100644 index 000000000..337053fb6 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/linux @@ -0,0 +1,6 @@ +headnode +compute0001 +compute0002 +lfsmaster +lustre0001 +lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/lustre b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/lustre new file mode 100644 index 000000000..b8f9b2061 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/lustre @@ -0,0 +1,2 @@ +lustre0001 +lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault new file mode 100644 index 000000000..337053fb6 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault @@ -0,0 +1,6 @@ +headnode +compute0001 +compute0002 +lfsmaster +lustre0001 +lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux new file mode 100644 index 000000000..337053fb6 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux @@ -0,0 +1,6 @@ +headnode +compute0001 +compute0002 +lfsmaster +lustre0001 +lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport new file mode 100644 index 000000000..1a9798066 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport @@ -0,0 +1 @@ +headnode diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient new file mode 100644 index 000000000..8af893f49 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient @@ -0,0 +1,3 @@ +headnode +compute0001 +compute0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics new file mode 100644 index 000000000..6453c2e60 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics @@ -0,0 +1,3 @@ +lfsmaster +lustre0001 +lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster new file mode 100644 index 000000000..a47bf87fe --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster @@ -0,0 +1 @@ +lfsmaster diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo new file mode 100644 index 000000000..337053fb6 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo @@ -0,0 +1,6 @@ +headnode +compute0001 +compute0002 +lfsmaster +lustre0001 +lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser new file mode 100644 index 000000000..337053fb6 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser @@ -0,0 +1,6 @@ +headnode +compute0001 +compute0002 +lfsmaster +lustre0001 +lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode new file mode 100644 index 000000000..1a9798066 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode @@ -0,0 +1 @@ +headnode diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre new file mode 100644 index 000000000..6453c2e60 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre @@ -0,0 +1,3 @@ +lfsmaster +lustre0001 +lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient new file mode 100644 index 000000000..748d1c5dc --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient @@ -0,0 +1,5 @@ +compute0001 +compute0002 +lfsmaster +lustre0001 +lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver new file mode 100644 index 000000000..1a9798066 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver @@ -0,0 +1 @@ +headnode diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode new file mode 100644 index 000000000..b8f9b2061 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode @@ -0,0 +1,2 @@ +lustre0001 +lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient new file mode 100644 index 000000000..232110d4a --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient @@ -0,0 +1,2 @@ +compute0001 +compute0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver new file mode 100644 index 000000000..1a9798066 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver @@ -0,0 +1 @@ +headnode diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre new file mode 100644 index 000000000..1a9798066 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre @@ -0,0 +1 @@ +headnode diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa b/examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa new file mode 100644 index 000000000..7846d2b39 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa @@ -0,0 +1,27 @@ +-----BEGIN RSA PRIVATE KEY----- +MIIEpAIBAAKCAQEA1JCjVUGcKCYN3RCERznjr7e1Chsf+DG30uluSXk3I/6nesto +5gLGfKiTjeHWvX5tqFITAA84r140AgsIcUHEpaWwk06QIVUTj6kDHbubP0i1V2EY +2sa6cm6hPQmsFIiOK578BLuv/Zda/arVJ1dq1q+1t0tt84TTCrsROszNw8t9Kc3Z +Gn2SY7F52Z8nttmN7OEsfUtg6K6f/5IwbJb7U8b/0jF6yWDpzrmqN33BJfrZ1VWs +jswhblxJZ0juAU/oAB0xtOzqM2vwUZy9FmcfRPo/U1gM4DUG1h37oWWkoQLhgURu +p3Lztqq8msXXsnk3ZnIkMWWNJ429fN2ui751QQIDAQABAoIBAGYQfRy2wDBW9Vks +UReSKE17PCZ6F8Oou8c95oLI/Tz/TZOcj+XBd2Tr3M3HnsCmMCkeH5lrtaAe74H7 +ojYfijivcjWJB5O5sgbM9H4WUtj0JH6sVK7XtTa1AB66wjGpz/oKAKCVLk/pmPss +R+T4CIjFHc/BHC5NnLgOUpuVM0fLUUUF8NmIvT6K0P4j7GZx12d1TDkqo+/rd1ku +EOuCjl8Q4bTO0qtJEXy2dmn38m6QGNS765j8gQ21wWY+Q7EX4JaJ+oO2ZgGuyYul +Cu+AFlCR4SkOok0DN6RG4KQ7Sly57HrZWwLI46FXmjiJqE/7wNvMwuHdUmnVbkoY +v04fxAECgYEA8ii6KMsPIxnMSCBpsRoFSOcPdSyoFyhMCCuiR9liCGRG4wz4u1i6 +ZFal1+d/rX6qxCTIZxvU8zn54Qsrr+44zV++4+Sd/nhrc+qWOxGggAscbYNG3w2g +GTGinERFPRs5iGmdJ0n+uy/TSPe5t0qH85AdKcU47mfrNb3Q08rEfxECgYEA4Lbj +zkCUa4UN6CP36FtOUNtpnrn7dxfDNpcS8CTt/oob2OifOUGhgPbCio5at7zE8cH0 +hWrUWFPDfBRliGdG/ZzdmIOaC0MU9eQG4JxkblgYccKpcYsTq45NDyhQJ0lbBjRG +Sp42HOnvZ8p0m9przrnQF22Bvr5E+VF1wVk18zECgYEA7pI9RS84jIZAAdcdCYPv +LPGnAvOp7paewXXrfQmnUUkppUsESd6SU4QiA2FpIk4mgvMSFLMQy0eU7KeKtNrn +Tz5C3FZBaZDNm/fDZhJpo3xO13179wh/cBK8d2OzKw6FUeVrFGgL8/KcH8kfSHq/ +EbAraxmIiygKTHnjIKUljWECgYAQxhYjIzbw/7GWDnlG4unppzcvHfrjXOa5gHVt +b5REV9LUUijwgTGpCsJizVWAOZsJ4Mx72QmYvkftTyh1EiB+deMkq04oYQ2DfU32 +HjZw9ip882bqjtMdDzY5V20EQbmFsQk+MKkhZ2Tzfm1N5PP/LmeWGBqDPnivk6ES +mbIpQQKBgQDqnc9KivmjPIHz2BpJh8icWkdvZ2WzycI3Sly6Suh0E6Q+epMTXUm3 +21TIEkkAlBYXkHs0ZhL7l7jzv5yYSGB8ZNDzk+UquE5OuxMwWsd3trqyJ3LMj9C5 +hV6JTHqNSw8xubCES0oRgJkcCedoQ0qxMwypnJarWPh/LSVCu3BZ2A== +-----END RSA PRIVATE KEY----- diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub b/examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub new file mode 100644 index 000000000..20776c3a0 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDUkKNVQZwoJg3dEIRHOeOvt7UKGx/4MbfS6W5JeTcj/qd6y2jmAsZ8qJON4da9fm2oUhMADzivXjQCCwhxQcSlpbCTTpAhVROPqQMdu5s/SLVXYRjaxrpybqE9CawUiI4rnvwEu6/9l1r9qtUnV2rWr7W3S23zhNMKuxE6zM3Dy30pzdkafZJjsXnZnye22Y3s4Sx9S2Dorp//kjBslvtTxv/SMXrJYOnOuao3fcEl+tnVVayOzCFuXElnSO4BT+gAHTG07Ooza/BRnL0WZx9E+j9TWAzgNQbWHfuhZaShAuGBRG6ncvO2qryaxdeyeTdmciQxZY0njb183a6LvnVB diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh new file mode 100755 index 000000000..d5e1850c6 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=linux + +if [ ! -f "hostlists/$tag" ]; then + echo "no hostlist ($tag), exiting" + exit 0 +fi + +# wait for DNS to update for all hostnames +for h in $(/dev/null 2>&1; do + echo "Waiting for host - $h (sleeping for 5 seconds)" + sleep 5 + done +done + +if [ "$1" != "" ]; then + tag=tags/$1 +else + sudo yum install -y epel-release > install/00_install_node_setup.log 2>&1 + sudo yum install -y pssh nc >> install/00_install_node_setup.log 2>&1 + + # setting up keys + cat < ~/.ssh/config + Host * + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + LogLevel ERROR +EOF + cp hpcadmin_id_rsa.pub ~/.ssh/id_rsa.pub + cp hpcadmin_id_rsa ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + chmod 644 ~/.ssh/config + chmod 644 ~/.ssh/id_rsa.pub + +fi + +pssh -p 50 -t 0 -i -h hostlists/$tag 'rpm -q rsync || sudo yum install -y rsync' >> install/00_install_node_setup.log 2>&1 + +prsync -p 50 -a -h hostlists/$tag ~/azhpc_install_config ~ >> install/00_install_node_setup.log 2>&1 +prsync -p 50 -a -h hostlists/$tag ~/.ssh ~ >> install/00_install_node_setup.log 2>&1 + +pssh -p 50 -t 0 -i -h hostlists/$tag 'echo "AcceptEnv PSSH_NODENUM PSSH_HOST" | sudo tee -a /etc/ssh/sshd_config' >> install/00_install_node_setup.log 2>&1 +pssh -p 50 -t 0 -i -h hostlists/$tag 'sudo systemctl restart sshd' >> install/00_install_node_setup.log 2>&1 +pssh -p 50 -t 0 -i -h hostlists/$tag "echo 'Defaults env_keep += \"PSSH_NODENUM PSSH_HOST\"' | sudo tee -a /etc/sudoers" >> install/00_install_node_setup.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh new file mode 100755 index 000000000..aff9f6abd --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-disable-selinux} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/disable-selinux.sh" >> install/01_disable-selinux.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh new file mode 100755 index 000000000..89df21b38 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-cndefault} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/cndefault.sh" >> install/02_cndefault.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh new file mode 100755 index 000000000..9fe8fc049 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-nfsserver} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/nfsserver.sh" >> install/03_nfsserver.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh new file mode 100755 index 000000000..3ef1d7dd2 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-nfsclient} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/nfsclient.sh '$(> install/04_nfsclient.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh new file mode 100755 index 000000000..547517af7 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-localuser} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/localuser.sh '$(> install/05_localuser.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh new file mode 100755 index 000000000..c51d1a7bf --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lfsrepo} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsrepo.sh '2.10'" >> install/06_lfsrepo.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh new file mode 100755 index 000000000..9c9e725e1 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lustre} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lustreinstall1.sh" >> install/07_lustreinstall1.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh new file mode 100755 index 000000000..cafe2dccc --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-rebootlustre} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/rebootlustre.sh '$(> install/08_rebootlustre.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh new file mode 100755 index 000000000..e7f2585c1 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +scripts/waitforreboot.sh >> install/09_waitforreboot.log 2>&1 + diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh new file mode 100755 index 000000000..0a9d5144c --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lustre} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/installOFED.sh" >> install/10_installOFED.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh new file mode 100755 index 000000000..415de3119 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lustre} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lustreinstall2.sh" >> install/11_lustreinstall2.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh new file mode 100755 index 000000000..210bc389e --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lustre} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lustrenetwork.sh" >> install/12_lustrenetwork.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh new file mode 100755 index 000000000..5dead31c8 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lfsmaster} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsmaster.sh '/dev/sdb'" >> install/13_lfsmaster.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh new file mode 100755 index 000000000..0b2f013ae --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-ossnode} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsoss.sh '$(head -n1 hostlists/tags/lfsmaster)' '/dev/nvme0n1'" >> install/14_lfsoss.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh new file mode 100755 index 000000000..479abe10e --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lustre} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfshsm.sh '$(head -n1 hostlists/tags/lustre)' 'lustretesting' 'TXOO/DhcJHGjjcNQ58f9SGCRF3RUuz3/UHaE70KbDAHhIkd38Ic5YXVlFcdxuytgk8pDg0sp5J9lCdOWr++sXA==' 'hsm' '2.10'" >> install/15_lfshsm.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh new file mode 100755 index 000000000..e6e74eb5c --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lfsclient} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsclient.sh '$(head -n1 hostlists/tags/lfsmaster)' '/lustre'" >> install/16_lfsclient.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh new file mode 100755 index 000000000..c23853cd8 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lfsazimport} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsimport.sh 'lustretesting' 'TXOO/DhcJHGjjcNQ58f9SGCRF3RUuz3/UHaE70KbDAHhIkd38Ic5YXVlFcdxuytgk8pDg0sp5J9lCdOWr++sXA==' 'hsm' '/lustre' '2.10'" >> install/17_lfsimport.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh new file mode 100755 index 000000000..d2a6ff976 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lfsloganalytics} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsloganalytics.sh 'lfs' 'eb2e4150-e0fa-494d-8f60-291e27820eff' '0iKHSuo3C36gwxYYZSBIIVB8g5l7A1qztuF77oVwZlFV9iKqke/Jajc+qVLkt1SB7LNimpeb3Q++qerMtnZvuw=='" >> install/18_lfsloganalytics.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh new file mode 100755 index 000000000..9731feb81 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-loginnode} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; scripts/pbsdownload.sh" >> install/19_pbsdownload.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh new file mode 100755 index 000000000..0a2c0cf2d --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-pbsserver} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pscp.pssh -p 50 -h hostlists/tags/$tag pbspro_19.1.1.centos7/pbspro-server-19.1.1-0.x86_64.rpm $(pwd) >> install/20_pbsserver.log 2>&1 +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/pbsserver.sh" >> install/20_pbsserver.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh new file mode 100755 index 000000000..1c354d17f --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-pbsclient} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pscp.pssh -p 50 -h hostlists/tags/$tag pbspro_19.1.1.centos7/pbspro-execution-19.1.1-0.x86_64.rpm $(pwd) >> install/21_pbsclient.log 2>&1 +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/pbsclient.sh '$(> install/21_pbsclient.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh new file mode 100755 index 000000000..303ebac1b --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Script to be run on all compute nodes +if ! rpm -q epel-release; then + yum -y install epel-release +fi + +yum -y install git jq htop + +# change access to resource so that temp jobs can be written there +chmod 777 /mnt/resource + +# If running on Cycle +# - enable METADATA access +# - remove Jetpack convergence +# - Disable Fail2Ban service +# - Fix PBS limits +if [ -e $CYCLECLOUD_HOME/bin/jetpack ]; then + DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + $DIR/azhpc4cycle.sh enable_metada_access + $DIR/azhpc4cycle.sh disable_jetpack_converge + $DIR/azhpc4cycle.sh disable_fail2ban + $DIR/azhpc4cycle.sh fix_pbs_limits +fi diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh new file mode 100755 index 000000000..00c87bbf2 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# set to permissive for now (until reboot) +setenforce 0 +# prep to have selinux disabled after reboot +sed -i 's/SELINUX=.*$/SELINUX=disabled/g' /etc/selinux/config diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh new file mode 100755 index 000000000..c267519fc --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh @@ -0,0 +1,4 @@ +#!/bin/bash +yum -y groupinstall --skip-broken "Infiniband Support" 2>/dev/null +echo "done installing Infiniband" +exit 0 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh new file mode 100755 index 000000000..26603bebd --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} +mkdir ~/.ssh + +cp -r /share/home/hpcuser/.ssh ~/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh new file mode 100755 index 000000000..0af1fc5e2 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = storage account +# arg: $3 = storage key +# arg: $4 = storage container +# arg: $5 = lustre version (default 2.10) +master=$1 +storage_account=$2 +storage_key=$3 +storage_container=$4 +lustre_version=${5-2.10} + +# remove the patch version +ndots=${lustre_version//[^.]} +if [ "${#ndots}" = "2" ]; then + lustre_version=${lustre_version%.*} +fi + +# adding kernel module for lustre client +if [ "$lustre_version" = "2.10" ]; then + yum install -y kmod-lustre-client + weak-modules --add-kernel $(uname -r) +fi + +if ! rpm -q lemur-azure-hsm-agent lemur-azure-data-movers; then + yum -y install \ + https://azurehpc.azureedge.net/rpms/lemur-azure-hsm-agent-1.0.0-lustre_${lustre_version}.x86_64.rpm \ + https://azurehpc.azureedge.net/rpms/lemur-azure-data-movers-1.0.0-lustre_${lustre_version}.x86_64.rpm +fi + +mkdir -p /var/run/lhsmd +chmod 755 /var/run/lhsmd + +mkdir -p /etc/lhsmd +chmod 755 /etc/lhsmd + +cat </etc/lhsmd/agent +# Lustre NID and filesystem name for the front end filesystem, the agent will mount this +client_device="${master}@tcp:/LustreFS" + +# Do you want to use S3 and POSIX, in this example we use POSIX +enabled_plugins=["lhsm-plugin-az"] + +## Directory to look for the plugins +plugin_dir="/usr/libexec/lhsmd" + +# TBD, I used 16 +handler_count=16 + +# TBD +snapshots { + enabled = false +} +EOF +chmod 600 /etc/lhsmd/agent + +cat </etc/lhsmd/lhsm-plugin-az +az_storage_account = "$storage_account" +az_storage_key = "$storage_key" + +num_threads = 32 + +# +# One or more archive definition is required. +# +archive "az-blob" { + id = 1 # Must be unique to this endpoint + container = "$storage_container" # Container used for this archive + prefix = "" # Optional prefix + num_threads = 32 +} +EOF +chmod 600 /etc/lhsmd/lhsm-plugin-az + +cat </etc/systemd/system/lhsmd.service +[Unit] +Description=The lhsmd server +After=syslog.target network.target remote-fs.target nss-lookup.target + +[Service] +Type=simple +PIDFile=/run/lhsmd.pid +ExecStartPre=/bin/mkdir -p /var/run/lhsmd +ExecStart=/sbin/lhsmd -config /etc/lhsmd/agent +Restart=always + +[Install] +WantedBy=multi-user.target +EOF +chmod 600 /etc/systemd/system/lhsmd.service + +systemctl daemon-reload +systemctl enable lhsmd +systemctl start lhsmd diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh new file mode 100755 index 000000000..fd9fad30b --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# arg: $1 = storage account +# arg: $2 = storage key +# arg: $3 = storage container +# arg: $3 = lfs mount +# arg: $4 = lustre mount (default=/lustre) +# arg: $5 = lustre version (default=2.10) +storage_account=$1 +storage_key=$2 +storage_container=$3 +lfs_mount=${4:-/lustre} +lustre_version=${5-2.10} + +# remove the patch version +ndots=${lustre_version//[^.]} +if [ "${#ndots}" = "2" ]; then + lustre_version=${lustre_version%.*} +fi + +if ! rpm -q lemur-azure-hsm-agent lemur-azure-data-movers; then + yum -y install \ + https://azurehpc.azureedge.net/rpms/lemur-azure-hsm-agent-1.0.0-lustre_${lustre_version}.x86_64.rpm \ + https://azurehpc.azureedge.net/rpms/lemur-azure-data-movers-1.0.0-lustre_${lustre_version}.x86_64.rpm +fi + +cd $lfs_mount +export STORAGE_ACCOUNT=$storage_account +export STORAGE_KEY=$storage_key +/sbin/azure-import ${storage_container} + diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh new file mode 100755 index 000000000..ce6b43f3d --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# arg: $1 = name +# arg: $2 = log analytics workspace id +# arg: $3 = log analytics key + +name=$1 +log_analytics_workspace_id=$2 +log_analytics_key=$3 + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +sed "s#__FS_NAME__#${name}#g;s#__LOG_ANALYTICS_WORKSPACE_ID__#${log_analytics_workspace_id}#g;s#__LOG_ANALYTICS_KEY__#${log_analytics_key}#g" $DIR/lfsloganalyticsd.sh.in >/usr/bin/lfsloganalyticsd.sh + +chmod +x /usr/bin/lfsloganalyticsd.sh + +cat </lib/systemd/system/lfsloganalytics.service +[Unit] +Description=Lustre logging service to Log Analytics. + +[Service] +Type=simple +ExecStart=/bin/bash /usr/bin/lfsloganalyticsd.sh +Restart=always + +[Install] +WantedBy=multi-user.target +EOF + +systemctl enable lfsloganalytics +systemctl start lfsloganalytics \ No newline at end of file diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh new file mode 100755 index 000000000..d2dcdb02e --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh /root/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + + diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh new file mode 100755 index 000000000..8f39aac68 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh /root/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + + lnetctl net add --net o2ib --if ib0 #double check + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}@o2ib" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device + + +mkdir /mnt/oss +echo "$device /mnt/oss lustre noatime,nodiratime,nobarrier 0 2" >> /etc/fstab +mount -a diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh new file mode 100755 index 000000000..db1eeb165 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh @@ -0,0 +1,27 @@ +#!/bin/bash +lustre_version=${1-2.10} + +cat << EOF >/etc/yum.repos.d/LustrePack.repo +[lustreserver] +name=lustreserver +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ +enabled=1 +gpgcheck=0 + +[e2fs] +name=e2fs +baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ +enabled=1 +gpgcheck=0 + +[lustreclient] +name=lustreclient +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ +enabled=1 +gpgcheck=0 +EOF + +#Include the correct rdma options +#cat >/etc/modprobe.d/lustre.conf<$home_root/$new_user/.ssh/config +Host * + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + LogLevel ERROR +EOF + ssh-keygen -f $home_root/$new_user/.ssh/id_rsa -t rsa -N '' + # add admin user public key (the only user in /home) + cat /home/*/.ssh/id_rsa.pub >$home_root/$new_user/.ssh/authorized_keys + cat $home_root/$new_user/.ssh/id_rsa.pub >>$home_root/$new_user/.ssh/authorized_keys + chown $new_user:$new_user $home_root/$new_user/.ssh + chown $new_user:$new_user $home_root/$new_user/.ssh/* + chmod 700 $home_root/$new_user/.ssh + chmod 600 $home_root/$new_user/.ssh/id_rsa + chmod 644 $home_root/$new_user/.ssh/id_rsa.pub + chmod 644 $home_root/$new_user/.ssh/config + chmod 644 $home_root/$new_user/.ssh/authorized_keys +fi +echo "$new_user ALL=(ALL) NOPASSWD: ALL" | tee -a /etc/sudoers diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh new file mode 100755 index 000000000..c052001a0 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# jump the gun here to ensure passwordless ssh as root between all lustre nodes to faciltate node reboot +cp -r /share/home/hpcuser/.ssh ~/ + +yum -y --nogpgcheck --disablerepo=* --enablerepo=e2fs install e2fsprogs + +yum -y --nogpgcheck --disablerepo=base,extras,updates --enablerepo=lustreserver install kernel kernel-devel kernel-headers kernel-tools kernel-tools-libs 2>/dev/null + diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh new file mode 100755 index 000000000..60f3e759e --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh @@ -0,0 +1,10 @@ +#!/bin/bash +yum -y --nogpgcheck --enablerepo=lustreserver install kmod-lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre lustre-resource-agents +modprobe -v lustre + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf + +weak-modules --add-kernel --no-initramfs +systemctl enable lustre +umount /mnt/resource diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh new file mode 100755 index 000000000..f95d33864 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf +service waagent restart +service rdma start +modprobe lnet +lctl network configure +lnetctl net add --net o2ib --if ib0 #need this to come up every time +sleep 5 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh new file mode 100755 index 000000000..678bac4dd --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# arg: $1 = nfsserver +nfs_server=$1 +nfs_share=${2-/share} +if [ -z "$nfs_server" ]; then + echo "The nfs_server is required" + exit 1 +fi + +yum install -y nfs-utils + +mkdir -p /scratch +mkdir -p /apps +mkdir -p /data +mkdir -p /share/home +mount $nfs_server:$nfs_share/apps /apps +mount $nfs_server:$nfs_share/data /data +mount $nfs_server:$nfs_share/home /share/home + +chmod 777 /scratch + +cat << EOF >> /etc/fstab +$nfs_server:$nfs_share/home /share/home nfs defaults 0 0 +$nfs_server:/mnt/resource/scratch /scratch nfs defaults 0 0 +$nfs_server:$nfs_share/apps /apps nfs defaults 0 0 +$nfs_server:$nfs_share/data /data nfs defaults 0 0 +EOF + +setsebool -P use_nfs_home_dirs 1 + +mount -a + +df diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh new file mode 100755 index 000000000..14d53a4c0 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh @@ -0,0 +1,212 @@ +#!/bin/bash +if [[ $(id -u) -ne 0 ]] ; then + echo "Must be run as root" + exit 1 +fi + +# Disable requiretty to allow run sudo within scripts +sed -i -e 's/Defaults requiretty.*/ #Defaults requiretty/g' /etc/sudoers + +yum -y install epel-release +yum -y install nfs-utils nfs-utils-lib + +# Shares +NFS_MOUNT_POINT=/share +NFS_APPS=$NFS_MOUNT_POINT/apps +NFS_DATA=$NFS_MOUNT_POINT/data +NFS_HOME=$NFS_MOUNT_POINT/home +NFS_SCRATCH=/mnt/resource/scratch + +# Partitions all data disks attached to the VM +# +setup_data_disks() +{ + mountPoint="$1" + filesystem="$2" + devices="$3" + raidDevice="$4" + createdPartitions="" + numdevices=`echo $devices | wc -w` + if [ $numdevices -gt 1 ] + then + # Loop through and partition disks until not found + for disk in $devices; do + fdisk -l /dev/$disk || break + fdisk /dev/$disk << EOF +n +p +1 + + +t +fd +w +EOF + createdPartitions="$createdPartitions /dev/${disk}1" + done + else + disk=$(echo $devices | tr -d [:space:]) + echo "Warning: Only a single device to partition, $disk" + fdisk -l /dev/$disk || break + fdisk /dev/$disk << EOF +n +p +1 + + +w +EOF + createdPartitions="$createdPartitions /dev/${disk}1" + fi + + sleep 10 + + # Create RAID-0 volume + if [ -n "$createdPartitions" ]; then + devices=`echo $createdPartitions | wc -w` + if [ $numdevices -gt 1 ] + then + mdadm --create /dev/$raidDevice --level 0 --raid-devices $devices $createdPartitions + sleep 10 + + mdadm /dev/$raidDevice + else + echo "Warning: mdadm is not called, we have one partition named, ${disk}1 for mountpoint, $mountPoint" + raidDevice=${disk}1 + fi + + if [ "$filesystem" == "xfs" ]; then + mkfs -t $filesystem /dev/$raidDevice + export xfsuuid="UUID=`blkid |grep dev/$raidDevice |cut -d " " -f 2 |cut -c 7-42`" +# echo "$xfsuuid $mountPoint $filesystem rw,noatime,attr2,inode64,nobarrier,sunit=1024,swidth=4096,nofail 0 2" >> /etc/fstab + echo "$xfsuuid $mountPoint $filesystem rw,noatime,attr2,inode64,nobarrier,nofail 0 2" >> /etc/fstab + else + mkfs.ext4 -i 2048 -I 512 -J size=400 -Odir_index,filetype /dev/$raidDevice + sleep 5 + tune2fs -o user_xattr /dev/$raidDevice + export ext4uuid="UUID=`blkid |grep dev/$raidDevice |cut -d " " -f 2 |cut -c 7-42`" + echo "$ext4uuid $mountPoint $filesystem noatime,nodiratime,nobarrier,nofail 0 2" >> /etc/fstab + fi + + sleep 10 + mount -a + fi +} + +setup_single_disk() +{ + mountPoint="$1" + filesystem="$2" + device="$3" + + fdisk -l /dev/$device || break + fdisk /dev/$device << EOF +n +p +1 + + +p +w +EOF + + if [ "$filesystem" == "xfs" ]; then + mkfs -t $filesystem /dev/$device + echo "/dev/$device $mountPoint $filesystem rw,noatime,attr2,inode64,nobarrier,nofail 0 2" >> /etc/fstab + else + mkfs.ext4 -F -i 2048 -I 512 -J size=400 -Odir_index,filetype /dev/$device + sleep 5 + tune2fs -o user_xattr /dev/$device + echo "/dev/$device $mountPoint $filesystem noatime,nodiratime,nobarrier,nofail 0 2" >> /etc/fstab + fi + + sleep 10 + + mount /dev/$device $mountPoint +} + +setup_disks() +{ + # Dump the current disk config for debugging + fdisk -l + + # Dump the scsi config + lsscsi + + # Get the root/OS disk so we know which device it uses and can ignore it later + rootDevice=`mount | grep "on / type" | awk '{print $1}' | sed 's/[0-9]//g'` + + # Get the TMP disk so we know which device and can ignore it later + tmpDevice=`mount | grep "on /mnt/resource type" | awk '{print $1}' | sed 's/[0-9]//g'` + + # Get the data disk sizes from fdisk, we ignore the disks above + dataDiskSize=`fdisk -l | grep '^Disk /dev/' | grep -v $rootDevice | grep -v $tmpDevice | awk '{print $3}' | sort -n -r | tail -1` + + # Compute number of disks + nbDisks=`fdisk -l | grep '^Disk /dev/' | grep -v $rootDevice | grep -v $tmpDevice | wc -l` + echo "nbDisks=$nbDisks" + + dataDevices="`fdisk -l | grep '^Disk /dev/' | grep $dataDiskSize | awk '{print $2}' | awk -F: '{print $1}' | sort | head -$nbDisks | tr '\n' ' ' | sed 's|/dev/||g'`" + + mkdir -p $NFS_MOUNT_POINT + + + if [ "$nbDisks" -eq "1" ]; then + setup_single_disk $NFS_MOUNT_POINT "ext4" "$dataDevices" + elif [ "$nbDisks" -gt "1" ]; then + setup_data_disks $NFS_MOUNT_POINT "xfs" "$dataDevices" "md10" + fi + + mkdir -p $NFS_APPS + mkdir -p $NFS_DATA + mkdir -p $NFS_HOME + mkdir -p $NFS_SCRATCH + chmod 777 $NFS_APPS + chmod 777 $NFS_DATA + chmod 777 $NFS_HOME + chmod 777 $NFS_SCRATCH + + ln -s $NFS_SCRATCH /scratch + + echo "$NFS_APPS *(rw,sync,no_root_squash)" >> /etc/exports + echo "$NFS_DATA *(rw,sync,no_root_squash)" >> /etc/exports + echo "$NFS_HOME *(rw,sync,no_root_squash)" >> /etc/exports + echo "$NFS_SCRATCH *(rw,sync,no_root_squash)" >> /etc/exports + + exportfs + exportfs -a + exportfs +} + +tune_nfs() +{ + cores=$(grep processor /proc/cpuinfo | wc -l) + nfs_proc=$(($cores * 4)) + replace="s/#RPCNFSDCOUNT=16/RPCNFSDCOUNT=$nfs_proc/g" + sed -i -e "$replace" /etc/sysconfig/nfs + + grep RPCNFSDCOUNT /etc/sysconfig/nfs +} + +systemctl enable rpcbind +systemctl enable nfs-server +systemctl enable nfs-lock +systemctl enable nfs-idmap +systemctl enable nfs + +systemctl start rpcbind +systemctl start nfs-server +systemctl start nfs-lock +systemctl start nfs-idmap +systemctl start nfs + +setup_disks +tune_nfs +systemctl restart nfs-server + +ln -s /share/apps /apps +ln -s /share/data /data + +df + + diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh new file mode 100755 index 000000000..fd037df76 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -e +# arg: $1 = pbs_server +pbs_server=$1 + +if [ "$(rpm -qa pbspro-execution)" = "" ];then + yum install -y pbspro-execution-19.1.1-0.x86_64.rpm + + sed -i "s/CHANGE_THIS_TO_PBS_PRO_SERVER_HOSTNAME/${pbs_server}/g" /etc/pbs.conf + sed -i "s/CHANGE_THIS_TO_PBS_PRO_SERVER_HOSTNAME/${pbs_server}/g" /var/spool/pbs/mom_priv/config + sed -i "s/^if /#if /g" /opt/pbs/lib/init.d/limits.pbs_mom + sed -i "s/^fi/#fi /g" /opt/pbs/lib/init.d/limits.pbs_mom + systemctl enable pbs + systemctl start pbs + + # Retrieve the VMSS name to be used as the pool name for multiple VMSS support + poolName=$(curl -s -H Metadata:true "http://169.254.169.254/metadata/instance?api-version=2018-10-01" | jq -r '.compute.vmScaleSetName') + /opt/pbs/bin/qmgr -c "c n $(hostname) resources_available.pool_name='$poolName'" + +else + echo "PBS client was already installed" +fi diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh new file mode 100755 index 000000000..b4317516b --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +filename=pbspro_19.1.1.centos7.zip + +if [ ! -f "$filename" ];then + wget -q https://github.com/PBSPro/pbspro/releases/download/v19.1.1/$filename + unzip $filename +fi + diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh new file mode 100755 index 000000000..14ee54d1a --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -e +admin_user=$(whoami) + +if [ "$(rpm -qa pbspro-server)" = "" ];then + yum install -y pbspro-server-19.1.1-0.x86_64.rpm + systemctl enable pbs + systemctl start pbs + /opt/pbs/bin/qmgr -c "s s managers += ${admin_user}@*" + /opt/pbs/bin/qmgr -c 's s flatuid=t' + /opt/pbs/bin/qmgr -c 's s job_history_enable=t' + /opt/pbs/bin/qmgr -c 'c r pool_name type=string,flag=h' + + # Update the sched_config file to schedule jobs that request pool_name + sed -i "s/^resources: \"ncpus,/resources: \"ncpus, pool_name,/g" /var/spool/pbs/sched_priv/sched_config + systemctl restart pbs +else + echo "PBSPro already installed" +fi diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh new file mode 100755 index 000000000..2d33c180b --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh @@ -0,0 +1,16 @@ +#!/bin/bash +vmlist=$1 +osscount=$2 +totalcount=$((osscount+2)) +index=0 +#prep headnode +cp -r /share/home/hpcuser/.ssh /root/ +echo "vmlist is ${vmlist[@]}" + +#needs to be done sequentially +for vmname in ${vmlist[@]}; do + echo "Rebooting $vmname" + ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null + index=$((index+1)) +done +exit 0 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh new file mode 100755 index 000000000..73411ca61 --- /dev/null +++ b/examples/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh @@ -0,0 +1,2 @@ +#!/bin/bash +sleep 180 #enough time for node reboot to continue process diff --git a/examples/lustre_rdma_avs/config.json b/examples/lustre_rdma_avs/config.json new file mode 100644 index 000000000..f366d979f --- /dev/null +++ b/examples/lustre_rdma_avs/config.json @@ -0,0 +1,270 @@ +{ + "location": "variables.location", + "resource_group": "variables.resource_group", + "install_from": "headnode", + "admin_user": "hpcadmin", + "vnet": { + "name": "hpcvnet", + "address_prefix": "10.2.0.0/20", + "subnets": { + "compute": "10.2.0.0/22", + "storage": "10.2.4.0/24" + } + }, + "variables": { + "location": "", + "resource_group": "", + "image": "OpenLogic:CentOS-HPC:7.6:latest", + "lustreimage": "OpenLogic:CentOS:7.6:latest", + "hpcimage": "OpenLogic:CentOS-HPC:7.6:latest", + "compute_instances": 2, + "lustre_instances": 2, + "low_priority": true, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.10", + "lustre_avset": "{{variables.resource_group}}avset", + "lustre_mount": "/lustre" + }, + "resources": { + "headnode": { + "type": "vm", + "vm_type": "Standard_HB60rs", + "accelerated_networking": false, + "public_ip": true, + "image": "variables.image", + "subnet": "compute", + "tags": [ + "disable-selinux", + "cndefault", + "lfsrepo", + "lfsclient", + "lfsazimport", + "localuser", + "pbsserver", + "loginnode", + "rebootlustre", + "nfsserver" + ] + }, + "compute": { + "type": "vm", + "vm_type": "Standard_HB120rs_v2", + "instances": "variables.compute_instances", + "availability_set": "variables.lustre_avset", + "accelerated_networking": false, + "image": "variables.hpcimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lfsrepo", + "lfsclient", + "localuser", + "pbsclient", + "nfsclient", + "disable-selinux" + ] + }, + "lfsmaster": { + "type": "vm", + "vm_type": "Standard_HB120rs_v2", + "availability_set": "variables.lustre_avset", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre", + "lfsmaster", + "lfsrepo", + "localuser", + "nfsclient", + "disable-selinux", + "lfsloganalytics" + ] + }, + "lustre": { + "type": "vm", + "vm_type": "Standard_HB120rs_v2", + "instances": "variables.lustre_instances", + "availability_set": "variables.lustre_avset", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lfsrepo", + "localuser", + "nfsclient", + "lustre", + "ossnode", + "disable-selinux", + "lfsloganalytics" + ] + } + }, + "install": [ + { + "script": "disable-selinux.sh", + "tag": "disable-selinux", + "sudo": true + }, + { + "script": "cndefault.sh", + "tag": "cndefault", + "sudo": true + }, + { + "script": "nfsserver.sh", + "tag": "nfsserver", + "sudo": true + }, + { + "script": "nfsclient.sh", + "args": [ + "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. + +The configuration file requires the following variables to be set: + +| Variable | Description | +|-------------------------|----------------------------------------------| +| resource_group | The resource group for the project | +| storage_account | The storage account for HSM | +| storage_key | The storage key for HSM | +| storage_container | The container to use for HSM | +| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | +| la_resourcegroup | The resource group for Log Analytics | +| la_name | The Log Analytics Workspace name | + +> Note: you can remove log anaytics and/or HSM from the config file if not required. + +> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/examples/lustre_rdma_avs/scripts/installOFED.sh b/examples/lustre_rdma_avs/scripts/installOFED.sh new file mode 100755 index 000000000..c267519fc --- /dev/null +++ b/examples/lustre_rdma_avs/scripts/installOFED.sh @@ -0,0 +1,4 @@ +#!/bin/bash +yum -y groupinstall --skip-broken "Infiniband Support" 2>/dev/null +echo "done installing Infiniband" +exit 0 diff --git a/examples/lustre_rdma_avs/scripts/lfsclient.sh b/examples/lustre_rdma_avs/scripts/lfsclient.sh new file mode 100755 index 000000000..26603bebd --- /dev/null +++ b/examples/lustre_rdma_avs/scripts/lfsclient.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} +mkdir ~/.ssh + +cp -r /share/home/hpcuser/.ssh ~/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/examples/lustre_rdma_avs/scripts/lfsmaster.sh b/examples/lustre_rdma_avs/scripts/lfsmaster.sh new file mode 100755 index 000000000..d2dcdb02e --- /dev/null +++ b/examples/lustre_rdma_avs/scripts/lfsmaster.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh /root/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + + diff --git a/examples/lustre_rdma_avs/scripts/lfsoss.sh b/examples/lustre_rdma_avs/scripts/lfsoss.sh new file mode 100755 index 000000000..8f39aac68 --- /dev/null +++ b/examples/lustre_rdma_avs/scripts/lfsoss.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh /root/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + + lnetctl net add --net o2ib --if ib0 #double check + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}@o2ib" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device + + +mkdir /mnt/oss +echo "$device /mnt/oss lustre noatime,nodiratime,nobarrier 0 2" >> /etc/fstab +mount -a diff --git a/examples/lustre_rdma_avs/scripts/lfsrepo.sh b/examples/lustre_rdma_avs/scripts/lfsrepo.sh new file mode 100755 index 000000000..db1eeb165 --- /dev/null +++ b/examples/lustre_rdma_avs/scripts/lfsrepo.sh @@ -0,0 +1,27 @@ +#!/bin/bash +lustre_version=${1-2.10} + +cat << EOF >/etc/yum.repos.d/LustrePack.repo +[lustreserver] +name=lustreserver +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ +enabled=1 +gpgcheck=0 + +[e2fs] +name=e2fs +baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ +enabled=1 +gpgcheck=0 + +[lustreclient] +name=lustreclient +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ +enabled=1 +gpgcheck=0 +EOF + +#Include the correct rdma options +#cat >/etc/modprobe.d/lustre.conf</dev/null + diff --git a/examples/lustre_rdma_avs/scripts/lustreinstall2.sh b/examples/lustre_rdma_avs/scripts/lustreinstall2.sh new file mode 100755 index 000000000..60f3e759e --- /dev/null +++ b/examples/lustre_rdma_avs/scripts/lustreinstall2.sh @@ -0,0 +1,10 @@ +#!/bin/bash +yum -y --nogpgcheck --enablerepo=lustreserver install kmod-lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre lustre-resource-agents +modprobe -v lustre + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf + +weak-modules --add-kernel --no-initramfs +systemctl enable lustre +umount /mnt/resource diff --git a/examples/lustre_rdma_avs/scripts/lustrenetwork.sh b/examples/lustre_rdma_avs/scripts/lustrenetwork.sh new file mode 100755 index 000000000..f95d33864 --- /dev/null +++ b/examples/lustre_rdma_avs/scripts/lustrenetwork.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf +service waagent restart +service rdma start +modprobe lnet +lctl network configure +lnetctl net add --net o2ib --if ib0 #need this to come up every time +sleep 5 diff --git a/examples/lustre_rdma_avs/scripts/oldreboot b/examples/lustre_rdma_avs/scripts/oldreboot new file mode 100755 index 000000000..ddc8d442e --- /dev/null +++ b/examples/lustre_rdma_avs/scripts/oldreboot @@ -0,0 +1,20 @@ +#!/bin/bash +vmlist=$1 +typenum=$2 + +totalcount=$(($typenum+2)) +index=0 + +#prep headnode +cp -r /share/home/hpcuser/.ssh /root/ + +echo "the first vm is ${vmlist[0]}" + +#needs to be done sequentially +for vmname in ${vmlist[@]}; do + if [ $index -lt $totalcount ] ; then + echo "Rebooting $vmname" + ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null || : + fi +done +exit 0 diff --git a/examples/lustre_rdma_avs/scripts/rebootlustre.sh b/examples/lustre_rdma_avs/scripts/rebootlustre.sh new file mode 100755 index 000000000..2d33c180b --- /dev/null +++ b/examples/lustre_rdma_avs/scripts/rebootlustre.sh @@ -0,0 +1,16 @@ +#!/bin/bash +vmlist=$1 +osscount=$2 +totalcount=$((osscount+2)) +index=0 +#prep headnode +cp -r /share/home/hpcuser/.ssh /root/ +echo "vmlist is ${vmlist[@]}" + +#needs to be done sequentially +for vmname in ${vmlist[@]}; do + echo "Rebooting $vmname" + ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null + index=$((index+1)) +done +exit 0 diff --git a/examples/lustre_rdma_avs/scripts/removeMOFED.sh b/examples/lustre_rdma_avs/scripts/removeMOFED.sh new file mode 100755 index 000000000..d0e69d6d0 --- /dev/null +++ b/examples/lustre_rdma_avs/scripts/removeMOFED.sh @@ -0,0 +1,6 @@ +#!/bin/bash +#rpm -e neohost-backend neohost-sdk-1.5.0-102.x86_64 +yum -y remove neohost-backend neohost-sdk-1.5.0-102.x86_64 2>/dev/null +/usr/sbin/ofed_uninstall.sh +sleep 5 +exit 0 diff --git a/examples/lustre_rdma_avs/scripts/waitforreboot.sh b/examples/lustre_rdma_avs/scripts/waitforreboot.sh new file mode 100755 index 000000000..73411ca61 --- /dev/null +++ b/examples/lustre_rdma_avs/scripts/waitforreboot.sh @@ -0,0 +1,2 @@ +#!/bin/bash +sleep 180 #enough time for node reboot to continue process diff --git a/examples/lustre_rdma_avs/writeup b/examples/lustre_rdma_avs/writeup new file mode 100644 index 000000000..3eb58e3c6 --- /dev/null +++ b/examples/lustre_rdma_avs/writeup @@ -0,0 +1,20 @@ +- lustre-ipoib - This is a created implementation of Lustre using ip over infiniband (IPoIB) +- lustre-rdma - This is a created implementation of Lustre using native Remote Direct Memory Access (RDMA) + +Changes to files to enable Infiniband functionality: +lfsmaster.sh +lfsoss.sh +lfsclient.sh +lfsrepo.sh +lfspkgs.sh + +Addition for the installation of new Mellanox OFED (MOFED) for the Lustre kernel : installMOFED.sh + +Addition for correct drives placement of OSSes : installdrives.sh +*installdrives.sh takes about 15 minutes to run so please either remote this entity, or wait it out. + +Additions for correct Lustre kernel : +lustreinstall1.sh +lustreinstall2.sh + +Addition for pause after MDS/OSS reboot : waitforreboot.sh diff --git a/examples/lustre_rdma_avs/writeuplustreipoib b/examples/lustre_rdma_avs/writeuplustreipoib new file mode 100644 index 000000000..e0f6ad7fc --- /dev/null +++ b/examples/lustre_rdma_avs/writeuplustreipoib @@ -0,0 +1,11 @@ +- lustre-ipoib - This is a created implementation of Lustre using IP over infiniband (IPoIB) + +Changes to files to enable Infiniband functionality: +lfsmaster.sh +lfsoss.sh +lfsclient.sh +lfsrepo.sh + +Addition for correct drives placement of OSSes : instaldrives.sh +*installdrives.sh takes about 15 minutes to run so please either remote this entity, or wait it out. + diff --git a/examples/lustre_rdma_nvmedrives/config.json b/examples/lustre_rdma_nvmedrives/config.json index c4a5b77dc..7ebb5bf43 100644 --- a/examples/lustre_rdma_nvmedrives/config.json +++ b/examples/lustre_rdma_nvmedrives/config.json @@ -1,5 +1,5 @@ { - "location": "southcentralus", + "location": "variables.location", "resource_group": "variables.resource_group", "install_from": "headnode", "admin_user": "hpcadmin", @@ -12,6 +12,7 @@ } }, "variables": { + "location": "", "resource_group": "", "image": "OpenLogic:CentOS:7.6:latest", "lustreimage": "OpenLogic:CentOS:7.6:latest", From 01fc4b60ff89471c74b4437bf4f34e7b799099d9 Mon Sep 17 00:00:00 2001 From: chadnar2 <52789065+chadnar2@users.noreply.github.com> Date: Tue, 30 Jun 2020 10:50:37 -0500 Subject: [PATCH 10/36] Update readme.md --- examples/lustre_rdma_avs/readme.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/lustre_rdma_avs/readme.md b/examples/lustre_rdma_avs/readme.md index b71ca377c..3b55df48c 100644 --- a/examples/lustre_rdma_avs/readme.md +++ b/examples/lustre_rdma_avs/readme.md @@ -1,10 +1,8 @@ -# Lustre Infiniband +# lustre_rdma_avs Visualisation: [config.json](https://azurehpc.azureedge.net/?o=https://raw.githubusercontent.com/Azure/azurehpc/master/examples/lustre_Infiniband/config.json) -This is a deployment of Lustre using the available infiniband network. This solution has been designed to work with either IP over infiniband or true Remote Direct Memory Access(RDMA) . The Object Storage Servers are designed to run a raid0 group using 1TB drives. This value can easily be changed inside installdrives.sh. - -Please note that installdrives.sh does take some time to complete due to it having to work with only part of a virtual machine scale set (VMSS). +This is a deployment of Lustre using the available infiniband network. This solution has been designed to work with true Remote Direct Memory Access(RDMA) . This deployment will only function using the Python based AzureHPC (not the BASH libexec). @@ -13,8 +11,8 @@ Resources: * Head node (headnode) * Compute nodes (compute) * Lustre - * Management/Meta-data server (lfsmds) - * Object storage servers (lfsoss) + * Management/Meta-data server (lfsmaster) + * Object storage servers (lustre) * Hierarchical storage management nodes (lfshsm) * Lustre client exporting with samba (lfssmb) @@ -24,6 +22,7 @@ The configuration file requires the following variables to be set: | Variable | Description | |-------------------------|----------------------------------------------| +| location | The locaton of the project | | resource_group | The resource group for the project | | storage_account | The storage account for HSM | | storage_key | The storage key for HSM | From db68123978b4f76bda6938f1b532af2248e139af Mon Sep 17 00:00:00 2001 From: chadnar2 <52789065+chadnar2@users.noreply.github.com> Date: Tue, 30 Jun 2020 11:07:06 -0500 Subject: [PATCH 11/36] Update readme.md --- examples/lustre_rdma_nvmedrives/readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/lustre_rdma_nvmedrives/readme.md b/examples/lustre_rdma_nvmedrives/readme.md index b71ca377c..366c0b265 100644 --- a/examples/lustre_rdma_nvmedrives/readme.md +++ b/examples/lustre_rdma_nvmedrives/readme.md @@ -24,6 +24,7 @@ The configuration file requires the following variables to be set: | Variable | Description | |-------------------------|----------------------------------------------| +| location | The location (Azure region) for the project | | resource_group | The resource group for the project | | storage_account | The storage account for HSM | | storage_key | The storage key for HSM | From b4e78661f532a1f79ed538ff8da8e4438bd4dab3 Mon Sep 17 00:00:00 2001 From: chadnar2 <52789065+chadnar2@users.noreply.github.com> Date: Tue, 30 Jun 2020 12:31:45 -0500 Subject: [PATCH 12/36] Delete removeMOFED.sh --- examples/lustre_rdma_avs/scripts/removeMOFED.sh | 6 ------ 1 file changed, 6 deletions(-) delete mode 100755 examples/lustre_rdma_avs/scripts/removeMOFED.sh diff --git a/examples/lustre_rdma_avs/scripts/removeMOFED.sh b/examples/lustre_rdma_avs/scripts/removeMOFED.sh deleted file mode 100755 index d0e69d6d0..000000000 --- a/examples/lustre_rdma_avs/scripts/removeMOFED.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -#rpm -e neohost-backend neohost-sdk-1.5.0-102.x86_64 -yum -y remove neohost-backend neohost-sdk-1.5.0-102.x86_64 2>/dev/null -/usr/sbin/ofed_uninstall.sh -sleep 5 -exit 0 From bcbcfa6d9c34428a4b9862bc2d7cbc31cfa555a3 Mon Sep 17 00:00:00 2001 From: chadnar2 <52789065+chadnar2@users.noreply.github.com> Date: Tue, 30 Jun 2020 12:31:56 -0500 Subject: [PATCH 13/36] Delete oldreboot --- examples/lustre_rdma_avs/scripts/oldreboot | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100755 examples/lustre_rdma_avs/scripts/oldreboot diff --git a/examples/lustre_rdma_avs/scripts/oldreboot b/examples/lustre_rdma_avs/scripts/oldreboot deleted file mode 100755 index ddc8d442e..000000000 --- a/examples/lustre_rdma_avs/scripts/oldreboot +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -vmlist=$1 -typenum=$2 - -totalcount=$(($typenum+2)) -index=0 - -#prep headnode -cp -r /share/home/hpcuser/.ssh /root/ - -echo "the first vm is ${vmlist[0]}" - -#needs to be done sequentially -for vmname in ${vmlist[@]}; do - if [ $index -lt $totalcount ] ; then - echo "Rebooting $vmname" - ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null || : - fi -done -exit 0 From b5fc4aa864f6629c4f5223b895999178bec67ba0 Mon Sep 17 00:00:00 2001 From: chadnar2 <52789065+chadnar2@users.noreply.github.com> Date: Wed, 1 Jul 2020 09:46:57 -0500 Subject: [PATCH 14/36] Delete deploy_config.json --- examples/lustre_rdma_avs/deploy_config.json | 583 -------------------- 1 file changed, 583 deletions(-) delete mode 100644 examples/lustre_rdma_avs/deploy_config.json diff --git a/examples/lustre_rdma_avs/deploy_config.json b/examples/lustre_rdma_avs/deploy_config.json deleted file mode 100644 index 5c935644c..000000000 --- a/examples/lustre_rdma_avs/deploy_config.json +++ /dev/null @@ -1,583 +0,0 @@ -{ - "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "parameters": {}, - "variables": {}, - "resources": [ - { - "apiVersion": "2018-10-01", - "type": "Microsoft.Network/virtualNetworks", - "name": "hpcvnet", - "location": "southcentralus", - "properties": { - "addressSpace": { - "addressPrefixes": [ - "10.2.0.0/20" - ] - }, - "subnets": [ - { - "name": "compute", - "properties": { - "addressPrefix": "10.2.0.0/22" - } - }, - { - "name": "storage", - "properties": { - "addressPrefix": "10.2.4.0/24" - } - } - ] - } - }, - { - "type": "Microsoft.Network/publicIPAddresses", - "apiVersion": "2018-01-01", - "name": "headnode_pip", - "location": "southcentralus", - "dependsOn": [], - "tags": {}, - "properties": { - "dnsSettings": { - "domainNameLabel": "headnode27d5d3" - } - } - }, - { - "type": "Microsoft.Network/networkSecurityGroups", - "apiVersion": "2015-06-15", - "name": "headnode_nsg", - "location": "southcentralus", - "dependsOn": [], - "tags": {}, - "properties": { - "securityRules": [ - { - "name": "default-allow-ssh", - "properties": { - "protocol": "Tcp", - "sourcePortRange": "*", - "destinationPortRange": "22", - "sourceAddressPrefix": "*", - "destinationAddressPrefix": "*", - "access": "Allow", - "priority": 1000, - "direction": "Inbound" - } - } - ] - } - }, - { - "type": "Microsoft.Network/networkInterfaces", - "apiVersion": "2016-09-01", - "name": "headnode_nic", - "location": "southcentralus", - "dependsOn": [ - "Microsoft.Network/virtualNetworks/hpcvnet", - "Microsoft.Network/publicIpAddresses/headnode_pip", - "Microsoft.Network/networkSecurityGroups/headnode_nsg" - ], - "tags": {}, - "properties": { - "ipConfigurations": [ - { - "name": "headnode_ipconfig", - "properties": { - "privateIPAllocationMethod": "Dynamic", - "subnet": { - "id": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'hpcvnet', 'compute')]" - }, - "publicIPAddress": { - "id": "[resourceId('Microsoft.Network/publicIPAddresses', 'headnode_pip')]" - } - } - } - ], - "enableAcceleratedNetworking": false, - "networkSecurityGroup": { - "id": "[resourceId('Microsoft.Network/networkSecurityGroups', 'headnode_nsg')]" - } - } - }, - { - "type": "Microsoft.Compute/virtualMachines", - "apiVersion": "2019-07-01", - "name": "headnode", - "location": "southcentralus", - "dependsOn": [ - "Microsoft.Network/networkInterfaces/headnode_nic" - ], - "tags": {}, - "properties": { - "hardwareProfile": { - "vmSize": "Standard_HB60rs" - }, - "networkProfile": { - "networkInterfaces": [ - { - "id": "[resourceId('Microsoft.Network/networkInterfaces', 'headnode_nic')]" - } - ] - }, - "storageProfile": { - "osDisk": { - "name": "headnode_osdisk", - "createOption": "fromImage", - "caching": "ReadWrite", - "managedDisk": { - "storageAccountType": "Premium_LRS" - } - }, - "imageReference": { - "publisher": "OpenLogic", - "offer": "CentOS-HPC", - "sku": "7.6", - "version": "latest" - }, - "dataDisks": [] - }, - "osProfile": { - "computerName": "headnode", - "adminUsername": "hpcadmin", - "linuxConfiguration": { - "disablePasswordAuthentication": true, - "ssh": { - "publicKeys": [ - { - "keyData": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDUkKNVQZwoJg3dEIRHOeOvt7UKGx/4MbfS6W5JeTcj/qd6y2jmAsZ8qJON4da9fm2oUhMADzivXjQCCwhxQcSlpbCTTpAhVROPqQMdu5s/SLVXYRjaxrpybqE9CawUiI4rnvwEu6/9l1r9qtUnV2rWr7W3S23zhNMKuxE6zM3Dy30pzdkafZJjsXnZnye22Y3s4Sx9S2Dorp//kjBslvtTxv/SMXrJYOnOuao3fcEl+tnVVayOzCFuXElnSO4BT+gAHTG07Ooza/BRnL0WZx9E+j9TWAzgNQbWHfuhZaShAuGBRG6ncvO2qryaxdeyeTdmciQxZY0njb183a6LvnVB", - "path": "/home/hpcadmin/.ssh/authorized_keys" - } - ] - } - } - } - } - }, - { - "name": "nc-testlustre2avset", - "type": "Microsoft.Compute/availabilitySets", - "apiVersion": "2018-10-01", - "location": "southcentralus", - "sku": { - "name": "Aligned" - }, - "properties": { - "platformUpdateDomainCount": 1, - "platformFaultDomainCount": 1 - } - }, - { - "type": "Microsoft.Network/networkInterfaces", - "apiVersion": "2016-09-01", - "name": "compute0001_nic", - "location": "southcentralus", - "dependsOn": [ - "Microsoft.Network/virtualNetworks/hpcvnet" - ], - "tags": {}, - "properties": { - "ipConfigurations": [ - { - "name": "compute0001_ipconfig", - "properties": { - "privateIPAllocationMethod": "Dynamic", - "subnet": { - "id": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'hpcvnet', 'storage')]" - } - } - } - ], - "enableAcceleratedNetworking": false - } - }, - { - "type": "Microsoft.Compute/virtualMachines", - "apiVersion": "2019-07-01", - "name": "compute0001", - "location": "southcentralus", - "dependsOn": [ - "Microsoft.Network/networkInterfaces/compute0001_nic", - "Microsoft.Compute/availabilitySets/nc-testlustre2avset" - ], - "tags": {}, - "properties": { - "hardwareProfile": { - "vmSize": "Standard_HB120rs_v2" - }, - "networkProfile": { - "networkInterfaces": [ - { - "id": "[resourceId('Microsoft.Network/networkInterfaces', 'compute0001_nic')]" - } - ] - }, - "storageProfile": { - "osDisk": { - "name": "compute0001_osdisk", - "createOption": "fromImage", - "caching": "ReadWrite", - "managedDisk": { - "storageAccountType": "Premium_LRS" - } - }, - "imageReference": { - "publisher": "OpenLogic", - "offer": "CentOS-HPC", - "sku": "7.6", - "version": "latest" - }, - "dataDisks": [] - }, - "osProfile": { - "computerName": "compute0001", - "adminUsername": "hpcadmin", - "linuxConfiguration": { - "disablePasswordAuthentication": true, - "ssh": { - "publicKeys": [ - { - "keyData": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDUkKNVQZwoJg3dEIRHOeOvt7UKGx/4MbfS6W5JeTcj/qd6y2jmAsZ8qJON4da9fm2oUhMADzivXjQCCwhxQcSlpbCTTpAhVROPqQMdu5s/SLVXYRjaxrpybqE9CawUiI4rnvwEu6/9l1r9qtUnV2rWr7W3S23zhNMKuxE6zM3Dy30pzdkafZJjsXnZnye22Y3s4Sx9S2Dorp//kjBslvtTxv/SMXrJYOnOuao3fcEl+tnVVayOzCFuXElnSO4BT+gAHTG07Ooza/BRnL0WZx9E+j9TWAzgNQbWHfuhZaShAuGBRG6ncvO2qryaxdeyeTdmciQxZY0njb183a6LvnVB", - "path": "/home/hpcadmin/.ssh/authorized_keys" - } - ] - } - } - }, - "availabilitySet": { - "id": "[resourceId('Microsoft.Compute/availabilitySets','nc-testlustre2avset')]" - } - } - }, - { - "type": "Microsoft.Network/networkInterfaces", - "apiVersion": "2016-09-01", - "name": "compute0002_nic", - "location": "southcentralus", - "dependsOn": [ - "Microsoft.Network/virtualNetworks/hpcvnet" - ], - "tags": {}, - "properties": { - "ipConfigurations": [ - { - "name": "compute0002_ipconfig", - "properties": { - "privateIPAllocationMethod": "Dynamic", - "subnet": { - "id": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'hpcvnet', 'storage')]" - } - } - } - ], - "enableAcceleratedNetworking": false - } - }, - { - "type": "Microsoft.Compute/virtualMachines", - "apiVersion": "2019-07-01", - "name": "compute0002", - "location": "southcentralus", - "dependsOn": [ - "Microsoft.Network/networkInterfaces/compute0002_nic", - "Microsoft.Compute/availabilitySets/nc-testlustre2avset" - ], - "tags": {}, - "properties": { - "hardwareProfile": { - "vmSize": "Standard_HB120rs_v2" - }, - "networkProfile": { - "networkInterfaces": [ - { - "id": "[resourceId('Microsoft.Network/networkInterfaces', 'compute0002_nic')]" - } - ] - }, - "storageProfile": { - "osDisk": { - "name": "compute0002_osdisk", - "createOption": "fromImage", - "caching": "ReadWrite", - "managedDisk": { - "storageAccountType": "Premium_LRS" - } - }, - "imageReference": { - "publisher": "OpenLogic", - "offer": "CentOS-HPC", - "sku": "7.6", - "version": "latest" - }, - "dataDisks": [] - }, - "osProfile": { - "computerName": "compute0002", - "adminUsername": "hpcadmin", - "linuxConfiguration": { - "disablePasswordAuthentication": true, - "ssh": { - "publicKeys": [ - { - "keyData": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDUkKNVQZwoJg3dEIRHOeOvt7UKGx/4MbfS6W5JeTcj/qd6y2jmAsZ8qJON4da9fm2oUhMADzivXjQCCwhxQcSlpbCTTpAhVROPqQMdu5s/SLVXYRjaxrpybqE9CawUiI4rnvwEu6/9l1r9qtUnV2rWr7W3S23zhNMKuxE6zM3Dy30pzdkafZJjsXnZnye22Y3s4Sx9S2Dorp//kjBslvtTxv/SMXrJYOnOuao3fcEl+tnVVayOzCFuXElnSO4BT+gAHTG07Ooza/BRnL0WZx9E+j9TWAzgNQbWHfuhZaShAuGBRG6ncvO2qryaxdeyeTdmciQxZY0njb183a6LvnVB", - "path": "/home/hpcadmin/.ssh/authorized_keys" - } - ] - } - } - }, - "availabilitySet": { - "id": "[resourceId('Microsoft.Compute/availabilitySets','nc-testlustre2avset')]" - } - } - }, - { - "type": "Microsoft.Network/networkInterfaces", - "apiVersion": "2016-09-01", - "name": "lfsmaster_nic", - "location": "southcentralus", - "dependsOn": [ - "Microsoft.Network/virtualNetworks/hpcvnet" - ], - "tags": {}, - "properties": { - "ipConfigurations": [ - { - "name": "lfsmaster_ipconfig", - "properties": { - "privateIPAllocationMethod": "Dynamic", - "subnet": { - "id": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'hpcvnet', 'storage')]" - } - } - } - ], - "enableAcceleratedNetworking": false - } - }, - { - "type": "Microsoft.Compute/virtualMachines", - "apiVersion": "2019-07-01", - "name": "lfsmaster", - "location": "southcentralus", - "dependsOn": [ - "Microsoft.Network/networkInterfaces/lfsmaster_nic", - "Microsoft.Compute/availabilitySets/nc-testlustre2avset" - ], - "tags": {}, - "properties": { - "hardwareProfile": { - "vmSize": "Standard_HB120rs_v2" - }, - "networkProfile": { - "networkInterfaces": [ - { - "id": "[resourceId('Microsoft.Network/networkInterfaces', 'lfsmaster_nic')]" - } - ] - }, - "storageProfile": { - "osDisk": { - "name": "lfsmaster_osdisk", - "createOption": "fromImage", - "caching": "ReadWrite", - "managedDisk": { - "storageAccountType": "Premium_LRS" - } - }, - "imageReference": { - "publisher": "OpenLogic", - "offer": "CentOS", - "sku": "7.6", - "version": "latest" - }, - "dataDisks": [] - }, - "osProfile": { - "computerName": "lfsmaster", - "adminUsername": "hpcadmin", - "linuxConfiguration": { - "disablePasswordAuthentication": true, - "ssh": { - "publicKeys": [ - { - "keyData": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDUkKNVQZwoJg3dEIRHOeOvt7UKGx/4MbfS6W5JeTcj/qd6y2jmAsZ8qJON4da9fm2oUhMADzivXjQCCwhxQcSlpbCTTpAhVROPqQMdu5s/SLVXYRjaxrpybqE9CawUiI4rnvwEu6/9l1r9qtUnV2rWr7W3S23zhNMKuxE6zM3Dy30pzdkafZJjsXnZnye22Y3s4Sx9S2Dorp//kjBslvtTxv/SMXrJYOnOuao3fcEl+tnVVayOzCFuXElnSO4BT+gAHTG07Ooza/BRnL0WZx9E+j9TWAzgNQbWHfuhZaShAuGBRG6ncvO2qryaxdeyeTdmciQxZY0njb183a6LvnVB", - "path": "/home/hpcadmin/.ssh/authorized_keys" - } - ] - } - } - }, - "availabilitySet": { - "id": "[resourceId('Microsoft.Compute/availabilitySets','nc-testlustre2avset')]" - } - } - }, - { - "type": "Microsoft.Network/networkInterfaces", - "apiVersion": "2016-09-01", - "name": "lustre0001_nic", - "location": "southcentralus", - "dependsOn": [ - "Microsoft.Network/virtualNetworks/hpcvnet" - ], - "tags": {}, - "properties": { - "ipConfigurations": [ - { - "name": "lustre0001_ipconfig", - "properties": { - "privateIPAllocationMethod": "Dynamic", - "subnet": { - "id": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'hpcvnet', 'storage')]" - } - } - } - ], - "enableAcceleratedNetworking": false - } - }, - { - "type": "Microsoft.Compute/virtualMachines", - "apiVersion": "2019-07-01", - "name": "lustre0001", - "location": "southcentralus", - "dependsOn": [ - "Microsoft.Network/networkInterfaces/lustre0001_nic", - "Microsoft.Compute/availabilitySets/nc-testlustre2avset" - ], - "tags": {}, - "properties": { - "hardwareProfile": { - "vmSize": "Standard_HB120rs_v2" - }, - "networkProfile": { - "networkInterfaces": [ - { - "id": "[resourceId('Microsoft.Network/networkInterfaces', 'lustre0001_nic')]" - } - ] - }, - "storageProfile": { - "osDisk": { - "name": "lustre0001_osdisk", - "createOption": "fromImage", - "caching": "ReadWrite", - "managedDisk": { - "storageAccountType": "Premium_LRS" - } - }, - "imageReference": { - "publisher": "OpenLogic", - "offer": "CentOS", - "sku": "7.6", - "version": "latest" - }, - "dataDisks": [] - }, - "osProfile": { - "computerName": "lustre0001", - "adminUsername": "hpcadmin", - "linuxConfiguration": { - "disablePasswordAuthentication": true, - "ssh": { - "publicKeys": [ - { - "keyData": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDUkKNVQZwoJg3dEIRHOeOvt7UKGx/4MbfS6W5JeTcj/qd6y2jmAsZ8qJON4da9fm2oUhMADzivXjQCCwhxQcSlpbCTTpAhVROPqQMdu5s/SLVXYRjaxrpybqE9CawUiI4rnvwEu6/9l1r9qtUnV2rWr7W3S23zhNMKuxE6zM3Dy30pzdkafZJjsXnZnye22Y3s4Sx9S2Dorp//kjBslvtTxv/SMXrJYOnOuao3fcEl+tnVVayOzCFuXElnSO4BT+gAHTG07Ooza/BRnL0WZx9E+j9TWAzgNQbWHfuhZaShAuGBRG6ncvO2qryaxdeyeTdmciQxZY0njb183a6LvnVB", - "path": "/home/hpcadmin/.ssh/authorized_keys" - } - ] - } - } - }, - "availabilitySet": { - "id": "[resourceId('Microsoft.Compute/availabilitySets','nc-testlustre2avset')]" - } - } - }, - { - "type": "Microsoft.Network/networkInterfaces", - "apiVersion": "2016-09-01", - "name": "lustre0002_nic", - "location": "southcentralus", - "dependsOn": [ - "Microsoft.Network/virtualNetworks/hpcvnet" - ], - "tags": {}, - "properties": { - "ipConfigurations": [ - { - "name": "lustre0002_ipconfig", - "properties": { - "privateIPAllocationMethod": "Dynamic", - "subnet": { - "id": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'hpcvnet', 'storage')]" - } - } - } - ], - "enableAcceleratedNetworking": false - } - }, - { - "type": "Microsoft.Compute/virtualMachines", - "apiVersion": "2019-07-01", - "name": "lustre0002", - "location": "southcentralus", - "dependsOn": [ - "Microsoft.Network/networkInterfaces/lustre0002_nic", - "Microsoft.Compute/availabilitySets/nc-testlustre2avset" - ], - "tags": {}, - "properties": { - "hardwareProfile": { - "vmSize": "Standard_HB120rs_v2" - }, - "networkProfile": { - "networkInterfaces": [ - { - "id": "[resourceId('Microsoft.Network/networkInterfaces', 'lustre0002_nic')]" - } - ] - }, - "storageProfile": { - "osDisk": { - "name": "lustre0002_osdisk", - "createOption": "fromImage", - "caching": "ReadWrite", - "managedDisk": { - "storageAccountType": "Premium_LRS" - } - }, - "imageReference": { - "publisher": "OpenLogic", - "offer": "CentOS", - "sku": "7.6", - "version": "latest" - }, - "dataDisks": [] - }, - "osProfile": { - "computerName": "lustre0002", - "adminUsername": "hpcadmin", - "linuxConfiguration": { - "disablePasswordAuthentication": true, - "ssh": { - "publicKeys": [ - { - "keyData": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDUkKNVQZwoJg3dEIRHOeOvt7UKGx/4MbfS6W5JeTcj/qd6y2jmAsZ8qJON4da9fm2oUhMADzivXjQCCwhxQcSlpbCTTpAhVROPqQMdu5s/SLVXYRjaxrpybqE9CawUiI4rnvwEu6/9l1r9qtUnV2rWr7W3S23zhNMKuxE6zM3Dy30pzdkafZJjsXnZnye22Y3s4Sx9S2Dorp//kjBslvtTxv/SMXrJYOnOuao3fcEl+tnVVayOzCFuXElnSO4BT+gAHTG07Ooza/BRnL0WZx9E+j9TWAzgNQbWHfuhZaShAuGBRG6ncvO2qryaxdeyeTdmciQxZY0njb183a6LvnVB", - "path": "/home/hpcadmin/.ssh/authorized_keys" - } - ] - } - } - }, - "availabilitySet": { - "id": "[resourceId('Microsoft.Compute/availabilitySets','nc-testlustre2avset')]" - } - } - } - ], - "outputs": {} -} \ No newline at end of file From 5a1079d40c12f605879c7ae16beedd6a0f57a5f5 Mon Sep 17 00:00:00 2001 From: chadnar2 <52789065+chadnar2@users.noreply.github.com> Date: Wed, 1 Jul 2020 09:47:06 -0500 Subject: [PATCH 15/36] Delete hpcadmin_id_rsa --- examples/lustre_rdma_avs/hpcadmin_id_rsa | 27 ------------------------ 1 file changed, 27 deletions(-) delete mode 100644 examples/lustre_rdma_avs/hpcadmin_id_rsa diff --git a/examples/lustre_rdma_avs/hpcadmin_id_rsa b/examples/lustre_rdma_avs/hpcadmin_id_rsa deleted file mode 100644 index 7846d2b39..000000000 --- a/examples/lustre_rdma_avs/hpcadmin_id_rsa +++ /dev/null @@ -1,27 +0,0 @@ ------BEGIN RSA PRIVATE KEY----- -MIIEpAIBAAKCAQEA1JCjVUGcKCYN3RCERznjr7e1Chsf+DG30uluSXk3I/6nesto -5gLGfKiTjeHWvX5tqFITAA84r140AgsIcUHEpaWwk06QIVUTj6kDHbubP0i1V2EY -2sa6cm6hPQmsFIiOK578BLuv/Zda/arVJ1dq1q+1t0tt84TTCrsROszNw8t9Kc3Z -Gn2SY7F52Z8nttmN7OEsfUtg6K6f/5IwbJb7U8b/0jF6yWDpzrmqN33BJfrZ1VWs -jswhblxJZ0juAU/oAB0xtOzqM2vwUZy9FmcfRPo/U1gM4DUG1h37oWWkoQLhgURu -p3Lztqq8msXXsnk3ZnIkMWWNJ429fN2ui751QQIDAQABAoIBAGYQfRy2wDBW9Vks -UReSKE17PCZ6F8Oou8c95oLI/Tz/TZOcj+XBd2Tr3M3HnsCmMCkeH5lrtaAe74H7 -ojYfijivcjWJB5O5sgbM9H4WUtj0JH6sVK7XtTa1AB66wjGpz/oKAKCVLk/pmPss -R+T4CIjFHc/BHC5NnLgOUpuVM0fLUUUF8NmIvT6K0P4j7GZx12d1TDkqo+/rd1ku -EOuCjl8Q4bTO0qtJEXy2dmn38m6QGNS765j8gQ21wWY+Q7EX4JaJ+oO2ZgGuyYul -Cu+AFlCR4SkOok0DN6RG4KQ7Sly57HrZWwLI46FXmjiJqE/7wNvMwuHdUmnVbkoY -v04fxAECgYEA8ii6KMsPIxnMSCBpsRoFSOcPdSyoFyhMCCuiR9liCGRG4wz4u1i6 -ZFal1+d/rX6qxCTIZxvU8zn54Qsrr+44zV++4+Sd/nhrc+qWOxGggAscbYNG3w2g -GTGinERFPRs5iGmdJ0n+uy/TSPe5t0qH85AdKcU47mfrNb3Q08rEfxECgYEA4Lbj -zkCUa4UN6CP36FtOUNtpnrn7dxfDNpcS8CTt/oob2OifOUGhgPbCio5at7zE8cH0 -hWrUWFPDfBRliGdG/ZzdmIOaC0MU9eQG4JxkblgYccKpcYsTq45NDyhQJ0lbBjRG -Sp42HOnvZ8p0m9przrnQF22Bvr5E+VF1wVk18zECgYEA7pI9RS84jIZAAdcdCYPv -LPGnAvOp7paewXXrfQmnUUkppUsESd6SU4QiA2FpIk4mgvMSFLMQy0eU7KeKtNrn -Tz5C3FZBaZDNm/fDZhJpo3xO13179wh/cBK8d2OzKw6FUeVrFGgL8/KcH8kfSHq/ -EbAraxmIiygKTHnjIKUljWECgYAQxhYjIzbw/7GWDnlG4unppzcvHfrjXOa5gHVt -b5REV9LUUijwgTGpCsJizVWAOZsJ4Mx72QmYvkftTyh1EiB+deMkq04oYQ2DfU32 -HjZw9ip882bqjtMdDzY5V20EQbmFsQk+MKkhZ2Tzfm1N5PP/LmeWGBqDPnivk6ES -mbIpQQKBgQDqnc9KivmjPIHz2BpJh8icWkdvZ2WzycI3Sly6Suh0E6Q+epMTXUm3 -21TIEkkAlBYXkHs0ZhL7l7jzv5yYSGB8ZNDzk+UquE5OuxMwWsd3trqyJ3LMj9C5 -hV6JTHqNSw8xubCES0oRgJkcCedoQ0qxMwypnJarWPh/LSVCu3BZ2A== ------END RSA PRIVATE KEY----- From c9830a91c2fc0c1931c2ad9e8477bbff94cc24b1 Mon Sep 17 00:00:00 2001 From: chadnar2 <52789065+chadnar2@users.noreply.github.com> Date: Wed, 1 Jul 2020 09:47:15 -0500 Subject: [PATCH 16/36] Delete hpcadmin_id_rsa.pub --- examples/lustre_rdma_avs/hpcadmin_id_rsa.pub | 1 - 1 file changed, 1 deletion(-) delete mode 100644 examples/lustre_rdma_avs/hpcadmin_id_rsa.pub diff --git a/examples/lustre_rdma_avs/hpcadmin_id_rsa.pub b/examples/lustre_rdma_avs/hpcadmin_id_rsa.pub deleted file mode 100644 index 20776c3a0..000000000 --- a/examples/lustre_rdma_avs/hpcadmin_id_rsa.pub +++ /dev/null @@ -1 +0,0 @@ -ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDUkKNVQZwoJg3dEIRHOeOvt7UKGx/4MbfS6W5JeTcj/qd6y2jmAsZ8qJON4da9fm2oUhMADzivXjQCCwhxQcSlpbCTTpAhVROPqQMdu5s/SLVXYRjaxrpybqE9CawUiI4rnvwEu6/9l1r9qtUnV2rWr7W3S23zhNMKuxE6zM3Dy30pzdkafZJjsXnZnye22Y3s4Sx9S2Dorp//kjBslvtTxv/SMXrJYOnOuao3fcEl+tnVVayOzCFuXElnSO4BT+gAHTG07Ooza/BRnL0WZx9E+j9TWAzgNQbWHfuhZaShAuGBRG6ncvO2qryaxdeyeTdmciQxZY0njb183a6LvnVB From a995af9f73156dd7a9f200518765db601441e84e Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Wed, 1 Jul 2020 15:04:24 -0500 Subject: [PATCH 17/36] modifications of lustre_rdma_avs --- examples/lustre_rdma_avs/config.json | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/lustre_rdma_avs/config.json b/examples/lustre_rdma_avs/config.json index f366d979f..13428e9a4 100644 --- a/examples/lustre_rdma_avs/config.json +++ b/examples/lustre_rdma_avs/config.json @@ -14,13 +14,13 @@ "variables": { "location": "", "resource_group": "", - "image": "OpenLogic:CentOS-HPC:7.6:latest", + "image": "OpenLogic:CentOS-HPC:7.6:7.6.201910250", "lustreimage": "OpenLogic:CentOS:7.6:latest", - "hpcimage": "OpenLogic:CentOS-HPC:7.6:latest", + "hpcimage": "OpenLogic:CentOS-HPC:7.6:7.6.201910250", "compute_instances": 2, "lustre_instances": 2, "low_priority": true, - "storage_account": "", + "storage_account": "", "storage_key": "sakey.{{variables.storage_account}}", "storage_container": "", "log_analytics_lfs_name": "", @@ -35,7 +35,7 @@ "resources": { "headnode": { "type": "vm", - "vm_type": "Standard_HB60rs", + "vm_type": "Standard_HB120rs_v2", "accelerated_networking": false, "public_ip": true, "image": "variables.image", @@ -165,7 +165,10 @@ }, { "type": "local_script", - "script": "waitforreboot.sh" + "script": "wait.sh", + "args": [ + "180" + ] }, { "script": "installOFED.sh", From 616553cbacee045633dcc0348d42d23caddf1f23 Mon Sep 17 00:00:00 2001 From: chadnar2 <52789065+chadnar2@users.noreply.github.com> Date: Wed, 1 Jul 2020 15:10:25 -0500 Subject: [PATCH 18/36] Delete waitforreboot.sh --- examples/lustre_rdma_avs/scripts/waitforreboot.sh | 2 -- 1 file changed, 2 deletions(-) delete mode 100755 examples/lustre_rdma_avs/scripts/waitforreboot.sh diff --git a/examples/lustre_rdma_avs/scripts/waitforreboot.sh b/examples/lustre_rdma_avs/scripts/waitforreboot.sh deleted file mode 100755 index 73411ca61..000000000 --- a/examples/lustre_rdma_avs/scripts/waitforreboot.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -sleep 180 #enough time for node reboot to continue process From f2c31c1cb11df667a1cb2223b5f90a8a5e1f4472 Mon Sep 17 00:00:00 2001 From: chadnar2 <52789065+chadnar2@users.noreply.github.com> Date: Wed, 1 Jul 2020 15:11:33 -0500 Subject: [PATCH 19/36] Delete waitforreboot.sh --- examples/lustre_rdma_nvmedrives/scripts/waitforreboot.sh | 2 -- 1 file changed, 2 deletions(-) delete mode 100755 examples/lustre_rdma_nvmedrives/scripts/waitforreboot.sh diff --git a/examples/lustre_rdma_nvmedrives/scripts/waitforreboot.sh b/examples/lustre_rdma_nvmedrives/scripts/waitforreboot.sh deleted file mode 100755 index 73411ca61..000000000 --- a/examples/lustre_rdma_nvmedrives/scripts/waitforreboot.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -sleep 180 #enough time for node reboot to continue process From 9aba5d253a4a5a012d9d828c45d3110d9f5164df Mon Sep 17 00:00:00 2001 From: chadnar2 <52789065+chadnar2@users.noreply.github.com> Date: Wed, 1 Jul 2020 15:12:51 -0500 Subject: [PATCH 20/36] Update config.json --- examples/lustre_rdma_nvmedrives/config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/lustre_rdma_nvmedrives/config.json b/examples/lustre_rdma_nvmedrives/config.json index 7ebb5bf43..313d028ef 100644 --- a/examples/lustre_rdma_nvmedrives/config.json +++ b/examples/lustre_rdma_nvmedrives/config.json @@ -129,7 +129,7 @@ }, { "type": "local_script", - "script": "waitforreboot.sh" + "script": "wait.sh" }, { "script": "installOFED.sh", From fd8175430be9bf7c7f6019ab0cd21b747b676be0 Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Wed, 1 Jul 2020 16:22:41 -0500 Subject: [PATCH 21/36] change of lustre_rdma_nvmedrives config.json to use existing wait.sh --- examples/lustre_rdma_nvmedrives/config.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/lustre_rdma_nvmedrives/config.json b/examples/lustre_rdma_nvmedrives/config.json index 7ebb5bf43..5bd41c739 100644 --- a/examples/lustre_rdma_nvmedrives/config.json +++ b/examples/lustre_rdma_nvmedrives/config.json @@ -129,7 +129,10 @@ }, { "type": "local_script", - "script": "waitforreboot.sh" + "script": "wait.sh", + "args": [ + "180" + ] }, { "script": "installOFED.sh", From 3ed6c06c9cb33728807ba7a2d7a6b2e15d034175 Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Thu, 2 Jul 2020 12:19:00 -0500 Subject: [PATCH 22/36] Moving all lustre rdma/ipoib work to experimental folder for further review. --- experimental/lustre_ipoib/config.json | 224 ++++++++++++++ experimental/lustre_ipoib/readme.md | 37 +++ .../lustre_ipoib/scripts/installdrives.sh | 33 +++ .../lustre_ipoib/scripts/lfsclient.sh | 44 +++ .../lustre_ipoib/scripts/lfsmaster.sh | 32 ++ experimental/lustre_ipoib/scripts/lfsoss.sh | 38 +++ experimental/lustre_ipoib/scripts/lfspkgs.sh | 11 + .../lustre_ipoib_nvmedrives/config.json | 205 +++++++++++++ .../lustre_ipoib_nvmedrives/readme.md | 35 +++ .../scripts/lfsclient.sh | 44 +++ .../scripts/lfsmaster.sh | 32 ++ .../lustre_ipoib_nvmedrives/scripts/lfsoss.sh | 38 +++ .../scripts/lfspkgs.sh | 11 + .../scripts/waitforreboot.sh | 2 + .../azhpc_install_config/hostlists/compute | 2 + .../azhpc_install_config/hostlists/headnode | 1 + .../azhpc_install_config/hostlists/lfsmaster | 1 + .../azhpc_install_config/hostlists/linux | 6 + .../azhpc_install_config/hostlists/lustre | 2 + .../hostlists/tags/cndefault | 6 + .../hostlists/tags/disable-selinux | 6 + .../hostlists/tags/lfsazimport | 1 + .../hostlists/tags/lfsclient | 3 + .../hostlists/tags/lfsloganalytics | 3 + .../hostlists/tags/lfsmaster | 1 + .../hostlists/tags/lfsrepo | 6 + .../hostlists/tags/localuser | 6 + .../hostlists/tags/loginnode | 1 + .../hostlists/tags/lustre | 3 + .../hostlists/tags/nfsclient | 5 + .../hostlists/tags/nfsserver | 1 + .../hostlists/tags/ossnode | 2 + .../hostlists/tags/pbsclient | 2 + .../hostlists/tags/pbsserver | 1 + .../hostlists/tags/rebootlustre | 1 + .../azhpc_install_config/hpcadmin_id_rsa | 27 ++ .../azhpc_install_config/hpcadmin_id_rsa.pub | 1 + .../install/00_install_node_setup.sh | 48 +++ .../install/01_disable-selinux.sh | 18 ++ .../install/02_cndefault.sh | 18 ++ .../install/03_nfsserver.sh | 18 ++ .../install/04_nfsclient.sh | 18 ++ .../install/05_localuser.sh | 18 ++ .../install/06_lfsrepo.sh | 18 ++ .../install/07_lustreinstall1.sh | 18 ++ .../install/08_rebootlustre.sh | 18 ++ .../install/09_waitforreboot.sh | 7 + .../install/10_installOFED.sh | 18 ++ .../install/11_lustreinstall2.sh | 18 ++ .../install/12_lustrenetwork.sh | 18 ++ .../install/13_lfsmaster.sh | 18 ++ .../azhpc_install_config/install/14_lfsoss.sh | 18 ++ .../azhpc_install_config/install/15_lfshsm.sh | 18 ++ .../install/16_lfsclient.sh | 18 ++ .../install/17_lfsimport.sh | 18 ++ .../install/18_lfsloganalytics.sh | 18 ++ .../install/19_pbsdownload.sh | 18 ++ .../install/20_pbsserver.sh | 19 ++ .../install/21_pbsclient.sh | 19 ++ .../azhpc_install_config/scripts/cndefault.sh | 23 ++ .../scripts/disable-selinux.sh | 6 + .../scripts/installOFED.sh | 4 + .../azhpc_install_config/scripts/lfsclient.sh | 48 +++ .../azhpc_install_config/scripts/lfshsm.sh | 95 ++++++ .../azhpc_install_config/scripts/lfsimport.sh | 31 ++ .../scripts/lfsloganalytics.sh | 31 ++ .../azhpc_install_config/scripts/lfsmaster.sh | 31 ++ .../azhpc_install_config/scripts/lfsoss.sh | 30 ++ .../azhpc_install_config/scripts/lfsrepo.sh | 27 ++ .../azhpc_install_config/scripts/localuser.sh | 40 +++ .../scripts/lustreinstall1.sh | 8 + .../scripts/lustreinstall2.sh | 10 + .../scripts/lustrenetwork.sh | 9 + .../azhpc_install_config/scripts/nfsclient.sh | 34 +++ .../azhpc_install_config/scripts/nfsserver.sh | 212 +++++++++++++ .../azhpc_install_config/scripts/pbsclient.sh | 22 ++ .../scripts/pbsdownload.sh | 9 + .../azhpc_install_config/scripts/pbsserver.sh | 19 ++ .../scripts/rebootlustre.sh | 16 + .../scripts/waitforreboot.sh | 2 + experimental/lustre_rdma_avs/config.json | 278 ++++++++++++++++++ experimental/lustre_rdma_avs/readme.md | 36 +++ .../lustre_rdma_avs/scripts/installOFED.sh | 4 + .../lustre_rdma_avs/scripts/lfsclient.sh | 57 ++++ .../lustre_rdma_avs/scripts/lfsmaster.sh | 31 ++ .../lustre_rdma_avs/scripts/lfsoss.sh | 30 ++ .../lustre_rdma_avs/scripts/lfsrepo.sh | 27 ++ .../lustre_rdma_avs/scripts/lustreinstall1.sh | 8 + .../lustre_rdma_avs/scripts/lustreinstall2.sh | 10 + .../lustre_rdma_avs/scripts/lustrenetwork.sh | 9 + .../lustre_rdma_avs/scripts/rebootlustre.sh | 16 + experimental/lustre_rdma_avs/writeup | 20 ++ .../lustre_rdma_nvmedrives/config.json | 241 +++++++++++++++ experimental/lustre_rdma_nvmedrives/readme.md | 38 +++ .../scripts/installOFED.sh | 4 + .../scripts/lfsclient.sh | 47 +++ .../scripts/lfsmaster.sh | 33 +++ .../lustre_rdma_nvmedrives/scripts/lfsoss.sh | 32 ++ .../lustre_rdma_nvmedrives/scripts/lfsrepo.sh | 27 ++ .../scripts/lustreinstall1.sh | 8 + .../scripts/lustreinstall2.sh | 10 + .../scripts/lustrenetwork.sh | 9 + .../scripts/rebootlustre.sh | 19 ++ experimental/lustre_rdma_nvmedrives/writeup | 17 ++ 104 files changed, 3009 insertions(+) create mode 100644 experimental/lustre_ipoib/config.json create mode 100644 experimental/lustre_ipoib/readme.md create mode 100755 experimental/lustre_ipoib/scripts/installdrives.sh create mode 100755 experimental/lustre_ipoib/scripts/lfsclient.sh create mode 100755 experimental/lustre_ipoib/scripts/lfsmaster.sh create mode 100755 experimental/lustre_ipoib/scripts/lfsoss.sh create mode 100755 experimental/lustre_ipoib/scripts/lfspkgs.sh create mode 100644 experimental/lustre_ipoib_nvmedrives/config.json create mode 100644 experimental/lustre_ipoib_nvmedrives/readme.md create mode 100755 experimental/lustre_ipoib_nvmedrives/scripts/lfsclient.sh create mode 100755 experimental/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh create mode 100755 experimental/lustre_ipoib_nvmedrives/scripts/lfsoss.sh create mode 100755 experimental/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh create mode 100755 experimental/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/compute create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/headnode create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/linux create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lustre create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa create mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/localuser.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh create mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh create mode 100644 experimental/lustre_rdma_avs/config.json create mode 100644 experimental/lustre_rdma_avs/readme.md create mode 100755 experimental/lustre_rdma_avs/scripts/installOFED.sh create mode 100755 experimental/lustre_rdma_avs/scripts/lfsclient.sh create mode 100755 experimental/lustre_rdma_avs/scripts/lfsmaster.sh create mode 100755 experimental/lustre_rdma_avs/scripts/lfsoss.sh create mode 100755 experimental/lustre_rdma_avs/scripts/lfsrepo.sh create mode 100755 experimental/lustre_rdma_avs/scripts/lustreinstall1.sh create mode 100755 experimental/lustre_rdma_avs/scripts/lustreinstall2.sh create mode 100755 experimental/lustre_rdma_avs/scripts/lustrenetwork.sh create mode 100755 experimental/lustre_rdma_avs/scripts/rebootlustre.sh create mode 100644 experimental/lustre_rdma_avs/writeup create mode 100644 experimental/lustre_rdma_nvmedrives/config.json create mode 100644 experimental/lustre_rdma_nvmedrives/readme.md create mode 100755 experimental/lustre_rdma_nvmedrives/scripts/installOFED.sh create mode 100755 experimental/lustre_rdma_nvmedrives/scripts/lfsclient.sh create mode 100755 experimental/lustre_rdma_nvmedrives/scripts/lfsmaster.sh create mode 100755 experimental/lustre_rdma_nvmedrives/scripts/lfsoss.sh create mode 100755 experimental/lustre_rdma_nvmedrives/scripts/lfsrepo.sh create mode 100755 experimental/lustre_rdma_nvmedrives/scripts/lustreinstall1.sh create mode 100755 experimental/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh create mode 100755 experimental/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh create mode 100755 experimental/lustre_rdma_nvmedrives/scripts/rebootlustre.sh create mode 100644 experimental/lustre_rdma_nvmedrives/writeup diff --git a/experimental/lustre_ipoib/config.json b/experimental/lustre_ipoib/config.json new file mode 100644 index 000000000..0127dc390 --- /dev/null +++ b/experimental/lustre_ipoib/config.json @@ -0,0 +1,224 @@ +{ + "location": "variables.location", + "resource_group": "variables.resource_group", + "install_from": "headnode", + "admin_user": "hpcadmin", + "vnet": { + "name": "hpcvnet", + "address_prefix": "10.2.0.0/20", + "subnets": { + "compute": "10.2.0.0/22", + "storage": "10.2.4.0/24" + } + }, + "variables": { + "location": "", + "resource_group": "", + "image": "OpenLogic:CentOS-HPC:7.6:latest", + "lustreimage": "OpenLogic:CentOS-HPC:7.6:latest", + "drivenum": 4, + "ossnum": 4, + "low_priority": true, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.10", + "lustre_mount": "/lustre" + }, + "resources": { + "headnode": { + "type": "vm", + "vm_type": "Standard_HC44rs", + "accelerated_networking": false, + "public_ip": true, + "image": "variables.image", + "subnet": "compute", + "tags": [ + "disable-selinux", + "cndefault", + "lfsrepo", + "lfsclient", + "lfsazimport", + "localuser", + "pbsserver", + "loginnode", + "nfsserver" + ] + }, + "lustre": { + "type": "vmss", + "vm_type": "Standard_HC44rs", + "instances": "9", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre[0:5]", + "osses[1:5]", + "lfsrepo", + "lfsclient[5:9]", + "localuser", + "pbsclient[5:9]", + "nfsclient", + "disable-selinux", + "lfsloganalytics" + ] + } + }, + "install": [ + { + "script": "disable-selinux.sh", + "tag": "disable-selinux", + "sudo": true + }, + { + "script": "cndefault.sh", + "tag": "cndefault", + "sudo": true + }, + { + "script": "nfsserver.sh", + "tag": "nfsserver", + "sudo": true + }, + { + "script": "nfsclient.sh", + "args": [ + "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. + +The configuration file requires the following variables to be set: + +| Variable | Description | +|-------------------------|----------------------------------------------| +| resource_group | The resource group for the project | +| storage_account | The storage account for HSM | +| storage_key | The storage key for HSM | +| storage_container | The container to use for HSM | +| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | +| la_resourcegroup | The resource group for Log Analytics | +| la_name | The Log Analytics Workspace name | + +> Note: you can remove log anaytics and/or HSM from the config file if not required. + +> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/experimental/lustre_ipoib/scripts/installdrives.sh b/experimental/lustre_ipoib/scripts/installdrives.sh new file mode 100755 index 000000000..221e2349e --- /dev/null +++ b/experimental/lustre_ipoib/scripts/installdrives.sh @@ -0,0 +1,33 @@ +#!/bin/bash +groupname=$1 +vmlist=$2 +ossnum=$3 +drivenum=$4 + +#create the drives first before attachint to vmss +drivecount=$(($drivenum*$ossnum)) + +for ((num=1; num<=$drivecount; num++)); do + az disk create -g $groupname -n "lustredrive$num" --size-gb 1024 & +done + +sleep 60 # to ensure all drives are made + +#Now use the created drives +index=0 +lustrecnt=1 + +idlisttmp=$(az vmss list-instances --resource-group $groupname --name lustre |grep providers/Microsoft.Compute/virtualMachineScaleSets/lustre/virtualMachines | awk -F "virtualMachines/" '{print $2}' | sed '/networkInterfaces/d'| sed 's/["].*$//') + +idlist=($idlisttmp) + +for vmname in ${vmlist[@]}; do + ((index++)) + if [ $index -gt 0 ] ; then + for ((diskid=1; diskid<=$drivenum; diskid++)); do + az vmss disk attach --vmss-name lustre --disk lustredrive${lustrecnt} --sku Premium_LRS --instance-id ${idlist[$index]} --resource-group $groupname + ((lustrecnt++)) + done + fi +done + diff --git a/experimental/lustre_ipoib/scripts/lfsclient.sh b/experimental/lustre_ipoib/scripts/lfsclient.sh new file mode 100755 index 000000000..4e30d37fa --- /dev/null +++ b/experimental/lustre_ipoib/scripts/lfsclient.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} +mkdir ~/.ssh + +cp -r /share/home/hpcuser/.ssh ~/ + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if rpm -q lustre; then + + # if the server packages are installed only the client kmod is needed + # for 2.10 and nothing extra is needed for 2.12 + if [ "$lustre_version" = "2.10" ]; then + + if ! rpm -q kmod-lustre-client; then + yum -y install kmod-lustre-client + fi + + fi + +else + + # install the client RPMs if not already installed + if ! rpm -q lustre-client kmod-lustre-client; then + yum -y install lustre-client kmod-lustre-client + fi + weak-modules --add-kernel $(uname -r) + +fi +#Include the correct infiniband options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/experimental/lustre_ipoib/scripts/lfsmaster.sh b/experimental/lustre_ipoib/scripts/lfsmaster.sh new file mode 100755 index 000000000..dce36a159 --- /dev/null +++ b/experimental/lustre_ipoib/scripts/lfsmaster.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh ~/ + +#Include the correct ipoib options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + +fi diff --git a/experimental/lustre_ipoib/scripts/lfsoss.sh b/experimental/lustre_ipoib/scripts/lfsoss.sh new file mode 100755 index 000000000..0b9b060a5 --- /dev/null +++ b/experimental/lustre_ipoib/scripts/lfsoss.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh ~/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if [ "$PSSH_NODENUM" != "0" ]; then + + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device +#Include the correct ipoib options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +fi diff --git a/experimental/lustre_ipoib/scripts/lfspkgs.sh b/experimental/lustre_ipoib/scripts/lfspkgs.sh new file mode 100755 index 000000000..3120d3ba6 --- /dev/null +++ b/experimental/lustre_ipoib/scripts/lfspkgs.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +yum -y install lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre-resource-agents e2fsprogs || exit 1 + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf + +systemctl restart waagent + +weak-modules --add-kernel --no-initramfs + +umount /mnt/resource diff --git a/experimental/lustre_ipoib_nvmedrives/config.json b/experimental/lustre_ipoib_nvmedrives/config.json new file mode 100644 index 000000000..4a7bd988b --- /dev/null +++ b/experimental/lustre_ipoib_nvmedrives/config.json @@ -0,0 +1,205 @@ +{ + "location": "variables.location", + "resource_group": "variables.resource_group", + "install_from": "headnode", + "admin_user": "hpcadmin", + "vnet": { + "name": "hpcvnet", + "address_prefix": "10.2.0.0/20", + "subnets": { + "compute": "10.2.0.0/22", + "storage": "10.2.4.0/24" + } + }, + "variables": { + "location" : "", + "resource_group": "", + "image": "OpenLogic:CentOS-HPC:7.6:latest", + "lustreimage": "OpenLogic:CentOS-HPC:7.6:latest", + "drivenum": 4, + "ossnum": 4, + "low_priority": true, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.10", + "lustre_mount": "/lustre" + }, + "resources": { + "headnode": { + "type": "vm", + "vm_type": "Standard_HB60rs", + "accelerated_networking": false, + "public_ip": true, + "image": "variables.image", + "subnet": "compute", + "tags": [ + "disable-selinux", + "cndefault", + "lfsrepo", + "lfsclient", + "lfsazimport", + "localuser", + "pbsserver", + "loginnode", + "nfsserver" + ] + }, + "lustre": { + "type": "vmss", + "vm_type": "Standard_HB120rs_v2", + "instances": "9", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre[0:5]", + "osses[1:5]", + "lfsrepo", + "lfsclient[5:9]", + "localuser", + "pbsclient[5:9]", + "nfsclient", + "disable-selinux", + "lfsloganalytics" + ] + } + }, + "install": [ + { + "script": "disable-selinux.sh", + "tag": "disable-selinux", + "sudo": true + }, + { + "script": "cndefault.sh", + "tag": "cndefault", + "sudo": true + }, + { + "script": "nfsserver.sh", + "tag": "nfsserver", + "sudo": true + }, + { + "script": "nfsclient.sh", + "args": [ + "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. + +The configuration file requires the following variables to be set: + +| Variable | Description | +|-------------------------|----------------------------------------------| +| resource_group | The resource group for the project | +| storage_account | The storage account for HSM | +| storage_key | The storage key for HSM | +| storage_container | The container to use for HSM | +| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | +| la_resourcegroup | The resource group for Log Analytics | +| la_name | The Log Analytics Workspace name | + +> Note: you can remove log anaytics and/or HSM from the config file if not required. + +> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/experimental/lustre_ipoib_nvmedrives/scripts/lfsclient.sh b/experimental/lustre_ipoib_nvmedrives/scripts/lfsclient.sh new file mode 100755 index 000000000..4e30d37fa --- /dev/null +++ b/experimental/lustre_ipoib_nvmedrives/scripts/lfsclient.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} +mkdir ~/.ssh + +cp -r /share/home/hpcuser/.ssh ~/ + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if rpm -q lustre; then + + # if the server packages are installed only the client kmod is needed + # for 2.10 and nothing extra is needed for 2.12 + if [ "$lustre_version" = "2.10" ]; then + + if ! rpm -q kmod-lustre-client; then + yum -y install kmod-lustre-client + fi + + fi + +else + + # install the client RPMs if not already installed + if ! rpm -q lustre-client kmod-lustre-client; then + yum -y install lustre-client kmod-lustre-client + fi + weak-modules --add-kernel $(uname -r) + +fi +#Include the correct infiniband options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/experimental/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh b/experimental/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh new file mode 100755 index 000000000..dce36a159 --- /dev/null +++ b/experimental/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh ~/ + +#Include the correct ipoib options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + +fi diff --git a/experimental/lustre_ipoib_nvmedrives/scripts/lfsoss.sh b/experimental/lustre_ipoib_nvmedrives/scripts/lfsoss.sh new file mode 100755 index 000000000..0b9b060a5 --- /dev/null +++ b/experimental/lustre_ipoib_nvmedrives/scripts/lfsoss.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh ~/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if [ "$PSSH_NODENUM" != "0" ]; then + + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device +#Include the correct ipoib options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +fi diff --git a/experimental/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh b/experimental/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh new file mode 100755 index 000000000..3120d3ba6 --- /dev/null +++ b/experimental/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +yum -y install lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre-resource-agents e2fsprogs || exit 1 + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf + +systemctl restart waagent + +weak-modules --add-kernel --no-initramfs + +umount /mnt/resource diff --git a/experimental/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh b/experimental/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh new file mode 100755 index 000000000..753167b8f --- /dev/null +++ b/experimental/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh @@ -0,0 +1,2 @@ +#!/bin/bash +sleep 60 #enough time for node reboot to continue process diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/compute b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/compute new file mode 100644 index 000000000..232110d4a --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/compute @@ -0,0 +1,2 @@ +compute0001 +compute0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/headnode b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/headnode new file mode 100644 index 000000000..1a9798066 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/headnode @@ -0,0 +1 @@ +headnode diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster new file mode 100644 index 000000000..a47bf87fe --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster @@ -0,0 +1 @@ +lfsmaster diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/linux b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/linux new file mode 100644 index 000000000..337053fb6 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/linux @@ -0,0 +1,6 @@ +headnode +compute0001 +compute0002 +lfsmaster +lustre0001 +lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lustre b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lustre new file mode 100644 index 000000000..b8f9b2061 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lustre @@ -0,0 +1,2 @@ +lustre0001 +lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault new file mode 100644 index 000000000..337053fb6 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault @@ -0,0 +1,6 @@ +headnode +compute0001 +compute0002 +lfsmaster +lustre0001 +lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux new file mode 100644 index 000000000..337053fb6 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux @@ -0,0 +1,6 @@ +headnode +compute0001 +compute0002 +lfsmaster +lustre0001 +lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport new file mode 100644 index 000000000..1a9798066 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport @@ -0,0 +1 @@ +headnode diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient new file mode 100644 index 000000000..8af893f49 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient @@ -0,0 +1,3 @@ +headnode +compute0001 +compute0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics new file mode 100644 index 000000000..6453c2e60 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics @@ -0,0 +1,3 @@ +lfsmaster +lustre0001 +lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster new file mode 100644 index 000000000..a47bf87fe --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster @@ -0,0 +1 @@ +lfsmaster diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo new file mode 100644 index 000000000..337053fb6 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo @@ -0,0 +1,6 @@ +headnode +compute0001 +compute0002 +lfsmaster +lustre0001 +lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser new file mode 100644 index 000000000..337053fb6 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser @@ -0,0 +1,6 @@ +headnode +compute0001 +compute0002 +lfsmaster +lustre0001 +lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode new file mode 100644 index 000000000..1a9798066 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode @@ -0,0 +1 @@ +headnode diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre new file mode 100644 index 000000000..6453c2e60 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre @@ -0,0 +1,3 @@ +lfsmaster +lustre0001 +lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient new file mode 100644 index 000000000..748d1c5dc --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient @@ -0,0 +1,5 @@ +compute0001 +compute0002 +lfsmaster +lustre0001 +lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver new file mode 100644 index 000000000..1a9798066 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver @@ -0,0 +1 @@ +headnode diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode new file mode 100644 index 000000000..b8f9b2061 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode @@ -0,0 +1,2 @@ +lustre0001 +lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient new file mode 100644 index 000000000..232110d4a --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient @@ -0,0 +1,2 @@ +compute0001 +compute0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver new file mode 100644 index 000000000..1a9798066 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver @@ -0,0 +1 @@ +headnode diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre new file mode 100644 index 000000000..1a9798066 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre @@ -0,0 +1 @@ +headnode diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa b/experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa new file mode 100644 index 000000000..7846d2b39 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa @@ -0,0 +1,27 @@ +-----BEGIN RSA PRIVATE KEY----- +MIIEpAIBAAKCAQEA1JCjVUGcKCYN3RCERznjr7e1Chsf+DG30uluSXk3I/6nesto +5gLGfKiTjeHWvX5tqFITAA84r140AgsIcUHEpaWwk06QIVUTj6kDHbubP0i1V2EY +2sa6cm6hPQmsFIiOK578BLuv/Zda/arVJ1dq1q+1t0tt84TTCrsROszNw8t9Kc3Z +Gn2SY7F52Z8nttmN7OEsfUtg6K6f/5IwbJb7U8b/0jF6yWDpzrmqN33BJfrZ1VWs +jswhblxJZ0juAU/oAB0xtOzqM2vwUZy9FmcfRPo/U1gM4DUG1h37oWWkoQLhgURu +p3Lztqq8msXXsnk3ZnIkMWWNJ429fN2ui751QQIDAQABAoIBAGYQfRy2wDBW9Vks +UReSKE17PCZ6F8Oou8c95oLI/Tz/TZOcj+XBd2Tr3M3HnsCmMCkeH5lrtaAe74H7 +ojYfijivcjWJB5O5sgbM9H4WUtj0JH6sVK7XtTa1AB66wjGpz/oKAKCVLk/pmPss +R+T4CIjFHc/BHC5NnLgOUpuVM0fLUUUF8NmIvT6K0P4j7GZx12d1TDkqo+/rd1ku +EOuCjl8Q4bTO0qtJEXy2dmn38m6QGNS765j8gQ21wWY+Q7EX4JaJ+oO2ZgGuyYul +Cu+AFlCR4SkOok0DN6RG4KQ7Sly57HrZWwLI46FXmjiJqE/7wNvMwuHdUmnVbkoY +v04fxAECgYEA8ii6KMsPIxnMSCBpsRoFSOcPdSyoFyhMCCuiR9liCGRG4wz4u1i6 +ZFal1+d/rX6qxCTIZxvU8zn54Qsrr+44zV++4+Sd/nhrc+qWOxGggAscbYNG3w2g +GTGinERFPRs5iGmdJ0n+uy/TSPe5t0qH85AdKcU47mfrNb3Q08rEfxECgYEA4Lbj +zkCUa4UN6CP36FtOUNtpnrn7dxfDNpcS8CTt/oob2OifOUGhgPbCio5at7zE8cH0 +hWrUWFPDfBRliGdG/ZzdmIOaC0MU9eQG4JxkblgYccKpcYsTq45NDyhQJ0lbBjRG +Sp42HOnvZ8p0m9przrnQF22Bvr5E+VF1wVk18zECgYEA7pI9RS84jIZAAdcdCYPv +LPGnAvOp7paewXXrfQmnUUkppUsESd6SU4QiA2FpIk4mgvMSFLMQy0eU7KeKtNrn +Tz5C3FZBaZDNm/fDZhJpo3xO13179wh/cBK8d2OzKw6FUeVrFGgL8/KcH8kfSHq/ +EbAraxmIiygKTHnjIKUljWECgYAQxhYjIzbw/7GWDnlG4unppzcvHfrjXOa5gHVt +b5REV9LUUijwgTGpCsJizVWAOZsJ4Mx72QmYvkftTyh1EiB+deMkq04oYQ2DfU32 +HjZw9ip882bqjtMdDzY5V20EQbmFsQk+MKkhZ2Tzfm1N5PP/LmeWGBqDPnivk6ES +mbIpQQKBgQDqnc9KivmjPIHz2BpJh8icWkdvZ2WzycI3Sly6Suh0E6Q+epMTXUm3 +21TIEkkAlBYXkHs0ZhL7l7jzv5yYSGB8ZNDzk+UquE5OuxMwWsd3trqyJ3LMj9C5 +hV6JTHqNSw8xubCES0oRgJkcCedoQ0qxMwypnJarWPh/LSVCu3BZ2A== +-----END RSA PRIVATE KEY----- diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub b/experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub new file mode 100644 index 000000000..20776c3a0 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDUkKNVQZwoJg3dEIRHOeOvt7UKGx/4MbfS6W5JeTcj/qd6y2jmAsZ8qJON4da9fm2oUhMADzivXjQCCwhxQcSlpbCTTpAhVROPqQMdu5s/SLVXYRjaxrpybqE9CawUiI4rnvwEu6/9l1r9qtUnV2rWr7W3S23zhNMKuxE6zM3Dy30pzdkafZJjsXnZnye22Y3s4Sx9S2Dorp//kjBslvtTxv/SMXrJYOnOuao3fcEl+tnVVayOzCFuXElnSO4BT+gAHTG07Ooza/BRnL0WZx9E+j9TWAzgNQbWHfuhZaShAuGBRG6ncvO2qryaxdeyeTdmciQxZY0njb183a6LvnVB diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh new file mode 100755 index 000000000..d5e1850c6 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=linux + +if [ ! -f "hostlists/$tag" ]; then + echo "no hostlist ($tag), exiting" + exit 0 +fi + +# wait for DNS to update for all hostnames +for h in $(/dev/null 2>&1; do + echo "Waiting for host - $h (sleeping for 5 seconds)" + sleep 5 + done +done + +if [ "$1" != "" ]; then + tag=tags/$1 +else + sudo yum install -y epel-release > install/00_install_node_setup.log 2>&1 + sudo yum install -y pssh nc >> install/00_install_node_setup.log 2>&1 + + # setting up keys + cat < ~/.ssh/config + Host * + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + LogLevel ERROR +EOF + cp hpcadmin_id_rsa.pub ~/.ssh/id_rsa.pub + cp hpcadmin_id_rsa ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + chmod 644 ~/.ssh/config + chmod 644 ~/.ssh/id_rsa.pub + +fi + +pssh -p 50 -t 0 -i -h hostlists/$tag 'rpm -q rsync || sudo yum install -y rsync' >> install/00_install_node_setup.log 2>&1 + +prsync -p 50 -a -h hostlists/$tag ~/azhpc_install_config ~ >> install/00_install_node_setup.log 2>&1 +prsync -p 50 -a -h hostlists/$tag ~/.ssh ~ >> install/00_install_node_setup.log 2>&1 + +pssh -p 50 -t 0 -i -h hostlists/$tag 'echo "AcceptEnv PSSH_NODENUM PSSH_HOST" | sudo tee -a /etc/ssh/sshd_config' >> install/00_install_node_setup.log 2>&1 +pssh -p 50 -t 0 -i -h hostlists/$tag 'sudo systemctl restart sshd' >> install/00_install_node_setup.log 2>&1 +pssh -p 50 -t 0 -i -h hostlists/$tag "echo 'Defaults env_keep += \"PSSH_NODENUM PSSH_HOST\"' | sudo tee -a /etc/sudoers" >> install/00_install_node_setup.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh new file mode 100755 index 000000000..aff9f6abd --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-disable-selinux} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/disable-selinux.sh" >> install/01_disable-selinux.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh new file mode 100755 index 000000000..89df21b38 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-cndefault} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/cndefault.sh" >> install/02_cndefault.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh new file mode 100755 index 000000000..9fe8fc049 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-nfsserver} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/nfsserver.sh" >> install/03_nfsserver.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh new file mode 100755 index 000000000..3ef1d7dd2 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-nfsclient} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/nfsclient.sh '$(> install/04_nfsclient.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh new file mode 100755 index 000000000..547517af7 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-localuser} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/localuser.sh '$(> install/05_localuser.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh new file mode 100755 index 000000000..c51d1a7bf --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lfsrepo} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsrepo.sh '2.10'" >> install/06_lfsrepo.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh new file mode 100755 index 000000000..9c9e725e1 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lustre} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lustreinstall1.sh" >> install/07_lustreinstall1.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh new file mode 100755 index 000000000..cafe2dccc --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-rebootlustre} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/rebootlustre.sh '$(> install/08_rebootlustre.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh new file mode 100755 index 000000000..e7f2585c1 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +scripts/waitforreboot.sh >> install/09_waitforreboot.log 2>&1 + diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh new file mode 100755 index 000000000..0a9d5144c --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lustre} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/installOFED.sh" >> install/10_installOFED.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh new file mode 100755 index 000000000..415de3119 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lustre} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lustreinstall2.sh" >> install/11_lustreinstall2.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh new file mode 100755 index 000000000..210bc389e --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lustre} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lustrenetwork.sh" >> install/12_lustrenetwork.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh new file mode 100755 index 000000000..5dead31c8 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lfsmaster} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsmaster.sh '/dev/sdb'" >> install/13_lfsmaster.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh new file mode 100755 index 000000000..0b2f013ae --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-ossnode} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsoss.sh '$(head -n1 hostlists/tags/lfsmaster)' '/dev/nvme0n1'" >> install/14_lfsoss.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh new file mode 100755 index 000000000..479abe10e --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lustre} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfshsm.sh '$(head -n1 hostlists/tags/lustre)' 'lustretesting' 'TXOO/DhcJHGjjcNQ58f9SGCRF3RUuz3/UHaE70KbDAHhIkd38Ic5YXVlFcdxuytgk8pDg0sp5J9lCdOWr++sXA==' 'hsm' '2.10'" >> install/15_lfshsm.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh new file mode 100755 index 000000000..e6e74eb5c --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lfsclient} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsclient.sh '$(head -n1 hostlists/tags/lfsmaster)' '/lustre'" >> install/16_lfsclient.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh new file mode 100755 index 000000000..c23853cd8 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lfsazimport} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsimport.sh 'lustretesting' 'TXOO/DhcJHGjjcNQ58f9SGCRF3RUuz3/UHaE70KbDAHhIkd38Ic5YXVlFcdxuytgk8pDg0sp5J9lCdOWr++sXA==' 'hsm' '/lustre' '2.10'" >> install/17_lfsimport.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh new file mode 100755 index 000000000..d2a6ff976 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-lfsloganalytics} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsloganalytics.sh 'lfs' 'eb2e4150-e0fa-494d-8f60-291e27820eff' '0iKHSuo3C36gwxYYZSBIIVB8g5l7A1qztuF77oVwZlFV9iKqke/Jajc+qVLkt1SB7LNimpeb3Q++qerMtnZvuw=='" >> install/18_lfsloganalytics.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh new file mode 100755 index 000000000..9731feb81 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-loginnode} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; scripts/pbsdownload.sh" >> install/19_pbsdownload.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh new file mode 100755 index 000000000..0a2c0cf2d --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-pbsserver} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pscp.pssh -p 50 -h hostlists/tags/$tag pbspro_19.1.1.centos7/pbspro-server-19.1.1-0.x86_64.rpm $(pwd) >> install/20_pbsserver.log 2>&1 +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/pbsserver.sh" >> install/20_pbsserver.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh new file mode 100755 index 000000000..1c354d17f --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# expecting to be in $tmp_dir +cd "$( dirname "${BASH_SOURCE[0]}" )/.." + +tag=${1:-pbsclient} + +if [ ! -f "hostlists/tags/$tag" ]; then + echo " Tag is not assigned to any resource (not running)" + exit 0 +fi + +if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then + echo " Tag does not contain any resources (not running)" + exit 0 +fi + +pscp.pssh -p 50 -h hostlists/tags/$tag pbspro_19.1.1.centos7/pbspro-execution-19.1.1-0.x86_64.rpm $(pwd) >> install/21_pbsclient.log 2>&1 +pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/pbsclient.sh '$(> install/21_pbsclient.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh new file mode 100755 index 000000000..303ebac1b --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Script to be run on all compute nodes +if ! rpm -q epel-release; then + yum -y install epel-release +fi + +yum -y install git jq htop + +# change access to resource so that temp jobs can be written there +chmod 777 /mnt/resource + +# If running on Cycle +# - enable METADATA access +# - remove Jetpack convergence +# - Disable Fail2Ban service +# - Fix PBS limits +if [ -e $CYCLECLOUD_HOME/bin/jetpack ]; then + DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + $DIR/azhpc4cycle.sh enable_metada_access + $DIR/azhpc4cycle.sh disable_jetpack_converge + $DIR/azhpc4cycle.sh disable_fail2ban + $DIR/azhpc4cycle.sh fix_pbs_limits +fi diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh new file mode 100755 index 000000000..00c87bbf2 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# set to permissive for now (until reboot) +setenforce 0 +# prep to have selinux disabled after reboot +sed -i 's/SELINUX=.*$/SELINUX=disabled/g' /etc/selinux/config diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh new file mode 100755 index 000000000..c267519fc --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh @@ -0,0 +1,4 @@ +#!/bin/bash +yum -y groupinstall --skip-broken "Infiniband Support" 2>/dev/null +echo "done installing Infiniband" +exit 0 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh new file mode 100755 index 000000000..26603bebd --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} +mkdir ~/.ssh + +cp -r /share/home/hpcuser/.ssh ~/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh new file mode 100755 index 000000000..0af1fc5e2 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = storage account +# arg: $3 = storage key +# arg: $4 = storage container +# arg: $5 = lustre version (default 2.10) +master=$1 +storage_account=$2 +storage_key=$3 +storage_container=$4 +lustre_version=${5-2.10} + +# remove the patch version +ndots=${lustre_version//[^.]} +if [ "${#ndots}" = "2" ]; then + lustre_version=${lustre_version%.*} +fi + +# adding kernel module for lustre client +if [ "$lustre_version" = "2.10" ]; then + yum install -y kmod-lustre-client + weak-modules --add-kernel $(uname -r) +fi + +if ! rpm -q lemur-azure-hsm-agent lemur-azure-data-movers; then + yum -y install \ + https://azurehpc.azureedge.net/rpms/lemur-azure-hsm-agent-1.0.0-lustre_${lustre_version}.x86_64.rpm \ + https://azurehpc.azureedge.net/rpms/lemur-azure-data-movers-1.0.0-lustre_${lustre_version}.x86_64.rpm +fi + +mkdir -p /var/run/lhsmd +chmod 755 /var/run/lhsmd + +mkdir -p /etc/lhsmd +chmod 755 /etc/lhsmd + +cat </etc/lhsmd/agent +# Lustre NID and filesystem name for the front end filesystem, the agent will mount this +client_device="${master}@tcp:/LustreFS" + +# Do you want to use S3 and POSIX, in this example we use POSIX +enabled_plugins=["lhsm-plugin-az"] + +## Directory to look for the plugins +plugin_dir="/usr/libexec/lhsmd" + +# TBD, I used 16 +handler_count=16 + +# TBD +snapshots { + enabled = false +} +EOF +chmod 600 /etc/lhsmd/agent + +cat </etc/lhsmd/lhsm-plugin-az +az_storage_account = "$storage_account" +az_storage_key = "$storage_key" + +num_threads = 32 + +# +# One or more archive definition is required. +# +archive "az-blob" { + id = 1 # Must be unique to this endpoint + container = "$storage_container" # Container used for this archive + prefix = "" # Optional prefix + num_threads = 32 +} +EOF +chmod 600 /etc/lhsmd/lhsm-plugin-az + +cat </etc/systemd/system/lhsmd.service +[Unit] +Description=The lhsmd server +After=syslog.target network.target remote-fs.target nss-lookup.target + +[Service] +Type=simple +PIDFile=/run/lhsmd.pid +ExecStartPre=/bin/mkdir -p /var/run/lhsmd +ExecStart=/sbin/lhsmd -config /etc/lhsmd/agent +Restart=always + +[Install] +WantedBy=multi-user.target +EOF +chmod 600 /etc/systemd/system/lhsmd.service + +systemctl daemon-reload +systemctl enable lhsmd +systemctl start lhsmd diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh new file mode 100755 index 000000000..fd9fad30b --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# arg: $1 = storage account +# arg: $2 = storage key +# arg: $3 = storage container +# arg: $3 = lfs mount +# arg: $4 = lustre mount (default=/lustre) +# arg: $5 = lustre version (default=2.10) +storage_account=$1 +storage_key=$2 +storage_container=$3 +lfs_mount=${4:-/lustre} +lustre_version=${5-2.10} + +# remove the patch version +ndots=${lustre_version//[^.]} +if [ "${#ndots}" = "2" ]; then + lustre_version=${lustre_version%.*} +fi + +if ! rpm -q lemur-azure-hsm-agent lemur-azure-data-movers; then + yum -y install \ + https://azurehpc.azureedge.net/rpms/lemur-azure-hsm-agent-1.0.0-lustre_${lustre_version}.x86_64.rpm \ + https://azurehpc.azureedge.net/rpms/lemur-azure-data-movers-1.0.0-lustre_${lustre_version}.x86_64.rpm +fi + +cd $lfs_mount +export STORAGE_ACCOUNT=$storage_account +export STORAGE_KEY=$storage_key +/sbin/azure-import ${storage_container} + diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh new file mode 100755 index 000000000..ce6b43f3d --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# arg: $1 = name +# arg: $2 = log analytics workspace id +# arg: $3 = log analytics key + +name=$1 +log_analytics_workspace_id=$2 +log_analytics_key=$3 + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +sed "s#__FS_NAME__#${name}#g;s#__LOG_ANALYTICS_WORKSPACE_ID__#${log_analytics_workspace_id}#g;s#__LOG_ANALYTICS_KEY__#${log_analytics_key}#g" $DIR/lfsloganalyticsd.sh.in >/usr/bin/lfsloganalyticsd.sh + +chmod +x /usr/bin/lfsloganalyticsd.sh + +cat </lib/systemd/system/lfsloganalytics.service +[Unit] +Description=Lustre logging service to Log Analytics. + +[Service] +Type=simple +ExecStart=/bin/bash /usr/bin/lfsloganalyticsd.sh +Restart=always + +[Install] +WantedBy=multi-user.target +EOF + +systemctl enable lfsloganalytics +systemctl start lfsloganalytics \ No newline at end of file diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh new file mode 100755 index 000000000..d2dcdb02e --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh /root/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + + diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh new file mode 100755 index 000000000..8f39aac68 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh /root/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + + lnetctl net add --net o2ib --if ib0 #double check + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}@o2ib" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device + + +mkdir /mnt/oss +echo "$device /mnt/oss lustre noatime,nodiratime,nobarrier 0 2" >> /etc/fstab +mount -a diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh new file mode 100755 index 000000000..db1eeb165 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh @@ -0,0 +1,27 @@ +#!/bin/bash +lustre_version=${1-2.10} + +cat << EOF >/etc/yum.repos.d/LustrePack.repo +[lustreserver] +name=lustreserver +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ +enabled=1 +gpgcheck=0 + +[e2fs] +name=e2fs +baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ +enabled=1 +gpgcheck=0 + +[lustreclient] +name=lustreclient +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ +enabled=1 +gpgcheck=0 +EOF + +#Include the correct rdma options +#cat >/etc/modprobe.d/lustre.conf<$home_root/$new_user/.ssh/config +Host * + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + LogLevel ERROR +EOF + ssh-keygen -f $home_root/$new_user/.ssh/id_rsa -t rsa -N '' + # add admin user public key (the only user in /home) + cat /home/*/.ssh/id_rsa.pub >$home_root/$new_user/.ssh/authorized_keys + cat $home_root/$new_user/.ssh/id_rsa.pub >>$home_root/$new_user/.ssh/authorized_keys + chown $new_user:$new_user $home_root/$new_user/.ssh + chown $new_user:$new_user $home_root/$new_user/.ssh/* + chmod 700 $home_root/$new_user/.ssh + chmod 600 $home_root/$new_user/.ssh/id_rsa + chmod 644 $home_root/$new_user/.ssh/id_rsa.pub + chmod 644 $home_root/$new_user/.ssh/config + chmod 644 $home_root/$new_user/.ssh/authorized_keys +fi +echo "$new_user ALL=(ALL) NOPASSWD: ALL" | tee -a /etc/sudoers diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh new file mode 100755 index 000000000..c052001a0 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# jump the gun here to ensure passwordless ssh as root between all lustre nodes to faciltate node reboot +cp -r /share/home/hpcuser/.ssh ~/ + +yum -y --nogpgcheck --disablerepo=* --enablerepo=e2fs install e2fsprogs + +yum -y --nogpgcheck --disablerepo=base,extras,updates --enablerepo=lustreserver install kernel kernel-devel kernel-headers kernel-tools kernel-tools-libs 2>/dev/null + diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh new file mode 100755 index 000000000..60f3e759e --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh @@ -0,0 +1,10 @@ +#!/bin/bash +yum -y --nogpgcheck --enablerepo=lustreserver install kmod-lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre lustre-resource-agents +modprobe -v lustre + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf + +weak-modules --add-kernel --no-initramfs +systemctl enable lustre +umount /mnt/resource diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh new file mode 100755 index 000000000..f95d33864 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf +service waagent restart +service rdma start +modprobe lnet +lctl network configure +lnetctl net add --net o2ib --if ib0 #need this to come up every time +sleep 5 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh new file mode 100755 index 000000000..678bac4dd --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# arg: $1 = nfsserver +nfs_server=$1 +nfs_share=${2-/share} +if [ -z "$nfs_server" ]; then + echo "The nfs_server is required" + exit 1 +fi + +yum install -y nfs-utils + +mkdir -p /scratch +mkdir -p /apps +mkdir -p /data +mkdir -p /share/home +mount $nfs_server:$nfs_share/apps /apps +mount $nfs_server:$nfs_share/data /data +mount $nfs_server:$nfs_share/home /share/home + +chmod 777 /scratch + +cat << EOF >> /etc/fstab +$nfs_server:$nfs_share/home /share/home nfs defaults 0 0 +$nfs_server:/mnt/resource/scratch /scratch nfs defaults 0 0 +$nfs_server:$nfs_share/apps /apps nfs defaults 0 0 +$nfs_server:$nfs_share/data /data nfs defaults 0 0 +EOF + +setsebool -P use_nfs_home_dirs 1 + +mount -a + +df diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh new file mode 100755 index 000000000..14d53a4c0 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh @@ -0,0 +1,212 @@ +#!/bin/bash +if [[ $(id -u) -ne 0 ]] ; then + echo "Must be run as root" + exit 1 +fi + +# Disable requiretty to allow run sudo within scripts +sed -i -e 's/Defaults requiretty.*/ #Defaults requiretty/g' /etc/sudoers + +yum -y install epel-release +yum -y install nfs-utils nfs-utils-lib + +# Shares +NFS_MOUNT_POINT=/share +NFS_APPS=$NFS_MOUNT_POINT/apps +NFS_DATA=$NFS_MOUNT_POINT/data +NFS_HOME=$NFS_MOUNT_POINT/home +NFS_SCRATCH=/mnt/resource/scratch + +# Partitions all data disks attached to the VM +# +setup_data_disks() +{ + mountPoint="$1" + filesystem="$2" + devices="$3" + raidDevice="$4" + createdPartitions="" + numdevices=`echo $devices | wc -w` + if [ $numdevices -gt 1 ] + then + # Loop through and partition disks until not found + for disk in $devices; do + fdisk -l /dev/$disk || break + fdisk /dev/$disk << EOF +n +p +1 + + +t +fd +w +EOF + createdPartitions="$createdPartitions /dev/${disk}1" + done + else + disk=$(echo $devices | tr -d [:space:]) + echo "Warning: Only a single device to partition, $disk" + fdisk -l /dev/$disk || break + fdisk /dev/$disk << EOF +n +p +1 + + +w +EOF + createdPartitions="$createdPartitions /dev/${disk}1" + fi + + sleep 10 + + # Create RAID-0 volume + if [ -n "$createdPartitions" ]; then + devices=`echo $createdPartitions | wc -w` + if [ $numdevices -gt 1 ] + then + mdadm --create /dev/$raidDevice --level 0 --raid-devices $devices $createdPartitions + sleep 10 + + mdadm /dev/$raidDevice + else + echo "Warning: mdadm is not called, we have one partition named, ${disk}1 for mountpoint, $mountPoint" + raidDevice=${disk}1 + fi + + if [ "$filesystem" == "xfs" ]; then + mkfs -t $filesystem /dev/$raidDevice + export xfsuuid="UUID=`blkid |grep dev/$raidDevice |cut -d " " -f 2 |cut -c 7-42`" +# echo "$xfsuuid $mountPoint $filesystem rw,noatime,attr2,inode64,nobarrier,sunit=1024,swidth=4096,nofail 0 2" >> /etc/fstab + echo "$xfsuuid $mountPoint $filesystem rw,noatime,attr2,inode64,nobarrier,nofail 0 2" >> /etc/fstab + else + mkfs.ext4 -i 2048 -I 512 -J size=400 -Odir_index,filetype /dev/$raidDevice + sleep 5 + tune2fs -o user_xattr /dev/$raidDevice + export ext4uuid="UUID=`blkid |grep dev/$raidDevice |cut -d " " -f 2 |cut -c 7-42`" + echo "$ext4uuid $mountPoint $filesystem noatime,nodiratime,nobarrier,nofail 0 2" >> /etc/fstab + fi + + sleep 10 + mount -a + fi +} + +setup_single_disk() +{ + mountPoint="$1" + filesystem="$2" + device="$3" + + fdisk -l /dev/$device || break + fdisk /dev/$device << EOF +n +p +1 + + +p +w +EOF + + if [ "$filesystem" == "xfs" ]; then + mkfs -t $filesystem /dev/$device + echo "/dev/$device $mountPoint $filesystem rw,noatime,attr2,inode64,nobarrier,nofail 0 2" >> /etc/fstab + else + mkfs.ext4 -F -i 2048 -I 512 -J size=400 -Odir_index,filetype /dev/$device + sleep 5 + tune2fs -o user_xattr /dev/$device + echo "/dev/$device $mountPoint $filesystem noatime,nodiratime,nobarrier,nofail 0 2" >> /etc/fstab + fi + + sleep 10 + + mount /dev/$device $mountPoint +} + +setup_disks() +{ + # Dump the current disk config for debugging + fdisk -l + + # Dump the scsi config + lsscsi + + # Get the root/OS disk so we know which device it uses and can ignore it later + rootDevice=`mount | grep "on / type" | awk '{print $1}' | sed 's/[0-9]//g'` + + # Get the TMP disk so we know which device and can ignore it later + tmpDevice=`mount | grep "on /mnt/resource type" | awk '{print $1}' | sed 's/[0-9]//g'` + + # Get the data disk sizes from fdisk, we ignore the disks above + dataDiskSize=`fdisk -l | grep '^Disk /dev/' | grep -v $rootDevice | grep -v $tmpDevice | awk '{print $3}' | sort -n -r | tail -1` + + # Compute number of disks + nbDisks=`fdisk -l | grep '^Disk /dev/' | grep -v $rootDevice | grep -v $tmpDevice | wc -l` + echo "nbDisks=$nbDisks" + + dataDevices="`fdisk -l | grep '^Disk /dev/' | grep $dataDiskSize | awk '{print $2}' | awk -F: '{print $1}' | sort | head -$nbDisks | tr '\n' ' ' | sed 's|/dev/||g'`" + + mkdir -p $NFS_MOUNT_POINT + + + if [ "$nbDisks" -eq "1" ]; then + setup_single_disk $NFS_MOUNT_POINT "ext4" "$dataDevices" + elif [ "$nbDisks" -gt "1" ]; then + setup_data_disks $NFS_MOUNT_POINT "xfs" "$dataDevices" "md10" + fi + + mkdir -p $NFS_APPS + mkdir -p $NFS_DATA + mkdir -p $NFS_HOME + mkdir -p $NFS_SCRATCH + chmod 777 $NFS_APPS + chmod 777 $NFS_DATA + chmod 777 $NFS_HOME + chmod 777 $NFS_SCRATCH + + ln -s $NFS_SCRATCH /scratch + + echo "$NFS_APPS *(rw,sync,no_root_squash)" >> /etc/exports + echo "$NFS_DATA *(rw,sync,no_root_squash)" >> /etc/exports + echo "$NFS_HOME *(rw,sync,no_root_squash)" >> /etc/exports + echo "$NFS_SCRATCH *(rw,sync,no_root_squash)" >> /etc/exports + + exportfs + exportfs -a + exportfs +} + +tune_nfs() +{ + cores=$(grep processor /proc/cpuinfo | wc -l) + nfs_proc=$(($cores * 4)) + replace="s/#RPCNFSDCOUNT=16/RPCNFSDCOUNT=$nfs_proc/g" + sed -i -e "$replace" /etc/sysconfig/nfs + + grep RPCNFSDCOUNT /etc/sysconfig/nfs +} + +systemctl enable rpcbind +systemctl enable nfs-server +systemctl enable nfs-lock +systemctl enable nfs-idmap +systemctl enable nfs + +systemctl start rpcbind +systemctl start nfs-server +systemctl start nfs-lock +systemctl start nfs-idmap +systemctl start nfs + +setup_disks +tune_nfs +systemctl restart nfs-server + +ln -s /share/apps /apps +ln -s /share/data /data + +df + + diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh new file mode 100755 index 000000000..fd037df76 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -e +# arg: $1 = pbs_server +pbs_server=$1 + +if [ "$(rpm -qa pbspro-execution)" = "" ];then + yum install -y pbspro-execution-19.1.1-0.x86_64.rpm + + sed -i "s/CHANGE_THIS_TO_PBS_PRO_SERVER_HOSTNAME/${pbs_server}/g" /etc/pbs.conf + sed -i "s/CHANGE_THIS_TO_PBS_PRO_SERVER_HOSTNAME/${pbs_server}/g" /var/spool/pbs/mom_priv/config + sed -i "s/^if /#if /g" /opt/pbs/lib/init.d/limits.pbs_mom + sed -i "s/^fi/#fi /g" /opt/pbs/lib/init.d/limits.pbs_mom + systemctl enable pbs + systemctl start pbs + + # Retrieve the VMSS name to be used as the pool name for multiple VMSS support + poolName=$(curl -s -H Metadata:true "http://169.254.169.254/metadata/instance?api-version=2018-10-01" | jq -r '.compute.vmScaleSetName') + /opt/pbs/bin/qmgr -c "c n $(hostname) resources_available.pool_name='$poolName'" + +else + echo "PBS client was already installed" +fi diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh new file mode 100755 index 000000000..b4317516b --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +filename=pbspro_19.1.1.centos7.zip + +if [ ! -f "$filename" ];then + wget -q https://github.com/PBSPro/pbspro/releases/download/v19.1.1/$filename + unzip $filename +fi + diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh new file mode 100755 index 000000000..14ee54d1a --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -e +admin_user=$(whoami) + +if [ "$(rpm -qa pbspro-server)" = "" ];then + yum install -y pbspro-server-19.1.1-0.x86_64.rpm + systemctl enable pbs + systemctl start pbs + /opt/pbs/bin/qmgr -c "s s managers += ${admin_user}@*" + /opt/pbs/bin/qmgr -c 's s flatuid=t' + /opt/pbs/bin/qmgr -c 's s job_history_enable=t' + /opt/pbs/bin/qmgr -c 'c r pool_name type=string,flag=h' + + # Update the sched_config file to schedule jobs that request pool_name + sed -i "s/^resources: \"ncpus,/resources: \"ncpus, pool_name,/g" /var/spool/pbs/sched_priv/sched_config + systemctl restart pbs +else + echo "PBSPro already installed" +fi diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh new file mode 100755 index 000000000..2d33c180b --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh @@ -0,0 +1,16 @@ +#!/bin/bash +vmlist=$1 +osscount=$2 +totalcount=$((osscount+2)) +index=0 +#prep headnode +cp -r /share/home/hpcuser/.ssh /root/ +echo "vmlist is ${vmlist[@]}" + +#needs to be done sequentially +for vmname in ${vmlist[@]}; do + echo "Rebooting $vmname" + ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null + index=$((index+1)) +done +exit 0 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh new file mode 100755 index 000000000..73411ca61 --- /dev/null +++ b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh @@ -0,0 +1,2 @@ +#!/bin/bash +sleep 180 #enough time for node reboot to continue process diff --git a/experimental/lustre_rdma_avs/config.json b/experimental/lustre_rdma_avs/config.json new file mode 100644 index 000000000..d241c4631 --- /dev/null +++ b/experimental/lustre_rdma_avs/config.json @@ -0,0 +1,278 @@ +{ + "location": "variables.location", + "resource_group": "variables.resource_group", + "install_from": "headnode", + "admin_user": "hpcadmin", + "vnet": { + "name": "hpcvnet", + "address_prefix": "10.2.0.0/20", + "subnets": { + "compute": "10.2.0.0/22", + "storage": "10.2.4.0/24" + } + }, + "variables": { + "location": "", + "image": "OpenLogic:CentOS:7.6:latest", + "lustreimage": "OpenLogic:CentOS:7.6:latest", + "hpcimage": "OpenLogic:CentOS:7.6:latest", + "compute_instances": 2, + "lustre_instances": 2, + "low_priority": false, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.10", + "lustre_avset": "{{variables.resource_group}}avset", + "lustre_mount": "/lustre" + }, + "resources": { + "headnode": { + "type": "vm", + "vm_type": "Standard_HB120rs_v2", + "accelerated_networking": false, + "public_ip": true, + "image": "variables.image", + "subnet": "compute", + "tags": [ + "disable-selinux", + "cndefault", + "lfsrepo", + "lfsclient", + "lfsazimport", + "localuser", + "pbsserver", + "loginnode", + "rebootlustre", + "nfsserver", + "allnodes" + ] + }, + "compute": { + "type": "vm", + "vm_type": "Standard_HB120rs_v2", + "instances": "variables.compute_instances", + "availability_set": "variables.lustre_avset", + "low_priority": "variables.low_priority", + "accelerated_networking": false, + "image": "variables.hpcimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lfsrepo", + "lfsclient", + "localuser", + "pbsclient", + "nfsclient", + "disable-selinux", + "allnodes" + ] + }, + "lfsmaster": { + "type": "vm", + "vm_type": "Standard_HB120rs_v2", + "availability_set": "variables.lustre_avset", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre", + "lfsmaster", + "lfsrepo", + "localuser", + "nfsclient", + "disable-selinux", + "lfsloganalytics", + "allnodes" + ] + }, + "lustre": { + "type": "vm", + "vm_type": "Standard_HB120rs_v2", + "instances": "variables.lustre_instances", + "availability_set": "variables.lustre_avset", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lfsrepo", + "localuser", + "nfsclient", + "lustre", + "ossnode", + "disable-selinux", + "lfsloganalytics", + "allnodes" + ] + } + }, + "install": [ + { + "script": "disable-selinux.sh", + "tag": "disable-selinux", + "sudo": true + }, + { + "script": "cndefault.sh", + "tag": "cndefault", + "sudo": true + }, + { + "script": "nfsserver.sh", + "tag": "nfsserver", + "sudo": true + }, + { + "script": "nfsclient.sh", + "args": [ + "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. + +The configuration file requires the following variables to be set: + +| Variable | Description | +|-------------------------|----------------------------------------------| +| location | The locaton of the project | +| resource_group | The resource group for the project | +| storage_account | The storage account for HSM | +| storage_key | The storage key for HSM | +| storage_container | The container to use for HSM | +| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | +| la_resourcegroup | The resource group for Log Analytics | +| la_name | The Log Analytics Workspace name | + +> Note: you can remove log anaytics and/or HSM from the config file if not required. + +> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/experimental/lustre_rdma_avs/scripts/installOFED.sh b/experimental/lustre_rdma_avs/scripts/installOFED.sh new file mode 100755 index 000000000..c267519fc --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/installOFED.sh @@ -0,0 +1,4 @@ +#!/bin/bash +yum -y groupinstall --skip-broken "Infiniband Support" 2>/dev/null +echo "done installing Infiniband" +exit 0 diff --git a/experimental/lustre_rdma_avs/scripts/lfsclient.sh b/experimental/lustre_rdma_avs/scripts/lfsclient.sh new file mode 100755 index 000000000..5fc8f9a62 --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/lfsclient.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} + +cp -r /share/home/hpcuser/.ssh /root/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/experimental/lustre_rdma_avs/scripts/lfsmaster.sh b/experimental/lustre_rdma_avs/scripts/lfsmaster.sh new file mode 100755 index 000000000..d2dcdb02e --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/lfsmaster.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh /root/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + + diff --git a/experimental/lustre_rdma_avs/scripts/lfsoss.sh b/experimental/lustre_rdma_avs/scripts/lfsoss.sh new file mode 100755 index 000000000..8f39aac68 --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/lfsoss.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh /root/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + + lnetctl net add --net o2ib --if ib0 #double check + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}@o2ib" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device + + +mkdir /mnt/oss +echo "$device /mnt/oss lustre noatime,nodiratime,nobarrier 0 2" >> /etc/fstab +mount -a diff --git a/experimental/lustre_rdma_avs/scripts/lfsrepo.sh b/experimental/lustre_rdma_avs/scripts/lfsrepo.sh new file mode 100755 index 000000000..db1eeb165 --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/lfsrepo.sh @@ -0,0 +1,27 @@ +#!/bin/bash +lustre_version=${1-2.10} + +cat << EOF >/etc/yum.repos.d/LustrePack.repo +[lustreserver] +name=lustreserver +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ +enabled=1 +gpgcheck=0 + +[e2fs] +name=e2fs +baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ +enabled=1 +gpgcheck=0 + +[lustreclient] +name=lustreclient +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ +enabled=1 +gpgcheck=0 +EOF + +#Include the correct rdma options +#cat >/etc/modprobe.d/lustre.conf</dev/null + diff --git a/experimental/lustre_rdma_avs/scripts/lustreinstall2.sh b/experimental/lustre_rdma_avs/scripts/lustreinstall2.sh new file mode 100755 index 000000000..60f3e759e --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/lustreinstall2.sh @@ -0,0 +1,10 @@ +#!/bin/bash +yum -y --nogpgcheck --enablerepo=lustreserver install kmod-lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre lustre-resource-agents +modprobe -v lustre + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf + +weak-modules --add-kernel --no-initramfs +systemctl enable lustre +umount /mnt/resource diff --git a/experimental/lustre_rdma_avs/scripts/lustrenetwork.sh b/experimental/lustre_rdma_avs/scripts/lustrenetwork.sh new file mode 100755 index 000000000..f95d33864 --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/lustrenetwork.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf +service waagent restart +service rdma start +modprobe lnet +lctl network configure +lnetctl net add --net o2ib --if ib0 #need this to come up every time +sleep 5 diff --git a/experimental/lustre_rdma_avs/scripts/rebootlustre.sh b/experimental/lustre_rdma_avs/scripts/rebootlustre.sh new file mode 100755 index 000000000..2d33c180b --- /dev/null +++ b/experimental/lustre_rdma_avs/scripts/rebootlustre.sh @@ -0,0 +1,16 @@ +#!/bin/bash +vmlist=$1 +osscount=$2 +totalcount=$((osscount+2)) +index=0 +#prep headnode +cp -r /share/home/hpcuser/.ssh /root/ +echo "vmlist is ${vmlist[@]}" + +#needs to be done sequentially +for vmname in ${vmlist[@]}; do + echo "Rebooting $vmname" + ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null + index=$((index+1)) +done +exit 0 diff --git a/experimental/lustre_rdma_avs/writeup b/experimental/lustre_rdma_avs/writeup new file mode 100644 index 000000000..3eb58e3c6 --- /dev/null +++ b/experimental/lustre_rdma_avs/writeup @@ -0,0 +1,20 @@ +- lustre-ipoib - This is a created implementation of Lustre using ip over infiniband (IPoIB) +- lustre-rdma - This is a created implementation of Lustre using native Remote Direct Memory Access (RDMA) + +Changes to files to enable Infiniband functionality: +lfsmaster.sh +lfsoss.sh +lfsclient.sh +lfsrepo.sh +lfspkgs.sh + +Addition for the installation of new Mellanox OFED (MOFED) for the Lustre kernel : installMOFED.sh + +Addition for correct drives placement of OSSes : installdrives.sh +*installdrives.sh takes about 15 minutes to run so please either remote this entity, or wait it out. + +Additions for correct Lustre kernel : +lustreinstall1.sh +lustreinstall2.sh + +Addition for pause after MDS/OSS reboot : waitforreboot.sh diff --git a/experimental/lustre_rdma_nvmedrives/config.json b/experimental/lustre_rdma_nvmedrives/config.json new file mode 100644 index 000000000..dd2c4d6a9 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/config.json @@ -0,0 +1,241 @@ +{ + "location": "variables.location", + "resource_group": "variables.resource_group", + "install_from": "headnode", + "admin_user": "hpcadmin", + "vnet": { + "name": "hpcvnet", + "address_prefix": "10.2.0.0/20", + "subnets": { + "compute": "10.2.0.0/22", + "storage": "10.2.4.0/24" + } + }, + "variables": { + "location": "", + "resource_group": "", + "image": "OpenLogic:CentOS:7.6:latest", + "lustreimage": "OpenLogic:CentOS:7.6:latest", + "ossnum": 4, + "low_priority": true, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.10", + "lustre_mount": "/lustre" + }, + "resources": { + "headnode": { + "type": "vm", + "vm_type": "Standard_HB60rs", + "accelerated_networking": false, + "public_ip": true, + "image": "variables.image", + "subnet": "compute", + "tags": [ + "disable-selinux", + "cndefault", + "lfsrepo", + "rebootlustre", + "lfsclient", + "lfsazimport", + "localuser", + "pbsserver", + "allnodes", + "loginnode", + "nfsserver" + ] + }, + "lustre": { + "type": "vmss", + "vm_type": "Standard_HB120rs_v2", + "instances": "9", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre[0:5]", + "osses[1:5]", + "lfsrepo", + "lfsclient[5:9]", + "localuser", + "pbsclient[5:9]", + "nfsclient", + "allnodes", + "disable-selinux", + "lfsloganalytics" + ] + } + }, + "install": [ + { + "script": "disable-selinux.sh", + "tag": "disable-selinux", + "sudo": true + }, + { + "script": "cndefault.sh", + "tag": "cndefault", + "sudo": true + }, + { + "script": "nfsserver.sh", + "tag": "nfsserver", + "sudo": true + }, + { + "script": "nfsclient.sh", + "args": [ + "$(>>>>>> 9aba5d253a4a5a012d9d828c45d3110d9f5164df + }, + { + "script": "installOFED.sh", + "tag": "allnodes", + "sudo": true + }, + { + "script": "lustreinstall2.sh", + "tag": "lustre", + "sudo": true + }, + { + "script": "lustrenetwork.sh", + "tag": "allnodes", + "sudo": true + }, + { + "script": "lfsmaster.sh", + "tag": "lustre", + "args": [ + "/dev/sdb" + ], + "sudo": true + }, + { + "script": "lfsoss.sh", + "args": [ + "$(head -n1 hostlists/tags/lustre)", + "/dev/nvme0n1" + ], + "tag": "lustre", + "sudo": true + }, + { + "script": "lfshsm.sh", + "args": [ + "$(head -n1 hostlists/tags/lustre)", + "variables.storage_account", + "variables.storage_key", + "variables.storage_container", + "variables.lustre_version" + ], + "tag": "lustre", + "sudo": true + }, + { + "script": "lfsclient.sh", + "args": [ + "$(head -n1 hostlists/tags/lustre)", + "variables.lustre_mount" + ], + "tag": "lfsclient", + "sudo": true + }, + { + "script": "lfsimport.sh", + "args": [ + "variables.storage_account", + "variables.storage_key", + "variables.storage_container", + "variables.lustre_mount", + "variables.lustre_version" + ], + "tag": "lfsazimport", + "sudo": true + }, + { + "script": "lfsloganalytics.sh", + "args": [ + "variables.log_analytics_lfs_name", + "variables.log_analytics_workspace", + "variables.log_analytics_key" + ], + "tag": "lfsloganalytics", + "sudo": true + }, + { + "script": "pbsdownload.sh", + "tag": "loginnode", + "sudo": false + }, + { + "script": "pbsserver.sh", + "copy": [ + "pbspro_19.1.1.centos7/pbspro-server-19.1.1-0.x86_64.rpm" + ], + "tag": "pbsserver", + "sudo": true + }, + { + "script": "pbsclient.sh", + "args": [ + "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. + +The configuration file requires the following variables to be set: + +| Variable | Description | +|-------------------------|----------------------------------------------| +| location | The location (Azure region) for the project | +| resource_group | The resource group for the project | +| storage_account | The storage account for HSM | +| storage_key | The storage key for HSM | +| storage_container | The container to use for HSM | +| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | +| la_resourcegroup | The resource group for Log Analytics | +| la_name | The Log Analytics Workspace name | + +> Note: you can remove log anaytics and/or HSM from the config file if not required. + +> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/experimental/lustre_rdma_nvmedrives/scripts/installOFED.sh b/experimental/lustre_rdma_nvmedrives/scripts/installOFED.sh new file mode 100755 index 000000000..c267519fc --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/installOFED.sh @@ -0,0 +1,4 @@ +#!/bin/bash +yum -y groupinstall --skip-broken "Infiniband Support" 2>/dev/null +echo "done installing Infiniband" +exit 0 diff --git a/experimental/lustre_rdma_nvmedrives/scripts/lfsclient.sh b/experimental/lustre_rdma_nvmedrives/scripts/lfsclient.sh new file mode 100755 index 000000000..0a3f302fc --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/lfsclient.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# arg: $1 = lfsserver +# arg: $2 = mount point (default: /lustre) +master=$1 +lfs_mount=${2:-/lustre} +mkdir ~/.ssh + +cp -r /share/home/hpcuser/.ssh ~/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab +mount -a +chmod 777 $lfs_mount diff --git a/experimental/lustre_rdma_nvmedrives/scripts/lfsmaster.sh b/experimental/lustre_rdma_nvmedrives/scripts/lfsmaster.sh new file mode 100755 index 000000000..1869a1f71 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/lfsmaster.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +device=$1 + +# this will only install MDS on first node in a scaleset +echo "pssh_nodenum is $PSSH_NODENUM" + +cp -r /share/home/hpcuser/.ssh /root/ + +#Include the correct rdma options +cat >/etc/modprobe.d/lustre.conf<> /etc/fstab + mount -a + + # set up hsm + lctl set_param -P mdt.*-MDT0000.hsm_control=enabled + lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 + lctl set_param mdt.*-MDT0000.hsm.max_requests=128 + + # allow any user and group ids to write + lctl set_param mdt.*-MDT0000.identity_upcall=NONE + +fi + diff --git a/experimental/lustre_rdma_nvmedrives/scripts/lfsoss.sh b/experimental/lustre_rdma_nvmedrives/scripts/lfsoss.sh new file mode 100755 index 000000000..ada2bb8c7 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/lfsoss.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# arg: $1 = lfsmaster +# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) +master=$1 +device=$2 + +cp -r /share/home/hpcuser/.ssh /root/ + +index=$(($PSSH_NODENUM + 1)) +myuser="hpcuser" + +capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") +masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) + +if [ "$PSSH_NODENUM" != "0" ]; then + lnetctl net add --net o2ib --if ib0 #double check + mkfs.lustre \ + --fsname=LustreFS \ + --backfstype=ldiskfs \ + --reformat \ + --ost \ + --mgsnode="${masterib}@o2ib" \ + --index=$index \ + --mountfsoptions="errors=remount-ro" \ + $device + + +mkdir /mnt/oss +echo "$device /mnt/oss lustre noatime,nodiratime,nobarrier 0 2" >> /etc/fstab +mount -a +fi diff --git a/experimental/lustre_rdma_nvmedrives/scripts/lfsrepo.sh b/experimental/lustre_rdma_nvmedrives/scripts/lfsrepo.sh new file mode 100755 index 000000000..db1eeb165 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/lfsrepo.sh @@ -0,0 +1,27 @@ +#!/bin/bash +lustre_version=${1-2.10} + +cat << EOF >/etc/yum.repos.d/LustrePack.repo +[lustreserver] +name=lustreserver +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ +enabled=1 +gpgcheck=0 + +[e2fs] +name=e2fs +baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ +enabled=1 +gpgcheck=0 + +[lustreclient] +name=lustreclient +baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ +enabled=1 +gpgcheck=0 +EOF + +#Include the correct rdma options +#cat >/etc/modprobe.d/lustre.conf</dev/null + diff --git a/experimental/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh b/experimental/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh new file mode 100755 index 000000000..60f3e759e --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh @@ -0,0 +1,10 @@ +#!/bin/bash +yum -y --nogpgcheck --enablerepo=lustreserver install kmod-lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre lustre-resource-agents +modprobe -v lustre + +sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf + +weak-modules --add-kernel --no-initramfs +systemctl enable lustre +umount /mnt/resource diff --git a/experimental/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh b/experimental/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh new file mode 100755 index 000000000..f95d33864 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf +service waagent restart +service rdma start +modprobe lnet +lctl network configure +lnetctl net add --net o2ib --if ib0 #need this to come up every time +sleep 5 diff --git a/experimental/lustre_rdma_nvmedrives/scripts/rebootlustre.sh b/experimental/lustre_rdma_nvmedrives/scripts/rebootlustre.sh new file mode 100755 index 000000000..9d1bf38c7 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/scripts/rebootlustre.sh @@ -0,0 +1,19 @@ +#!/bin/bash +vmlist=$1 +ossnum=$2 + +totalcount=$(($ossnum+2)) +index=0 + +#prep headnode +cp -r /share/home/hpcuser/.ssh /root/ + +#needs to be done sequentially +for vmname in ${vmlist[@]}; do + if [ $index -lt $totalcount ] ; then + echo "Rebooting $vmname" + ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null + fi +done +exit 0 # to ensure no errors are thrown + diff --git a/experimental/lustre_rdma_nvmedrives/writeup b/experimental/lustre_rdma_nvmedrives/writeup new file mode 100644 index 000000000..809ec71d3 --- /dev/null +++ b/experimental/lustre_rdma_nvmedrives/writeup @@ -0,0 +1,17 @@ +- lustre-ipoib - This is a created implementation of Lustre using ip over infiniband (IPoIB) +- lustre-rdma - This is a created implementation of Lustre using native Remote Direct Memory Access (RDMA) + +Changes to files to enable Infiniband functionality: +lfsmaster.sh +lfsoss.sh +lfsclient.sh +lfsrepo.sh +lfspkgs.sh + +Addition for the installation of new OFED : installOFED.sh + +Addition for correct Lustre kernel : lustreinstall1.sh +Lustre packages : lustreinstall2.sh + +Addition for rebooting of Lustre MDS/OSS: rebootlustre.sh +Addition for pause after MDS/OSS reboot : waitforreboot.sh From f1bc9a24cd8974661b556f94914df75799b6567c Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Thu, 2 Jul 2020 12:29:39 -0500 Subject: [PATCH 23/36] Remove duplicated directory for lustre_ipoib --- examples/lustre_ipoib/config.json | 224 ------------------ examples/lustre_ipoib/readme.md | 37 --- .../lustre_ipoib/scripts/installdrives.sh | 33 --- examples/lustre_ipoib/scripts/lfsclient.sh | 44 ---- examples/lustre_ipoib/scripts/lfsmaster.sh | 32 --- examples/lustre_ipoib/scripts/lfsoss.sh | 38 --- examples/lustre_ipoib/scripts/lfspkgs.sh | 11 - examples/lustre_ipoib_nvmedrives/config.json | 205 ---------------- examples/lustre_ipoib_nvmedrives/readme.md | 35 --- .../scripts/lfsclient.sh | 44 ---- .../scripts/lfsmaster.sh | 32 --- .../lustre_ipoib_nvmedrives/scripts/lfsoss.sh | 38 --- .../scripts/lfspkgs.sh | 11 - .../scripts/waitforreboot.sh | 2 - examples/lustre_rdma_avs/config.json | 65 ++--- examples/lustre_rdma_avs/scripts/lfsclient.sh | 13 +- examples/lustre_rdma_avs/writeuplustreipoib | 11 - 17 files changed, 46 insertions(+), 829 deletions(-) delete mode 100644 examples/lustre_ipoib/config.json delete mode 100644 examples/lustre_ipoib/readme.md delete mode 100755 examples/lustre_ipoib/scripts/installdrives.sh delete mode 100755 examples/lustre_ipoib/scripts/lfsclient.sh delete mode 100755 examples/lustre_ipoib/scripts/lfsmaster.sh delete mode 100755 examples/lustre_ipoib/scripts/lfsoss.sh delete mode 100755 examples/lustre_ipoib/scripts/lfspkgs.sh delete mode 100644 examples/lustre_ipoib_nvmedrives/config.json delete mode 100644 examples/lustre_ipoib_nvmedrives/readme.md delete mode 100755 examples/lustre_ipoib_nvmedrives/scripts/lfsclient.sh delete mode 100755 examples/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh delete mode 100755 examples/lustre_ipoib_nvmedrives/scripts/lfsoss.sh delete mode 100755 examples/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh delete mode 100755 examples/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh delete mode 100644 examples/lustre_rdma_avs/writeuplustreipoib diff --git a/examples/lustre_ipoib/config.json b/examples/lustre_ipoib/config.json deleted file mode 100644 index 0127dc390..000000000 --- a/examples/lustre_ipoib/config.json +++ /dev/null @@ -1,224 +0,0 @@ -{ - "location": "variables.location", - "resource_group": "variables.resource_group", - "install_from": "headnode", - "admin_user": "hpcadmin", - "vnet": { - "name": "hpcvnet", - "address_prefix": "10.2.0.0/20", - "subnets": { - "compute": "10.2.0.0/22", - "storage": "10.2.4.0/24" - } - }, - "variables": { - "location": "", - "resource_group": "", - "image": "OpenLogic:CentOS-HPC:7.6:latest", - "lustreimage": "OpenLogic:CentOS-HPC:7.6:latest", - "drivenum": 4, - "ossnum": 4, - "low_priority": true, - "storage_account": "", - "storage_key": "sakey.{{variables.storage_account}}", - "storage_container": "", - "log_analytics_lfs_name": "", - "la_resourcegroup": "", - "la_name": "", - "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", - "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", - "lustre_version": "2.10", - "lustre_mount": "/lustre" - }, - "resources": { - "headnode": { - "type": "vm", - "vm_type": "Standard_HC44rs", - "accelerated_networking": false, - "public_ip": true, - "image": "variables.image", - "subnet": "compute", - "tags": [ - "disable-selinux", - "cndefault", - "lfsrepo", - "lfsclient", - "lfsazimport", - "localuser", - "pbsserver", - "loginnode", - "nfsserver" - ] - }, - "lustre": { - "type": "vmss", - "vm_type": "Standard_HC44rs", - "instances": "9", - "accelerated_networking": false, - "image": "variables.lustreimage", - "subnet": "storage", - "tags": [ - "cndefault", - "lustre[0:5]", - "osses[1:5]", - "lfsrepo", - "lfsclient[5:9]", - "localuser", - "pbsclient[5:9]", - "nfsclient", - "disable-selinux", - "lfsloganalytics" - ] - } - }, - "install": [ - { - "script": "disable-selinux.sh", - "tag": "disable-selinux", - "sudo": true - }, - { - "script": "cndefault.sh", - "tag": "cndefault", - "sudo": true - }, - { - "script": "nfsserver.sh", - "tag": "nfsserver", - "sudo": true - }, - { - "script": "nfsclient.sh", - "args": [ - "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. - -The configuration file requires the following variables to be set: - -| Variable | Description | -|-------------------------|----------------------------------------------| -| resource_group | The resource group for the project | -| storage_account | The storage account for HSM | -| storage_key | The storage key for HSM | -| storage_container | The container to use for HSM | -| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | -| la_resourcegroup | The resource group for Log Analytics | -| la_name | The Log Analytics Workspace name | - -> Note: you can remove log anaytics and/or HSM from the config file if not required. - -> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/examples/lustre_ipoib/scripts/installdrives.sh b/examples/lustre_ipoib/scripts/installdrives.sh deleted file mode 100755 index 221e2349e..000000000 --- a/examples/lustre_ipoib/scripts/installdrives.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -groupname=$1 -vmlist=$2 -ossnum=$3 -drivenum=$4 - -#create the drives first before attachint to vmss -drivecount=$(($drivenum*$ossnum)) - -for ((num=1; num<=$drivecount; num++)); do - az disk create -g $groupname -n "lustredrive$num" --size-gb 1024 & -done - -sleep 60 # to ensure all drives are made - -#Now use the created drives -index=0 -lustrecnt=1 - -idlisttmp=$(az vmss list-instances --resource-group $groupname --name lustre |grep providers/Microsoft.Compute/virtualMachineScaleSets/lustre/virtualMachines | awk -F "virtualMachines/" '{print $2}' | sed '/networkInterfaces/d'| sed 's/["].*$//') - -idlist=($idlisttmp) - -for vmname in ${vmlist[@]}; do - ((index++)) - if [ $index -gt 0 ] ; then - for ((diskid=1; diskid<=$drivenum; diskid++)); do - az vmss disk attach --vmss-name lustre --disk lustredrive${lustrecnt} --sku Premium_LRS --instance-id ${idlist[$index]} --resource-group $groupname - ((lustrecnt++)) - done - fi -done - diff --git a/examples/lustre_ipoib/scripts/lfsclient.sh b/examples/lustre_ipoib/scripts/lfsclient.sh deleted file mode 100755 index 4e30d37fa..000000000 --- a/examples/lustre_ipoib/scripts/lfsclient.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsserver -# arg: $2 = mount point (default: /lustre) -master=$1 -lfs_mount=${2:-/lustre} -mkdir ~/.ssh - -cp -r /share/home/hpcuser/.ssh ~/ - -capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") -masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) - -if rpm -q lustre; then - - # if the server packages are installed only the client kmod is needed - # for 2.10 and nothing extra is needed for 2.12 - if [ "$lustre_version" = "2.10" ]; then - - if ! rpm -q kmod-lustre-client; then - yum -y install kmod-lustre-client - fi - - fi - -else - - # install the client RPMs if not already installed - if ! rpm -q lustre-client kmod-lustre-client; then - yum -y install lustre-client kmod-lustre-client - fi - weak-modules --add-kernel $(uname -r) - -fi -#Include the correct infiniband options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab -mount -a -chmod 777 $lfs_mount diff --git a/examples/lustre_ipoib/scripts/lfsmaster.sh b/examples/lustre_ipoib/scripts/lfsmaster.sh deleted file mode 100755 index dce36a159..000000000 --- a/examples/lustre_ipoib/scripts/lfsmaster.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) -device=$1 - -# this will only install MDS on first node in a scaleset -echo "pssh_nodenum is $PSSH_NODENUM" - -cp -r /share/home/hpcuser/.ssh ~/ - -#Include the correct ipoib options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab - mount -a - - # set up hsm - lctl set_param -P mdt.*-MDT0000.hsm_control=enabled - lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 - lctl set_param mdt.*-MDT0000.hsm.max_requests=128 - - # allow any user and group ids to write - lctl set_param mdt.*-MDT0000.identity_upcall=NONE - -fi diff --git a/examples/lustre_ipoib/scripts/lfsoss.sh b/examples/lustre_ipoib/scripts/lfsoss.sh deleted file mode 100755 index 0b9b060a5..000000000 --- a/examples/lustre_ipoib/scripts/lfsoss.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsmaster -# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) -master=$1 -device=$2 - -cp -r /share/home/hpcuser/.ssh ~/ - -index=$(($PSSH_NODENUM + 1)) -myuser="hpcuser" - -capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") -masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) - -if [ "$PSSH_NODENUM" != "0" ]; then - - mkfs.lustre \ - --fsname=LustreFS \ - --backfstype=ldiskfs \ - --reformat \ - --ost \ - --mgsnode="${masterib}" \ - --index=$index \ - --mountfsoptions="errors=remount-ro" \ - $device -#Include the correct ipoib options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab -mount -a -fi diff --git a/examples/lustre_ipoib/scripts/lfspkgs.sh b/examples/lustre_ipoib/scripts/lfspkgs.sh deleted file mode 100755 index 3120d3ba6..000000000 --- a/examples/lustre_ipoib/scripts/lfspkgs.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -yum -y install lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre-resource-agents e2fsprogs || exit 1 - -sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf - -systemctl restart waagent - -weak-modules --add-kernel --no-initramfs - -umount /mnt/resource diff --git a/examples/lustre_ipoib_nvmedrives/config.json b/examples/lustre_ipoib_nvmedrives/config.json deleted file mode 100644 index 4a7bd988b..000000000 --- a/examples/lustre_ipoib_nvmedrives/config.json +++ /dev/null @@ -1,205 +0,0 @@ -{ - "location": "variables.location", - "resource_group": "variables.resource_group", - "install_from": "headnode", - "admin_user": "hpcadmin", - "vnet": { - "name": "hpcvnet", - "address_prefix": "10.2.0.0/20", - "subnets": { - "compute": "10.2.0.0/22", - "storage": "10.2.4.0/24" - } - }, - "variables": { - "location" : "", - "resource_group": "", - "image": "OpenLogic:CentOS-HPC:7.6:latest", - "lustreimage": "OpenLogic:CentOS-HPC:7.6:latest", - "drivenum": 4, - "ossnum": 4, - "low_priority": true, - "storage_account": "", - "storage_key": "sakey.{{variables.storage_account}}", - "storage_container": "", - "log_analytics_lfs_name": "", - "la_resourcegroup": "", - "la_name": "", - "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", - "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", - "lustre_version": "2.10", - "lustre_mount": "/lustre" - }, - "resources": { - "headnode": { - "type": "vm", - "vm_type": "Standard_HB60rs", - "accelerated_networking": false, - "public_ip": true, - "image": "variables.image", - "subnet": "compute", - "tags": [ - "disable-selinux", - "cndefault", - "lfsrepo", - "lfsclient", - "lfsazimport", - "localuser", - "pbsserver", - "loginnode", - "nfsserver" - ] - }, - "lustre": { - "type": "vmss", - "vm_type": "Standard_HB120rs_v2", - "instances": "9", - "accelerated_networking": false, - "image": "variables.lustreimage", - "subnet": "storage", - "tags": [ - "cndefault", - "lustre[0:5]", - "osses[1:5]", - "lfsrepo", - "lfsclient[5:9]", - "localuser", - "pbsclient[5:9]", - "nfsclient", - "disable-selinux", - "lfsloganalytics" - ] - } - }, - "install": [ - { - "script": "disable-selinux.sh", - "tag": "disable-selinux", - "sudo": true - }, - { - "script": "cndefault.sh", - "tag": "cndefault", - "sudo": true - }, - { - "script": "nfsserver.sh", - "tag": "nfsserver", - "sudo": true - }, - { - "script": "nfsclient.sh", - "args": [ - "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. - -The configuration file requires the following variables to be set: - -| Variable | Description | -|-------------------------|----------------------------------------------| -| resource_group | The resource group for the project | -| storage_account | The storage account for HSM | -| storage_key | The storage key for HSM | -| storage_container | The container to use for HSM | -| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | -| la_resourcegroup | The resource group for Log Analytics | -| la_name | The Log Analytics Workspace name | - -> Note: you can remove log anaytics and/or HSM from the config file if not required. - -> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/examples/lustre_ipoib_nvmedrives/scripts/lfsclient.sh b/examples/lustre_ipoib_nvmedrives/scripts/lfsclient.sh deleted file mode 100755 index 4e30d37fa..000000000 --- a/examples/lustre_ipoib_nvmedrives/scripts/lfsclient.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsserver -# arg: $2 = mount point (default: /lustre) -master=$1 -lfs_mount=${2:-/lustre} -mkdir ~/.ssh - -cp -r /share/home/hpcuser/.ssh ~/ - -capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") -masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) - -if rpm -q lustre; then - - # if the server packages are installed only the client kmod is needed - # for 2.10 and nothing extra is needed for 2.12 - if [ "$lustre_version" = "2.10" ]; then - - if ! rpm -q kmod-lustre-client; then - yum -y install kmod-lustre-client - fi - - fi - -else - - # install the client RPMs if not already installed - if ! rpm -q lustre-client kmod-lustre-client; then - yum -y install lustre-client kmod-lustre-client - fi - weak-modules --add-kernel $(uname -r) - -fi -#Include the correct infiniband options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab -mount -a -chmod 777 $lfs_mount diff --git a/examples/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh b/examples/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh deleted file mode 100755 index dce36a159..000000000 --- a/examples/lustre_ipoib_nvmedrives/scripts/lfsmaster.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) -device=$1 - -# this will only install MDS on first node in a scaleset -echo "pssh_nodenum is $PSSH_NODENUM" - -cp -r /share/home/hpcuser/.ssh ~/ - -#Include the correct ipoib options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab - mount -a - - # set up hsm - lctl set_param -P mdt.*-MDT0000.hsm_control=enabled - lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 - lctl set_param mdt.*-MDT0000.hsm.max_requests=128 - - # allow any user and group ids to write - lctl set_param mdt.*-MDT0000.identity_upcall=NONE - -fi diff --git a/examples/lustre_ipoib_nvmedrives/scripts/lfsoss.sh b/examples/lustre_ipoib_nvmedrives/scripts/lfsoss.sh deleted file mode 100755 index 0b9b060a5..000000000 --- a/examples/lustre_ipoib_nvmedrives/scripts/lfsoss.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsmaster -# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) -master=$1 -device=$2 - -cp -r /share/home/hpcuser/.ssh ~/ - -index=$(($PSSH_NODENUM + 1)) -myuser="hpcuser" - -capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") -masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) - -if [ "$PSSH_NODENUM" != "0" ]; then - - mkfs.lustre \ - --fsname=LustreFS \ - --backfstype=ldiskfs \ - --reformat \ - --ost \ - --mgsnode="${masterib}" \ - --index=$index \ - --mountfsoptions="errors=remount-ro" \ - $device -#Include the correct ipoib options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab -mount -a -fi diff --git a/examples/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh b/examples/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh deleted file mode 100755 index 3120d3ba6..000000000 --- a/examples/lustre_ipoib_nvmedrives/scripts/lfspkgs.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -yum -y install lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre-resource-agents e2fsprogs || exit 1 - -sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf - -systemctl restart waagent - -weak-modules --add-kernel --no-initramfs - -umount /mnt/resource diff --git a/examples/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh b/examples/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh deleted file mode 100755 index 753167b8f..000000000 --- a/examples/lustre_ipoib_nvmedrives/scripts/waitforreboot.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -sleep 60 #enough time for node reboot to continue process diff --git a/examples/lustre_rdma_avs/config.json b/examples/lustre_rdma_avs/config.json index 13428e9a4..d241c4631 100644 --- a/examples/lustre_rdma_avs/config.json +++ b/examples/lustre_rdma_avs/config.json @@ -12,14 +12,14 @@ } }, "variables": { - "location": "", + "location": "", - "image": "OpenLogic:CentOS-HPC:7.6:7.6.201910250", + "image": "OpenLogic:CentOS:7.6:latest", "lustreimage": "OpenLogic:CentOS:7.6:latest", - "hpcimage": "OpenLogic:CentOS-HPC:7.6:7.6.201910250", + "hpcimage": "OpenLogic:CentOS:7.6:latest", "compute_instances": 2, "lustre_instances": 2, - "low_priority": true, + "low_priority": false, "storage_account": "", "storage_key": "sakey.{{variables.storage_account}}", "storage_container": "", @@ -50,7 +50,8 @@ "pbsserver", "loginnode", "rebootlustre", - "nfsserver" + "nfsserver", + "allnodes" ] }, "compute": { @@ -58,6 +59,7 @@ "vm_type": "Standard_HB120rs_v2", "instances": "variables.compute_instances", "availability_set": "variables.lustre_avset", + "low_priority": "variables.low_priority", "accelerated_networking": false, "image": "variables.hpcimage", "subnet": "storage", @@ -68,7 +70,8 @@ "localuser", "pbsclient", "nfsclient", - "disable-selinux" + "disable-selinux", + "allnodes" ] }, "lfsmaster": { @@ -86,7 +89,8 @@ "localuser", "nfsclient", "disable-selinux", - "lfsloganalytics" + "lfsloganalytics", + "allnodes" ] }, "lustre": { @@ -105,7 +109,8 @@ "lustre", "ossnode", "disable-selinux", - "lfsloganalytics" + "lfsloganalytics", + "allnodes" ] } }, @@ -172,7 +177,7 @@ }, { "script": "installOFED.sh", - "tag": "lustre", + "tag": "allnodes", "sudo": true }, { @@ -202,27 +207,27 @@ "tag": "ossnode", "sudo": true }, - { - "script": "lfshsm.sh", - "args": [ - "$(head -n1 hostlists/tags/lustre)", - "variables.storage_account", - "variables.storage_key", - "variables.storage_container", - "variables.lustre_version" - ], - "tag": "lustre", - "sudo": true - }, - { - "script": "lfsclient.sh", - "args": [ - "$(head -n1 hostlists/tags/lfsmaster)", - "variables.lustre_mount" - ], - "tag": "lfsclient", - "sudo": true - }, + { + "script": "lfshsm.sh", + "args": [ + "$(head -n1 hostlists/tags/lustre)", + "variables.storage_account", + "variables.storage_key", + "variables.storage_container", + "variables.lustre_version" + ], + "tag": "lustre", + "sudo": true + }, + { + "script": "lfsclient.sh", + "args": [ + "$(head -n1 hostlists/tags/lfsmaster)", + "variables.lustre_mount" + ], + "tag": "lfsclient", + "sudo": true + }, { "script": "lfsimport.sh", "args": [ diff --git a/examples/lustre_rdma_avs/scripts/lfsclient.sh b/examples/lustre_rdma_avs/scripts/lfsclient.sh index 26603bebd..5fc8f9a62 100755 --- a/examples/lustre_rdma_avs/scripts/lfsclient.sh +++ b/examples/lustre_rdma_avs/scripts/lfsclient.sh @@ -4,9 +4,8 @@ # arg: $2 = mount point (default: /lustre) master=$1 lfs_mount=${2:-/lustre} -mkdir ~/.ssh -cp -r /share/home/hpcuser/.ssh ~/ +cp -r /share/home/hpcuser/.ssh /root/ #Include the correct rdma options cat >/etc/modprobe.d/lustre.conf<> /etc/fstab diff --git a/examples/lustre_rdma_avs/writeuplustreipoib b/examples/lustre_rdma_avs/writeuplustreipoib deleted file mode 100644 index e0f6ad7fc..000000000 --- a/examples/lustre_rdma_avs/writeuplustreipoib +++ /dev/null @@ -1,11 +0,0 @@ -- lustre-ipoib - This is a created implementation of Lustre using IP over infiniband (IPoIB) - -Changes to files to enable Infiniband functionality: -lfsmaster.sh -lfsoss.sh -lfsclient.sh -lfsrepo.sh - -Addition for correct drives placement of OSSes : instaldrives.sh -*installdrives.sh takes about 15 minutes to run so please either remote this entity, or wait it out. - From 9aa90d43545c090ca578678462b4716cea3c7102 Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Thu, 2 Jul 2020 12:32:29 -0500 Subject: [PATCH 24/36] Remove lustre_rdma_nvmedrives from examples --- examples/lustre_rdma_nvmedrives/config.json | 241 ------------------ examples/lustre_rdma_nvmedrives/readme.md | 38 --- .../scripts/installOFED.sh | 4 - .../scripts/lfsclient.sh | 47 ---- .../scripts/lfsmaster.sh | 33 --- .../lustre_rdma_nvmedrives/scripts/lfsoss.sh | 32 --- .../lustre_rdma_nvmedrives/scripts/lfsrepo.sh | 27 -- .../scripts/lustreinstall1.sh | 8 - .../scripts/lustreinstall2.sh | 10 - .../scripts/lustrenetwork.sh | 9 - .../scripts/rebootlustre.sh | 19 -- examples/lustre_rdma_nvmedrives/writeup | 17 -- 12 files changed, 485 deletions(-) delete mode 100644 examples/lustre_rdma_nvmedrives/config.json delete mode 100644 examples/lustre_rdma_nvmedrives/readme.md delete mode 100755 examples/lustre_rdma_nvmedrives/scripts/installOFED.sh delete mode 100755 examples/lustre_rdma_nvmedrives/scripts/lfsclient.sh delete mode 100755 examples/lustre_rdma_nvmedrives/scripts/lfsmaster.sh delete mode 100755 examples/lustre_rdma_nvmedrives/scripts/lfsoss.sh delete mode 100755 examples/lustre_rdma_nvmedrives/scripts/lfsrepo.sh delete mode 100755 examples/lustre_rdma_nvmedrives/scripts/lustreinstall1.sh delete mode 100755 examples/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh delete mode 100755 examples/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh delete mode 100755 examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh delete mode 100644 examples/lustre_rdma_nvmedrives/writeup diff --git a/examples/lustre_rdma_nvmedrives/config.json b/examples/lustre_rdma_nvmedrives/config.json deleted file mode 100644 index dd2c4d6a9..000000000 --- a/examples/lustre_rdma_nvmedrives/config.json +++ /dev/null @@ -1,241 +0,0 @@ -{ - "location": "variables.location", - "resource_group": "variables.resource_group", - "install_from": "headnode", - "admin_user": "hpcadmin", - "vnet": { - "name": "hpcvnet", - "address_prefix": "10.2.0.0/20", - "subnets": { - "compute": "10.2.0.0/22", - "storage": "10.2.4.0/24" - } - }, - "variables": { - "location": "", - "resource_group": "", - "image": "OpenLogic:CentOS:7.6:latest", - "lustreimage": "OpenLogic:CentOS:7.6:latest", - "ossnum": 4, - "low_priority": true, - "storage_account": "", - "storage_key": "sakey.{{variables.storage_account}}", - "storage_container": "", - "log_analytics_lfs_name": "", - "la_resourcegroup": "", - "la_name": "", - "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", - "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", - "lustre_version": "2.10", - "lustre_mount": "/lustre" - }, - "resources": { - "headnode": { - "type": "vm", - "vm_type": "Standard_HB60rs", - "accelerated_networking": false, - "public_ip": true, - "image": "variables.image", - "subnet": "compute", - "tags": [ - "disable-selinux", - "cndefault", - "lfsrepo", - "rebootlustre", - "lfsclient", - "lfsazimport", - "localuser", - "pbsserver", - "allnodes", - "loginnode", - "nfsserver" - ] - }, - "lustre": { - "type": "vmss", - "vm_type": "Standard_HB120rs_v2", - "instances": "9", - "accelerated_networking": false, - "image": "variables.lustreimage", - "subnet": "storage", - "tags": [ - "cndefault", - "lustre[0:5]", - "osses[1:5]", - "lfsrepo", - "lfsclient[5:9]", - "localuser", - "pbsclient[5:9]", - "nfsclient", - "allnodes", - "disable-selinux", - "lfsloganalytics" - ] - } - }, - "install": [ - { - "script": "disable-selinux.sh", - "tag": "disable-selinux", - "sudo": true - }, - { - "script": "cndefault.sh", - "tag": "cndefault", - "sudo": true - }, - { - "script": "nfsserver.sh", - "tag": "nfsserver", - "sudo": true - }, - { - "script": "nfsclient.sh", - "args": [ - "$(>>>>>> 9aba5d253a4a5a012d9d828c45d3110d9f5164df - }, - { - "script": "installOFED.sh", - "tag": "allnodes", - "sudo": true - }, - { - "script": "lustreinstall2.sh", - "tag": "lustre", - "sudo": true - }, - { - "script": "lustrenetwork.sh", - "tag": "allnodes", - "sudo": true - }, - { - "script": "lfsmaster.sh", - "tag": "lustre", - "args": [ - "/dev/sdb" - ], - "sudo": true - }, - { - "script": "lfsoss.sh", - "args": [ - "$(head -n1 hostlists/tags/lustre)", - "/dev/nvme0n1" - ], - "tag": "lustre", - "sudo": true - }, - { - "script": "lfshsm.sh", - "args": [ - "$(head -n1 hostlists/tags/lustre)", - "variables.storage_account", - "variables.storage_key", - "variables.storage_container", - "variables.lustre_version" - ], - "tag": "lustre", - "sudo": true - }, - { - "script": "lfsclient.sh", - "args": [ - "$(head -n1 hostlists/tags/lustre)", - "variables.lustre_mount" - ], - "tag": "lfsclient", - "sudo": true - }, - { - "script": "lfsimport.sh", - "args": [ - "variables.storage_account", - "variables.storage_key", - "variables.storage_container", - "variables.lustre_mount", - "variables.lustre_version" - ], - "tag": "lfsazimport", - "sudo": true - }, - { - "script": "lfsloganalytics.sh", - "args": [ - "variables.log_analytics_lfs_name", - "variables.log_analytics_workspace", - "variables.log_analytics_key" - ], - "tag": "lfsloganalytics", - "sudo": true - }, - { - "script": "pbsdownload.sh", - "tag": "loginnode", - "sudo": false - }, - { - "script": "pbsserver.sh", - "copy": [ - "pbspro_19.1.1.centos7/pbspro-server-19.1.1-0.x86_64.rpm" - ], - "tag": "pbsserver", - "sudo": true - }, - { - "script": "pbsclient.sh", - "args": [ - "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. - -The configuration file requires the following variables to be set: - -| Variable | Description | -|-------------------------|----------------------------------------------| -| location | The location (Azure region) for the project | -| resource_group | The resource group for the project | -| storage_account | The storage account for HSM | -| storage_key | The storage key for HSM | -| storage_container | The container to use for HSM | -| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | -| la_resourcegroup | The resource group for Log Analytics | -| la_name | The Log Analytics Workspace name | - -> Note: you can remove log anaytics and/or HSM from the config file if not required. - -> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/examples/lustre_rdma_nvmedrives/scripts/installOFED.sh b/examples/lustre_rdma_nvmedrives/scripts/installOFED.sh deleted file mode 100755 index c267519fc..000000000 --- a/examples/lustre_rdma_nvmedrives/scripts/installOFED.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -yum -y groupinstall --skip-broken "Infiniband Support" 2>/dev/null -echo "done installing Infiniband" -exit 0 diff --git a/examples/lustre_rdma_nvmedrives/scripts/lfsclient.sh b/examples/lustre_rdma_nvmedrives/scripts/lfsclient.sh deleted file mode 100755 index 0a3f302fc..000000000 --- a/examples/lustre_rdma_nvmedrives/scripts/lfsclient.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsserver -# arg: $2 = mount point (default: /lustre) -master=$1 -lfs_mount=${2:-/lustre} -mkdir ~/.ssh - -cp -r /share/home/hpcuser/.ssh ~/ - -#Include the correct rdma options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab -mount -a -chmod 777 $lfs_mount diff --git a/examples/lustre_rdma_nvmedrives/scripts/lfsmaster.sh b/examples/lustre_rdma_nvmedrives/scripts/lfsmaster.sh deleted file mode 100755 index 1869a1f71..000000000 --- a/examples/lustre_rdma_nvmedrives/scripts/lfsmaster.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) -device=$1 - -# this will only install MDS on first node in a scaleset -echo "pssh_nodenum is $PSSH_NODENUM" - -cp -r /share/home/hpcuser/.ssh /root/ - -#Include the correct rdma options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab - mount -a - - # set up hsm - lctl set_param -P mdt.*-MDT0000.hsm_control=enabled - lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 - lctl set_param mdt.*-MDT0000.hsm.max_requests=128 - - # allow any user and group ids to write - lctl set_param mdt.*-MDT0000.identity_upcall=NONE - -fi - diff --git a/examples/lustre_rdma_nvmedrives/scripts/lfsoss.sh b/examples/lustre_rdma_nvmedrives/scripts/lfsoss.sh deleted file mode 100755 index ada2bb8c7..000000000 --- a/examples/lustre_rdma_nvmedrives/scripts/lfsoss.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsmaster -# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) -master=$1 -device=$2 - -cp -r /share/home/hpcuser/.ssh /root/ - -index=$(($PSSH_NODENUM + 1)) -myuser="hpcuser" - -capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") -masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) - -if [ "$PSSH_NODENUM" != "0" ]; then - lnetctl net add --net o2ib --if ib0 #double check - mkfs.lustre \ - --fsname=LustreFS \ - --backfstype=ldiskfs \ - --reformat \ - --ost \ - --mgsnode="${masterib}@o2ib" \ - --index=$index \ - --mountfsoptions="errors=remount-ro" \ - $device - - -mkdir /mnt/oss -echo "$device /mnt/oss lustre noatime,nodiratime,nobarrier 0 2" >> /etc/fstab -mount -a -fi diff --git a/examples/lustre_rdma_nvmedrives/scripts/lfsrepo.sh b/examples/lustre_rdma_nvmedrives/scripts/lfsrepo.sh deleted file mode 100755 index db1eeb165..000000000 --- a/examples/lustre_rdma_nvmedrives/scripts/lfsrepo.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -lustre_version=${1-2.10} - -cat << EOF >/etc/yum.repos.d/LustrePack.repo -[lustreserver] -name=lustreserver -baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ -enabled=1 -gpgcheck=0 - -[e2fs] -name=e2fs -baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ -enabled=1 -gpgcheck=0 - -[lustreclient] -name=lustreclient -baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ -enabled=1 -gpgcheck=0 -EOF - -#Include the correct rdma options -#cat >/etc/modprobe.d/lustre.conf</dev/null - diff --git a/examples/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh b/examples/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh deleted file mode 100755 index 60f3e759e..000000000 --- a/examples/lustre_rdma_nvmedrives/scripts/lustreinstall2.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -yum -y --nogpgcheck --enablerepo=lustreserver install kmod-lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre lustre-resource-agents -modprobe -v lustre - -sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf -sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf - -weak-modules --add-kernel --no-initramfs -systemctl enable lustre -umount /mnt/resource diff --git a/examples/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh b/examples/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh deleted file mode 100755 index f95d33864..000000000 --- a/examples/lustre_rdma_nvmedrives/scripts/lustrenetwork.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf -service waagent restart -service rdma start -modprobe lnet -lctl network configure -lnetctl net add --net o2ib --if ib0 #need this to come up every time -sleep 5 diff --git a/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh b/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh deleted file mode 100755 index 9d1bf38c7..000000000 --- a/examples/lustre_rdma_nvmedrives/scripts/rebootlustre.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -vmlist=$1 -ossnum=$2 - -totalcount=$(($ossnum+2)) -index=0 - -#prep headnode -cp -r /share/home/hpcuser/.ssh /root/ - -#needs to be done sequentially -for vmname in ${vmlist[@]}; do - if [ $index -lt $totalcount ] ; then - echo "Rebooting $vmname" - ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null - fi -done -exit 0 # to ensure no errors are thrown - diff --git a/examples/lustre_rdma_nvmedrives/writeup b/examples/lustre_rdma_nvmedrives/writeup deleted file mode 100644 index 809ec71d3..000000000 --- a/examples/lustre_rdma_nvmedrives/writeup +++ /dev/null @@ -1,17 +0,0 @@ -- lustre-ipoib - This is a created implementation of Lustre using ip over infiniband (IPoIB) -- lustre-rdma - This is a created implementation of Lustre using native Remote Direct Memory Access (RDMA) - -Changes to files to enable Infiniband functionality: -lfsmaster.sh -lfsoss.sh -lfsclient.sh -lfsrepo.sh -lfspkgs.sh - -Addition for the installation of new OFED : installOFED.sh - -Addition for correct Lustre kernel : lustreinstall1.sh -Lustre packages : lustreinstall2.sh - -Addition for rebooting of Lustre MDS/OSS: rebootlustre.sh -Addition for pause after MDS/OSS reboot : waitforreboot.sh From 103f3f78978044d2eb1888ed1a503fd329eaff07 Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Sat, 4 Jul 2020 08:38:17 -0500 Subject: [PATCH 25/36] cleanup work for lustre_rdma_avs from examples folder --- .../azhpc_install_config/hostlists/compute | 2 - .../azhpc_install_config/hostlists/headnode | 1 - .../azhpc_install_config/hostlists/lfsmaster | 1 - .../azhpc_install_config/hostlists/linux | 6 - .../azhpc_install_config/hostlists/lustre | 2 - .../hostlists/tags/cndefault | 6 - .../hostlists/tags/disable-selinux | 6 - .../hostlists/tags/lfsazimport | 1 - .../hostlists/tags/lfsclient | 3 - .../hostlists/tags/lfsloganalytics | 3 - .../hostlists/tags/lfsmaster | 1 - .../hostlists/tags/lfsrepo | 6 - .../hostlists/tags/localuser | 6 - .../hostlists/tags/loginnode | 1 - .../hostlists/tags/lustre | 3 - .../hostlists/tags/nfsclient | 5 - .../hostlists/tags/nfsserver | 1 - .../hostlists/tags/ossnode | 2 - .../hostlists/tags/pbsclient | 2 - .../hostlists/tags/pbsserver | 1 - .../hostlists/tags/rebootlustre | 1 - .../azhpc_install_config/hpcadmin_id_rsa | 27 -- .../azhpc_install_config/hpcadmin_id_rsa.pub | 1 - .../install/00_install_node_setup.sh | 48 --- .../install/01_disable-selinux.sh | 18 -- .../install/02_cndefault.sh | 18 -- .../install/03_nfsserver.sh | 18 -- .../install/04_nfsclient.sh | 18 -- .../install/05_localuser.sh | 18 -- .../install/06_lfsrepo.sh | 18 -- .../install/07_lustreinstall1.sh | 18 -- .../install/08_rebootlustre.sh | 18 -- .../install/09_waitforreboot.sh | 7 - .../install/10_installOFED.sh | 18 -- .../install/11_lustreinstall2.sh | 18 -- .../install/12_lustrenetwork.sh | 18 -- .../install/13_lfsmaster.sh | 18 -- .../azhpc_install_config/install/14_lfsoss.sh | 18 -- .../azhpc_install_config/install/15_lfshsm.sh | 18 -- .../install/16_lfsclient.sh | 18 -- .../install/17_lfsimport.sh | 18 -- .../install/18_lfsloganalytics.sh | 18 -- .../install/19_pbsdownload.sh | 18 -- .../install/20_pbsserver.sh | 19 -- .../install/21_pbsclient.sh | 19 -- .../azhpc_install_config/scripts/cndefault.sh | 23 -- .../scripts/disable-selinux.sh | 6 - .../scripts/installOFED.sh | 4 - .../azhpc_install_config/scripts/lfsclient.sh | 48 --- .../azhpc_install_config/scripts/lfshsm.sh | 95 ------ .../azhpc_install_config/scripts/lfsimport.sh | 31 -- .../scripts/lfsloganalytics.sh | 31 -- .../azhpc_install_config/scripts/lfsmaster.sh | 31 -- .../azhpc_install_config/scripts/lfsoss.sh | 30 -- .../azhpc_install_config/scripts/lfsrepo.sh | 27 -- .../azhpc_install_config/scripts/localuser.sh | 40 --- .../scripts/lustreinstall1.sh | 8 - .../scripts/lustreinstall2.sh | 10 - .../scripts/lustrenetwork.sh | 9 - .../azhpc_install_config/scripts/nfsclient.sh | 34 --- .../azhpc_install_config/scripts/nfsserver.sh | 212 ------------- .../azhpc_install_config/scripts/pbsclient.sh | 22 -- .../scripts/pbsdownload.sh | 9 - .../azhpc_install_config/scripts/pbsserver.sh | 19 -- .../scripts/rebootlustre.sh | 16 - .../scripts/waitforreboot.sh | 2 - examples/lustre_rdma_avs/config.json | 278 ------------------ examples/lustre_rdma_avs/readme.md | 36 --- .../lustre_rdma_avs/scripts/installOFED.sh | 4 - examples/lustre_rdma_avs/scripts/lfsclient.sh | 57 ---- examples/lustre_rdma_avs/scripts/lfsmaster.sh | 31 -- examples/lustre_rdma_avs/scripts/lfsoss.sh | 30 -- examples/lustre_rdma_avs/scripts/lfsrepo.sh | 27 -- .../lustre_rdma_avs/scripts/lustreinstall1.sh | 8 - .../lustre_rdma_avs/scripts/lustreinstall2.sh | 10 - .../lustre_rdma_avs/scripts/lustrenetwork.sh | 9 - .../lustre_rdma_avs/scripts/rebootlustre.sh | 16 - examples/lustre_rdma_avs/writeup | 20 -- experimental/lustre_rdma_avs/writeup | 3 - 79 files changed, 1741 deletions(-) delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/compute delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/headnode delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/linux delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/lustre delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa delete mode 100644 examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/localuser.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh delete mode 100755 examples/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh delete mode 100644 examples/lustre_rdma_avs/config.json delete mode 100644 examples/lustre_rdma_avs/readme.md delete mode 100755 examples/lustre_rdma_avs/scripts/installOFED.sh delete mode 100755 examples/lustre_rdma_avs/scripts/lfsclient.sh delete mode 100755 examples/lustre_rdma_avs/scripts/lfsmaster.sh delete mode 100755 examples/lustre_rdma_avs/scripts/lfsoss.sh delete mode 100755 examples/lustre_rdma_avs/scripts/lfsrepo.sh delete mode 100755 examples/lustre_rdma_avs/scripts/lustreinstall1.sh delete mode 100755 examples/lustre_rdma_avs/scripts/lustreinstall2.sh delete mode 100755 examples/lustre_rdma_avs/scripts/lustrenetwork.sh delete mode 100755 examples/lustre_rdma_avs/scripts/rebootlustre.sh delete mode 100644 examples/lustre_rdma_avs/writeup diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/compute b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/compute deleted file mode 100644 index 232110d4a..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/compute +++ /dev/null @@ -1,2 +0,0 @@ -compute0001 -compute0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/headnode b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/headnode deleted file mode 100644 index 1a9798066..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/headnode +++ /dev/null @@ -1 +0,0 @@ -headnode diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster deleted file mode 100644 index a47bf87fe..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster +++ /dev/null @@ -1 +0,0 @@ -lfsmaster diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/linux b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/linux deleted file mode 100644 index 337053fb6..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/linux +++ /dev/null @@ -1,6 +0,0 @@ -headnode -compute0001 -compute0002 -lfsmaster -lustre0001 -lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/lustre b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/lustre deleted file mode 100644 index b8f9b2061..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/lustre +++ /dev/null @@ -1,2 +0,0 @@ -lustre0001 -lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault deleted file mode 100644 index 337053fb6..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault +++ /dev/null @@ -1,6 +0,0 @@ -headnode -compute0001 -compute0002 -lfsmaster -lustre0001 -lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux deleted file mode 100644 index 337053fb6..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux +++ /dev/null @@ -1,6 +0,0 @@ -headnode -compute0001 -compute0002 -lfsmaster -lustre0001 -lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport deleted file mode 100644 index 1a9798066..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport +++ /dev/null @@ -1 +0,0 @@ -headnode diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient deleted file mode 100644 index 8af893f49..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient +++ /dev/null @@ -1,3 +0,0 @@ -headnode -compute0001 -compute0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics deleted file mode 100644 index 6453c2e60..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics +++ /dev/null @@ -1,3 +0,0 @@ -lfsmaster -lustre0001 -lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster deleted file mode 100644 index a47bf87fe..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster +++ /dev/null @@ -1 +0,0 @@ -lfsmaster diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo deleted file mode 100644 index 337053fb6..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo +++ /dev/null @@ -1,6 +0,0 @@ -headnode -compute0001 -compute0002 -lfsmaster -lustre0001 -lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser deleted file mode 100644 index 337053fb6..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser +++ /dev/null @@ -1,6 +0,0 @@ -headnode -compute0001 -compute0002 -lfsmaster -lustre0001 -lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode deleted file mode 100644 index 1a9798066..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode +++ /dev/null @@ -1 +0,0 @@ -headnode diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre deleted file mode 100644 index 6453c2e60..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre +++ /dev/null @@ -1,3 +0,0 @@ -lfsmaster -lustre0001 -lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient deleted file mode 100644 index 748d1c5dc..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient +++ /dev/null @@ -1,5 +0,0 @@ -compute0001 -compute0002 -lfsmaster -lustre0001 -lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver deleted file mode 100644 index 1a9798066..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver +++ /dev/null @@ -1 +0,0 @@ -headnode diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode deleted file mode 100644 index b8f9b2061..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode +++ /dev/null @@ -1,2 +0,0 @@ -lustre0001 -lustre0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient deleted file mode 100644 index 232110d4a..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient +++ /dev/null @@ -1,2 +0,0 @@ -compute0001 -compute0002 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver deleted file mode 100644 index 1a9798066..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver +++ /dev/null @@ -1 +0,0 @@ -headnode diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre b/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre deleted file mode 100644 index 1a9798066..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre +++ /dev/null @@ -1 +0,0 @@ -headnode diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa b/examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa deleted file mode 100644 index 7846d2b39..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa +++ /dev/null @@ -1,27 +0,0 @@ ------BEGIN RSA PRIVATE KEY----- -MIIEpAIBAAKCAQEA1JCjVUGcKCYN3RCERznjr7e1Chsf+DG30uluSXk3I/6nesto -5gLGfKiTjeHWvX5tqFITAA84r140AgsIcUHEpaWwk06QIVUTj6kDHbubP0i1V2EY -2sa6cm6hPQmsFIiOK578BLuv/Zda/arVJ1dq1q+1t0tt84TTCrsROszNw8t9Kc3Z -Gn2SY7F52Z8nttmN7OEsfUtg6K6f/5IwbJb7U8b/0jF6yWDpzrmqN33BJfrZ1VWs -jswhblxJZ0juAU/oAB0xtOzqM2vwUZy9FmcfRPo/U1gM4DUG1h37oWWkoQLhgURu -p3Lztqq8msXXsnk3ZnIkMWWNJ429fN2ui751QQIDAQABAoIBAGYQfRy2wDBW9Vks -UReSKE17PCZ6F8Oou8c95oLI/Tz/TZOcj+XBd2Tr3M3HnsCmMCkeH5lrtaAe74H7 -ojYfijivcjWJB5O5sgbM9H4WUtj0JH6sVK7XtTa1AB66wjGpz/oKAKCVLk/pmPss -R+T4CIjFHc/BHC5NnLgOUpuVM0fLUUUF8NmIvT6K0P4j7GZx12d1TDkqo+/rd1ku -EOuCjl8Q4bTO0qtJEXy2dmn38m6QGNS765j8gQ21wWY+Q7EX4JaJ+oO2ZgGuyYul -Cu+AFlCR4SkOok0DN6RG4KQ7Sly57HrZWwLI46FXmjiJqE/7wNvMwuHdUmnVbkoY -v04fxAECgYEA8ii6KMsPIxnMSCBpsRoFSOcPdSyoFyhMCCuiR9liCGRG4wz4u1i6 -ZFal1+d/rX6qxCTIZxvU8zn54Qsrr+44zV++4+Sd/nhrc+qWOxGggAscbYNG3w2g -GTGinERFPRs5iGmdJ0n+uy/TSPe5t0qH85AdKcU47mfrNb3Q08rEfxECgYEA4Lbj -zkCUa4UN6CP36FtOUNtpnrn7dxfDNpcS8CTt/oob2OifOUGhgPbCio5at7zE8cH0 -hWrUWFPDfBRliGdG/ZzdmIOaC0MU9eQG4JxkblgYccKpcYsTq45NDyhQJ0lbBjRG -Sp42HOnvZ8p0m9przrnQF22Bvr5E+VF1wVk18zECgYEA7pI9RS84jIZAAdcdCYPv -LPGnAvOp7paewXXrfQmnUUkppUsESd6SU4QiA2FpIk4mgvMSFLMQy0eU7KeKtNrn -Tz5C3FZBaZDNm/fDZhJpo3xO13179wh/cBK8d2OzKw6FUeVrFGgL8/KcH8kfSHq/ -EbAraxmIiygKTHnjIKUljWECgYAQxhYjIzbw/7GWDnlG4unppzcvHfrjXOa5gHVt -b5REV9LUUijwgTGpCsJizVWAOZsJ4Mx72QmYvkftTyh1EiB+deMkq04oYQ2DfU32 -HjZw9ip882bqjtMdDzY5V20EQbmFsQk+MKkhZ2Tzfm1N5PP/LmeWGBqDPnivk6ES -mbIpQQKBgQDqnc9KivmjPIHz2BpJh8icWkdvZ2WzycI3Sly6Suh0E6Q+epMTXUm3 -21TIEkkAlBYXkHs0ZhL7l7jzv5yYSGB8ZNDzk+UquE5OuxMwWsd3trqyJ3LMj9C5 -hV6JTHqNSw8xubCES0oRgJkcCedoQ0qxMwypnJarWPh/LSVCu3BZ2A== ------END RSA PRIVATE KEY----- diff --git a/examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub b/examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub deleted file mode 100644 index 20776c3a0..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub +++ /dev/null @@ -1 +0,0 @@ -ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDUkKNVQZwoJg3dEIRHOeOvt7UKGx/4MbfS6W5JeTcj/qd6y2jmAsZ8qJON4da9fm2oUhMADzivXjQCCwhxQcSlpbCTTpAhVROPqQMdu5s/SLVXYRjaxrpybqE9CawUiI4rnvwEu6/9l1r9qtUnV2rWr7W3S23zhNMKuxE6zM3Dy30pzdkafZJjsXnZnye22Y3s4Sx9S2Dorp//kjBslvtTxv/SMXrJYOnOuao3fcEl+tnVVayOzCFuXElnSO4BT+gAHTG07Ooza/BRnL0WZx9E+j9TWAzgNQbWHfuhZaShAuGBRG6ncvO2qryaxdeyeTdmciQxZY0njb183a6LvnVB diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh deleted file mode 100755 index d5e1850c6..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=linux - -if [ ! -f "hostlists/$tag" ]; then - echo "no hostlist ($tag), exiting" - exit 0 -fi - -# wait for DNS to update for all hostnames -for h in $(/dev/null 2>&1; do - echo "Waiting for host - $h (sleeping for 5 seconds)" - sleep 5 - done -done - -if [ "$1" != "" ]; then - tag=tags/$1 -else - sudo yum install -y epel-release > install/00_install_node_setup.log 2>&1 - sudo yum install -y pssh nc >> install/00_install_node_setup.log 2>&1 - - # setting up keys - cat < ~/.ssh/config - Host * - StrictHostKeyChecking no - UserKnownHostsFile /dev/null - LogLevel ERROR -EOF - cp hpcadmin_id_rsa.pub ~/.ssh/id_rsa.pub - cp hpcadmin_id_rsa ~/.ssh/id_rsa - chmod 600 ~/.ssh/id_rsa - chmod 644 ~/.ssh/config - chmod 644 ~/.ssh/id_rsa.pub - -fi - -pssh -p 50 -t 0 -i -h hostlists/$tag 'rpm -q rsync || sudo yum install -y rsync' >> install/00_install_node_setup.log 2>&1 - -prsync -p 50 -a -h hostlists/$tag ~/azhpc_install_config ~ >> install/00_install_node_setup.log 2>&1 -prsync -p 50 -a -h hostlists/$tag ~/.ssh ~ >> install/00_install_node_setup.log 2>&1 - -pssh -p 50 -t 0 -i -h hostlists/$tag 'echo "AcceptEnv PSSH_NODENUM PSSH_HOST" | sudo tee -a /etc/ssh/sshd_config' >> install/00_install_node_setup.log 2>&1 -pssh -p 50 -t 0 -i -h hostlists/$tag 'sudo systemctl restart sshd' >> install/00_install_node_setup.log 2>&1 -pssh -p 50 -t 0 -i -h hostlists/$tag "echo 'Defaults env_keep += \"PSSH_NODENUM PSSH_HOST\"' | sudo tee -a /etc/sudoers" >> install/00_install_node_setup.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh deleted file mode 100755 index aff9f6abd..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-disable-selinux} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/disable-selinux.sh" >> install/01_disable-selinux.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh deleted file mode 100755 index 89df21b38..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-cndefault} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/cndefault.sh" >> install/02_cndefault.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh deleted file mode 100755 index 9fe8fc049..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-nfsserver} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/nfsserver.sh" >> install/03_nfsserver.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh deleted file mode 100755 index 3ef1d7dd2..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-nfsclient} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/nfsclient.sh '$(> install/04_nfsclient.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh deleted file mode 100755 index 547517af7..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-localuser} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/localuser.sh '$(> install/05_localuser.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh deleted file mode 100755 index c51d1a7bf..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lfsrepo} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsrepo.sh '2.10'" >> install/06_lfsrepo.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh deleted file mode 100755 index 9c9e725e1..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lustre} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lustreinstall1.sh" >> install/07_lustreinstall1.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh deleted file mode 100755 index cafe2dccc..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-rebootlustre} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/rebootlustre.sh '$(> install/08_rebootlustre.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh deleted file mode 100755 index e7f2585c1..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -scripts/waitforreboot.sh >> install/09_waitforreboot.log 2>&1 - diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh deleted file mode 100755 index 0a9d5144c..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lustre} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/installOFED.sh" >> install/10_installOFED.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh deleted file mode 100755 index 415de3119..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lustre} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lustreinstall2.sh" >> install/11_lustreinstall2.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh deleted file mode 100755 index 210bc389e..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lustre} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lustrenetwork.sh" >> install/12_lustrenetwork.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh deleted file mode 100755 index 5dead31c8..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lfsmaster} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsmaster.sh '/dev/sdb'" >> install/13_lfsmaster.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh deleted file mode 100755 index 0b2f013ae..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-ossnode} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsoss.sh '$(head -n1 hostlists/tags/lfsmaster)' '/dev/nvme0n1'" >> install/14_lfsoss.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh deleted file mode 100755 index 479abe10e..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lustre} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfshsm.sh '$(head -n1 hostlists/tags/lustre)' 'lustretesting' 'TXOO/DhcJHGjjcNQ58f9SGCRF3RUuz3/UHaE70KbDAHhIkd38Ic5YXVlFcdxuytgk8pDg0sp5J9lCdOWr++sXA==' 'hsm' '2.10'" >> install/15_lfshsm.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh deleted file mode 100755 index e6e74eb5c..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lfsclient} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsclient.sh '$(head -n1 hostlists/tags/lfsmaster)' '/lustre'" >> install/16_lfsclient.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh deleted file mode 100755 index c23853cd8..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lfsazimport} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsimport.sh 'lustretesting' 'TXOO/DhcJHGjjcNQ58f9SGCRF3RUuz3/UHaE70KbDAHhIkd38Ic5YXVlFcdxuytgk8pDg0sp5J9lCdOWr++sXA==' 'hsm' '/lustre' '2.10'" >> install/17_lfsimport.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh deleted file mode 100755 index d2a6ff976..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lfsloganalytics} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsloganalytics.sh 'lfs' 'eb2e4150-e0fa-494d-8f60-291e27820eff' '0iKHSuo3C36gwxYYZSBIIVB8g5l7A1qztuF77oVwZlFV9iKqke/Jajc+qVLkt1SB7LNimpeb3Q++qerMtnZvuw=='" >> install/18_lfsloganalytics.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh deleted file mode 100755 index 9731feb81..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-loginnode} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; scripts/pbsdownload.sh" >> install/19_pbsdownload.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh deleted file mode 100755 index 0a2c0cf2d..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-pbsserver} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pscp.pssh -p 50 -h hostlists/tags/$tag pbspro_19.1.1.centos7/pbspro-server-19.1.1-0.x86_64.rpm $(pwd) >> install/20_pbsserver.log 2>&1 -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/pbsserver.sh" >> install/20_pbsserver.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh b/examples/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh deleted file mode 100755 index 1c354d17f..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-pbsclient} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pscp.pssh -p 50 -h hostlists/tags/$tag pbspro_19.1.1.centos7/pbspro-execution-19.1.1-0.x86_64.rpm $(pwd) >> install/21_pbsclient.log 2>&1 -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/pbsclient.sh '$(> install/21_pbsclient.log 2>&1 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh deleted file mode 100755 index 303ebac1b..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Script to be run on all compute nodes -if ! rpm -q epel-release; then - yum -y install epel-release -fi - -yum -y install git jq htop - -# change access to resource so that temp jobs can be written there -chmod 777 /mnt/resource - -# If running on Cycle -# - enable METADATA access -# - remove Jetpack convergence -# - Disable Fail2Ban service -# - Fix PBS limits -if [ -e $CYCLECLOUD_HOME/bin/jetpack ]; then - DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - $DIR/azhpc4cycle.sh enable_metada_access - $DIR/azhpc4cycle.sh disable_jetpack_converge - $DIR/azhpc4cycle.sh disable_fail2ban - $DIR/azhpc4cycle.sh fix_pbs_limits -fi diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh deleted file mode 100755 index 00c87bbf2..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -# set to permissive for now (until reboot) -setenforce 0 -# prep to have selinux disabled after reboot -sed -i 's/SELINUX=.*$/SELINUX=disabled/g' /etc/selinux/config diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh deleted file mode 100755 index c267519fc..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -yum -y groupinstall --skip-broken "Infiniband Support" 2>/dev/null -echo "done installing Infiniband" -exit 0 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh deleted file mode 100755 index 26603bebd..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsserver -# arg: $2 = mount point (default: /lustre) -master=$1 -lfs_mount=${2:-/lustre} -mkdir ~/.ssh - -cp -r /share/home/hpcuser/.ssh ~/ - -#Include the correct rdma options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab -mount -a -chmod 777 $lfs_mount diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh deleted file mode 100755 index 0af1fc5e2..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsserver -# arg: $2 = storage account -# arg: $3 = storage key -# arg: $4 = storage container -# arg: $5 = lustre version (default 2.10) -master=$1 -storage_account=$2 -storage_key=$3 -storage_container=$4 -lustre_version=${5-2.10} - -# remove the patch version -ndots=${lustre_version//[^.]} -if [ "${#ndots}" = "2" ]; then - lustre_version=${lustre_version%.*} -fi - -# adding kernel module for lustre client -if [ "$lustre_version" = "2.10" ]; then - yum install -y kmod-lustre-client - weak-modules --add-kernel $(uname -r) -fi - -if ! rpm -q lemur-azure-hsm-agent lemur-azure-data-movers; then - yum -y install \ - https://azurehpc.azureedge.net/rpms/lemur-azure-hsm-agent-1.0.0-lustre_${lustre_version}.x86_64.rpm \ - https://azurehpc.azureedge.net/rpms/lemur-azure-data-movers-1.0.0-lustre_${lustre_version}.x86_64.rpm -fi - -mkdir -p /var/run/lhsmd -chmod 755 /var/run/lhsmd - -mkdir -p /etc/lhsmd -chmod 755 /etc/lhsmd - -cat </etc/lhsmd/agent -# Lustre NID and filesystem name for the front end filesystem, the agent will mount this -client_device="${master}@tcp:/LustreFS" - -# Do you want to use S3 and POSIX, in this example we use POSIX -enabled_plugins=["lhsm-plugin-az"] - -## Directory to look for the plugins -plugin_dir="/usr/libexec/lhsmd" - -# TBD, I used 16 -handler_count=16 - -# TBD -snapshots { - enabled = false -} -EOF -chmod 600 /etc/lhsmd/agent - -cat </etc/lhsmd/lhsm-plugin-az -az_storage_account = "$storage_account" -az_storage_key = "$storage_key" - -num_threads = 32 - -# -# One or more archive definition is required. -# -archive "az-blob" { - id = 1 # Must be unique to this endpoint - container = "$storage_container" # Container used for this archive - prefix = "" # Optional prefix - num_threads = 32 -} -EOF -chmod 600 /etc/lhsmd/lhsm-plugin-az - -cat </etc/systemd/system/lhsmd.service -[Unit] -Description=The lhsmd server -After=syslog.target network.target remote-fs.target nss-lookup.target - -[Service] -Type=simple -PIDFile=/run/lhsmd.pid -ExecStartPre=/bin/mkdir -p /var/run/lhsmd -ExecStart=/sbin/lhsmd -config /etc/lhsmd/agent -Restart=always - -[Install] -WantedBy=multi-user.target -EOF -chmod 600 /etc/systemd/system/lhsmd.service - -systemctl daemon-reload -systemctl enable lhsmd -systemctl start lhsmd diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh deleted file mode 100755 index fd9fad30b..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# arg: $1 = storage account -# arg: $2 = storage key -# arg: $3 = storage container -# arg: $3 = lfs mount -# arg: $4 = lustre mount (default=/lustre) -# arg: $5 = lustre version (default=2.10) -storage_account=$1 -storage_key=$2 -storage_container=$3 -lfs_mount=${4:-/lustre} -lustre_version=${5-2.10} - -# remove the patch version -ndots=${lustre_version//[^.]} -if [ "${#ndots}" = "2" ]; then - lustre_version=${lustre_version%.*} -fi - -if ! rpm -q lemur-azure-hsm-agent lemur-azure-data-movers; then - yum -y install \ - https://azurehpc.azureedge.net/rpms/lemur-azure-hsm-agent-1.0.0-lustre_${lustre_version}.x86_64.rpm \ - https://azurehpc.azureedge.net/rpms/lemur-azure-data-movers-1.0.0-lustre_${lustre_version}.x86_64.rpm -fi - -cd $lfs_mount -export STORAGE_ACCOUNT=$storage_account -export STORAGE_KEY=$storage_key -/sbin/azure-import ${storage_container} - diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh deleted file mode 100755 index ce6b43f3d..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# arg: $1 = name -# arg: $2 = log analytics workspace id -# arg: $3 = log analytics key - -name=$1 -log_analytics_workspace_id=$2 -log_analytics_key=$3 - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -sed "s#__FS_NAME__#${name}#g;s#__LOG_ANALYTICS_WORKSPACE_ID__#${log_analytics_workspace_id}#g;s#__LOG_ANALYTICS_KEY__#${log_analytics_key}#g" $DIR/lfsloganalyticsd.sh.in >/usr/bin/lfsloganalyticsd.sh - -chmod +x /usr/bin/lfsloganalyticsd.sh - -cat </lib/systemd/system/lfsloganalytics.service -[Unit] -Description=Lustre logging service to Log Analytics. - -[Service] -Type=simple -ExecStart=/bin/bash /usr/bin/lfsloganalyticsd.sh -Restart=always - -[Install] -WantedBy=multi-user.target -EOF - -systemctl enable lfsloganalytics -systemctl start lfsloganalytics \ No newline at end of file diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh deleted file mode 100755 index d2dcdb02e..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) -device=$1 - -# this will only install MDS on first node in a scaleset -echo "pssh_nodenum is $PSSH_NODENUM" - -cp -r /share/home/hpcuser/.ssh /root/ - -#Include the correct rdma options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab - mount -a - - # set up hsm - lctl set_param -P mdt.*-MDT0000.hsm_control=enabled - lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 - lctl set_param mdt.*-MDT0000.hsm.max_requests=128 - - # allow any user and group ids to write - lctl set_param mdt.*-MDT0000.identity_upcall=NONE - - diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh deleted file mode 100755 index 8f39aac68..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsmaster -# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) -master=$1 -device=$2 - -cp -r /share/home/hpcuser/.ssh /root/ - -index=$(($PSSH_NODENUM + 1)) -myuser="hpcuser" - -capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") -masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) - - lnetctl net add --net o2ib --if ib0 #double check - mkfs.lustre \ - --fsname=LustreFS \ - --backfstype=ldiskfs \ - --reformat \ - --ost \ - --mgsnode="${masterib}@o2ib" \ - --index=$index \ - --mountfsoptions="errors=remount-ro" \ - $device - - -mkdir /mnt/oss -echo "$device /mnt/oss lustre noatime,nodiratime,nobarrier 0 2" >> /etc/fstab -mount -a diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh deleted file mode 100755 index db1eeb165..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -lustre_version=${1-2.10} - -cat << EOF >/etc/yum.repos.d/LustrePack.repo -[lustreserver] -name=lustreserver -baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ -enabled=1 -gpgcheck=0 - -[e2fs] -name=e2fs -baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ -enabled=1 -gpgcheck=0 - -[lustreclient] -name=lustreclient -baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ -enabled=1 -gpgcheck=0 -EOF - -#Include the correct rdma options -#cat >/etc/modprobe.d/lustre.conf<$home_root/$new_user/.ssh/config -Host * - StrictHostKeyChecking no - UserKnownHostsFile /dev/null - LogLevel ERROR -EOF - ssh-keygen -f $home_root/$new_user/.ssh/id_rsa -t rsa -N '' - # add admin user public key (the only user in /home) - cat /home/*/.ssh/id_rsa.pub >$home_root/$new_user/.ssh/authorized_keys - cat $home_root/$new_user/.ssh/id_rsa.pub >>$home_root/$new_user/.ssh/authorized_keys - chown $new_user:$new_user $home_root/$new_user/.ssh - chown $new_user:$new_user $home_root/$new_user/.ssh/* - chmod 700 $home_root/$new_user/.ssh - chmod 600 $home_root/$new_user/.ssh/id_rsa - chmod 644 $home_root/$new_user/.ssh/id_rsa.pub - chmod 644 $home_root/$new_user/.ssh/config - chmod 644 $home_root/$new_user/.ssh/authorized_keys -fi -echo "$new_user ALL=(ALL) NOPASSWD: ALL" | tee -a /etc/sudoers diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh deleted file mode 100755 index c052001a0..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -# jump the gun here to ensure passwordless ssh as root between all lustre nodes to faciltate node reboot -cp -r /share/home/hpcuser/.ssh ~/ - -yum -y --nogpgcheck --disablerepo=* --enablerepo=e2fs install e2fsprogs - -yum -y --nogpgcheck --disablerepo=base,extras,updates --enablerepo=lustreserver install kernel kernel-devel kernel-headers kernel-tools kernel-tools-libs 2>/dev/null - diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh deleted file mode 100755 index 60f3e759e..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -yum -y --nogpgcheck --enablerepo=lustreserver install kmod-lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre lustre-resource-agents -modprobe -v lustre - -sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf -sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf - -weak-modules --add-kernel --no-initramfs -systemctl enable lustre -umount /mnt/resource diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh deleted file mode 100755 index f95d33864..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf -service waagent restart -service rdma start -modprobe lnet -lctl network configure -lnetctl net add --net o2ib --if ib0 #need this to come up every time -sleep 5 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh deleted file mode 100755 index 678bac4dd..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -# arg: $1 = nfsserver -nfs_server=$1 -nfs_share=${2-/share} -if [ -z "$nfs_server" ]; then - echo "The nfs_server is required" - exit 1 -fi - -yum install -y nfs-utils - -mkdir -p /scratch -mkdir -p /apps -mkdir -p /data -mkdir -p /share/home -mount $nfs_server:$nfs_share/apps /apps -mount $nfs_server:$nfs_share/data /data -mount $nfs_server:$nfs_share/home /share/home - -chmod 777 /scratch - -cat << EOF >> /etc/fstab -$nfs_server:$nfs_share/home /share/home nfs defaults 0 0 -$nfs_server:/mnt/resource/scratch /scratch nfs defaults 0 0 -$nfs_server:$nfs_share/apps /apps nfs defaults 0 0 -$nfs_server:$nfs_share/data /data nfs defaults 0 0 -EOF - -setsebool -P use_nfs_home_dirs 1 - -mount -a - -df diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh deleted file mode 100755 index 14d53a4c0..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh +++ /dev/null @@ -1,212 +0,0 @@ -#!/bin/bash -if [[ $(id -u) -ne 0 ]] ; then - echo "Must be run as root" - exit 1 -fi - -# Disable requiretty to allow run sudo within scripts -sed -i -e 's/Defaults requiretty.*/ #Defaults requiretty/g' /etc/sudoers - -yum -y install epel-release -yum -y install nfs-utils nfs-utils-lib - -# Shares -NFS_MOUNT_POINT=/share -NFS_APPS=$NFS_MOUNT_POINT/apps -NFS_DATA=$NFS_MOUNT_POINT/data -NFS_HOME=$NFS_MOUNT_POINT/home -NFS_SCRATCH=/mnt/resource/scratch - -# Partitions all data disks attached to the VM -# -setup_data_disks() -{ - mountPoint="$1" - filesystem="$2" - devices="$3" - raidDevice="$4" - createdPartitions="" - numdevices=`echo $devices | wc -w` - if [ $numdevices -gt 1 ] - then - # Loop through and partition disks until not found - for disk in $devices; do - fdisk -l /dev/$disk || break - fdisk /dev/$disk << EOF -n -p -1 - - -t -fd -w -EOF - createdPartitions="$createdPartitions /dev/${disk}1" - done - else - disk=$(echo $devices | tr -d [:space:]) - echo "Warning: Only a single device to partition, $disk" - fdisk -l /dev/$disk || break - fdisk /dev/$disk << EOF -n -p -1 - - -w -EOF - createdPartitions="$createdPartitions /dev/${disk}1" - fi - - sleep 10 - - # Create RAID-0 volume - if [ -n "$createdPartitions" ]; then - devices=`echo $createdPartitions | wc -w` - if [ $numdevices -gt 1 ] - then - mdadm --create /dev/$raidDevice --level 0 --raid-devices $devices $createdPartitions - sleep 10 - - mdadm /dev/$raidDevice - else - echo "Warning: mdadm is not called, we have one partition named, ${disk}1 for mountpoint, $mountPoint" - raidDevice=${disk}1 - fi - - if [ "$filesystem" == "xfs" ]; then - mkfs -t $filesystem /dev/$raidDevice - export xfsuuid="UUID=`blkid |grep dev/$raidDevice |cut -d " " -f 2 |cut -c 7-42`" -# echo "$xfsuuid $mountPoint $filesystem rw,noatime,attr2,inode64,nobarrier,sunit=1024,swidth=4096,nofail 0 2" >> /etc/fstab - echo "$xfsuuid $mountPoint $filesystem rw,noatime,attr2,inode64,nobarrier,nofail 0 2" >> /etc/fstab - else - mkfs.ext4 -i 2048 -I 512 -J size=400 -Odir_index,filetype /dev/$raidDevice - sleep 5 - tune2fs -o user_xattr /dev/$raidDevice - export ext4uuid="UUID=`blkid |grep dev/$raidDevice |cut -d " " -f 2 |cut -c 7-42`" - echo "$ext4uuid $mountPoint $filesystem noatime,nodiratime,nobarrier,nofail 0 2" >> /etc/fstab - fi - - sleep 10 - mount -a - fi -} - -setup_single_disk() -{ - mountPoint="$1" - filesystem="$2" - device="$3" - - fdisk -l /dev/$device || break - fdisk /dev/$device << EOF -n -p -1 - - -p -w -EOF - - if [ "$filesystem" == "xfs" ]; then - mkfs -t $filesystem /dev/$device - echo "/dev/$device $mountPoint $filesystem rw,noatime,attr2,inode64,nobarrier,nofail 0 2" >> /etc/fstab - else - mkfs.ext4 -F -i 2048 -I 512 -J size=400 -Odir_index,filetype /dev/$device - sleep 5 - tune2fs -o user_xattr /dev/$device - echo "/dev/$device $mountPoint $filesystem noatime,nodiratime,nobarrier,nofail 0 2" >> /etc/fstab - fi - - sleep 10 - - mount /dev/$device $mountPoint -} - -setup_disks() -{ - # Dump the current disk config for debugging - fdisk -l - - # Dump the scsi config - lsscsi - - # Get the root/OS disk so we know which device it uses and can ignore it later - rootDevice=`mount | grep "on / type" | awk '{print $1}' | sed 's/[0-9]//g'` - - # Get the TMP disk so we know which device and can ignore it later - tmpDevice=`mount | grep "on /mnt/resource type" | awk '{print $1}' | sed 's/[0-9]//g'` - - # Get the data disk sizes from fdisk, we ignore the disks above - dataDiskSize=`fdisk -l | grep '^Disk /dev/' | grep -v $rootDevice | grep -v $tmpDevice | awk '{print $3}' | sort -n -r | tail -1` - - # Compute number of disks - nbDisks=`fdisk -l | grep '^Disk /dev/' | grep -v $rootDevice | grep -v $tmpDevice | wc -l` - echo "nbDisks=$nbDisks" - - dataDevices="`fdisk -l | grep '^Disk /dev/' | grep $dataDiskSize | awk '{print $2}' | awk -F: '{print $1}' | sort | head -$nbDisks | tr '\n' ' ' | sed 's|/dev/||g'`" - - mkdir -p $NFS_MOUNT_POINT - - - if [ "$nbDisks" -eq "1" ]; then - setup_single_disk $NFS_MOUNT_POINT "ext4" "$dataDevices" - elif [ "$nbDisks" -gt "1" ]; then - setup_data_disks $NFS_MOUNT_POINT "xfs" "$dataDevices" "md10" - fi - - mkdir -p $NFS_APPS - mkdir -p $NFS_DATA - mkdir -p $NFS_HOME - mkdir -p $NFS_SCRATCH - chmod 777 $NFS_APPS - chmod 777 $NFS_DATA - chmod 777 $NFS_HOME - chmod 777 $NFS_SCRATCH - - ln -s $NFS_SCRATCH /scratch - - echo "$NFS_APPS *(rw,sync,no_root_squash)" >> /etc/exports - echo "$NFS_DATA *(rw,sync,no_root_squash)" >> /etc/exports - echo "$NFS_HOME *(rw,sync,no_root_squash)" >> /etc/exports - echo "$NFS_SCRATCH *(rw,sync,no_root_squash)" >> /etc/exports - - exportfs - exportfs -a - exportfs -} - -tune_nfs() -{ - cores=$(grep processor /proc/cpuinfo | wc -l) - nfs_proc=$(($cores * 4)) - replace="s/#RPCNFSDCOUNT=16/RPCNFSDCOUNT=$nfs_proc/g" - sed -i -e "$replace" /etc/sysconfig/nfs - - grep RPCNFSDCOUNT /etc/sysconfig/nfs -} - -systemctl enable rpcbind -systemctl enable nfs-server -systemctl enable nfs-lock -systemctl enable nfs-idmap -systemctl enable nfs - -systemctl start rpcbind -systemctl start nfs-server -systemctl start nfs-lock -systemctl start nfs-idmap -systemctl start nfs - -setup_disks -tune_nfs -systemctl restart nfs-server - -ln -s /share/apps /apps -ln -s /share/data /data - -df - - diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh deleted file mode 100755 index fd037df76..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -set -e -# arg: $1 = pbs_server -pbs_server=$1 - -if [ "$(rpm -qa pbspro-execution)" = "" ];then - yum install -y pbspro-execution-19.1.1-0.x86_64.rpm - - sed -i "s/CHANGE_THIS_TO_PBS_PRO_SERVER_HOSTNAME/${pbs_server}/g" /etc/pbs.conf - sed -i "s/CHANGE_THIS_TO_PBS_PRO_SERVER_HOSTNAME/${pbs_server}/g" /var/spool/pbs/mom_priv/config - sed -i "s/^if /#if /g" /opt/pbs/lib/init.d/limits.pbs_mom - sed -i "s/^fi/#fi /g" /opt/pbs/lib/init.d/limits.pbs_mom - systemctl enable pbs - systemctl start pbs - - # Retrieve the VMSS name to be used as the pool name for multiple VMSS support - poolName=$(curl -s -H Metadata:true "http://169.254.169.254/metadata/instance?api-version=2018-10-01" | jq -r '.compute.vmScaleSetName') - /opt/pbs/bin/qmgr -c "c n $(hostname) resources_available.pool_name='$poolName'" - -else - echo "PBS client was already installed" -fi diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh deleted file mode 100755 index b4317516b..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -filename=pbspro_19.1.1.centos7.zip - -if [ ! -f "$filename" ];then - wget -q https://github.com/PBSPro/pbspro/releases/download/v19.1.1/$filename - unzip $filename -fi - diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh deleted file mode 100755 index 14ee54d1a..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -set -e -admin_user=$(whoami) - -if [ "$(rpm -qa pbspro-server)" = "" ];then - yum install -y pbspro-server-19.1.1-0.x86_64.rpm - systemctl enable pbs - systemctl start pbs - /opt/pbs/bin/qmgr -c "s s managers += ${admin_user}@*" - /opt/pbs/bin/qmgr -c 's s flatuid=t' - /opt/pbs/bin/qmgr -c 's s job_history_enable=t' - /opt/pbs/bin/qmgr -c 'c r pool_name type=string,flag=h' - - # Update the sched_config file to schedule jobs that request pool_name - sed -i "s/^resources: \"ncpus,/resources: \"ncpus, pool_name,/g" /var/spool/pbs/sched_priv/sched_config - systemctl restart pbs -else - echo "PBSPro already installed" -fi diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh deleted file mode 100755 index 2d33c180b..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -vmlist=$1 -osscount=$2 -totalcount=$((osscount+2)) -index=0 -#prep headnode -cp -r /share/home/hpcuser/.ssh /root/ -echo "vmlist is ${vmlist[@]}" - -#needs to be done sequentially -for vmname in ${vmlist[@]}; do - echo "Rebooting $vmname" - ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null - index=$((index+1)) -done -exit 0 diff --git a/examples/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh b/examples/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh deleted file mode 100755 index 73411ca61..000000000 --- a/examples/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -sleep 180 #enough time for node reboot to continue process diff --git a/examples/lustre_rdma_avs/config.json b/examples/lustre_rdma_avs/config.json deleted file mode 100644 index d241c4631..000000000 --- a/examples/lustre_rdma_avs/config.json +++ /dev/null @@ -1,278 +0,0 @@ -{ - "location": "variables.location", - "resource_group": "variables.resource_group", - "install_from": "headnode", - "admin_user": "hpcadmin", - "vnet": { - "name": "hpcvnet", - "address_prefix": "10.2.0.0/20", - "subnets": { - "compute": "10.2.0.0/22", - "storage": "10.2.4.0/24" - } - }, - "variables": { - "location": "", - "image": "OpenLogic:CentOS:7.6:latest", - "lustreimage": "OpenLogic:CentOS:7.6:latest", - "hpcimage": "OpenLogic:CentOS:7.6:latest", - "compute_instances": 2, - "lustre_instances": 2, - "low_priority": false, - "storage_account": "", - "storage_key": "sakey.{{variables.storage_account}}", - "storage_container": "", - "log_analytics_lfs_name": "", - "la_resourcegroup": "", - "la_name": "", - "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", - "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", - "lustre_version": "2.10", - "lustre_avset": "{{variables.resource_group}}avset", - "lustre_mount": "/lustre" - }, - "resources": { - "headnode": { - "type": "vm", - "vm_type": "Standard_HB120rs_v2", - "accelerated_networking": false, - "public_ip": true, - "image": "variables.image", - "subnet": "compute", - "tags": [ - "disable-selinux", - "cndefault", - "lfsrepo", - "lfsclient", - "lfsazimport", - "localuser", - "pbsserver", - "loginnode", - "rebootlustre", - "nfsserver", - "allnodes" - ] - }, - "compute": { - "type": "vm", - "vm_type": "Standard_HB120rs_v2", - "instances": "variables.compute_instances", - "availability_set": "variables.lustre_avset", - "low_priority": "variables.low_priority", - "accelerated_networking": false, - "image": "variables.hpcimage", - "subnet": "storage", - "tags": [ - "cndefault", - "lfsrepo", - "lfsclient", - "localuser", - "pbsclient", - "nfsclient", - "disable-selinux", - "allnodes" - ] - }, - "lfsmaster": { - "type": "vm", - "vm_type": "Standard_HB120rs_v2", - "availability_set": "variables.lustre_avset", - "accelerated_networking": false, - "image": "variables.lustreimage", - "subnet": "storage", - "tags": [ - "cndefault", - "lustre", - "lfsmaster", - "lfsrepo", - "localuser", - "nfsclient", - "disable-selinux", - "lfsloganalytics", - "allnodes" - ] - }, - "lustre": { - "type": "vm", - "vm_type": "Standard_HB120rs_v2", - "instances": "variables.lustre_instances", - "availability_set": "variables.lustre_avset", - "accelerated_networking": false, - "image": "variables.lustreimage", - "subnet": "storage", - "tags": [ - "cndefault", - "lfsrepo", - "localuser", - "nfsclient", - "lustre", - "ossnode", - "disable-selinux", - "lfsloganalytics", - "allnodes" - ] - } - }, - "install": [ - { - "script": "disable-selinux.sh", - "tag": "disable-selinux", - "sudo": true - }, - { - "script": "cndefault.sh", - "tag": "cndefault", - "sudo": true - }, - { - "script": "nfsserver.sh", - "tag": "nfsserver", - "sudo": true - }, - { - "script": "nfsclient.sh", - "args": [ - "$( Note: The HC nodes are used for the cluster, although this node type may be easily changed by use of the vm_type variable for lustre inside config.json. - -The configuration file requires the following variables to be set: - -| Variable | Description | -|-------------------------|----------------------------------------------| -| location | The locaton of the project | -| resource_group | The resource group for the project | -| storage_account | The storage account for HSM | -| storage_key | The storage key for HSM | -| storage_container | The container to use for HSM | -| log_analytics_lfs_name | The lustre filesystem name for Log Analytics | -| la_resourcegroup | The resource group for Log Analytics | -| la_name | The Log Analytics Workspace name | - -> Note: you can remove log anaytics and/or HSM from the config file if not required. - -> Note: Key Vault should be used for the keys to keep them out of the config files. diff --git a/examples/lustre_rdma_avs/scripts/installOFED.sh b/examples/lustre_rdma_avs/scripts/installOFED.sh deleted file mode 100755 index c267519fc..000000000 --- a/examples/lustre_rdma_avs/scripts/installOFED.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -yum -y groupinstall --skip-broken "Infiniband Support" 2>/dev/null -echo "done installing Infiniband" -exit 0 diff --git a/examples/lustre_rdma_avs/scripts/lfsclient.sh b/examples/lustre_rdma_avs/scripts/lfsclient.sh deleted file mode 100755 index 5fc8f9a62..000000000 --- a/examples/lustre_rdma_avs/scripts/lfsclient.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsserver -# arg: $2 = mount point (default: /lustre) -master=$1 -lfs_mount=${2:-/lustre} - -cp -r /share/home/hpcuser/.ssh /root/ - -#Include the correct rdma options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab -mount -a -chmod 777 $lfs_mount diff --git a/examples/lustre_rdma_avs/scripts/lfsmaster.sh b/examples/lustre_rdma_avs/scripts/lfsmaster.sh deleted file mode 100755 index d2dcdb02e..000000000 --- a/examples/lustre_rdma_avs/scripts/lfsmaster.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) -device=$1 - -# this will only install MDS on first node in a scaleset -echo "pssh_nodenum is $PSSH_NODENUM" - -cp -r /share/home/hpcuser/.ssh /root/ - -#Include the correct rdma options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab - mount -a - - # set up hsm - lctl set_param -P mdt.*-MDT0000.hsm_control=enabled - lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 - lctl set_param mdt.*-MDT0000.hsm.max_requests=128 - - # allow any user and group ids to write - lctl set_param mdt.*-MDT0000.identity_upcall=NONE - - diff --git a/examples/lustre_rdma_avs/scripts/lfsoss.sh b/examples/lustre_rdma_avs/scripts/lfsoss.sh deleted file mode 100755 index 8f39aac68..000000000 --- a/examples/lustre_rdma_avs/scripts/lfsoss.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsmaster -# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) -master=$1 -device=$2 - -cp -r /share/home/hpcuser/.ssh /root/ - -index=$(($PSSH_NODENUM + 1)) -myuser="hpcuser" - -capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") -masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) - - lnetctl net add --net o2ib --if ib0 #double check - mkfs.lustre \ - --fsname=LustreFS \ - --backfstype=ldiskfs \ - --reformat \ - --ost \ - --mgsnode="${masterib}@o2ib" \ - --index=$index \ - --mountfsoptions="errors=remount-ro" \ - $device - - -mkdir /mnt/oss -echo "$device /mnt/oss lustre noatime,nodiratime,nobarrier 0 2" >> /etc/fstab -mount -a diff --git a/examples/lustre_rdma_avs/scripts/lfsrepo.sh b/examples/lustre_rdma_avs/scripts/lfsrepo.sh deleted file mode 100755 index db1eeb165..000000000 --- a/examples/lustre_rdma_avs/scripts/lfsrepo.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -lustre_version=${1-2.10} - -cat << EOF >/etc/yum.repos.d/LustrePack.repo -[lustreserver] -name=lustreserver -baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ -enabled=1 -gpgcheck=0 - -[e2fs] -name=e2fs -baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ -enabled=1 -gpgcheck=0 - -[lustreclient] -name=lustreclient -baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ -enabled=1 -gpgcheck=0 -EOF - -#Include the correct rdma options -#cat >/etc/modprobe.d/lustre.conf</dev/null - diff --git a/examples/lustre_rdma_avs/scripts/lustreinstall2.sh b/examples/lustre_rdma_avs/scripts/lustreinstall2.sh deleted file mode 100755 index 60f3e759e..000000000 --- a/examples/lustre_rdma_avs/scripts/lustreinstall2.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -yum -y --nogpgcheck --enablerepo=lustreserver install kmod-lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre lustre-resource-agents -modprobe -v lustre - -sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf -sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf - -weak-modules --add-kernel --no-initramfs -systemctl enable lustre -umount /mnt/resource diff --git a/examples/lustre_rdma_avs/scripts/lustrenetwork.sh b/examples/lustre_rdma_avs/scripts/lustrenetwork.sh deleted file mode 100755 index f95d33864..000000000 --- a/examples/lustre_rdma_avs/scripts/lustrenetwork.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf -service waagent restart -service rdma start -modprobe lnet -lctl network configure -lnetctl net add --net o2ib --if ib0 #need this to come up every time -sleep 5 diff --git a/examples/lustre_rdma_avs/scripts/rebootlustre.sh b/examples/lustre_rdma_avs/scripts/rebootlustre.sh deleted file mode 100755 index 2d33c180b..000000000 --- a/examples/lustre_rdma_avs/scripts/rebootlustre.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -vmlist=$1 -osscount=$2 -totalcount=$((osscount+2)) -index=0 -#prep headnode -cp -r /share/home/hpcuser/.ssh /root/ -echo "vmlist is ${vmlist[@]}" - -#needs to be done sequentially -for vmname in ${vmlist[@]}; do - echo "Rebooting $vmname" - ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null - index=$((index+1)) -done -exit 0 diff --git a/examples/lustre_rdma_avs/writeup b/examples/lustre_rdma_avs/writeup deleted file mode 100644 index 3eb58e3c6..000000000 --- a/examples/lustre_rdma_avs/writeup +++ /dev/null @@ -1,20 +0,0 @@ -- lustre-ipoib - This is a created implementation of Lustre using ip over infiniband (IPoIB) -- lustre-rdma - This is a created implementation of Lustre using native Remote Direct Memory Access (RDMA) - -Changes to files to enable Infiniband functionality: -lfsmaster.sh -lfsoss.sh -lfsclient.sh -lfsrepo.sh -lfspkgs.sh - -Addition for the installation of new Mellanox OFED (MOFED) for the Lustre kernel : installMOFED.sh - -Addition for correct drives placement of OSSes : installdrives.sh -*installdrives.sh takes about 15 minutes to run so please either remote this entity, or wait it out. - -Additions for correct Lustre kernel : -lustreinstall1.sh -lustreinstall2.sh - -Addition for pause after MDS/OSS reboot : waitforreboot.sh diff --git a/experimental/lustre_rdma_avs/writeup b/experimental/lustre_rdma_avs/writeup index 3eb58e3c6..93562dd26 100644 --- a/experimental/lustre_rdma_avs/writeup +++ b/experimental/lustre_rdma_avs/writeup @@ -10,9 +10,6 @@ lfspkgs.sh Addition for the installation of new Mellanox OFED (MOFED) for the Lustre kernel : installMOFED.sh -Addition for correct drives placement of OSSes : installdrives.sh -*installdrives.sh takes about 15 minutes to run so please either remote this entity, or wait it out. - Additions for correct Lustre kernel : lustreinstall1.sh lustreinstall2.sh From 5f91e16e31b0ef7e1d3f4600d013e71522abac4e Mon Sep 17 00:00:00 2001 From: Xavier Pillons Date: Tue, 7 Jul 2020 10:47:52 +0200 Subject: [PATCH 26/36] remove temp files --- .../azhpc_install_config/hostlists/compute | 2 - .../azhpc_install_config/hostlists/headnode | 1 - .../azhpc_install_config/hostlists/lfsmaster | 1 - .../azhpc_install_config/hostlists/linux | 6 - .../azhpc_install_config/hostlists/lustre | 2 - .../hostlists/tags/cndefault | 6 - .../hostlists/tags/disable-selinux | 6 - .../hostlists/tags/lfsazimport | 1 - .../hostlists/tags/lfsclient | 3 - .../hostlists/tags/lfsloganalytics | 3 - .../hostlists/tags/lfsmaster | 1 - .../hostlists/tags/lfsrepo | 6 - .../hostlists/tags/localuser | 6 - .../hostlists/tags/loginnode | 1 - .../hostlists/tags/lustre | 3 - .../hostlists/tags/nfsclient | 5 - .../hostlists/tags/nfsserver | 1 - .../hostlists/tags/ossnode | 2 - .../hostlists/tags/pbsclient | 2 - .../hostlists/tags/pbsserver | 1 - .../hostlists/tags/rebootlustre | 1 - .../azhpc_install_config/hpcadmin_id_rsa | 27 --- .../azhpc_install_config/hpcadmin_id_rsa.pub | 1 - .../install/00_install_node_setup.sh | 48 ---- .../install/01_disable-selinux.sh | 18 -- .../install/02_cndefault.sh | 18 -- .../install/03_nfsserver.sh | 18 -- .../install/04_nfsclient.sh | 18 -- .../install/05_localuser.sh | 18 -- .../install/06_lfsrepo.sh | 18 -- .../install/07_lustreinstall1.sh | 18 -- .../install/08_rebootlustre.sh | 18 -- .../install/09_waitforreboot.sh | 7 - .../install/10_installOFED.sh | 18 -- .../install/11_lustreinstall2.sh | 18 -- .../install/12_lustrenetwork.sh | 18 -- .../install/13_lfsmaster.sh | 18 -- .../azhpc_install_config/install/14_lfsoss.sh | 18 -- .../azhpc_install_config/install/15_lfshsm.sh | 18 -- .../install/16_lfsclient.sh | 18 -- .../install/17_lfsimport.sh | 18 -- .../install/18_lfsloganalytics.sh | 18 -- .../install/19_pbsdownload.sh | 18 -- .../install/20_pbsserver.sh | 19 -- .../install/21_pbsclient.sh | 19 -- .../azhpc_install_config/scripts/cndefault.sh | 23 -- .../scripts/disable-selinux.sh | 6 - .../scripts/installOFED.sh | 4 - .../azhpc_install_config/scripts/lfsclient.sh | 48 ---- .../azhpc_install_config/scripts/lfshsm.sh | 95 -------- .../azhpc_install_config/scripts/lfsimport.sh | 31 --- .../scripts/lfsloganalytics.sh | 31 --- .../azhpc_install_config/scripts/lfsmaster.sh | 31 --- .../azhpc_install_config/scripts/lfsoss.sh | 30 --- .../azhpc_install_config/scripts/lfsrepo.sh | 27 --- .../azhpc_install_config/scripts/localuser.sh | 40 ---- .../scripts/lustreinstall1.sh | 8 - .../scripts/lustreinstall2.sh | 10 - .../scripts/lustrenetwork.sh | 9 - .../azhpc_install_config/scripts/nfsclient.sh | 34 --- .../azhpc_install_config/scripts/nfsserver.sh | 212 ------------------ .../azhpc_install_config/scripts/pbsclient.sh | 22 -- .../scripts/pbsdownload.sh | 9 - .../azhpc_install_config/scripts/pbsserver.sh | 19 -- .../scripts/rebootlustre.sh | 16 -- .../scripts/waitforreboot.sh | 2 - 66 files changed, 1212 deletions(-) delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/compute delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/headnode delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/linux delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lustre delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa delete mode 100644 experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/localuser.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh delete mode 100755 experimental/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/compute b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/compute deleted file mode 100644 index 232110d4a..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/compute +++ /dev/null @@ -1,2 +0,0 @@ -compute0001 -compute0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/headnode b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/headnode deleted file mode 100644 index 1a9798066..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/headnode +++ /dev/null @@ -1 +0,0 @@ -headnode diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster deleted file mode 100644 index a47bf87fe..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lfsmaster +++ /dev/null @@ -1 +0,0 @@ -lfsmaster diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/linux b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/linux deleted file mode 100644 index 337053fb6..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/linux +++ /dev/null @@ -1,6 +0,0 @@ -headnode -compute0001 -compute0002 -lfsmaster -lustre0001 -lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lustre b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lustre deleted file mode 100644 index b8f9b2061..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/lustre +++ /dev/null @@ -1,2 +0,0 @@ -lustre0001 -lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault deleted file mode 100644 index 337053fb6..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/cndefault +++ /dev/null @@ -1,6 +0,0 @@ -headnode -compute0001 -compute0002 -lfsmaster -lustre0001 -lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux deleted file mode 100644 index 337053fb6..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/disable-selinux +++ /dev/null @@ -1,6 +0,0 @@ -headnode -compute0001 -compute0002 -lfsmaster -lustre0001 -lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport deleted file mode 100644 index 1a9798066..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsazimport +++ /dev/null @@ -1 +0,0 @@ -headnode diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient deleted file mode 100644 index 8af893f49..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsclient +++ /dev/null @@ -1,3 +0,0 @@ -headnode -compute0001 -compute0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics deleted file mode 100644 index 6453c2e60..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsloganalytics +++ /dev/null @@ -1,3 +0,0 @@ -lfsmaster -lustre0001 -lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster deleted file mode 100644 index a47bf87fe..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsmaster +++ /dev/null @@ -1 +0,0 @@ -lfsmaster diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo deleted file mode 100644 index 337053fb6..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lfsrepo +++ /dev/null @@ -1,6 +0,0 @@ -headnode -compute0001 -compute0002 -lfsmaster -lustre0001 -lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser deleted file mode 100644 index 337053fb6..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/localuser +++ /dev/null @@ -1,6 +0,0 @@ -headnode -compute0001 -compute0002 -lfsmaster -lustre0001 -lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode deleted file mode 100644 index 1a9798066..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/loginnode +++ /dev/null @@ -1 +0,0 @@ -headnode diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre deleted file mode 100644 index 6453c2e60..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/lustre +++ /dev/null @@ -1,3 +0,0 @@ -lfsmaster -lustre0001 -lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient deleted file mode 100644 index 748d1c5dc..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsclient +++ /dev/null @@ -1,5 +0,0 @@ -compute0001 -compute0002 -lfsmaster -lustre0001 -lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver deleted file mode 100644 index 1a9798066..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/nfsserver +++ /dev/null @@ -1 +0,0 @@ -headnode diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode deleted file mode 100644 index b8f9b2061..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/ossnode +++ /dev/null @@ -1,2 +0,0 @@ -lustre0001 -lustre0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient deleted file mode 100644 index 232110d4a..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsclient +++ /dev/null @@ -1,2 +0,0 @@ -compute0001 -compute0002 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver deleted file mode 100644 index 1a9798066..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/pbsserver +++ /dev/null @@ -1 +0,0 @@ -headnode diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre b/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre deleted file mode 100644 index 1a9798066..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hostlists/tags/rebootlustre +++ /dev/null @@ -1 +0,0 @@ -headnode diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa b/experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa deleted file mode 100644 index 7846d2b39..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa +++ /dev/null @@ -1,27 +0,0 @@ ------BEGIN RSA PRIVATE KEY----- -MIIEpAIBAAKCAQEA1JCjVUGcKCYN3RCERznjr7e1Chsf+DG30uluSXk3I/6nesto -5gLGfKiTjeHWvX5tqFITAA84r140AgsIcUHEpaWwk06QIVUTj6kDHbubP0i1V2EY -2sa6cm6hPQmsFIiOK578BLuv/Zda/arVJ1dq1q+1t0tt84TTCrsROszNw8t9Kc3Z -Gn2SY7F52Z8nttmN7OEsfUtg6K6f/5IwbJb7U8b/0jF6yWDpzrmqN33BJfrZ1VWs -jswhblxJZ0juAU/oAB0xtOzqM2vwUZy9FmcfRPo/U1gM4DUG1h37oWWkoQLhgURu -p3Lztqq8msXXsnk3ZnIkMWWNJ429fN2ui751QQIDAQABAoIBAGYQfRy2wDBW9Vks -UReSKE17PCZ6F8Oou8c95oLI/Tz/TZOcj+XBd2Tr3M3HnsCmMCkeH5lrtaAe74H7 -ojYfijivcjWJB5O5sgbM9H4WUtj0JH6sVK7XtTa1AB66wjGpz/oKAKCVLk/pmPss -R+T4CIjFHc/BHC5NnLgOUpuVM0fLUUUF8NmIvT6K0P4j7GZx12d1TDkqo+/rd1ku -EOuCjl8Q4bTO0qtJEXy2dmn38m6QGNS765j8gQ21wWY+Q7EX4JaJ+oO2ZgGuyYul -Cu+AFlCR4SkOok0DN6RG4KQ7Sly57HrZWwLI46FXmjiJqE/7wNvMwuHdUmnVbkoY -v04fxAECgYEA8ii6KMsPIxnMSCBpsRoFSOcPdSyoFyhMCCuiR9liCGRG4wz4u1i6 -ZFal1+d/rX6qxCTIZxvU8zn54Qsrr+44zV++4+Sd/nhrc+qWOxGggAscbYNG3w2g -GTGinERFPRs5iGmdJ0n+uy/TSPe5t0qH85AdKcU47mfrNb3Q08rEfxECgYEA4Lbj -zkCUa4UN6CP36FtOUNtpnrn7dxfDNpcS8CTt/oob2OifOUGhgPbCio5at7zE8cH0 -hWrUWFPDfBRliGdG/ZzdmIOaC0MU9eQG4JxkblgYccKpcYsTq45NDyhQJ0lbBjRG -Sp42HOnvZ8p0m9przrnQF22Bvr5E+VF1wVk18zECgYEA7pI9RS84jIZAAdcdCYPv -LPGnAvOp7paewXXrfQmnUUkppUsESd6SU4QiA2FpIk4mgvMSFLMQy0eU7KeKtNrn -Tz5C3FZBaZDNm/fDZhJpo3xO13179wh/cBK8d2OzKw6FUeVrFGgL8/KcH8kfSHq/ -EbAraxmIiygKTHnjIKUljWECgYAQxhYjIzbw/7GWDnlG4unppzcvHfrjXOa5gHVt -b5REV9LUUijwgTGpCsJizVWAOZsJ4Mx72QmYvkftTyh1EiB+deMkq04oYQ2DfU32 -HjZw9ip882bqjtMdDzY5V20EQbmFsQk+MKkhZ2Tzfm1N5PP/LmeWGBqDPnivk6ES -mbIpQQKBgQDqnc9KivmjPIHz2BpJh8icWkdvZ2WzycI3Sly6Suh0E6Q+epMTXUm3 -21TIEkkAlBYXkHs0ZhL7l7jzv5yYSGB8ZNDzk+UquE5OuxMwWsd3trqyJ3LMj9C5 -hV6JTHqNSw8xubCES0oRgJkcCedoQ0qxMwypnJarWPh/LSVCu3BZ2A== ------END RSA PRIVATE KEY----- diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub b/experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub deleted file mode 100644 index 20776c3a0..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/hpcadmin_id_rsa.pub +++ /dev/null @@ -1 +0,0 @@ -ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDUkKNVQZwoJg3dEIRHOeOvt7UKGx/4MbfS6W5JeTcj/qd6y2jmAsZ8qJON4da9fm2oUhMADzivXjQCCwhxQcSlpbCTTpAhVROPqQMdu5s/SLVXYRjaxrpybqE9CawUiI4rnvwEu6/9l1r9qtUnV2rWr7W3S23zhNMKuxE6zM3Dy30pzdkafZJjsXnZnye22Y3s4Sx9S2Dorp//kjBslvtTxv/SMXrJYOnOuao3fcEl+tnVVayOzCFuXElnSO4BT+gAHTG07Ooza/BRnL0WZx9E+j9TWAzgNQbWHfuhZaShAuGBRG6ncvO2qryaxdeyeTdmciQxZY0njb183a6LvnVB diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh deleted file mode 100755 index d5e1850c6..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/00_install_node_setup.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=linux - -if [ ! -f "hostlists/$tag" ]; then - echo "no hostlist ($tag), exiting" - exit 0 -fi - -# wait for DNS to update for all hostnames -for h in $(/dev/null 2>&1; do - echo "Waiting for host - $h (sleeping for 5 seconds)" - sleep 5 - done -done - -if [ "$1" != "" ]; then - tag=tags/$1 -else - sudo yum install -y epel-release > install/00_install_node_setup.log 2>&1 - sudo yum install -y pssh nc >> install/00_install_node_setup.log 2>&1 - - # setting up keys - cat < ~/.ssh/config - Host * - StrictHostKeyChecking no - UserKnownHostsFile /dev/null - LogLevel ERROR -EOF - cp hpcadmin_id_rsa.pub ~/.ssh/id_rsa.pub - cp hpcadmin_id_rsa ~/.ssh/id_rsa - chmod 600 ~/.ssh/id_rsa - chmod 644 ~/.ssh/config - chmod 644 ~/.ssh/id_rsa.pub - -fi - -pssh -p 50 -t 0 -i -h hostlists/$tag 'rpm -q rsync || sudo yum install -y rsync' >> install/00_install_node_setup.log 2>&1 - -prsync -p 50 -a -h hostlists/$tag ~/azhpc_install_config ~ >> install/00_install_node_setup.log 2>&1 -prsync -p 50 -a -h hostlists/$tag ~/.ssh ~ >> install/00_install_node_setup.log 2>&1 - -pssh -p 50 -t 0 -i -h hostlists/$tag 'echo "AcceptEnv PSSH_NODENUM PSSH_HOST" | sudo tee -a /etc/ssh/sshd_config' >> install/00_install_node_setup.log 2>&1 -pssh -p 50 -t 0 -i -h hostlists/$tag 'sudo systemctl restart sshd' >> install/00_install_node_setup.log 2>&1 -pssh -p 50 -t 0 -i -h hostlists/$tag "echo 'Defaults env_keep += \"PSSH_NODENUM PSSH_HOST\"' | sudo tee -a /etc/sudoers" >> install/00_install_node_setup.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh deleted file mode 100755 index aff9f6abd..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/01_disable-selinux.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-disable-selinux} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/disable-selinux.sh" >> install/01_disable-selinux.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh deleted file mode 100755 index 89df21b38..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/02_cndefault.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-cndefault} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/cndefault.sh" >> install/02_cndefault.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh deleted file mode 100755 index 9fe8fc049..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/03_nfsserver.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-nfsserver} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/nfsserver.sh" >> install/03_nfsserver.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh deleted file mode 100755 index 3ef1d7dd2..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/04_nfsclient.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-nfsclient} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/nfsclient.sh '$(> install/04_nfsclient.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh deleted file mode 100755 index 547517af7..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/05_localuser.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-localuser} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/localuser.sh '$(> install/05_localuser.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh deleted file mode 100755 index c51d1a7bf..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/06_lfsrepo.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lfsrepo} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsrepo.sh '2.10'" >> install/06_lfsrepo.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh deleted file mode 100755 index 9c9e725e1..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/07_lustreinstall1.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lustre} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lustreinstall1.sh" >> install/07_lustreinstall1.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh deleted file mode 100755 index cafe2dccc..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/08_rebootlustre.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-rebootlustre} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/rebootlustre.sh '$(> install/08_rebootlustre.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh deleted file mode 100755 index e7f2585c1..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/09_waitforreboot.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -scripts/waitforreboot.sh >> install/09_waitforreboot.log 2>&1 - diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh deleted file mode 100755 index 0a9d5144c..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/10_installOFED.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lustre} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/installOFED.sh" >> install/10_installOFED.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh deleted file mode 100755 index 415de3119..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/11_lustreinstall2.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lustre} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lustreinstall2.sh" >> install/11_lustreinstall2.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh deleted file mode 100755 index 210bc389e..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/12_lustrenetwork.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lustre} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lustrenetwork.sh" >> install/12_lustrenetwork.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh deleted file mode 100755 index 5dead31c8..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/13_lfsmaster.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lfsmaster} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsmaster.sh '/dev/sdb'" >> install/13_lfsmaster.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh deleted file mode 100755 index 0b2f013ae..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/14_lfsoss.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-ossnode} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsoss.sh '$(head -n1 hostlists/tags/lfsmaster)' '/dev/nvme0n1'" >> install/14_lfsoss.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh deleted file mode 100755 index 479abe10e..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/15_lfshsm.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lustre} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfshsm.sh '$(head -n1 hostlists/tags/lustre)' 'lustretesting' 'TXOO/DhcJHGjjcNQ58f9SGCRF3RUuz3/UHaE70KbDAHhIkd38Ic5YXVlFcdxuytgk8pDg0sp5J9lCdOWr++sXA==' 'hsm' '2.10'" >> install/15_lfshsm.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh deleted file mode 100755 index e6e74eb5c..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/16_lfsclient.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lfsclient} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsclient.sh '$(head -n1 hostlists/tags/lfsmaster)' '/lustre'" >> install/16_lfsclient.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh deleted file mode 100755 index c23853cd8..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/17_lfsimport.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lfsazimport} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsimport.sh 'lustretesting' 'TXOO/DhcJHGjjcNQ58f9SGCRF3RUuz3/UHaE70KbDAHhIkd38Ic5YXVlFcdxuytgk8pDg0sp5J9lCdOWr++sXA==' 'hsm' '/lustre' '2.10'" >> install/17_lfsimport.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh deleted file mode 100755 index d2a6ff976..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/18_lfsloganalytics.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-lfsloganalytics} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/lfsloganalytics.sh 'lfs' 'eb2e4150-e0fa-494d-8f60-291e27820eff' '0iKHSuo3C36gwxYYZSBIIVB8g5l7A1qztuF77oVwZlFV9iKqke/Jajc+qVLkt1SB7LNimpeb3Q++qerMtnZvuw=='" >> install/18_lfsloganalytics.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh deleted file mode 100755 index 9731feb81..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/19_pbsdownload.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-loginnode} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; scripts/pbsdownload.sh" >> install/19_pbsdownload.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh deleted file mode 100755 index 0a2c0cf2d..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/20_pbsserver.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-pbsserver} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pscp.pssh -p 50 -h hostlists/tags/$tag pbspro_19.1.1.centos7/pbspro-server-19.1.1-0.x86_64.rpm $(pwd) >> install/20_pbsserver.log 2>&1 -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/pbsserver.sh" >> install/20_pbsserver.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh b/experimental/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh deleted file mode 100755 index 1c354d17f..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/install/21_pbsclient.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -# expecting to be in $tmp_dir -cd "$( dirname "${BASH_SOURCE[0]}" )/.." - -tag=${1:-pbsclient} - -if [ ! -f "hostlists/tags/$tag" ]; then - echo " Tag is not assigned to any resource (not running)" - exit 0 -fi - -if [ "$(wc -l < hostlists/tags/$tag)" = "0" ]; then - echo " Tag does not contain any resources (not running)" - exit 0 -fi - -pscp.pssh -p 50 -h hostlists/tags/$tag pbspro_19.1.1.centos7/pbspro-execution-19.1.1-0.x86_64.rpm $(pwd) >> install/21_pbsclient.log 2>&1 -pssh -p 50 -t 0 -i -h hostlists/tags/$tag "cd azhpc_install_config; sudo scripts/pbsclient.sh '$(> install/21_pbsclient.log 2>&1 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh deleted file mode 100755 index 303ebac1b..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/cndefault.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Script to be run on all compute nodes -if ! rpm -q epel-release; then - yum -y install epel-release -fi - -yum -y install git jq htop - -# change access to resource so that temp jobs can be written there -chmod 777 /mnt/resource - -# If running on Cycle -# - enable METADATA access -# - remove Jetpack convergence -# - Disable Fail2Ban service -# - Fix PBS limits -if [ -e $CYCLECLOUD_HOME/bin/jetpack ]; then - DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - $DIR/azhpc4cycle.sh enable_metada_access - $DIR/azhpc4cycle.sh disable_jetpack_converge - $DIR/azhpc4cycle.sh disable_fail2ban - $DIR/azhpc4cycle.sh fix_pbs_limits -fi diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh deleted file mode 100755 index 00c87bbf2..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/disable-selinux.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -# set to permissive for now (until reboot) -setenforce 0 -# prep to have selinux disabled after reboot -sed -i 's/SELINUX=.*$/SELINUX=disabled/g' /etc/selinux/config diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh deleted file mode 100755 index c267519fc..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/installOFED.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -yum -y groupinstall --skip-broken "Infiniband Support" 2>/dev/null -echo "done installing Infiniband" -exit 0 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh deleted file mode 100755 index 26603bebd..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsclient.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsserver -# arg: $2 = mount point (default: /lustre) -master=$1 -lfs_mount=${2:-/lustre} -mkdir ~/.ssh - -cp -r /share/home/hpcuser/.ssh ~/ - -#Include the correct rdma options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab -mount -a -chmod 777 $lfs_mount diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh deleted file mode 100755 index 0af1fc5e2..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfshsm.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsserver -# arg: $2 = storage account -# arg: $3 = storage key -# arg: $4 = storage container -# arg: $5 = lustre version (default 2.10) -master=$1 -storage_account=$2 -storage_key=$3 -storage_container=$4 -lustre_version=${5-2.10} - -# remove the patch version -ndots=${lustre_version//[^.]} -if [ "${#ndots}" = "2" ]; then - lustre_version=${lustre_version%.*} -fi - -# adding kernel module for lustre client -if [ "$lustre_version" = "2.10" ]; then - yum install -y kmod-lustre-client - weak-modules --add-kernel $(uname -r) -fi - -if ! rpm -q lemur-azure-hsm-agent lemur-azure-data-movers; then - yum -y install \ - https://azurehpc.azureedge.net/rpms/lemur-azure-hsm-agent-1.0.0-lustre_${lustre_version}.x86_64.rpm \ - https://azurehpc.azureedge.net/rpms/lemur-azure-data-movers-1.0.0-lustre_${lustre_version}.x86_64.rpm -fi - -mkdir -p /var/run/lhsmd -chmod 755 /var/run/lhsmd - -mkdir -p /etc/lhsmd -chmod 755 /etc/lhsmd - -cat </etc/lhsmd/agent -# Lustre NID and filesystem name for the front end filesystem, the agent will mount this -client_device="${master}@tcp:/LustreFS" - -# Do you want to use S3 and POSIX, in this example we use POSIX -enabled_plugins=["lhsm-plugin-az"] - -## Directory to look for the plugins -plugin_dir="/usr/libexec/lhsmd" - -# TBD, I used 16 -handler_count=16 - -# TBD -snapshots { - enabled = false -} -EOF -chmod 600 /etc/lhsmd/agent - -cat </etc/lhsmd/lhsm-plugin-az -az_storage_account = "$storage_account" -az_storage_key = "$storage_key" - -num_threads = 32 - -# -# One or more archive definition is required. -# -archive "az-blob" { - id = 1 # Must be unique to this endpoint - container = "$storage_container" # Container used for this archive - prefix = "" # Optional prefix - num_threads = 32 -} -EOF -chmod 600 /etc/lhsmd/lhsm-plugin-az - -cat </etc/systemd/system/lhsmd.service -[Unit] -Description=The lhsmd server -After=syslog.target network.target remote-fs.target nss-lookup.target - -[Service] -Type=simple -PIDFile=/run/lhsmd.pid -ExecStartPre=/bin/mkdir -p /var/run/lhsmd -ExecStart=/sbin/lhsmd -config /etc/lhsmd/agent -Restart=always - -[Install] -WantedBy=multi-user.target -EOF -chmod 600 /etc/systemd/system/lhsmd.service - -systemctl daemon-reload -systemctl enable lhsmd -systemctl start lhsmd diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh deleted file mode 100755 index fd9fad30b..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsimport.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# arg: $1 = storage account -# arg: $2 = storage key -# arg: $3 = storage container -# arg: $3 = lfs mount -# arg: $4 = lustre mount (default=/lustre) -# arg: $5 = lustre version (default=2.10) -storage_account=$1 -storage_key=$2 -storage_container=$3 -lfs_mount=${4:-/lustre} -lustre_version=${5-2.10} - -# remove the patch version -ndots=${lustre_version//[^.]} -if [ "${#ndots}" = "2" ]; then - lustre_version=${lustre_version%.*} -fi - -if ! rpm -q lemur-azure-hsm-agent lemur-azure-data-movers; then - yum -y install \ - https://azurehpc.azureedge.net/rpms/lemur-azure-hsm-agent-1.0.0-lustre_${lustre_version}.x86_64.rpm \ - https://azurehpc.azureedge.net/rpms/lemur-azure-data-movers-1.0.0-lustre_${lustre_version}.x86_64.rpm -fi - -cd $lfs_mount -export STORAGE_ACCOUNT=$storage_account -export STORAGE_KEY=$storage_key -/sbin/azure-import ${storage_container} - diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh deleted file mode 100755 index ce6b43f3d..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsloganalytics.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# arg: $1 = name -# arg: $2 = log analytics workspace id -# arg: $3 = log analytics key - -name=$1 -log_analytics_workspace_id=$2 -log_analytics_key=$3 - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -sed "s#__FS_NAME__#${name}#g;s#__LOG_ANALYTICS_WORKSPACE_ID__#${log_analytics_workspace_id}#g;s#__LOG_ANALYTICS_KEY__#${log_analytics_key}#g" $DIR/lfsloganalyticsd.sh.in >/usr/bin/lfsloganalyticsd.sh - -chmod +x /usr/bin/lfsloganalyticsd.sh - -cat </lib/systemd/system/lfsloganalytics.service -[Unit] -Description=Lustre logging service to Log Analytics. - -[Service] -Type=simple -ExecStart=/bin/bash /usr/bin/lfsloganalyticsd.sh -Restart=always - -[Install] -WantedBy=multi-user.target -EOF - -systemctl enable lfsloganalytics -systemctl start lfsloganalytics \ No newline at end of file diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh deleted file mode 100755 index d2dcdb02e..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsmaster.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# arg: $1 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) -device=$1 - -# this will only install MDS on first node in a scaleset -echo "pssh_nodenum is $PSSH_NODENUM" - -cp -r /share/home/hpcuser/.ssh /root/ - -#Include the correct rdma options -cat >/etc/modprobe.d/lustre.conf<> /etc/fstab - mount -a - - # set up hsm - lctl set_param -P mdt.*-MDT0000.hsm_control=enabled - lctl set_param -P mdt.*-MDT0000.hsm.default_archive_id=1 - lctl set_param mdt.*-MDT0000.hsm.max_requests=128 - - # allow any user and group ids to write - lctl set_param mdt.*-MDT0000.identity_upcall=NONE - - diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh deleted file mode 100755 index 8f39aac68..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsoss.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# arg: $1 = lfsmaster -# arg: $2 = device (e.g. L=/dev/sdb Lv2=/dev/nvme0n1) -master=$1 -device=$2 - -cp -r /share/home/hpcuser/.ssh /root/ - -index=$(($PSSH_NODENUM + 1)) -myuser="hpcuser" - -capture=$(ssh hpcuser@$master "sudo ip address show dev ib0") -masterib=$(echo $capture | awk -F 'inet' '{print $2}' | cut -d / -f 1 ) - - lnetctl net add --net o2ib --if ib0 #double check - mkfs.lustre \ - --fsname=LustreFS \ - --backfstype=ldiskfs \ - --reformat \ - --ost \ - --mgsnode="${masterib}@o2ib" \ - --index=$index \ - --mountfsoptions="errors=remount-ro" \ - $device - - -mkdir /mnt/oss -echo "$device /mnt/oss lustre noatime,nodiratime,nobarrier 0 2" >> /etc/fstab -mount -a diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh deleted file mode 100755 index db1eeb165..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lfsrepo.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -lustre_version=${1-2.10} - -cat << EOF >/etc/yum.repos.d/LustrePack.repo -[lustreserver] -name=lustreserver -baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ -enabled=1 -gpgcheck=0 - -[e2fs] -name=e2fs -baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ -enabled=1 -gpgcheck=0 - -[lustreclient] -name=lustreclient -baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ -enabled=1 -gpgcheck=0 -EOF - -#Include the correct rdma options -#cat >/etc/modprobe.d/lustre.conf<$home_root/$new_user/.ssh/config -Host * - StrictHostKeyChecking no - UserKnownHostsFile /dev/null - LogLevel ERROR -EOF - ssh-keygen -f $home_root/$new_user/.ssh/id_rsa -t rsa -N '' - # add admin user public key (the only user in /home) - cat /home/*/.ssh/id_rsa.pub >$home_root/$new_user/.ssh/authorized_keys - cat $home_root/$new_user/.ssh/id_rsa.pub >>$home_root/$new_user/.ssh/authorized_keys - chown $new_user:$new_user $home_root/$new_user/.ssh - chown $new_user:$new_user $home_root/$new_user/.ssh/* - chmod 700 $home_root/$new_user/.ssh - chmod 600 $home_root/$new_user/.ssh/id_rsa - chmod 644 $home_root/$new_user/.ssh/id_rsa.pub - chmod 644 $home_root/$new_user/.ssh/config - chmod 644 $home_root/$new_user/.ssh/authorized_keys -fi -echo "$new_user ALL=(ALL) NOPASSWD: ALL" | tee -a /etc/sudoers diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh deleted file mode 100755 index c052001a0..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall1.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -# jump the gun here to ensure passwordless ssh as root between all lustre nodes to faciltate node reboot -cp -r /share/home/hpcuser/.ssh ~/ - -yum -y --nogpgcheck --disablerepo=* --enablerepo=e2fs install e2fsprogs - -yum -y --nogpgcheck --disablerepo=base,extras,updates --enablerepo=lustreserver install kernel kernel-devel kernel-headers kernel-tools kernel-tools-libs 2>/dev/null - diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh deleted file mode 100755 index 60f3e759e..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustreinstall2.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -yum -y --nogpgcheck --enablerepo=lustreserver install kmod-lustre kmod-lustre-osd-ldiskfs lustre-osd-ldiskfs-mount lustre lustre-resource-agents -modprobe -v lustre - -sed -i 's/ResourceDisk\.Format=y/ResourceDisk.Format=n/g' /etc/waagent.conf -sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf - -weak-modules --add-kernel --no-initramfs -systemctl enable lustre -umount /mnt/resource diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh deleted file mode 100755 index f95d33864..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/lustrenetwork.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -sed -i 's/# OS.EnableRDMA=y/OS.EnableRDMA=y/g' /etc/waagent.conf -service waagent restart -service rdma start -modprobe lnet -lctl network configure -lnetctl net add --net o2ib --if ib0 #need this to come up every time -sleep 5 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh deleted file mode 100755 index 678bac4dd..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsclient.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -# arg: $1 = nfsserver -nfs_server=$1 -nfs_share=${2-/share} -if [ -z "$nfs_server" ]; then - echo "The nfs_server is required" - exit 1 -fi - -yum install -y nfs-utils - -mkdir -p /scratch -mkdir -p /apps -mkdir -p /data -mkdir -p /share/home -mount $nfs_server:$nfs_share/apps /apps -mount $nfs_server:$nfs_share/data /data -mount $nfs_server:$nfs_share/home /share/home - -chmod 777 /scratch - -cat << EOF >> /etc/fstab -$nfs_server:$nfs_share/home /share/home nfs defaults 0 0 -$nfs_server:/mnt/resource/scratch /scratch nfs defaults 0 0 -$nfs_server:$nfs_share/apps /apps nfs defaults 0 0 -$nfs_server:$nfs_share/data /data nfs defaults 0 0 -EOF - -setsebool -P use_nfs_home_dirs 1 - -mount -a - -df diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh deleted file mode 100755 index 14d53a4c0..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/nfsserver.sh +++ /dev/null @@ -1,212 +0,0 @@ -#!/bin/bash -if [[ $(id -u) -ne 0 ]] ; then - echo "Must be run as root" - exit 1 -fi - -# Disable requiretty to allow run sudo within scripts -sed -i -e 's/Defaults requiretty.*/ #Defaults requiretty/g' /etc/sudoers - -yum -y install epel-release -yum -y install nfs-utils nfs-utils-lib - -# Shares -NFS_MOUNT_POINT=/share -NFS_APPS=$NFS_MOUNT_POINT/apps -NFS_DATA=$NFS_MOUNT_POINT/data -NFS_HOME=$NFS_MOUNT_POINT/home -NFS_SCRATCH=/mnt/resource/scratch - -# Partitions all data disks attached to the VM -# -setup_data_disks() -{ - mountPoint="$1" - filesystem="$2" - devices="$3" - raidDevice="$4" - createdPartitions="" - numdevices=`echo $devices | wc -w` - if [ $numdevices -gt 1 ] - then - # Loop through and partition disks until not found - for disk in $devices; do - fdisk -l /dev/$disk || break - fdisk /dev/$disk << EOF -n -p -1 - - -t -fd -w -EOF - createdPartitions="$createdPartitions /dev/${disk}1" - done - else - disk=$(echo $devices | tr -d [:space:]) - echo "Warning: Only a single device to partition, $disk" - fdisk -l /dev/$disk || break - fdisk /dev/$disk << EOF -n -p -1 - - -w -EOF - createdPartitions="$createdPartitions /dev/${disk}1" - fi - - sleep 10 - - # Create RAID-0 volume - if [ -n "$createdPartitions" ]; then - devices=`echo $createdPartitions | wc -w` - if [ $numdevices -gt 1 ] - then - mdadm --create /dev/$raidDevice --level 0 --raid-devices $devices $createdPartitions - sleep 10 - - mdadm /dev/$raidDevice - else - echo "Warning: mdadm is not called, we have one partition named, ${disk}1 for mountpoint, $mountPoint" - raidDevice=${disk}1 - fi - - if [ "$filesystem" == "xfs" ]; then - mkfs -t $filesystem /dev/$raidDevice - export xfsuuid="UUID=`blkid |grep dev/$raidDevice |cut -d " " -f 2 |cut -c 7-42`" -# echo "$xfsuuid $mountPoint $filesystem rw,noatime,attr2,inode64,nobarrier,sunit=1024,swidth=4096,nofail 0 2" >> /etc/fstab - echo "$xfsuuid $mountPoint $filesystem rw,noatime,attr2,inode64,nobarrier,nofail 0 2" >> /etc/fstab - else - mkfs.ext4 -i 2048 -I 512 -J size=400 -Odir_index,filetype /dev/$raidDevice - sleep 5 - tune2fs -o user_xattr /dev/$raidDevice - export ext4uuid="UUID=`blkid |grep dev/$raidDevice |cut -d " " -f 2 |cut -c 7-42`" - echo "$ext4uuid $mountPoint $filesystem noatime,nodiratime,nobarrier,nofail 0 2" >> /etc/fstab - fi - - sleep 10 - mount -a - fi -} - -setup_single_disk() -{ - mountPoint="$1" - filesystem="$2" - device="$3" - - fdisk -l /dev/$device || break - fdisk /dev/$device << EOF -n -p -1 - - -p -w -EOF - - if [ "$filesystem" == "xfs" ]; then - mkfs -t $filesystem /dev/$device - echo "/dev/$device $mountPoint $filesystem rw,noatime,attr2,inode64,nobarrier,nofail 0 2" >> /etc/fstab - else - mkfs.ext4 -F -i 2048 -I 512 -J size=400 -Odir_index,filetype /dev/$device - sleep 5 - tune2fs -o user_xattr /dev/$device - echo "/dev/$device $mountPoint $filesystem noatime,nodiratime,nobarrier,nofail 0 2" >> /etc/fstab - fi - - sleep 10 - - mount /dev/$device $mountPoint -} - -setup_disks() -{ - # Dump the current disk config for debugging - fdisk -l - - # Dump the scsi config - lsscsi - - # Get the root/OS disk so we know which device it uses and can ignore it later - rootDevice=`mount | grep "on / type" | awk '{print $1}' | sed 's/[0-9]//g'` - - # Get the TMP disk so we know which device and can ignore it later - tmpDevice=`mount | grep "on /mnt/resource type" | awk '{print $1}' | sed 's/[0-9]//g'` - - # Get the data disk sizes from fdisk, we ignore the disks above - dataDiskSize=`fdisk -l | grep '^Disk /dev/' | grep -v $rootDevice | grep -v $tmpDevice | awk '{print $3}' | sort -n -r | tail -1` - - # Compute number of disks - nbDisks=`fdisk -l | grep '^Disk /dev/' | grep -v $rootDevice | grep -v $tmpDevice | wc -l` - echo "nbDisks=$nbDisks" - - dataDevices="`fdisk -l | grep '^Disk /dev/' | grep $dataDiskSize | awk '{print $2}' | awk -F: '{print $1}' | sort | head -$nbDisks | tr '\n' ' ' | sed 's|/dev/||g'`" - - mkdir -p $NFS_MOUNT_POINT - - - if [ "$nbDisks" -eq "1" ]; then - setup_single_disk $NFS_MOUNT_POINT "ext4" "$dataDevices" - elif [ "$nbDisks" -gt "1" ]; then - setup_data_disks $NFS_MOUNT_POINT "xfs" "$dataDevices" "md10" - fi - - mkdir -p $NFS_APPS - mkdir -p $NFS_DATA - mkdir -p $NFS_HOME - mkdir -p $NFS_SCRATCH - chmod 777 $NFS_APPS - chmod 777 $NFS_DATA - chmod 777 $NFS_HOME - chmod 777 $NFS_SCRATCH - - ln -s $NFS_SCRATCH /scratch - - echo "$NFS_APPS *(rw,sync,no_root_squash)" >> /etc/exports - echo "$NFS_DATA *(rw,sync,no_root_squash)" >> /etc/exports - echo "$NFS_HOME *(rw,sync,no_root_squash)" >> /etc/exports - echo "$NFS_SCRATCH *(rw,sync,no_root_squash)" >> /etc/exports - - exportfs - exportfs -a - exportfs -} - -tune_nfs() -{ - cores=$(grep processor /proc/cpuinfo | wc -l) - nfs_proc=$(($cores * 4)) - replace="s/#RPCNFSDCOUNT=16/RPCNFSDCOUNT=$nfs_proc/g" - sed -i -e "$replace" /etc/sysconfig/nfs - - grep RPCNFSDCOUNT /etc/sysconfig/nfs -} - -systemctl enable rpcbind -systemctl enable nfs-server -systemctl enable nfs-lock -systemctl enable nfs-idmap -systemctl enable nfs - -systemctl start rpcbind -systemctl start nfs-server -systemctl start nfs-lock -systemctl start nfs-idmap -systemctl start nfs - -setup_disks -tune_nfs -systemctl restart nfs-server - -ln -s /share/apps /apps -ln -s /share/data /data - -df - - diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh deleted file mode 100755 index fd037df76..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsclient.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -set -e -# arg: $1 = pbs_server -pbs_server=$1 - -if [ "$(rpm -qa pbspro-execution)" = "" ];then - yum install -y pbspro-execution-19.1.1-0.x86_64.rpm - - sed -i "s/CHANGE_THIS_TO_PBS_PRO_SERVER_HOSTNAME/${pbs_server}/g" /etc/pbs.conf - sed -i "s/CHANGE_THIS_TO_PBS_PRO_SERVER_HOSTNAME/${pbs_server}/g" /var/spool/pbs/mom_priv/config - sed -i "s/^if /#if /g" /opt/pbs/lib/init.d/limits.pbs_mom - sed -i "s/^fi/#fi /g" /opt/pbs/lib/init.d/limits.pbs_mom - systemctl enable pbs - systemctl start pbs - - # Retrieve the VMSS name to be used as the pool name for multiple VMSS support - poolName=$(curl -s -H Metadata:true "http://169.254.169.254/metadata/instance?api-version=2018-10-01" | jq -r '.compute.vmScaleSetName') - /opt/pbs/bin/qmgr -c "c n $(hostname) resources_available.pool_name='$poolName'" - -else - echo "PBS client was already installed" -fi diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh deleted file mode 100755 index b4317516b..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsdownload.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -filename=pbspro_19.1.1.centos7.zip - -if [ ! -f "$filename" ];then - wget -q https://github.com/PBSPro/pbspro/releases/download/v19.1.1/$filename - unzip $filename -fi - diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh deleted file mode 100755 index 14ee54d1a..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/pbsserver.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -set -e -admin_user=$(whoami) - -if [ "$(rpm -qa pbspro-server)" = "" ];then - yum install -y pbspro-server-19.1.1-0.x86_64.rpm - systemctl enable pbs - systemctl start pbs - /opt/pbs/bin/qmgr -c "s s managers += ${admin_user}@*" - /opt/pbs/bin/qmgr -c 's s flatuid=t' - /opt/pbs/bin/qmgr -c 's s job_history_enable=t' - /opt/pbs/bin/qmgr -c 'c r pool_name type=string,flag=h' - - # Update the sched_config file to schedule jobs that request pool_name - sed -i "s/^resources: \"ncpus,/resources: \"ncpus, pool_name,/g" /var/spool/pbs/sched_priv/sched_config - systemctl restart pbs -else - echo "PBSPro already installed" -fi diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh deleted file mode 100755 index 2d33c180b..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/rebootlustre.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -vmlist=$1 -osscount=$2 -totalcount=$((osscount+2)) -index=0 -#prep headnode -cp -r /share/home/hpcuser/.ssh /root/ -echo "vmlist is ${vmlist[@]}" - -#needs to be done sequentially -for vmname in ${vmlist[@]}; do - echo "Rebooting $vmname" - ssh hpcuser@${vmname} "sudo reboot 2>/dev/null; exit 2>/dev/null" 2>/dev/null - index=$((index+1)) -done -exit 0 diff --git a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh b/experimental/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh deleted file mode 100755 index 73411ca61..000000000 --- a/experimental/lustre_rdma_avs/azhpc_install_config/scripts/waitforreboot.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -sleep 180 #enough time for node reboot to continue process From a49ed49dd794b5812a2e750b2140b56914e5f4f5 Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Tue, 7 Jul 2020 11:35:41 -0500 Subject: [PATCH 27/36] error fixed in config.json for lustre_ipoib_nvmedrives --- experimental/lustre_ipoib_nvmedrives/config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/lustre_ipoib_nvmedrives/config.json b/experimental/lustre_ipoib_nvmedrives/config.json index 4a7bd988b..2eeda9e70 100644 --- a/experimental/lustre_ipoib_nvmedrives/config.json +++ b/experimental/lustre_ipoib_nvmedrives/config.json @@ -197,7 +197,7 @@ "copy": [ "pbspro_19.1.1.centos7/pbspro-execution-19.1.1-0.x86_64.rpm" ], - "tag": "lfsclient", + "tag": "pbsclient", "sudo": true } From bdf20540b1b33636e35cd5d8eba119a3e783bec2 Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Tue, 7 Jul 2020 11:39:24 -0500 Subject: [PATCH 28/36] change in config.json for lustre_ipoib to fix PBS --- experimental/lustre_ipoib/config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/lustre_ipoib/config.json b/experimental/lustre_ipoib/config.json index 0127dc390..d96038e21 100644 --- a/experimental/lustre_ipoib/config.json +++ b/experimental/lustre_ipoib/config.json @@ -216,7 +216,7 @@ "copy": [ "pbspro_19.1.1.centos7/pbspro-execution-19.1.1-0.x86_64.rpm" ], - "tag": "lfsclient", + "tag": "pbsclient", "sudo": false } From 7fe7aa96a255928c3bfc8ad8b34035ac0dd2cce3 Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Thu, 9 Jul 2020 13:29:06 -0500 Subject: [PATCH 29/36] The headnode cannot communicate with Lustre nodes over Infiband anyway, so removing Infiniband components from headnode --- experimental/lustre_ipoib/config.json | 3 --- experimental/lustre_ipoib_nvmedrives/config.json | 3 --- experimental/lustre_rdma_avs/config.json | 3 --- experimental/lustre_rdma_nvmedrives/config.json | 7 ------- 4 files changed, 16 deletions(-) diff --git a/experimental/lustre_ipoib/config.json b/experimental/lustre_ipoib/config.json index d96038e21..be7c23449 100644 --- a/experimental/lustre_ipoib/config.json +++ b/experimental/lustre_ipoib/config.json @@ -41,9 +41,6 @@ "tags": [ "disable-selinux", "cndefault", - "lfsrepo", - "lfsclient", - "lfsazimport", "localuser", "pbsserver", "loginnode", diff --git a/experimental/lustre_ipoib_nvmedrives/config.json b/experimental/lustre_ipoib_nvmedrives/config.json index 2eeda9e70..ae8fc7902 100644 --- a/experimental/lustre_ipoib_nvmedrives/config.json +++ b/experimental/lustre_ipoib_nvmedrives/config.json @@ -41,9 +41,6 @@ "tags": [ "disable-selinux", "cndefault", - "lfsrepo", - "lfsclient", - "lfsazimport", "localuser", "pbsserver", "loginnode", diff --git a/experimental/lustre_rdma_avs/config.json b/experimental/lustre_rdma_avs/config.json index d241c4631..80b951da7 100644 --- a/experimental/lustre_rdma_avs/config.json +++ b/experimental/lustre_rdma_avs/config.json @@ -43,9 +43,6 @@ "tags": [ "disable-selinux", "cndefault", - "lfsrepo", - "lfsclient", - "lfsazimport", "localuser", "pbsserver", "loginnode", diff --git a/experimental/lustre_rdma_nvmedrives/config.json b/experimental/lustre_rdma_nvmedrives/config.json index dd2c4d6a9..d3fe8b3ba 100644 --- a/experimental/lustre_rdma_nvmedrives/config.json +++ b/experimental/lustre_rdma_nvmedrives/config.json @@ -40,10 +40,7 @@ "tags": [ "disable-selinux", "cndefault", - "lfsrepo", "rebootlustre", - "lfsclient", - "lfsazimport", "localuser", "pbsserver", "allnodes", @@ -129,14 +126,10 @@ }, { "type": "local_script", -<<<<<<< HEAD "script": "wait.sh", "args": [ "180" ] -======= - "script": "wait.sh" ->>>>>>> 9aba5d253a4a5a012d9d828c45d3110d9f5164df }, { "script": "installOFED.sh", From 51f305c268cf092cf6bf51eb4430607d101e3166 Mon Sep 17 00:00:00 2001 From: Xavier Pillons Date: Fri, 10 Jul 2020 18:18:24 +0200 Subject: [PATCH 30/36] added pipeline --- experimental/lustre_rdma_avs/config.json | 293 +++++++++++----------- experimental/lustre_rdma_avs/pipeline.yml | 40 +++ experimental/lustre_rdma_avs/test.yml | 16 ++ 3 files changed, 202 insertions(+), 147 deletions(-) create mode 100644 experimental/lustre_rdma_avs/pipeline.yml create mode 100644 experimental/lustre_rdma_avs/test.yml diff --git a/experimental/lustre_rdma_avs/config.json b/experimental/lustre_rdma_avs/config.json index 80b951da7..b303ef66d 100644 --- a/experimental/lustre_rdma_avs/config.json +++ b/experimental/lustre_rdma_avs/config.json @@ -12,25 +12,24 @@ } }, "variables": { - "location": "", - "image": "OpenLogic:CentOS:7.6:latest", - "lustreimage": "OpenLogic:CentOS:7.6:latest", - "hpcimage": "OpenLogic:CentOS:7.6:latest", - "compute_instances": 2, - "lustre_instances": 2, - "low_priority": false, - "storage_account": "", - "storage_key": "sakey.{{variables.storage_account}}", - "storage_container": "", - "log_analytics_lfs_name": "", - "la_resourcegroup": "", - "la_name": "", - "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", - "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", - "lustre_version": "2.10", - "lustre_avset": "{{variables.resource_group}}avset", - "lustre_mount": "/lustre" + "location": "", + "lustreimage": "7.7 OpenLogic:CentOS-HPC:7.7:7.7.2020042000", + "hpcimage": "7.7 OpenLogic:CentOS-HPC:7.7:7.7.2020042000", + "compute_instances": 2, + "lustre_instances": 2, + "low_priority": false, + "storage_account": "", + "storage_key": "sakey.{{variables.storage_account}}", + "storage_container": "", + "log_analytics_lfs_name": "", + "la_resourcegroup": "", + "la_name": "", + "log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}", + "lustre_version": "2.12.4", + "lustre_avset": "{{variables.resource_group}}avset", + "lustre_mount": "/lustre" }, "resources": { "headnode": { @@ -38,7 +37,7 @@ "vm_type": "Standard_HB120rs_v2", "accelerated_networking": false, "public_ip": true, - "image": "variables.image", + "image": "variables.hpcimage", "subnet": "compute", "tags": [ "disable-selinux", @@ -46,68 +45,68 @@ "localuser", "pbsserver", "loginnode", - "rebootlustre", + "rebootlustre", "nfsserver", - "allnodes" + "allnodes" + ] + }, + "compute": { + "type": "vm", + "vm_type": "Standard_HB120rs_v2", + "instances": "variables.compute_instances", + "availability_set": "variables.lustre_avset", + "low_priority": "variables.low_priority", + "accelerated_networking": false, + "image": "variables.hpcimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lfsrepo", + "lfsclient", + "localuser", + "pbsclient", + "nfsclient", + "disable-selinux", + "allnodes" + ] + }, + "lfsmaster": { + "type": "vm", + "vm_type": "Standard_HB120rs_v2", + "availability_set": "variables.lustre_avset", + "accelerated_networking": false, + "image": "variables.lustreimage", + "subnet": "storage", + "tags": [ + "cndefault", + "lustre", + "lfsmaster", + "lfsrepo", + "localuser", + "nfsclient", + "disable-selinux", + "lfsloganalytics", + "allnodes" ] }, - "compute": { - "type": "vm", - "vm_type": "Standard_HB120rs_v2", - "instances": "variables.compute_instances", - "availability_set": "variables.lustre_avset", - "low_priority": "variables.low_priority", - "accelerated_networking": false, - "image": "variables.hpcimage", - "subnet": "storage", - "tags": [ - "cndefault", - "lfsrepo", - "lfsclient", - "localuser", - "pbsclient", - "nfsclient", - "disable-selinux", - "allnodes" - ] - }, - "lfsmaster": { - "type": "vm", - "vm_type": "Standard_HB120rs_v2", - "availability_set": "variables.lustre_avset", - "accelerated_networking": false, - "image": "variables.lustreimage", - "subnet": "storage", - "tags": [ - "cndefault", - "lustre", - "lfsmaster", - "lfsrepo", - "localuser", - "nfsclient", - "disable-selinux", - "lfsloganalytics", - "allnodes" - ] - }, "lustre": { "type": "vm", "vm_type": "Standard_HB120rs_v2", "instances": "variables.lustre_instances", - "availability_set": "variables.lustre_avset", + "availability_set": "variables.lustre_avset", "accelerated_networking": false, "image": "variables.lustreimage", "subnet": "storage", "tags": [ "cndefault", "lfsrepo", - "localuser", - "nfsclient", - "lustre", - "ossnode", + "localuser", + "nfsclient", + "lustre", + "ossnode", "disable-selinux", "lfsloganalytics", - "allnodes" + "allnodes" ] } }, @@ -151,42 +150,42 @@ ], "sudo": true }, - { - "script": "lustreinstall1.sh", - "tag": "lustre", - "sudo": true - }, - { - "script": "rebootlustre.sh", - "tag": "rebootlustre", - "sudo": true, - "args": [ - "$(>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + # configuration file to use + azhpc.config: experimental/lustre_rdma_avs/config.json + # pipeline directory + azhpc.pipeline_dir: experimental/lustre_rdma_avs + # destination of scripts. Default is hpcuser@headnode:/apps + #azhpc.script_remote_dest: 'hpcadmin@headnode:.' + +# Add the variables needed in your configuration file +# Uncomment and set values below, or leave commented and thru pipeline variables + # azhpc.variables.location: westeurope + azhpc.variables.compute_instances: 4 + azhpc.variables.low_priority: false + azhpc.variables.lustre_instances: 4 + azhpc.variables.log_analytics_lfs_name: lfs + azhpc.variables.lustre_mount: /lustre + + +# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +jobs: +- template: ../../ci/templates/jobs/build.yml + parameters: + extra_steps: ../../../experimental/lustre_rdma_avs/test.yml diff --git a/experimental/lustre_rdma_avs/test.yml b/experimental/lustre_rdma_avs/test.yml new file mode 100644 index 000000000..b5d22d3b3 --- /dev/null +++ b/experimental/lustre_rdma_avs/test.yml @@ -0,0 +1,16 @@ +steps: +- template: ../../ci/templates/steps/azhpc-run.yml + parameters: + user: hpcuser + command: /apps/ci/check_pbs_nodes.sh $(azhpc.variables.compute_instances) + +- template: ../../ci/templates/steps/azhpc-run.yml + parameters: + user: hpcadmin + command: /apps/ci/check_mountpoints.sh $(azhpc.variables.lustre_mount) + +- template: ../../ci/templates/steps/azhpc-run.yml + parameters: + user: hpcadmin + command: /apps/ci/check_lustre_client.sh $(azhpc.variables.lustre_mount) + From ab1ad3caf461ee48b3161b7d7b64acf8a95f6a88 Mon Sep 17 00:00:00 2001 From: Xavier Pillons Date: Fri, 10 Jul 2020 18:27:37 +0200 Subject: [PATCH 31/36] fix image --- experimental/lustre_rdma_avs/config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/experimental/lustre_rdma_avs/config.json b/experimental/lustre_rdma_avs/config.json index b303ef66d..b4b533823 100644 --- a/experimental/lustre_rdma_avs/config.json +++ b/experimental/lustre_rdma_avs/config.json @@ -14,8 +14,8 @@ "variables": { "location": "", - "lustreimage": "7.7 OpenLogic:CentOS-HPC:7.7:7.7.2020042000", - "hpcimage": "7.7 OpenLogic:CentOS-HPC:7.7:7.7.2020042000", + "lustreimage": "OpenLogic:CentOS-HPC:7.7:7.7.2020043000", + "hpcimage": "OpenLogic:CentOS-HPC:7.7:7.7.2020043000", "compute_instances": 2, "lustre_instances": 2, "low_priority": false, From 0b03083daa0a0d62e978b5251f5343e2f705bd25 Mon Sep 17 00:00:00 2001 From: Xavier Pillons Date: Fri, 10 Jul 2020 19:15:22 +0200 Subject: [PATCH 32/36] use plain CentOS Image --- experimental/lustre_rdma_avs/config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/experimental/lustre_rdma_avs/config.json b/experimental/lustre_rdma_avs/config.json index b4b533823..bde631807 100644 --- a/experimental/lustre_rdma_avs/config.json +++ b/experimental/lustre_rdma_avs/config.json @@ -14,8 +14,8 @@ "variables": { "location": "", - "lustreimage": "OpenLogic:CentOS-HPC:7.7:7.7.2020043000", - "hpcimage": "OpenLogic:CentOS-HPC:7.7:7.7.2020043000", + "lustreimage": "OpenLogic:CentOS:7.7:7.7.2020042900", + "hpcimage": "OpenLogic:CentOS:7.7:7.7.2020042900", "compute_instances": 2, "lustre_instances": 2, "low_priority": false, From 0f3279b39a537745c0cdc8facc595e3a3c66c0af Mon Sep 17 00:00:00 2001 From: Xavier Pillons Date: Fri, 10 Jul 2020 19:58:19 +0200 Subject: [PATCH 33/36] use the lfsrepo from the shared scripts dir --- .../lustre_rdma_avs/scripts/lfsrepo.sh | 27 ------------------- 1 file changed, 27 deletions(-) delete mode 100755 experimental/lustre_rdma_avs/scripts/lfsrepo.sh diff --git a/experimental/lustre_rdma_avs/scripts/lfsrepo.sh b/experimental/lustre_rdma_avs/scripts/lfsrepo.sh deleted file mode 100755 index db1eeb165..000000000 --- a/experimental/lustre_rdma_avs/scripts/lfsrepo.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -lustre_version=${1-2.10} - -cat << EOF >/etc/yum.repos.d/LustrePack.repo -[lustreserver] -name=lustreserver -baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ -enabled=1 -gpgcheck=0 - -[e2fs] -name=e2fs -baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7/ -enabled=1 -gpgcheck=0 - -[lustreclient] -name=lustreclient -baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ -enabled=1 -gpgcheck=0 -EOF - -#Include the correct rdma options -#cat >/etc/modprobe.d/lustre.conf< Date: Fri, 10 Jul 2020 14:12:16 -0500 Subject: [PATCH 34/36] Need to have a diffrent repo than standard for the Lustre kernel --- experimental/lustre_rdma_avs/scripts/lfsrepo.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/experimental/lustre_rdma_avs/scripts/lfsrepo.sh b/experimental/lustre_rdma_avs/scripts/lfsrepo.sh index db1eeb165..00e0d9dd9 100755 --- a/experimental/lustre_rdma_avs/scripts/lfsrepo.sh +++ b/experimental/lustre_rdma_avs/scripts/lfsrepo.sh @@ -1,10 +1,16 @@ #!/bin/bash lustre_version=${1-2.10} +if [ "$lustre_version" = "2.10" -o "$lustre_version" = "2.12" ]; then + lustre_dir=latest-${lustre_version}-release + else + lustre_dir="lustre-$lustre_version" +fi + cat << EOF >/etc/yum.repos.d/LustrePack.repo [lustreserver] name=lustreserver -baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/server/ +baseurl=https://downloads.whamcloud.com/public/lustre/${lustre_dir}/el7/server/ enabled=1 gpgcheck=0 @@ -16,7 +22,7 @@ gpgcheck=0 [lustreclient] name=lustreclient -baseurl=https://downloads.whamcloud.com/public/lustre/latest-${lustre_version}-release/el7/client/ +baseurl=https://downloads.whamcloud.com/public/lustre/${lustre_dir}/el7/client/ enabled=1 gpgcheck=0 EOF From a7473c952b248d54ff6dc107bd1b518e5090b274 Mon Sep 17 00:00:00 2001 From: chadnar2 <52789065+chadnar2@users.noreply.github.com> Date: Fri, 10 Jul 2020 14:15:45 -0500 Subject: [PATCH 35/36] Update config.json --- experimental/lustre_rdma_avs/config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/experimental/lustre_rdma_avs/config.json b/experimental/lustre_rdma_avs/config.json index bde631807..e0f9b3561 100644 --- a/experimental/lustre_rdma_avs/config.json +++ b/experimental/lustre_rdma_avs/config.json @@ -12,7 +12,7 @@ } }, "variables": { - "location": "", "resource_group": "", "lustreimage": "OpenLogic:CentOS:7.7:7.7.2020042900", "hpcimage": "OpenLogic:CentOS:7.7:7.7.2020042900", @@ -271,4 +271,4 @@ "sudo": true } ] -} \ No newline at end of file +} From d685b87932e6b93f17e8ea1998bab8ce16d59a1b Mon Sep 17 00:00:00 2001 From: Narjit Chadha Date: Tue, 14 Jul 2020 11:19:02 -0500 Subject: [PATCH 36/36] Changed headnode in lustre_rdma_avs to Standard_D8s_v3 --- experimental/lustre_rdma_avs/config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/experimental/lustre_rdma_avs/config.json b/experimental/lustre_rdma_avs/config.json index bde631807..c0ca22b73 100644 --- a/experimental/lustre_rdma_avs/config.json +++ b/experimental/lustre_rdma_avs/config.json @@ -34,7 +34,7 @@ "resources": { "headnode": { "type": "vm", - "vm_type": "Standard_HB120rs_v2", + "vm_type": "Standard_D8s_v3", "accelerated_networking": false, "public_ip": true, "image": "variables.hpcimage", @@ -271,4 +271,4 @@ "sudo": true } ] -} \ No newline at end of file +}