This article is tested on BCM10 with Ubuntu 22.04
1. clone the default software-image
cmsh
softwareimage
clone default-image amlfs-image
2. Install Azure LTS kernel inside the amlfs-image
cm-chroot-sw-img /cm/images/amlfs-image/
apt update && apt install linux-image-azure-lts-22.04
3. install Azure LTS headers inside the amlfs-image
cm-chroot-sw-img /cm/images/amlfs-image/
apt update && apt install linux-headers-azure-lts-22.04
4. set the kernel version to the Azure LTS version
cmsh
softwareimage use amlfs-image
set kernelversion 5.15.0-1054-azure
commit
5. remove unnecessary kernels and headers to avoid confusion
Note: this step is optional and in some cases may fail if the kernel version of the head node is the same as the kernel version you’re trying to remove)
chroot /cm/images/amlfs-image/
apt list –installed linux-headers*
apt remove linux-headers-5.19.0-45-generic linux-headers-generic-hwe-22.04
apt list –installed linux-image*
apt remove linux-image-5.19.0-45-generic linux-image-generic-hwe-22.04
6. create and run the following script inside the amlfs-image
Note: in some browsers the double quote can be different; please make sure that the double quotes are correct
cm-chroot-sw-img /cm/images/amlfs-image/
cat << EOF > /root/repos.sh
#!/bin/bash
set -ex
apt update && apt install -y ca-certificates curl apt-transport-https lsb-release gnupg
source /etc/lsb-release
echo “deb [arch=amd64] https://packages.microsoft.com/repos/amlfs-\${DISTRIB_CODENAME}/ \${DISTRIB_CODENAME} main” | tee /etc/apt/sources.list.d/amlfs.list
curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg –dearmor | tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null
apt update
EOF
bash /root/repos.sh
7. install the Azure Lustre metapackage that matches the running kernel
apt install amlfs-lustre-client-2.15.3-43-gd7e07df=5.15.0-1054-azure && apt install amlfs-lustre-client-2.15.3-43-gd7e07df=5.15.0-1054-azure
8. upgrade the lustre client
apt upgrade linux-image-5.15.0-1054-azure amlfs-lustre-client-2.15.3-43-gd7e07df
9. (optional) for GPU nodes, install cuda-driver package:
apt install cuda-driver
10. make sure that the initrd can be generated successfully after all changes
cmsh
softwareimage use amlfs-image
createramdisk -w
NOTE: createramdisk may take a little while to finish
11. reboot the compute node
cmsh
device use cnode001
reboot
12. after the node boots up successfully, check that the lustre (optional: nvidia) kernel modules are loaded
root@cnode001:~# lsmod | grep lustre
lustre 839680 4
lmv 184320 2 lustre
mdc 221184 2 lustre
lov 270336 5 mdc,lustre
ptlrpc 1155072 8 fld,osc,fid,mgc,lov,mdc,lmv,lustre
obdclass 3153920 13 fld,osc,fid,ptlrpc,mgc,lov,mdc,lmv,lustre
lnet 589824 7 osc,obdclass,ptlrpc,mgc,ksocklnd,lmv,lustre
libcfs 225280 12 fld,lnet,osc,fid,obdclass,ptlrpc,mgc,ksocklnd,lov,mdc,lmv,lustre
root@cnode001:~# lsmod | grep nvidia
nvidia_peermem 16384 0
nvidia_drm 94208 0
nvidia_modeset 1327104 1 nvidia_drm
nvidia_uvm 1515520 0
nvidia 56172544 36 nvidia_uvm,nvidia_peermem,nvidia_modeset
ib_uverbs 163840 91 nvidia_peermem,rdma_ucm,mlx5_ib
drm_kms_helper 311296 4 hyperv_drm,nvidia_drm
drm 622592 5 drm_kms_helper,nvidia,hyperv_drm,nvidia_drm
i2c_core 90112 5 drm_kms_helper,i2c_algo_bit,igb,nvidia,drm