1. Home
  2. Create persistent UDEV rules to rename the disks consistently based on HW address

Create persistent UDEV rules to rename the disks consistently based on HW address

This article was tested on DGX OS 6.2 BCM10
1. Edit /cm/images/<IMAGENAME>/usr/lib/udev/rules.d/60-persistent-storage-<DGXTYPE>.rules and add the following lines:

   a. For DGX H100

########## persistent nvme rules by HW address ##########
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:01:00.0", SYMLINK+="disk/by-id/osdisk-1"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:02:00.0", SYMLINK+="disk/by-id/osdisk-2"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:ab:00.0", SYMLINK+="disk/by-id/raiddisk-1"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:ac:00.0", SYMLINK+="disk/by-id/raiddisk-2"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:ad:00.0", SYMLINK+="disk/by-id/raiddisk-3"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:ae:00.0", SYMLINK+="disk/by-id/raiddisk-4"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:2a:00.0", SYMLINK+="disk/by-id/raiddisk-5"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:2b:00.0", SYMLINK+="disk/by-id/raiddisk-6"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:2c:00.0", SYMLINK+="disk/by-id/raiddisk-7"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:2d:00.0", SYMLINK+="disk/by-id/raiddisk-8"
########## persistent nvme rules by HW address ##########

   b. For DGX A100

########## persistent nvme rules by HW address ##########
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:22:00.0", SYMLINK+="disk/by-id/osdisk-1"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:23:00.0", SYMLINK+="disk/by-id/osdisk-2"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:08:00.0", SYMLINK+="disk/by-id/raiddisk-1"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:09:00.0", SYMLINK+="disk/by-id/raiddisk-2"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:52:00.0", SYMLINK+="disk/by-id/raiddisk-3"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:53:00.0", SYMLINK+="disk/by-id/raiddisk-4"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:89:00.0", SYMLINK+="disk/by-id/raiddisk-5"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:8a:00.0", SYMLINK+="disk/by-id/raiddisk-6"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:ca:00.0", SYMLINK+="disk/by-id/raiddisk-7"
KERNEL=="nvme[0-9]n[0-9]", ATTRS{address}=="0000:cb:00.0", SYMLINK+="disk/by-id/raiddisk-8"
########## persistent nvme rules by HW address ##########

Note: <DGXTYPE> has to be replaced with either a100 or h100 based.

2. Edit /cm/node-installer/usr/lib/udev/rules.d/60-persistent-storage-<DGXTYPE>.rules and add the same lines as above
3. append nvme_core.multipath=n to kernel parameter to disable NVMe multipath capability
cmsh
softwareimage use <IMAGENAME>
append kernelparameters " nvme_core.multipath=n"
commit
4. Create disksetup using the persistent names created by the UDEV rules
cmsh
category use <categoryname>
set disksetup
### the disksetup will open in the default editor; copy/paste the following XML; save changes and exit commit

<?xml version="1.0" encoding="UTF-8"?>
<diskSetup>
    <device>
        <blockdev>/dev/disk/by-id/osdisk-1</blockdev>
        <partition id="boot1" partitiontype="esp">
            <size>512M</size>
            <type>linux</type>
            <filesystem>fat</filesystem>
            <mountPoint>/boot/efi</mountPoint>
            <mountOptions>defaults,noatime,nodiratime</mountOptions>
        </partition>
        <partition id="slash1">
            <size>max</size>
            <type>linux raid</type>
        </partition>
    </device>
    <device>
        <blockdev>/dev/disk/by-id/osdisk-2</blockdev>
        <partition id="boot2" partitiontype="esp">
            <size>512M</size>
            <type>linux</type>
            <filesystem>fat</filesystem>
            <mountOptions>defaults,noatime,nodiratime</mountOptions>
        </partition>
        <partition id="slash2">
            <size>max</size>
            <type>linux raid</type>
        </partition>
    </device>
    <device>
        <blockdev>/dev/disk/by-id/raiddisk-1</blockdev>
        <partition id="raid1" partitiontype="esp">
            <size>max</size>
            <type>linux raid</type>
        </partition>
    </device>
    <device>
        <blockdev>/dev/disk/by-id/raiddisk-2</blockdev>
        <partition id="raid2" partitiontype="esp">
            <size>max</size>
            <type>linux raid</type>
        </partition>
    </device>
    <device>
        <blockdev>/dev/disk/by-id/raiddisk-3</blockdev>
        <partition id="raid3" partitiontype="esp">
            <size>max</size>
            <type>linux raid</type>
        </partition>
    </device>
    <device>
        <blockdev>/dev/disk/by-id/raiddisk-4</blockdev>
        <partition id="raid4" partitiontype="esp">
            <size>max</size>
            <type>linux raid</type>
        </partition>
    </device>
    <device>
        <blockdev>/dev/disk/by-id/raiddisk-5</blockdev>
        <partition id="raid5" partitiontype="esp">
            <size>max</size>
            <type>linux raid</type>
        </partition>
    </device>
    <device>
        <blockdev>/dev/disk/by-id/raiddisk-6</blockdev>
        <partition id="raid6" partitiontype="esp">
            <size>max</size>
            <type>linux raid</type>
        </partition>
    </device>
    <device>
        <blockdev>/dev/disk/by-id/raiddisk-7</blockdev>
        <partition id="raid7" partitiontype="esp">
            <size>max</size>
            <type>linux raid</type>
        </partition>
    </device>
    <device>
        <blockdev>/dev/disk/by-id/raiddisk-8</blockdev>
        <partition id="raid8" partitiontype="esp">
            <size>max</size>
            <type>linux raid</type>
        </partition>
    </device>
    <raid id="slash">
        <member>slash1</member>
        <member>slash2</member>
        <level>1</level>
        <filesystem>ext4</filesystem>
        <mountPoint>/</mountPoint>
        <mountOptions>defaults,noatime,nodiratime</mountOptions>
    </raid>
    <raid id="raid">
        <member>raid1</member>
        <member>raid2</member>
        <member>raid3</member>
        <member>raid4</member>
        <member>raid5</member>
        <member>raid6</member>
        <member>raid7</member>
        <member>raid8</member>
        <level>0</level>
        <filesystem>ext4</filesystem>
        <mountPoint>/raid</mountPoint>
        <mountOptions>defaults,noatime,nodiratime</mountOptions>
    </raid>
</diskSetup> 
5. Add a finalize script to replace the /dev/nvmeXnY of /boot/efi in /etc/fstab with the UUID
cmsh
category use <categoryname>
set finalizescript
### the finalizescript will open in the default editor; copy/paste the following script; save changes and exit
commit

#!/bin/bash

sed -i "s/.*\/boot\/efi.*/UUID=\"$(blkid -l -t PARTLABEL=/boot/efi -s UUID -o value)\" \/boot\/efi    vfat       defaults,noatime,nodiratime              0 2/" /localdisk/etc/fstab 
Create a service to regenerate the contents of /boot/efi/EFI
cm-chroot-sw-img /cm/images/<IMAGENAME>
cat > /etc/systemd/system/nvsm-bootefi.service << EOF
[Unit]
Description= Fix /boot/efi partition
Before=nvsm.service nvsm-notifier.service

[Service]
Type=oneshot
RemainAfterExit=yes
ExecStartPre=/usr/sbin/grub-install
ExecStart=cp -r --preserve /boot/efi/EFI/dgx /boot/efi/EFI/ubuntu

[Install]
WantedBy=multi-user.target
EOF

systemctl enable nvsm-bootefi.service

IMPORTANT NOTE:

On some systems it is necessary to disable NVMe/Harddisk boot option from the BIOS menu to avoid falling back to booting from local harddisk when GRUB is installed on the MBR via the grub-install command.

Updated on September 5, 2024