VGPU support with multiple physical GPU on same board

dsoliveira · February 6, 2024, 3:36pm

I recently implemented ON with an Nvidia A16-64GB, this card has 4 physical GPUs
I can configure VMs normally with VGPU, however, they only work on the same BUS
Using onehost show (id) I can see them all correctly:
3f:0*
41:0*
43:0*
45:0*

However, when instantiating a new VM, only BUS 3f:0* is used, if I change the template with the parameter:
PCI = [
BUS = “43”,…
…] as in the example above, I receive the following error message:

DEPLOY: /var/tmp/one/vgpu: line 85: echo: write error: Input/output error Error creating mediated device error: Failed to create domain from /var/lib/one//datastores/0/13/deployment. 0 error: device not found: mediated device ‘3e87b439-543f-512e-a402-1ba761e1f0c2’ not found

Bare-metal: Ubuntu 20.04.6 LTS
ON Bare-metal: 6.8.0
ON Frontend: 6.8.0

mkutouski · February 7, 2024, 8:28am

Could you, please, share the output of onevm show <vmid> -j command as well as deployment.0 file for that VM?

dsoliveira · February 7, 2024, 11:53am

Sure bro,

onevm show 23 -j

{
  "VM": {
    "ID": "23",
    "UID": "0",
    "GID": "0",
    "UNAME": "oneadmin",
    "GNAME": "oneadmin",
    "NAME": "teste7",
    "PERMISSIONS": {
      "OWNER_U": "1",
      "OWNER_M": "1",
      "OWNER_A": "0",
      "GROUP_U": "0",
      "GROUP_M": "0",
      "GROUP_A": "0",
      "OTHER_U": "0",
      "OTHER_M": "0",
      "OTHER_A": "0"
    },
    "LAST_POLL": "1707306538",
    "STATE": "3",
    "LCM_STATE": "36",
    "PREV_STATE": "3",
    "PREV_LCM_STATE": "36",
    "RESCHED": "0",
    "STIME": "1707306383",
    "ETIME": "0",
    "DEPLOY_ID": {
    },
    "MONITORING": {
      "DISK_SIZE": [
        {
          "ID": "0",
          "SIZE": "339"
        },
        {
          "ID": "1",
          "SIZE": "1"
        }
      ],
      "ID": "23",
      "TIMESTAMP": "1707306538"
    },
    "SCHED_ACTIONS": {
    },
    "TEMPLATE": {
      "AUTOMATIC_DS_REQUIREMENTS": "(\"CLUSTERS/ID\" @> 0)",
      "AUTOMATIC_NIC_REQUIREMENTS": "(\"CLUSTERS/ID\" @> 0)",
      "AUTOMATIC_REQUIREMENTS": "(CLUSTER_ID = 0) & !(PUBLIC_CLOUD = YES) & !(PIN_POLICY = PINNED)",
      "CONTEXT": {
        "DISK_ID": "1",
        "NETWORK": "YES",
        "PCI0_ADDRESS": "01:01.0",
        "SSH_PUBLIC_KEY": "",
        "TARGET": "hda"
      },
      "CPU": "1",
      "DISK": [
        {
          "ALLOW_ORPHANS": "YES",
          "CLONE": "YES",
          "CLONE_TARGET": "SYSTEM",
          "CLUSTER_ID": "0",
          "DATASTORE": "default",
          "DATASTORE_ID": "1",
          "DEV_PREFIX": "vd",
          "DISK_ID": "0",
          "DISK_SNAPSHOT_TOTAL_SIZE": "0",
          "DISK_TYPE": "FILE",
          "DRIVER": "qcow2",
          "FORMAT": "qcow2",
          "IMAGE": "Ubuntu18-gpu-nvidia-720-disk-0",
          "IMAGE_ID": "2",
          "IMAGE_STATE": "2",
          "LN_TARGET": "SYSTEM",
          "ORIGINAL_SIZE": "2252",
          "READONLY": "NO",
          "SAVE": "NO",
          "SIZE": "2252",
          "SOURCE": "/var/lib/one//datastores/1/af84ffc50d1048b08d3ca75652de97ee",
          "TARGET": "vda",
          "TM_MAD": "ssh",
          "TYPE": "FILE"
        }
      ],
      "GRAPHICS": {
        "LISTEN": "0.0.0.0",
        "PORT": "5923",
        "TYPE": "VNC"
      },
      "MEMORY": "768",
      "MEMORY_RESIZE_MODE": "BALLOONING",
      "OS": {
        "ARCH": "x86_64",
        "FIRMWARE": "",
        "FIRMWARE_SECURE": "YES",
        "UUID": "17a75a28-4510-451a-8974-02970a5f5415"
      },
      "PCI": {
        "ADDRESS": "0000:3f:00:5",
        "BUS": "3f",
        "CLASS": "0302",
        "DEVICE": "25b6",
        "DOMAIN": "0000",
        "FUNCTION": "5",
        "NUMA_NODE": "0",
        "PCI_ID": "0",
        "PROFILE": "nvidia-720",
        "SHORT_ADDRESS": "3f:00.5",
        "SLOT": "00",
        "UUID": "3e87b439-543f-512e-a402-1ba761e1f0c2",
        "VENDOR": "10de",
        "VM_ADDRESS": "01:01.0",
        "VM_BUS": "0x01",
        "VM_DOMAIN": "0x0000",
        "VM_FUNCTION": "0",
        "VM_SLOT": "0x01"
      },
      "TEMPLATE_ID": "3",
      "TM_MAD_SYSTEM": "ssh",
      "VMID": "23"
    },
    "USER_TEMPLATE": {
      "ERROR": "Wed Feb  7 08:46:43 2024: DEPLOY: /var/tmp/one/vgpu: line 85: echo: write error: Input/output error Error creating mediated de... see more details in VM log",
      "HOT_RESIZE": {
        "CPU_HOT_ADD_ENABLED": "NO",
        "MEMORY_HOT_ADD_ENABLED": "NO"
      },
      "LOGO": "images/logos/ubuntu.png",
      "LXD_SECURITY_PRIVILEGED": "true",
      "MEMORY_UNIT_COST": "MB"
    },
    "HISTORY_RECORDS": {
      "HISTORY": {
        "OID": "23",
        "SEQ": "0",
        "HOSTNAME": "10.128.255.6",
        "HID": "1",
        "CID": "0",
        "STIME": "1707306394",
        "ETIME": "0",
        "VM_MAD": "kvm",
        "TM_MAD": "ssh",
        "DS_ID": "0",
        "PSTIME": "1707306394",
        "PETIME": "1707306400",
        "RSTIME": "1707306400",
        "RETIME": "1707306403",
        "ESTIME": "0",
        "EETIME": "0",
        "ACTION": "0",
        "UID": "-1",
        "GID": "-1",
        "REQUEST_ID": "-1"
      }
    },
    "BACKUPS": {
      "BACKUP_CONFIG": {
      },
      "BACKUP_IDS": {
      }
    }
  }
}

cat deployment.0

<domain type='kvm' xmlns:qemu='http://libvirt.org/schemas/domain/qemu/1.0'>
	<name>one-23</name>
	<title>teste7</title>
	<uuid>17a75a28-4510-451a-8974-02970a5f5415</uuid>
	<cputune>
		<shares>1024</shares>
	</cputune>
	<memory>786432</memory>
	<os>
		<type arch='x86_64'>hvm</type>
	</os>
	<devices>
		<emulator><![CDATA[/usr/bin/qemu-kvm-one]]></emulator>
		<disk type='file' device='disk'>
			<source file='/var/lib/one//datastores/0/23/disk.0'/>
			<target dev='vda' bus='virtio'/>
			<driver name='qemu' type='qcow2' cache='none' discard='unmap'/>
		</disk>
		<disk type='file' device='cdrom'>
			<source file='/var/lib/one//datastores/0/23/disk.1'/>
			<target dev='hda' bus='ide'/>
			<readonly/>
			<driver name='qemu' type='raw'/>
		</disk>
		<controller type='scsi' index='0' model='virtio-scsi'>
			<driver queues='1'/>
		</controller>
		<graphics type='vnc' listen='0.0.0.0' port='5923'/>
		<hostdev mode='subsystem' type='mdev' model='vfio-pci'>
			<source>
				<address  uuid='3e87b439-543f-512e-a402-1ba761e1f0c2'/>
			</source>
				<address type='pci' domain='0x0000' bus='0x01' slot='0x01' function='0'/>
		</hostdev>
	</devices>
	<features>
		<acpi/>
	</features>
	<devices>
		<channel type='unix'>
			<source mode='bind'/><target type='virtio' name='org.qemu.guest_agent.0'/>
		</channel>
	</devices>
	<metadata>
		<one:vm xmlns:one="http://opennebula.org/xmlns/libvirt/1.0">
			<one:system_datastore><![CDATA[/var/lib/one//datastores/0/23]]></one:system_datastore>
			<one:name><![CDATA[teste7]]></one:name>
			<one:uname><![CDATA[oneadmin]]></one:uname>
			<one:uid>0</one:uid>
			<one:gname><![CDATA[oneadmin]]></one:gname>
			<one:gid>0</one:gid>
			<one:opennebula_version>6.8.0</one:opennebula_version>
			<one:stime>1707306383</one:stime>
			<one:deployment_time>1707306400</one:deployment_time>
		</one:vm>
	</metadata>
</domain>

dsoliveira · February 7, 2024, 5:36pm

Update:
If I specify SHORT_ADDRESS directly in the template, it is possible to create a VM using another BUS, however, as the FUNCTION is explicit, I can only create a single VM with the same template

onetemplate show 3 -x

  <PCI>
      <BUS><![CDATA[45]]></BUS>
      <DOMAIN><![CDATA[0000]]></DOMAIN>
      <PROFILE><![CDATA[nvidia-720]]></PROFILE>
      <SHORT_ADDRESS><![CDATA[45:00.4]]></SHORT_ADDRESS>
    </PCI>

onevm show 30 -j

"PCI": {
        "ADDRESS": "0000:45:00:4",
        "BUS": "45",
        "DOMAIN": "0000",
        "FUNCTION": "4",
        "NUMA_NODE": "0",
        "PCI_ID": "0",
        "PROFILE": "nvidia-720",
        "SHORT_ADDRESS": "45:00.4",
        "SLOT": "00",
        "UUID": "aa874448-6a29-5f4a-8838-58d6373c2e57",
        "VM_ADDRESS": "01:01.0",
        "VM_BUS": "0x01",
        "VM_DOMAIN": "0x0000",
        "VM_FUNCTION": "0",
        "VM_SLOT": "0x01"
      },

mkutouski · February 12, 2024, 10:22am

It looks more like configuration issue.
As it’s written in the docs OpenNebula schedules devices. One can request a device by its type / class or vendor.
So basically it should be enough to have the only CLASS, DEVICE and VENDOR in the VM template, e.g.:

PCI=[
  CLASS="<classid>",
  DEVICE="<deviceid>",
  VENDOR="<vendorid>" ]

During VM instantiation OpenNebula starts picking devices from the first free one. Usually devices are allocated by bus.
If you want to use an specific one, then you need to use this SHORT_ADDRESS and specify that at instantiation as you tried and described here.
So either you let OpenNebula pick which device of the given type/class/vendor to use, or you select one.

dsoliveira · February 13, 2024, 9:41am

Hello @mkutouski
You are right
However, I am using multiple physical gpus, if I create a 16gb template, and then use a 2gb template, in theory they should use different chips, but an error is generated because it is not possible to have two profiles on the same chip
I understand that it should be possible to create a template with a specific profile for each gpu, so that they are all used according to the free slots

Topic		Replies	Views
GPU Passthrough no longer works since VGPU support was added CLI / API	4	1337	October 6, 2022
Passthrough vGPU Community Support	0	191	July 12, 2024
PCI Passthrough for graphics card consisting of multiple PCI devices General	0	771	June 20, 2019
VMS attached GPU failed to start VM Configuration / Contextualization	3	538	June 18, 2020
Support to vGPU General	2	354	January 11, 2024

VGPU support with multiple physical GPU on same board

Related topics