VGPU support with multiple physical GPU on same board

I recently implemented ON with an Nvidia A16-64GB, this card has 4 physical GPUs
I can configure VMs normally with VGPU, however, they only work on the same BUS
Using onehost show (id) I can see them all correctly:
3f:0*
41:0*
43:0*
45:0*

However, when instantiating a new VM, only BUS 3f:0* is used, if I change the template with the parameter:
PCI = [
BUS = “43”,…
…] as in the example above, I receive the following error message:

DEPLOY: /var/tmp/one/vgpu: line 85: echo: write error: Input/output error Error creating mediated device error: Failed to create domain from /var/lib/one//datastores/0/13/deployment. 0 error: device not found: mediated device ‘3e87b439-543f-512e-a402-1ba761e1f0c2’ not found

Bare-metal: Ubuntu 20.04.6 LTS
ON Bare-metal: 6.8.0
ON Frontend: 6.8.0

Could you, please, share the output of onevm show <vmid> -j command as well as deployment.0 file for that VM?

Sure bro,

onevm show 23 -j

{
  "VM": {
    "ID": "23",
    "UID": "0",
    "GID": "0",
    "UNAME": "oneadmin",
    "GNAME": "oneadmin",
    "NAME": "teste7",
    "PERMISSIONS": {
      "OWNER_U": "1",
      "OWNER_M": "1",
      "OWNER_A": "0",
      "GROUP_U": "0",
      "GROUP_M": "0",
      "GROUP_A": "0",
      "OTHER_U": "0",
      "OTHER_M": "0",
      "OTHER_A": "0"
    },
    "LAST_POLL": "1707306538",
    "STATE": "3",
    "LCM_STATE": "36",
    "PREV_STATE": "3",
    "PREV_LCM_STATE": "36",
    "RESCHED": "0",
    "STIME": "1707306383",
    "ETIME": "0",
    "DEPLOY_ID": {
    },
    "MONITORING": {
      "DISK_SIZE": [
        {
          "ID": "0",
          "SIZE": "339"
        },
        {
          "ID": "1",
          "SIZE": "1"
        }
      ],
      "ID": "23",
      "TIMESTAMP": "1707306538"
    },
    "SCHED_ACTIONS": {
    },
    "TEMPLATE": {
      "AUTOMATIC_DS_REQUIREMENTS": "(\"CLUSTERS/ID\" @> 0)",
      "AUTOMATIC_NIC_REQUIREMENTS": "(\"CLUSTERS/ID\" @> 0)",
      "AUTOMATIC_REQUIREMENTS": "(CLUSTER_ID = 0) & !(PUBLIC_CLOUD = YES) & !(PIN_POLICY = PINNED)",
      "CONTEXT": {
        "DISK_ID": "1",
        "NETWORK": "YES",
        "PCI0_ADDRESS": "01:01.0",
        "SSH_PUBLIC_KEY": "",
        "TARGET": "hda"
      },
      "CPU": "1",
      "DISK": [
        {
          "ALLOW_ORPHANS": "YES",
          "CLONE": "YES",
          "CLONE_TARGET": "SYSTEM",
          "CLUSTER_ID": "0",
          "DATASTORE": "default",
          "DATASTORE_ID": "1",
          "DEV_PREFIX": "vd",
          "DISK_ID": "0",
          "DISK_SNAPSHOT_TOTAL_SIZE": "0",
          "DISK_TYPE": "FILE",
          "DRIVER": "qcow2",
          "FORMAT": "qcow2",
          "IMAGE": "Ubuntu18-gpu-nvidia-720-disk-0",
          "IMAGE_ID": "2",
          "IMAGE_STATE": "2",
          "LN_TARGET": "SYSTEM",
          "ORIGINAL_SIZE": "2252",
          "READONLY": "NO",
          "SAVE": "NO",
          "SIZE": "2252",
          "SOURCE": "/var/lib/one//datastores/1/af84ffc50d1048b08d3ca75652de97ee",
          "TARGET": "vda",
          "TM_MAD": "ssh",
          "TYPE": "FILE"
        }
      ],
      "GRAPHICS": {
        "LISTEN": "0.0.0.0",
        "PORT": "5923",
        "TYPE": "VNC"
      },
      "MEMORY": "768",
      "MEMORY_RESIZE_MODE": "BALLOONING",
      "OS": {
        "ARCH": "x86_64",
        "FIRMWARE": "",
        "FIRMWARE_SECURE": "YES",
        "UUID": "17a75a28-4510-451a-8974-02970a5f5415"
      },
      "PCI": {
        "ADDRESS": "0000:3f:00:5",
        "BUS": "3f",
        "CLASS": "0302",
        "DEVICE": "25b6",
        "DOMAIN": "0000",
        "FUNCTION": "5",
        "NUMA_NODE": "0",
        "PCI_ID": "0",
        "PROFILE": "nvidia-720",
        "SHORT_ADDRESS": "3f:00.5",
        "SLOT": "00",
        "UUID": "3e87b439-543f-512e-a402-1ba761e1f0c2",
        "VENDOR": "10de",
        "VM_ADDRESS": "01:01.0",
        "VM_BUS": "0x01",
        "VM_DOMAIN": "0x0000",
        "VM_FUNCTION": "0",
        "VM_SLOT": "0x01"
      },
      "TEMPLATE_ID": "3",
      "TM_MAD_SYSTEM": "ssh",
      "VMID": "23"
    },
    "USER_TEMPLATE": {
      "ERROR": "Wed Feb  7 08:46:43 2024: DEPLOY: /var/tmp/one/vgpu: line 85: echo: write error: Input/output error Error creating mediated de... see more details in VM log",
      "HOT_RESIZE": {
        "CPU_HOT_ADD_ENABLED": "NO",
        "MEMORY_HOT_ADD_ENABLED": "NO"
      },
      "LOGO": "images/logos/ubuntu.png",
      "LXD_SECURITY_PRIVILEGED": "true",
      "MEMORY_UNIT_COST": "MB"
    },
    "HISTORY_RECORDS": {
      "HISTORY": {
        "OID": "23",
        "SEQ": "0",
        "HOSTNAME": "10.128.255.6",
        "HID": "1",
        "CID": "0",
        "STIME": "1707306394",
        "ETIME": "0",
        "VM_MAD": "kvm",
        "TM_MAD": "ssh",
        "DS_ID": "0",
        "PSTIME": "1707306394",
        "PETIME": "1707306400",
        "RSTIME": "1707306400",
        "RETIME": "1707306403",
        "ESTIME": "0",
        "EETIME": "0",
        "ACTION": "0",
        "UID": "-1",
        "GID": "-1",
        "REQUEST_ID": "-1"
      }
    },
    "BACKUPS": {
      "BACKUP_CONFIG": {
      },
      "BACKUP_IDS": {
      }
    }
  }
}

cat deployment.0

<domain type='kvm' xmlns:qemu='http://libvirt.org/schemas/domain/qemu/1.0'>
	<name>one-23</name>
	<title>teste7</title>
	<uuid>17a75a28-4510-451a-8974-02970a5f5415</uuid>
	<cputune>
		<shares>1024</shares>
	</cputune>
	<memory>786432</memory>
	<os>
		<type arch='x86_64'>hvm</type>
	</os>
	<devices>
		<emulator><![CDATA[/usr/bin/qemu-kvm-one]]></emulator>
		<disk type='file' device='disk'>
			<source file='/var/lib/one//datastores/0/23/disk.0'/>
			<target dev='vda' bus='virtio'/>
			<driver name='qemu' type='qcow2' cache='none' discard='unmap'/>
		</disk>
		<disk type='file' device='cdrom'>
			<source file='/var/lib/one//datastores/0/23/disk.1'/>
			<target dev='hda' bus='ide'/>
			<readonly/>
			<driver name='qemu' type='raw'/>
		</disk>
		<controller type='scsi' index='0' model='virtio-scsi'>
			<driver queues='1'/>
		</controller>
		<graphics type='vnc' listen='0.0.0.0' port='5923'/>
		<hostdev mode='subsystem' type='mdev' model='vfio-pci'>
			<source>
				<address  uuid='3e87b439-543f-512e-a402-1ba761e1f0c2'/>
			</source>
				<address type='pci' domain='0x0000' bus='0x01' slot='0x01' function='0'/>
		</hostdev>
	</devices>
	<features>
		<acpi/>
	</features>
	<devices>
		<channel type='unix'>
			<source mode='bind'/><target type='virtio' name='org.qemu.guest_agent.0'/>
		</channel>
	</devices>
	<metadata>
		<one:vm xmlns:one="http://opennebula.org/xmlns/libvirt/1.0">
			<one:system_datastore><![CDATA[/var/lib/one//datastores/0/23]]></one:system_datastore>
			<one:name><![CDATA[teste7]]></one:name>
			<one:uname><![CDATA[oneadmin]]></one:uname>
			<one:uid>0</one:uid>
			<one:gname><![CDATA[oneadmin]]></one:gname>
			<one:gid>0</one:gid>
			<one:opennebula_version>6.8.0</one:opennebula_version>
			<one:stime>1707306383</one:stime>
			<one:deployment_time>1707306400</one:deployment_time>
		</one:vm>
	</metadata>
</domain>


Update:
If I specify SHORT_ADDRESS directly in the template, it is possible to create a VM using another BUS, however, as the FUNCTION is explicit, I can only create a single VM with the same template

onetemplate show 3 -x

  <PCI>
      <BUS><![CDATA[45]]></BUS>
      <DOMAIN><![CDATA[0000]]></DOMAIN>
      <PROFILE><![CDATA[nvidia-720]]></PROFILE>
      <SHORT_ADDRESS><![CDATA[45:00.4]]></SHORT_ADDRESS>
    </PCI>

onevm show 30 -j

"PCI": {
        "ADDRESS": "0000:45:00:4",
        "BUS": "45",
        "DOMAIN": "0000",
        "FUNCTION": "4",
        "NUMA_NODE": "0",
        "PCI_ID": "0",
        "PROFILE": "nvidia-720",
        "SHORT_ADDRESS": "45:00.4",
        "SLOT": "00",
        "UUID": "aa874448-6a29-5f4a-8838-58d6373c2e57",
        "VM_ADDRESS": "01:01.0",
        "VM_BUS": "0x01",
        "VM_DOMAIN": "0x0000",
        "VM_FUNCTION": "0",
        "VM_SLOT": "0x01"
      },

It looks more like configuration issue.
As it’s written in the docs OpenNebula schedules devices. One can request a device by its type / class or vendor.
So basically it should be enough to have the only CLASS, DEVICE and VENDOR in the VM template, e.g.:

PCI=[
  CLASS="<classid>",
  DEVICE="<deviceid>",
  VENDOR="<vendorid>" ]

During VM instantiation OpenNebula starts picking devices from the first free one. Usually devices are allocated by bus.
If you want to use an specific one, then you need to use this SHORT_ADDRESS and specify that at instantiation as you tried and described here.
So either you let OpenNebula pick which device of the given type/class/vendor to use, or you select one.

1 Like

Hello @mkutouski
You are right
However, I am using multiple physical gpus, if I create a 16gb template, and then use a 2gb template, in theory they should use different chips, but an error is generated because it is not possible to have two profiles on the same chip
I understand that it should be possible to create a template with a specific profile for each gpu, so that they are all used according to the free slots