Version: OpenNebula 6.10.3 (64156dd6) Enterprise Edition
Setup: 1 Zone with 3 Frontends
Problem description:
After a unsuccessful VM creation attempt (via Sunstone GUI), all the services and the requests to OneD (via GUI or via CLI with commands like “onevm”) aren’t being processed by the oned daemon.
The oned service log (/var/log/one/oned.log) shows every second the following error:
Mon Jul 7 12:29:08 2025 [Z0][ONE][E]: SQL command was: INSERT INTO vm_pool (
oid, name, body, uid, gid, state, lcm_state, owner_u, group_u, other_u, short_body, body_json
) VALUES (
1339,
'vm-machine-name',
'<VM>...</VM>', -- (very long XML, see below)
5, 0, 1, 0, 1, 0, 0,
'<VM>...</VM>', -- (shorter XML, see below)
'{...}' -- (JSON, see below)
)
The JSON body contains the template used to create the VM with the specific values (hidden the private data):
{
"VM": {
"ID": "1339",
"UID": "...",
"GID": "...",
"UNAME": "...",
"GNAME": "...",
"NAME": "...",
"LAST_POLL": "...",
"STATE": "...",
"LCM_STATE": "...",
"PREV_STATE": "...",
"PREV_LCM_STATE": "...",
"RESCHED": "...",
"STIME": "...",
"ETIME": "...",
"DEPLOY_ID": "...",
"TEMPLATE": {
"AUTOMATIC_DS_REQUIREMENTS": "...",
"AUTOMATIC_NIC_REQUIREMENTS": "...",
"AUTOMATIC_REQUIREMENTS": "...",
"CONTEXT": {
"DISK_ID": "...",
"ETH0_DNS": "...",
"ETH0_EXTERNAL": "...",
"ETH0_GATEWAY": "...",
"ETH0_IP": "...",
"ETH0_IP6": "...",
"ETH0_IP6_GATEWAY": "...",
"ETH0_IP6_METHOD": "...",
"ETH0_IP6_METRIC": "...",
"ETH0_IP6_PREFIX_LENGTH": "...",
"ETH0_IP6_ULA": "...",
"ETH0_MAC": "...",
"ETH0_MASK": "...",
"ETH0_METHOD": "...",
"ETH0_METRIC": "...",
"ETH0_MTU": "...",
"ETH0_NETWORK": "...",
"ETH0_SEARCH_DOMAIN": "...",
"ETH0_VLAN_ID": "...",
"ETH0_VROUTER_IP": "...",
"ETH0_VROUTER_IP6": "...",
"ETH0_VROUTER_MANAGEMENT": "...",
"NETWORK": "...",
"PASSWORD": "...",
"PCI0_ADDRESS": "...",
"PCI0_IP": "...",
"PCI0_MAC": "...",
"SET_HOSTNAME": "...",
"SSH_PUBLIC_KEY": "...",
"START_SCRIPT_BASE64": "...",
"TARGET": "..."
},
"CPU": "...",
"CPU_MODEL": {
"MODEL": "..."
},
"DISK": [
{
"ALLOW_ORPHANS": "...",
"CEPH_HOST": "...",
"CEPH_SECRET": "...",
"CEPH_USER": "...",
"CLONE": "...",
"CLONE_TARGET": "...",
"CLUSTER_ID": "...",
"DATASTORE": "...",
"DATASTORE_ID": "...",
"DEV_PREFIX": "...",
"DISK_ID": "...",
"DISK_SNAPSHOT_TOTAL_SIZE": "...",
"DISK_TYPE": "...",
"DRIVER": "...",
"FORMAT": "...",
"IMAGE": "...",
"IMAGE_ID": "...",
"IMAGE_STATE": "...",
"IMAGE_UNAME": "...",
"LN_TARGET": "...",
"ORIGINAL_SIZE": "...",
"POOL_NAME": "...",
"READONLY": "...",
"SAVE": "...",
"SIZE": "...",
"SOURCE": "...",
"TARGET": "...",
"TM_MAD": "...",
"TYPE": "..."
}
],
"GRAPHICS": {
"LISTEN": "...",
"TYPE": "..."
},
"MEMORY": "...",
"MEMORY_MAX": "...",
"MEMORY_RESIZE_MODE": "...",
"NIC": [
{
"AR_ID": "...",
"BRIDGE": "...",
"BRIDGE_TYPE": "...",
"CLUSTER_ID": "...",
"DNS": "...",
"GATEWAY": "...",
"IP": "...",
"MAC": "...",
"NAME": "...",
"NETWORK": "...",
"NETWORK_ID": "...",
"NETWORK_UNAME": "...",
"NIC_ID": "...",
"SECURITY_GROUPS": "...",
"TARGET": "...",
"VN_MAD": "..."
}
],
"OS": {
"UUID": "..."
},
"PCI": {
"AR_ID": "...",
"BRIDGE": "...",
"BRIDGE_TYPE": "...",
"CLASS": "...",
"CLUSTER_ID": "...",
"DEVICE": "...",
"IP": "...",
"MAC": "...",
"NAME": "...",
"NETWORK": "...",
"NETWORK_ID": "...",
"NIC_ID": "...",
"PCI_ID": "...",
"SECURITY_GROUPS": "...",
"TARGET": "...",
"TYPE": "...",
"VENDOR": "...",
"VM_ADDRESS": "...",
"VM_BUS": "...",
"VM_DOMAIN": "...",
"VM_FUNCTION": "...",
"VM_SLOT": "...",
"VN_MAD": "..."
},
"SECURITY_GROUP_RULE": [
{
"PROTOCOL": "...",
"RULE_TYPE": "...",
"SECURITY_GROUP_ID": "...",
"SECURITY_GROUP_NAME": "..."
}
// ... (possibly more rules)
],
"TEMPLATE_ID": "...",
"VCPU": "...",
"VCPU_MAX": "...",
"VMID": "..."
},
"USER_TEMPLATE": {
"HOT_RESIZE": {
"CPU_HOT_ADD_ENABLED": "...",
"MEMORY_HOT_ADD_ENABLED": "..."
},
"HYPERVISOR": "...",
"LOGO": "...",
"MEMORY_UNIT_COST": "...",
"SCHED_REQUIREMENTS": "..."
}
}
}
Steps tried to solve:
- Restart the Opennebula services on the 3 zone hosts (
systemctl restart opennebula
) - Stop services and repair consistency on the DB (
onedb fsck
), getting the following output:
Removing possibly corrupted records from VM showback please run 'oneshowback calculate` to recalculate the showback
VM 1339 is in Image XXX VM id list, but it should not
VNet XX AR 0 has leased ... to VM 1339, but it is actually free
VNet XX has 20 used leases, but it is actually 19
VNet XXX AR 0 has leased ... to VM 1339, but it is actually free
....
so multiple actions were performed to cleanup remaining configurations of the failed VM, but the action is constantly hitting the service, avoiding perform any action (stop a VM, create a VNet…) making the service unusable.
- Additionally, the schedule logs (/var/log/one/sched.log) from the different hosts only show the following entry:
Mon Jul 7 11:37:50 2025 [Z0][SCHED][E]: oned is not leader
We haven’t found any workaround to cleanup this action to unlock the requests to the daemon.