Skip to content

Commit

Permalink
lxd/instance/drivers/qemu: Pick a random vsock Context ID
Browse files Browse the repository at this point in the history
When acquiring a new Context ID for the communication via vsock, use the UUID of the instance as a seed for generating random uint32 candidates. The loop is kept open until a free Context ID is found or the timeout of 5s is reached. The syscall to the vsock returns ENODEV in case the Context ID is not yet assigned.
In case the Context ID of a stopped VM was already acquired again, a new one is now generated.

Fixes https://github.com/lxc/lxd/issues/11508

Signed-off-by: Julian Pelizäus <[email protected]>
  • Loading branch information
roosterfish committed Jun 28, 2023
1 parent 4db7b8c commit 23bd9af
Showing 1 changed file with 95 additions and 26 deletions.
121 changes: 95 additions & 26 deletions lxd/instance/drivers/driver_qemu.go
Original file line number Diff line number Diff line change
Expand Up @@ -375,16 +375,10 @@ func (d *qemu) getAgentClient() (*http.Client, error) {
return nil, err
}

vsockID := d.vsockID() // Default to using the vsock ID that will be used on next start.

// But if vsock ID from last VM start is present in volatile, then use that.
// This allows a running VM to be recovered after DB record deletion and that agent connection still work
// after the VM's instance ID has changed.
if d.localConfig["volatile.vsock_id"] != "" {
volatileVsockID, err := strconv.Atoi(d.localConfig["volatile.vsock_id"])
if err == nil {
vsockID = volatileVsockID
}
// Existing vsock ID from volatile.
vsockID, err := d.vsockID()
if err != nil {
return nil, err
}

agent, err := lxdvsock.HTTPClient(vsockID, shared.HTTPSDefaultPort, clientCert, clientKey, agentCert)
Expand Down Expand Up @@ -1151,9 +1145,15 @@ func (d *qemu) start(stateful bool, op *operationlock.InstanceOperation) error {

volatileSet := make(map[string]string)

// New or existing vsock ID from volatile.
vsockID, err := d.vsockID()
if err != nil {
return err
}

// Update vsock ID in volatile if needed for recovery (do this before UpdateBackupFile() call).
oldVsockID := d.localConfig["volatile.vsock_id"]
newVsockID := strconv.Itoa(d.vsockID())
newVsockID := strconv.Itoa(vsockID)
if oldVsockID != newVsockID {
volatileSet["volatile.vsock_id"] = newVsockID
}
Expand Down Expand Up @@ -2943,6 +2943,12 @@ func (d *qemu) generateQemuConfigFile(cpuInfo *cpuTopology, mountInfo *storagePo

cfg = append(cfg, qemuTablet(&tabletOpts)...)

// Existing vsock ID from volatile.
vsockID, err := d.vsockID()
if err != nil {
return "", nil, err
}

devBus, devAddr, multi = bus.allocate(busFunctionGroupGeneric)
vsockOpts := qemuVsockOpts{
dev: qemuDevOpts{
Expand All @@ -2951,7 +2957,7 @@ func (d *qemu) generateQemuConfigFile(cpuInfo *cpuTopology, mountInfo *storagePo
devAddr: devAddr,
multifunction: multi,
},
vsockID: d.vsockID(),
vsockID: vsockID,
}

cfg = append(cfg, qemuVsock(&vsockOpts)...)
Expand Down Expand Up @@ -7426,22 +7432,85 @@ func (d *qemu) DeviceEventHandler(runConf *deviceConfig.RunConfig) error {
return nil
}

// freeVsockID returns true if the given vsockID is not yet acquired
func (d *qemu) freeVsockID(vsockID uint32) bool {
c, err := lxdvsock.Dial(vsockID, shared.HTTPSDefaultPort)
if err != nil {
var unixErrno unix.Errno

if !errors.As(err, &unixErrno) {
return false
}

if unixErrno == unix.ENODEV {
// The syscall to the vsock device returned "no such device".
// This means the address (Context ID) is free.
return true
}
}

// Address is already in use.
c.Close()
return false
}

// nextVsockID returns the next free vsock Context ID beginning from start.
func (d *qemu) nextVsockID(instanceUUID string) (uint32, error) {
r, err := util.GetStableRandomGenerator(instanceUUID)
if err != nil {
return 0, fmt.Errorf("Failed generating stable random seed from instance UUID %q: %w", instanceUUID, err)
}

timeout := 5 * time.Second

// Try to find a new Context ID.
for start := time.Now(); time.Since(start) <= timeout; {
candidateVsockID := r.Uint32()

// Don't try to acquire the reserved Context IDs 0-2
if candidateVsockID <= 2 {
continue
}

if d.freeVsockID(candidateVsockID) {
return candidateVsockID, nil
}

continue
}

return 0, fmt.Errorf("Timeout exceeded after %d seconds", timeout)
}

// vsockID returns the vsock Context ID for the VM.
func (d *qemu) vsockID() int {
// We use the system's own VsockID as the base.
//
// This is either "2" for a physical system or the VM's own id if
// running inside of a VM.
//
// To this we add 1 for backward compatibility with prior logic
// which would start at id 3 rather than id 2. Removing that offset
// would cause conflicts between existing VMs until they're all rebooted.
//
// We then add the VM's own instance id (1 or higher) to give us a
// unique, non-clashing context ID for our guest.
func (d *qemu) vsockID() (int, error) {
// Check if vsock ID from last VM start is present in volatile, then use that.
// This allows a running VM to be recovered after DB record deletion and that an agent connection still works
// after the VM's instance ID has changed.
existingVsockID, ok := d.localConfig["volatile.vsock_id"]
if ok {
vsockID, err := strconv.Atoi(existingVsockID)
if err != nil {
return 0, fmt.Errorf("Failed to parse volatile.vsock_id: %q: %w", existingVsockID, err)
}

info := DriverStatuses()[instancetype.VM].Info
return info.Features["vhost_vsock"].(int) + 1 + d.id
// Check if the vsock ID from last VM start is still not aquired in case the VM was stopped.
if d.freeVsockID(uint32(vsockID)) {
return vsockID, nil
}
}

instanceUUID := uuid.Parse(d.localConfig["volatile.uuid"])
if instanceUUID == nil {
return 0, fmt.Errorf("Instance does not have a UUID")
}

vsockID, err := d.nextVsockID(instanceUUID.String())
if err != nil {
return 0, fmt.Errorf("Failed to find a free vsock Context ID: %w", err)
}

return int(vsockID), nil
}

// InitPID returns the instance's current process ID.
Expand Down

0 comments on commit 23bd9af

Please sign in to comment.