Skip to content

xenopsd: Don't balloon down memory on same-host migration #6437

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ocaml/xapi-idl/xen/xenops_interface.ml
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,11 @@ module XenopsAPI (R : RPC) = struct
~description:["when true, verify remote server certificate"]
Types.bool
in
let localhost_migration =
Param.mk ~name:"localhost_migration"
~description:["when true, localhost migration is being performed"]
Types.bool
in
declare "VM.migrate" []
(debug_info_p
@-> vm_id_p
Expand All @@ -727,6 +732,7 @@ module XenopsAPI (R : RPC) = struct
@-> xenops_url
@-> compress
@-> verify_dest
@-> localhost_migration
@-> returning task_id_p err
)

Expand Down
14 changes: 8 additions & 6 deletions ocaml/xapi/xapi_vm_migrate.ml
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ let assert_licensed_storage_motion ~__context =

let rec migrate_with_retries ~__context ~queue_name ~max ~try_no ~dbg:_ ~vm_uuid
~xenops_vdi_map ~xenops_vif_map ~xenops_vgpu_map ~xenops_url ~compress
~verify_cert =
~verify_cert ~localhost_migration =
let open Xapi_xenops_queue in
let module Client = (val make_client queue_name : XENOPS) in
let dbg = Context.string_of_task_and_tracing __context in
Expand All @@ -254,7 +254,7 @@ let rec migrate_with_retries ~__context ~queue_name ~max ~try_no ~dbg:_ ~vm_uuid
progress := "Client.VM.migrate" ;
let t1 =
Client.VM.migrate dbg vm_uuid xenops_vdi_map xenops_vif_map
xenops_vgpu_map xenops_url compress verify_dest
xenops_vgpu_map xenops_url compress verify_dest localhost_migration
in
progress := "sync_with_task" ;
ignore (Xapi_xenops.sync_with_task __context queue_name t1)
Expand All @@ -281,7 +281,7 @@ let rec migrate_with_retries ~__context ~queue_name ~max ~try_no ~dbg:_ ~vm_uuid
(Printexc.to_string e) !progress try_no max ;
migrate_with_retries ~__context ~queue_name ~max ~try_no:(try_no + 1)
~dbg ~vm_uuid ~xenops_vdi_map ~xenops_vif_map ~xenops_vgpu_map
~xenops_url ~compress ~verify_cert
~xenops_url ~compress ~verify_cert ~localhost_migration
(* Something else went wrong *)
| e ->
debug
Expand Down Expand Up @@ -374,7 +374,8 @@ let pool_migrate ~__context ~vm ~host ~options =
Pool_features.assert_enabled ~__context ~f:Features.Xen_motion ;
let dbg = Context.string_of_task __context in
let localhost = Helpers.get_localhost ~__context in
if host = localhost then
let localhost_migration = host = localhost in
if localhost_migration then
info "This is a localhost migration" ;
let open Xapi_xenops_queue in
let queue_name = queue_of_vm ~__context ~self:vm in
Expand Down Expand Up @@ -431,7 +432,7 @@ let pool_migrate ~__context ~vm ~host ~options =
let verify_cert = Stunnel_client.pool () in
migrate_with_retry ~__context ~queue_name ~dbg ~vm_uuid
~xenops_vdi_map:[] ~xenops_vif_map:[] ~xenops_vgpu_map
~xenops_url ~compress ~verify_cert ;
~xenops_url ~compress ~verify_cert ~localhost_migration ;
(* Delete all record of this VM locally (including caches) *)
Xapi_xenops.Xenopsd_metadata.delete ~__context vm_uuid
)
Expand Down Expand Up @@ -1586,7 +1587,8 @@ let migrate_send' ~__context ~vm ~dest ~live:_ ~vdi_map ~vif_map ~vgpu_map
let dbg = Context.string_of_task __context in
migrate_with_retry ~__context ~queue_name ~dbg ~vm_uuid
~xenops_vdi_map ~xenops_vif_map ~xenops_vgpu_map
~xenops_url:remote.xenops_url ~compress ~verify_cert ;
~xenops_url:remote.xenops_url ~compress ~verify_cert
~localhost_migration:is_same_host ;
Xapi_xenops.Xenopsd_metadata.delete ~__context vm_uuid
)
with
Expand Down
41 changes: 27 additions & 14 deletions ocaml/xenopsd/lib/xenops_server.ml
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ type vm_migrate_op = {
; vmm_tmp_dest_id: Vm.id
; vmm_compress: bool
; vmm_verify_dest: bool
; vmm_localhost_migration: bool
}
[@@deriving rpcty]

Expand Down Expand Up @@ -2628,19 +2629,30 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit =
~path:(Uri.path_unencoded url ^ snippet ^ id_str)
~query:(Uri.query url) ()
in
(* CA-78365: set the memory dynamic range to a single value to stop
ballooning. *)
let atomic =
VM_set_memory_dynamic_range
(id, vm.Vm.memory_dynamic_min, vm.Vm.memory_dynamic_min)
in
let (_ : unit) =
perform_atomic ~progress_callback:(fun _ -> ()) atomic t
in
(* Waiting here is not essential but adds a degree of safety and
reducess unnecessary memory copying. *)
( try B.VM.wait_ballooning t vm
with Xenopsd_error Ballooning_timeout_before_migration -> ()
(* CA-78365: set the memory dynamic range to a single value
to stop ballooning, if ballooning is enabled at all *)
( if vm.memory_dynamic_min <> vm.memory_dynamic_max then
(* There's no need to balloon down when doing localhost migration -
we're not copying any memory in the first place. This would
likely increase VDI migration time as swap would be engaged.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this just a guess or do you have evidence from tests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part comes from a report on the xcp-ng Discord

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made the observation on Windows VMs whilst performing VDI migrations in a production environment.

For example, on a VM with 32GB dynamic MAX and 16GB dynamic MIN with 20GB in use, the ballooning would mean waiting for 4GB to be pushed into the page file (my assumption being that this would then also mean that the changed blocks which would have to be sent to the new SR if we are migrating the disc backing the page file). The free 12GB may also have been used by the guest OS read cache and would be ejected, meaning potential subsequent reads from disc that may have been cache hits.

Instead change the ballooning target to the current state *)
let new_balloon_target =
if vmm.vmm_localhost_migration then
(B.VM.get_state vm).memory_actual
else
vm.memory_dynamic_min
in
let atomic =
VM_set_memory_dynamic_range
(id, new_balloon_target, new_balloon_target)
in
let (_ : unit) =
perform_atomic ~progress_callback:(fun _ -> ()) atomic t
in
(* Waiting here is not essential but adds a degree of safety and
reducess unnecessary memory copying. *)
try B.VM.wait_ballooning t vm
with Xenopsd_error Ballooning_timeout_before_migration -> ()
) ;
(* Find out the VM's current memory_limit: this will be used to allocate
memory on the receiver *)
Expand Down Expand Up @@ -3597,7 +3609,7 @@ module VM = struct
let s3resume _ dbg id = queue_operation dbg id (Atomic (VM_s3resume id))

let migrate _context dbg id vmm_vdi_map vmm_vif_map vmm_vgpu_pci_map vmm_url
(compress : bool) (verify_dest : bool) =
(compress : bool) (localhost_migration : bool) (verify_dest : bool) =
let tmp_uuid_of uuid ~kind =
Printf.sprintf "%s00000000000%c" (String.sub uuid 0 24)
(match kind with `dest -> '1' | `src -> '0')
Expand All @@ -3614,6 +3626,7 @@ module VM = struct
; vmm_tmp_dest_id= tmp_uuid_of id ~kind:`dest
; vmm_compress= compress
; vmm_verify_dest= verify_dest
; vmm_localhost_migration= localhost_migration
}
)

Expand Down
Loading