From 1c137f0d5a26531b3e3d3fa10cf5b604cf01374c Mon Sep 17 00:00:00 2001 From: Zheng Li Date: Fri, 8 Jan 2016 13:56:06 +0000 Subject: [PATCH 1/2] CA-173605: avoid memory double counting In the fix for CA-155888, we solved a concurrency issue. It was due to the fact that the memory transfer from the squeezed to the domain wasn't atomic, so that the same memory might be allocated to two competing domains. However, the method we used was quite conservative: the memory reservation for a domain will only be dismissed _after_ the completion of the whole domain build process. It means that, at certain stage during the this process, when we calculate the used/free memory on the host, the same amount of memory could be counted twice: once as the memory physically allocated to an established domain, once as the memory reservation (before it is eventually dismissed). This can be observed more clearly in cases of domain migration, where the whole process takes rather long time. The consequence of this is, under certain circumstances, a memory allocation involved domain operation (like VM.start) that should succeed might be failed by squeezed, as it considers the host having less free memory (or, more occupied memory). This patch doesn't fundamentally change how the CA-155888 fix works. Instead, it adds some clue information to the memory reservation record, so that when counting the total memory reservation on a host, the calculation can use that information to figure out that a domain might have already been built with that memory allocation so that the reservation shouldn't be counted again here. Signed-off-by: Zheng Li --- src/memory_server.ml | 6 ++++-- src/squeeze_xen.ml | 2 +- src/squeezed_state.ml | 18 ++++++++++++------ 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/memory_server.ml b/src/memory_server.ml index 8e5bb9755..6cbed1460 100644 --- a/src/memory_server.ml +++ b/src/memory_server.ml @@ -129,10 +129,12 @@ let transfer_reservation_to_domain _ dbg session_id reservation_id domid = Xenctrl.with_intf (fun xc -> try - let kib = Client.immediate (get_client ()) (fun xs -> Client.read xs (reservation_path _service session_id reservation_id)) in + let reservation_id_path = reservation_path _service session_id reservation_id in + let kib = Client.immediate (get_client ()) (fun xs -> Client.read xs reservation_id_path) in (* This code is single-threaded, no need to make this transactional: *) Client.immediate (get_client ()) (fun xs -> Client.write xs (Printf.sprintf "/local/domain/%d/memory/initial-reservation" domid) kib); - Client.immediate (get_client ()) (fun xs -> Client.write xs (Printf.sprintf "/local/domain/%d/memory/reservation-id" domid) reservation_id); + Client.immediate (get_client ()) (fun xs -> Client.write xs (Printf.sprintf "/local/domain/%d/memory/reservation-id" domid) reservation_id); + Client.immediate (get_client ()) (fun xs -> Client.write xs (path [reservation_id_path; "in-transfer"]) (string_of_int domid)); Opt.iter (fun maxmem -> Squeeze_xen.Domain.set_maxmem_noexn xc domid maxmem) (try Some (Int64.of_string kib) with _ -> None); diff --git a/src/squeeze_xen.ml b/src/squeeze_xen.ml index 4e56e38db..b24decbcb 100644 --- a/src/squeeze_xen.ml +++ b/src/squeeze_xen.ml @@ -560,7 +560,7 @@ let make_host ~verbose ~xc = ) in (* Sum up the 'reservations' which exist separately from domains *) - let non_domain_reservations = Squeezed_state.total_reservations Squeezed_state._service in + let non_domain_reservations = Squeezed_state.total_reservations Squeezed_state._service domain_infolist in if verbose && non_domain_reservations <> 0L then debug "Total non-domain reservations = %Ld" non_domain_reservations; reserved_kib := Int64.add !reserved_kib non_domain_reservations; diff --git a/src/squeezed_state.ml b/src/squeezed_state.ml index 51232f426..45a153095 100644 --- a/src/squeezed_state.ml +++ b/src/squeezed_state.ml @@ -23,6 +23,7 @@ let _service = "squeezed" let listdir path = try List.filter (fun x -> x <> "") (Client.immediate (get_client ()) (fun xs -> Client.directory xs path)) with Xs_protocol.Enoent _ -> [] let xs_read path = try Client.immediate (get_client ()) (fun xs -> Client.read xs path) with Xs_protocol.Enoent _ as e -> begin debug "xenstsore-read %s returned ENOENT" path; raise e end +let xs_read_option path = try Some (Client.immediate (get_client ()) (fun xs -> Client.read xs path)) with Xs_protocol.Enoent _ -> None let path = String.concat "/" @@ -42,9 +43,14 @@ let del_reservation service session_id reservation_id = Client.immediate (get_client ()) (fun xs -> Client.rm xs (reservation_path service session_id reservation_id)) (** Return the total amount of memory reserved *) -let total_reservations service = - let session_ids = listdir (path [ ""; service; "state" ]) in - let session_total session_id = - let rids = listdir (path [ ""; service; "state"; session_id ]) in - List.fold_left Int64.add 0L (List.map (fun r -> Int64.of_string (xs_read (path [ ""; service; "state"; session_id; r]))) rids) in - List.fold_left Int64.add 0L (List.map session_total session_ids) +let total_reservations service domain_infolist = + let dom_list = List.map (fun di -> di.Xenctrl.domid) domain_infolist in + let session_ids = listdir (path [ ""; service; "state" ]) in + let already_counted sid rid = + match xs_read_option (path [ ""; service; "state"; sid; rid; "in-transfer"]) with + | Some domid when List.mem (int_of_string domid) dom_list -> true + | _ -> false in + let session_total sid = + let rids = listdir (path [ ""; service; "state"; sid ]) in + List.fold_left Int64.add 0L (List.map (fun rid -> if already_counted sid rid then 0L else Int64.of_string (xs_read (path [ ""; service; "state"; sid; rid]))) rids) in + List.fold_left Int64.add 0L (List.map session_total session_ids) From 5c9c28658a4f910e8b04bf81af675c0b31e272bb Mon Sep 17 00:00:00 2001 From: Zheng Li Date: Mon, 11 Jan 2016 23:13:50 +0000 Subject: [PATCH 2/2] CA-173605: swap the order of memory observation on host and running domains This patch simply swaps the order of getting host memory information and domains memory information during memory planning. As already stated in the source code comment, we don't currently have a way to atomically get the current memory layout/consumption of both the host and all its running domains at the same time. We have to do this one step after another, which can potentially cause inconsistency due to time gaps. We accommodate this by adding tolerances in the planning and also expect an amiable memory allocation speed and an eventual convergence. This is generally fine in most situations, the worst case that could happen is that 1) there is intensive domain memory allocation ongoing and 2) the inconsistency causes us to think that we have _more_ free host memory than we actually has (as the time of the actual planing), in which case we might be short of memory by encouraging multiple operations to compete for the same amount. By swapping the host memory inspection step to later (than domains memory inspection), we makes the calculation more conservative in such cases: the host free_mem observation will be closer to the final memory planning step hence smaller (in cases of intensive memory allocation) than the other way around; also a domain's memory_actual observation will be further ahead so that the estimated memory reservation for that domain (to come from the host free_mem pool) will come out larger. So, eventually the host's available free memory will be estimated more conservative than the current implementation, and therefore safer. Signed-off-by: Zheng Li --- src/squeeze_xen.ml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/squeeze_xen.ml b/src/squeeze_xen.ml index b24decbcb..935ddb67a 100644 --- a/src/squeeze_xen.ml +++ b/src/squeeze_xen.ml @@ -439,17 +439,6 @@ let make_host ~verbose ~xc = then domain_infolist else List.filter (fun di -> di.Xenctrl.domid > 0) domain_infolist in - (* - For the host free memory we sum the free pages and the pages needing - scrubbing: we don't want to adjust targets simply because the scrubber - is slow. - *) - let physinfo = Xenctrl.physinfo xc in - let free_pages_kib = Xenctrl.pages_to_kib (Int64.of_nativeint physinfo.Xenctrl.free_pages) - and scrub_pages_kib = Xenctrl.pages_to_kib (Int64.of_nativeint physinfo.Xenctrl.scrub_pages) - and total_pages_kib = Xenctrl.pages_to_kib (Int64.of_nativeint physinfo.Xenctrl.total_pages) in - let free_mem_kib = free_pages_kib +* scrub_pages_kib in - let cnx = xc in let domains = List.concat (List.map @@ -559,6 +548,17 @@ let make_host ~verbose ~xc = domain_infolist ) in + (* + For the host free memory we sum the free pages and the pages needing + scrubbing: we don't want to adjust targets simply because the scrubber + is slow. + *) + let physinfo = Xenctrl.physinfo xc in + let free_pages_kib = Xenctrl.pages_to_kib (Int64.of_nativeint physinfo.Xenctrl.free_pages) + and scrub_pages_kib = Xenctrl.pages_to_kib (Int64.of_nativeint physinfo.Xenctrl.scrub_pages) + and total_pages_kib = Xenctrl.pages_to_kib (Int64.of_nativeint physinfo.Xenctrl.total_pages) in + let free_mem_kib = free_pages_kib +* scrub_pages_kib in + (* Sum up the 'reservations' which exist separately from domains *) let non_domain_reservations = Squeezed_state.total_reservations Squeezed_state._service domain_infolist in if verbose && non_domain_reservations <> 0L