diff --git a/nova/conf/vmware.py b/nova/conf/vmware.py index 5aeb0e3a6f9..0bd0bfef9c7 100644 --- a/nova/conf/vmware.py +++ b/nova/conf/vmware.py @@ -515,6 +515,34 @@ The sync-loop thread for custom traits runs continuously, sleeping after syncing the traits. This setting defines how long to sleep between runs. +Possible values: + * integer >= time in seconds to sleep between runs + * intger < 0: disable the sync-loop +"""), + cfg.IntOpt('external_customer_drs_rules_sync_loop_spacing', + default=1800, + help=""" +Amount of time in seconds to wait between external customer DRS rules +sync-loop runs + +The sync-loop thread for external customer DRS rules runs continuously, +sleeping after ensuring all necessary DRS rules exist and cleaning up +no-longer-necessary ones. This setting defines how long to sleep between runs. + +Possible values: + * integer >= time in seconds to sleep between runs + * intger < 0: disable the sync-loop +"""), + cfg.IntOpt('external_customer_vm_groups_sync_loop_spacing', + default=1800, + help=""" +Amount of time in seconds to wait between external customer VmGroup +sync-loop runs + +The sync-loop thread for external customer VmGroups runs continuously, +sleeping after updating the members of the VmGroups found in the cluster and +matching a certain prefix. This setting defines how long to sleep between runs. + Possible values: * integer >= time in seconds to sleep between runs * intger < 0: disable the sync-loop diff --git a/nova/utils.py b/nova/utils.py index efeef40a763..a1352606eb7 100644 --- a/nova/utils.py +++ b/nova/utils.py @@ -94,6 +94,7 @@ BIGVM_EXCLUSIVE_TRAIT = 'CUSTOM_HANA_EXCLUSIVE_HOST' EXTERNAL_CUSTOMER_SUPPORTED_TRAIT = 'CUSTOM_EXTERNAL_CUSTOMER_SUPPORTED' +EXTERNAL_CUSTOMER_DOMAIN_TRAIT = 'CUSTOM_EXTERNAL_CUSTOMER_{}' _FILE_CACHE = {} diff --git a/nova/virt/vmwareapi/cluster_util.py b/nova/virt/vmwareapi/cluster_util.py index c4d6749e00a..dad48c248af 100644 --- a/nova/virt/vmwareapi/cluster_util.py +++ b/nova/virt/vmwareapi/cluster_util.py @@ -20,6 +20,7 @@ from nova import exception from nova.i18n import _ from nova import utils +from nova.virt.vmwareapi import error_util LOG = logging.getLogger(__name__) @@ -220,6 +221,35 @@ def update_vm_group_membership(session, cluster, vm_group_name, vm_ref, reconfigure_cluster(session, cluster, config_spec) +def set_vm_group_members(session, cluster_ref, vm_group_name, vm_refs): + """Replace current members of the VmGroup with the given list + + The VmGroup has to exist. + """ + + @utils.synchronized(f"set-vm-group-members-{vm_group_name}") + def _set_vm_group_members(session, cluster_ref, vm_group_name, vm_refs): + """local function to add vmgroup name to lock""" + cluster_config = session._call_method( + vim_util, "get_object_property", cluster_ref, "configurationEx") + + client_factory = session.vim.client.factory + config_spec = client_factory.create('ns0:ClusterConfigSpecEx') + + group = _get_vm_group(cluster_config, vm_group_name) + if not group: + raise error_util.VmGroupDoesNotExist(vm_group_name=vm_group_name) + + group.vm = vm_refs + + group_spec = create_group_spec(client_factory, group, 'edit') + config_spec.groupSpec = [group_spec] + + reconfigure_cluster(session, cluster_ref, config_spec) + + return _set_vm_group_members(session, cluster_ref, vm_group_name, vm_refs) + + def create_vm_rule(client_factory, name, vm_refs, policy='affinity', rule=None): """Create a ClusterAffinityRuleSpec or ClusterAntiAffinityRuleSpec object @@ -249,6 +279,29 @@ def create_vm_rule(client_factory, name, vm_refs, policy='affinity', return rule +def create_vm_host_rule(client_factory, name, host_group_name, vm_group_name, + policy='affinity', mandatory=True): + """Create a ClusterVmHostRuleInfo object + + This rule defines anti-/affinity between a VmGroup and a HostGroup. + """ + rule = client_factory.create('ns0:ClusterVmHostRuleInfo') + rule.name = name + rule.enabled = True + rule.mandatory = True + rule.vmGroupName = vm_group_name + + if policy == 'affinity': + rule.affineHostGroupName = host_group_name + elif policy == 'anti-affinity': + rule.antiAffineHostGroupName = host_group_name + else: + msg = _('%s policy is not supported.') % policy + raise exception.ValidationError(msg) + + return rule + + def create_rule_spec(client_factory, rule, operation='add'): """Create a ClusterRuleSpec object""" rule_spec = client_factory.create('ns0:ClusterRuleSpec') @@ -271,18 +324,8 @@ def _create_cluster_rules_spec(client_factory, name, vm_refs, def _create_cluster_group_rules_spec(client_factory, name, vm_group_name, host_group_name, policy='affinity', rule=None): - rules_info = client_factory.create('ns0:ClusterVmHostRuleInfo') - rules_info.name = name - rules_info.enabled = True - rules_info.mandatory = True - rules_info.vmGroupName = vm_group_name - if policy == 'affinity': - rules_info.affineHostGroupName = host_group_name - elif policy == 'anti-affinity': - rules_info.antiAffineHostGroupName = host_group_name - else: - msg = _('%s policy is not supported.') % policy - raise exception.ValidationError(msg) + rules_info = create_vm_host_rule( + client_factory, name, host_group_name, vm_group_name, policy) if rule is not None: rules_info.key = rule.key diff --git a/nova/virt/vmwareapi/constants.py b/nova/virt/vmwareapi/constants.py index e1c92aa8434..4f693ee1b4f 100644 --- a/nova/virt/vmwareapi/constants.py +++ b/nova/virt/vmwareapi/constants.py @@ -249,6 +249,8 @@ # distinguish between what has been automatically created and what is # admin-created. DRS_PREFIX = 'NOVA_' +# DRS rule prefix for external customer HostGroup affinity rules +DRS_EXT_CUSTOMER_PREFIX = f"{DRS_PREFIX}external_customer_" # Prefix for custom attributes (CustomFieldDef names) that's used by Nova to diff --git a/nova/virt/vmwareapi/driver.py b/nova/virt/vmwareapi/driver.py index bb92c587f99..ff7533c5d8d 100644 --- a/nova/virt/vmwareapi/driver.py +++ b/nova/virt/vmwareapi/driver.py @@ -260,6 +260,8 @@ def init_host(self, host): LOG.debug("Starting green server-group sync-loop thread") utils.spawn(self._server_group_sync_loop, host) utils.spawn(self._custom_traits_sync_loop, host) + utils.spawn(self._external_customer_drs_rules_sync_loop) + utils.spawn(self._external_customer_vm_groups_sync_loop) def get_nodenames_by_uuid(self, refresh=False): """Overwritten method to return a locally-cached node UUID @@ -611,6 +613,40 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None): # where cpu traits are added. In the vmware world, this is where we # would add nested providers representing tenant VDC and similar. + self._update_provider_tree_for_external_customers( + provider_tree, nodename, stats['hostgroups']) + + def _update_provider_tree_for_external_customers(self, provider_tree, + nodename, hostgroups): + existing_traits = { + t for t in provider_tree.data(nodename).traits + if t.startswith(tuple( + utils.EXTERNAL_CUSTOMER_DOMAIN_TRAIT.format( + prefix.upper().removesuffix('-').replace('-', '_')) + for prefix in CONF.external_customer_domain_name_prefixes)) or + t == utils.EXTERNAL_CUSTOMER_SUPPORTED_TRAIT} + + wanted_traits = set() + # we can have generic host groups and special ones per domain + for prefix in CONF.external_customer_domain_name_prefixes: + hg_prefix = prefix.removesuffix('-') + + for hg_name in hostgroups: + if not hg_name.startswith(hg_prefix): + continue + + trait = utils.EXTERNAL_CUSTOMER_DOMAIN_TRAIT.format( + hg_name.upper().replace('-', '_')) + wanted_traits.add(trait) + + if wanted_traits: + wanted_traits.add(utils.EXTERNAL_CUSTOMER_SUPPORTED_TRAIT) + + traits_to_add = wanted_traits - existing_traits + provider_tree.add_traits(nodename, *traits_to_add) + traits_to_remove = existing_traits - wanted_traits + provider_tree.remove_traits(nodename, *traits_to_remove) + def prepare_for_spawn(self, instance): """Perform pre-checks for spawn.""" self._vmops.prepare_for_spawn(instance) @@ -981,6 +1017,61 @@ def _custom_traits_sync_loop(self, compute_host): time.sleep(CONF.vmware.custom_traits_sync_loop_spacing) + def _external_customer_drs_rules_sync_loop(self): + """Clean up and validate DRS rules for external customers HostGroups + + While during normal operations we already ensure that DRS rules get + created appropriately, we do not clean up no-longer-necessary ones. + This sync loop will ensure DRS rules and clean up, too. + """ + # we create a context here, so we can follow the logs for the sync-loop + # more easily + context = nova_context.get_admin_context() # noqa:F841 + + while CONF.vmware.external_customer_drs_rules_sync_loop_spacing >= 0: + LOG.debug('Starting external customer DRS rules sync-loop') + try: + self._vmops.sync_external_customer_drs_rules(do_cleanup=True) + except Exception as e: + LOG.exception("Finished external customer DRS rules sync-loop " + "with error: %s", e) + else: + LOG.debug('Finished external customer DRS rules sync-loop') + + time.sleep(CONF. + vmware.external_customer_drs_rules_sync_loop_spacing) + + def _external_customer_vm_groups_sync_loop(self): + """Sync the instances of this host into members for VmGroups + + We pick the list of existing VmGroup objects from the cluster and sync + the members for them. _external_customer_drs_rules_sync_loop() takes + care of creating/deleting VmGroup objects for existing HostGroups. + """ + context = nova_context.get_admin_context() + + while CONF.vmware.external_customer_vm_groups_sync_loop_spacing >= 0: + LOG.debug('Starting external customer VmGroup sync-loop') + try: + drs_prefix = constants.DRS_EXT_CUSTOMER_PREFIX + + vm_groups = cluster_util.fetch_cluster_groups( + self._session, self._cluster_ref, group_type='vm') + vm_group_names = [vm_group_name for vm_group_name in vm_groups + if vm_group_name.startswith(drs_prefix)] + + for vm_group_name in vm_group_names: + self._vmops.sync_external_customer_vm_group(context, + vm_group_name) + except Exception as e: + LOG.exception("Finished external customer VmGroup sync-loop " + "with error: %s", e) + else: + LOG.debug('Finished external customer VmGroup sync-loop') + + time.sleep(CONF. + vmware.external_customer_vm_groups_sync_loop_spacing) + def check_can_live_migrate_destination(self, context, instance, src_compute_info, dst_compute_info, block_migration=False, diff --git a/nova/virt/vmwareapi/error_util.py b/nova/virt/vmwareapi/error_util.py index b6c0d30c351..b1e297b621a 100644 --- a/nova/virt/vmwareapi/error_util.py +++ b/nova/virt/vmwareapi/error_util.py @@ -54,3 +54,7 @@ class EvcModeDoesNotExist(exception.Invalid): class InClustervMotionCheckError(exception.Invalid): msg_fmt = _("in cluster migration failed: %(reason)s") + + +class VmGroupDoesNotExist(exception.Invalid): + msg_fmt = _("VmGroup %(vm_group_name)s does not exist") diff --git a/nova/virt/vmwareapi/host.py b/nova/virt/vmwareapi/host.py index 6de474c51f4..76ce144e453 100644 --- a/nova/virt/vmwareapi/host.py +++ b/nova/virt/vmwareapi/host.py @@ -22,12 +22,14 @@ from oslo_utils import units from oslo_utils import versionutils from oslo_vmware import exceptions as vexc +from oslo_vmware import vim_util as vutil import nova.conf from nova import context from nova import exception from nova import objects from nova.objects import fields as obj_fields +from nova.virt.vmwareapi import cluster_util from nova.virt.vmwareapi import ds_util from nova.virt.vmwareapi import vim_util from nova.virt.vmwareapi import vm_util @@ -98,6 +100,9 @@ def update_status(self): # Get cpu, memory stats from the cluster per_host_stats = vm_util.get_stats_from_cluster_per_host( self._session, self._cluster) + + cluster_hostgroups = cluster_util.fetch_cluster_groups( + self._session, self._cluster, group_type="host") except (vexc.VimConnectionException, vexc.VimAttributeException) as ex: # VimAttributeException is thrown when vpxd service is down LOG.warning("Failed to connect with %(node)s. " @@ -133,11 +138,30 @@ def update_status(self): data[self._cluster_node_name] = self._merge_stats( self._cluster_node_name, cluster_stats, defaults) + data[self._cluster_node_name]['hostgroups'] = {} + for hg in cluster_hostgroups.values(): + # ignore empty hostgroups + if not getattr(hg, 'host', None): + continue + + data[self._cluster_node_name]['hostgroups'][hg.name] = [ + vutil.get_moref_value(h_ref) for h_ref in hg.host] + for h_ref in hg.host: + host_ref_value = vutil.get_moref_value(h_ref) + if host_ref_value not in per_host_stats: + continue + h_name = per_host_stats[host_ref_value]["name"] + data[h_name].setdefault('hostgroups', []).append(hg.name) + self._stats = data if self._auto_service_disabled: self._set_host_enabled(True) return data + @property + def hostgroups(self): + return self._stats[self._cluster_node_name].get('hostgroups', {}) + def _merge_stats(self, host, stats, defaults): result = deepcopy(defaults) result["hypervisor_hostname"] = host diff --git a/nova/virt/vmwareapi/vmops.py b/nova/virt/vmwareapi/vmops.py index 4fcd03182d7..20d08cb2cea 100644 --- a/nova/virt/vmwareapi/vmops.py +++ b/nova/virt/vmwareapi/vmops.py @@ -19,6 +19,7 @@ Class for VM tasks like spawn, snapshot, suspend, resume etc. """ +from collections import defaultdict import contextlib import copy import itertools @@ -1241,6 +1242,8 @@ def prepare_for_spawn(self, instance): def update_cluster_placement(self, context, instance, remove=False): self.sync_instance_server_group(context, instance) self.update_admin_vm_group_membership(instance, remove=remove) + self.update_external_customer_placement(context, instance, + remove=remove) def sync_instance_server_group(self, context, instance): try: @@ -1273,6 +1276,233 @@ def update_admin_vm_group_membership(self, instance, remove=False): vm_group_name, vm_ref, remove=remove) + def _get_external_customer_vm_group_for_instance(self, instance, + hostgroups): + """Return the VmGroup name for the specific instance""" + domain_name = instance.system_metadata['domain_name'] + prefixes = tuple(CONF.external_customer_domain_name_prefixes) + drs_prefix = constants.DRS_EXT_CUSTOMER_PREFIX + + vm_group_name = None + if domain_name in hostgroups: + vm_group_name = f"{drs_prefix}{domain_name}" + elif not domain_name.startswith(prefixes): + vm_group_name = f"{drs_prefix}internal_workload" + else: + prefixes_and_hg_names = [(prefix, prefix.removesuffix('-')) + for prefix in prefixes] + for prefix, hg_name in prefixes_and_hg_names: + if not domain_name.startswith(prefix): + continue + + if hg_name not in hostgroups: + continue + + vm_group_name = f"{drs_prefix}{hg_name}" + break + else: + LOG.error("Could not find hostgroup for instance %s in " + "domain %s in the existing hostgroups %s", + instance.uuid, domain_name, hostgroups) + + return vm_group_name + + def _get_instances_by_external_customer_vm_group(self, context): + """Get Instances assigned to this host grouped by VmGroup name""" + InstanceList = objects.instance.InstanceList + filters = {'host': self._compute_host, 'deleted': False} + instances = InstanceList.get_by_filters( + context, filters, expected_attrs=['system_metadata']) + + hostgroups = self._vc_state.hostgroups + + grouped_instances = defaultdict(list) + for instance in instances: + vm_group_name = self._get_external_customer_vm_group_for_instance( + instance, hostgroups) + grouped_instances[vm_group_name].append(instance.uuid) + + return grouped_instances + + def update_external_customer_placement(self, context, instance, + remove=False): + """Ensure DRS rules and VmGroup assignment for the instance""" + hg_prefixes = tuple(prefix.removesuffix('-') + for prefix in CONF.external_customer_domain_name_prefixes) + hostgroups = [hg_name for hg_name in self._vc_state.hostgroups + if hg_name.startswith(hg_prefixes)] + + if not hostgroups: + # This cluster does not manage external customer VMs. + return + + self.sync_external_customer_drs_rules() + + vm_group_name = self._get_external_customer_vm_group_for_instance( + instance, hostgroups) + if vm_group_name is None: + if remove: + # For removals we do not require a VmGroup because the VM would + # not be managed by the cluster soon anyways. + return + # FIXME build a VMware exception for this + raise exception.NovaException('') + self.sync_external_customer_vm_group(context, vm_group_name) + + def sync_external_customer_vm_group(self, context, vm_group_name): + LOG.debug('Starting sync for external customer VmGroup %s', + vm_group_name) + + @utils.synchronized("vmware-external-customer-vm-group-" + f"{vm_group_name}") + def _sync_vm_group(context, vm_group_name): + vg_instances = \ + self._get_instances_by_external_customer_vm_group(context) + if vm_group_name not in vg_instances: + LOG.info("Sync for external customer VmGroup %s done: " + "No instances assigned.", vm_group_name) + return + + instance_uuids = vg_instances[vm_group_name] + + expected_members = self._filter_instances_for_drs( + context, instance_uuids, vm_group_name=vm_group_name) + if not expected_members: + LOG.info("Sync for external customer VmGroup %s done: " + "No expected members.", vm_group_name) + return + + LOG.debug('Updating external customer VmGroup %s with %s members', + vm_group_name, len(expected_members)) + cluster_util.set_vm_group_members( + self._session, self._cluster, vm_group_name, + list(expected_members.values())) + + LOG.debug('Sync for external customer VmGroup %s done', + vm_group_name) + + _sync_vm_group(context, vm_group_name) + + @utils.synchronized('sync-external-customers-drs-rules') + def sync_external_customer_drs_rules(self, do_cleanup=True): + """Ensure DRS rules for anti-/affinity to HostGroups exist + + We delete any DRS rules matching the naming scheme but referencing a + non-existing HostGroup to keep the cluster clean. + + For each HostGroup there's are 2 rules. Configured with affinity to the + HostGroup, we have a rule referencing a VmGroup named after the + HostGroup. To keep internal workload away from the HostGroup, a second + rule configured with anti-affinity referencing the pre-defined internal + workload VmGroup exists. + + :param:do_cleanup: If set to `False`, no remove operations for groups + and rules are executed. Can be useful outside the + sync-loop to reduce possible cluster + reconfiguration calls which can hang on the cluster + lock waiting for vCenter-external live-migrations. + """ + LOG.debug("Syncing external customer DRS rules") + drs_prefix = constants.DRS_EXT_CUSTOMER_PREFIX + + client_factory = self._session.vim.client.factory + config_spec = client_factory.create('ns0:ClusterConfigSpecEx') + config_spec.groupSpec = [] + config_spec.rulesSpec = [] + + # fetch the existing Group objects and split them into HostGroup and + # VmGroup objects, filtering for HostGroups that contain hosts + _existing_groups = cluster_util.fetch_cluster_groups(self._session, + self._cluster) + + hg_prefixes = tuple(prefix.removesuffix('-') + for prefix in CONF.external_customer_domain_name_prefixes) + existing_hostgroups = {g.name for g in _existing_groups.values() + if getattr(g, 'host', None) and g.name.startswith(hg_prefixes)} + existing_vm_groups = {g.name for g in _existing_groups.values() + if g.name.startswith(drs_prefix)} + + # fetch the existing rules and deduplicate them + _existing_rules = sorted(cluster_util.get_rules_by_prefix( + self._session, self._cluster, drs_prefix), + key=attrgetter('name')) + existing_rules = {} + for rule_name, rules in itertools.groupby(_existing_rules, + key=attrgetter('name')): + rules = list(rules) + existing_rules[rule_name] = rules[0] + if len(rules) == 1: + continue + # we found a duplicated rule and have to delete all but one + for rule in rules[1:]: + config_spec.rulesSpec.append( + cluster_util.create_rule_spec( + client_factory, rule, 'remove')) + + expected_vm_group_names = { + f"{drs_prefix}{hg_name}" for hg_name in existing_hostgroups} + if existing_hostgroups: + # If there are no hostgroups we have to take care of, it means our + # cluster doesn't handle external workload and we also don't need + # the internal_workload groups. + expected_vm_group_names.add(f"{drs_prefix}internal_workload") + + # Creat missing VmGroup objects so we can create rules for them + missing_vm_group_names = expected_vm_group_names - existing_vm_groups + for vm_group_name in missing_vm_group_names: + group = cluster_util.create_vm_group(client_factory, + vm_group_name, []) + config_spec.groupSpec.append( + cluster_util.create_group_spec( + client_factory, group, 'add')) + + expected_rule_names = set() + # NOTE: Experiments have shown that we can create VmGroup objects in + # the same call that we create a rule that uses them. + for hg_name in existing_hostgroups: + for affinity, vm_group_name in ( + ('affinity', f"{drs_prefix}{hg_name}"), + ('anti-affinity', f"{drs_prefix}internal_workload")): + rule_name = f"{drs_prefix}{hg_name}_{affinity}" + expected_rule_names.add(rule_name) + if rule := existing_rules.get(rule_name): + # Make sure our existing rule is enabled + if not rule.enabled: + rule.enabled = True + config_spec.rulesSpec.append( + cluster_util.create_rule_spec( + client_factory, rule, 'edit')) + else: + # Since we ensure that VmGroups we expect exist/get + # created, we don't have to handle non-existing VmGroups + # here. + rule = cluster_util.create_vm_host_rule(client_factory, + rule_name, hg_name, vm_group_name, policy=affinity) + config_spec.rulesSpec.append( + cluster_util.create_rule_spec( + client_factory, rule, 'add')) + + # Clean up no longer necessary DRS rules + if do_cleanup: + for rule_name in existing_rules.keys() - expected_rule_names: + config_spec.rulesSpec.append( + cluster_util.create_rule_spec( + client_factory, rules[rule_name], 'remove')) + + if not config_spec.rulesSpec and not config_spec.groupSpec: + return + + rule_changes = [str((rs.info.name, rs.operation)) + for rs in config_spec.rulesSpec] + group_changes = [str((gs.info.name, gs.operation)) + for gs in config_spec.groupSpec] + LOG.info('Updating external customer DRS rules and groups: %s and %s', + ', '.join(rule_changes), ', '.join(group_changes)) + cluster_util.reconfigure_cluster(self._session, self._cluster, + config_spec) + LOG.info('Updated external customer DRS rules and groups: %s and %s', + ', '.join(rule_changes), ', '.join(group_changes)) + def _build_template_vm_inventory_path(self, vi): vm_folder_name = self._session._call_method(vutil, "get_object_property", @@ -4013,9 +4243,22 @@ def set_compute_host(self, compute_host): """Called by the driver on init_host() so we know the compute host""" self._compute_host = compute_host - def sync_server_group(self, context, sg_uuid): - """Sync a server group by its uuid for the current host/cluster + def _filter_instances_for_drs(self, context, instance_uuids, sg_uuid=None, + vm_group_name=None): + """Filter the given Instance UUIDs down to DRS-applicable ones + + We cannot use all instances when configuring DRS rules/groups, because + with some instances race against other, parallel tasks. This method + filters down the given instance UUIDs to the ones we should be able to + safely use with DRS. + + We return a dictionary mapping the remaining instance UUIDs to MoRefs. """ + if (sg_uuid, vm_group_name) == (None, None): + raise ValueError( + "Either `sg_uuid` or `vm_group_name` is required for " + "`VMwareVMOps._filter_instances_for_drs()`.") + # we have to ignore instances currently in a volatitle state, where # either VMware cannot support them being in a DRS rule or we expect # them to go away during the syncing process, which could lead to @@ -4028,6 +4271,99 @@ def sync_server_group(self, context, sg_uuid): task_states.REBUILDING, task_states.REBUILD_BLOCK_DEVICE_MAPPING, ] + + # First we check for all instances, which have ongoing migrations + # on the given host, either as source or destination + + # Decision matrix for migrations: + # Mig-Status Action/Host + # Source Dest + # Preparing Remove N/A + # Running Remove Add + # (Other states are consistent with default behaviour) + # + # So the Instance.host will always be the source of the migration, + # and we want to remove the rules. + # We only need to handle specially the case a running migration + # on the destination host, and add it + + MigrationList = objects.migration.MigrationList + filters = { + "host": self._compute_host, + "instance_uuid": instance_uuids, + "status": ["preparing", "running"], + } + + expected_members = {} + + migrations_by_instance_uuid = {} + for migration in MigrationList.get_by_filters(context, filters): + instance_uuid = migration.instance_uuid + if migration.source_compute == self._compute_host: + # The host is the source of a migration + # That means the instance will be part of the instance list + # So we have to remember that instance to be removed from + # the DRS rule-set + migrations_by_instance_uuid[instance_uuid] = \ + migration + else: + # We now handle the destination side + if migration.status == "preparing": + # Not even started, we can ignore that one + continue + + # Polling the cache, as vm_util.get_vm_ref is very slow + # for the negative search. + # We just have to ensure, that the cache holds a value + # before syncing the server group on the destination host + # Race conditions are averted by the calling function's lock + moref = vm_util.vm_ref_cache_get(instance_uuid) + if moref: + expected_members[instance_uuid] = moref + + # retrieve the instances, because sg.members contains all members + # and we need to filter them for our host + InstanceList = objects.instance.InstanceList + filters = {'host': self._compute_host, 'uuid': instance_uuids, + 'deleted': False} + instances = InstanceList.get_by_filters(context, filters, + expected_attrs=[]) + + if sg_uuid: + msg_prefix = "Excluding member %s of server-group %s, " + msg_value = sg_uuid + elif vm_group_name: + msg_prefix = "Exluding instance %s from VmGroup %s, " + msg_value = vm_group_name + + for instance in instances: + task_state = instance.task_state + if task_state in STATES_EXCLUDING_MEMBERS_FROM_DRS_RULES: + LOG.debug(msg_prefix + "because it's in task_state %s.", + instance.uuid, msg_value, task_state) + continue + + migration = migrations_by_instance_uuid.get(instance.uuid) + if migration: + LOG.debug(msg_prefix + "due to being on the source side of " + "ongoing migration %s.", + instance.uuid, msg_value, migration.uuid) + continue + + try: + moref = vm_util.get_vm_ref(self._session, instance) + except exception.InstanceNotFound: + LOG.warning(msg_prefix + "because we could not find " + "a moref for it", + instance.uuid, msg_value) + continue + expected_members[instance.uuid] = moref + + return expected_members + + def sync_server_group(self, context, sg_uuid): + """Sync a server group by its uuid for the current host/cluster + """ LOG.debug('Starting sync for server-group %s', sg_uuid) @utils.synchronized('vmware-server-group-{}'.format(sg_uuid)) @@ -4052,87 +4388,8 @@ def _sync_sync_server_group(context, sg_uuid): LOG.debug('Sync for server-group %s done', sg_uuid) return - # First we check for all instances, which have ongoing migrations - # on the given host, either as source or destination - - # Decision matrix for migrations: - # Mig-Status Action/Host - # Source Dest - # Preparing Remove N/A - # Running Remove Add - # (Other states are consistent with default behaviour) - # - # So the Instance.host will always be the source of the migration, - # and we want to remove the rules. - # We only need to handle specially the case a running migration - # on the destination host, and add it - - MigrationList = objects.migration.MigrationList - filters = { - "host": self._compute_host, - "instance_uuid": sg.members, - "status": ["preparing", "running"], - } - - expected_members = {} - - migrations_by_instance_uuid = {} - for migration in MigrationList.get_by_filters(context, filters): - instance_uuid = migration.instance_uuid - if migration.source_compute == self._compute_host: - # The host is the source of a migration - # That means the instance will be part of the instance list - # So we have to remember that instance to be removed from - # the DRS rule-set - migrations_by_instance_uuid[instance_uuid] = \ - migration - else: - # We now handle the destination side - if migration.status == "preparing": - # Not even started, we can ignore that one - continue - - # Polling the cache, as vm_util.get_vm_ref is very slow - # for the negative search. - # We just have to ensure, that the cache holds a value - # before syncing the server group on the destination host - # Race conditions are averted by this functions lock - moref = vm_util.vm_ref_cache_get(instance_uuid) - if moref: - expected_members[instance_uuid] = moref - - # retrieve the instances, because sg.members contains all members - # and we need to filter them for our host - InstanceList = objects.instance.InstanceList - filters = {'host': self._compute_host, 'uuid': sg.members, - 'deleted': False} - instances = InstanceList.get_by_filters(context, filters, - expected_attrs=[]) - - for instance in instances: - task_state = instance.task_state - if task_state in STATES_EXCLUDING_MEMBERS_FROM_DRS_RULES: - LOG.debug("Excluding member %s of server-group %s, " - "because it's in task_state %s.", - instance.uuid, sg.uuid, task_state) - continue - - migration = migrations_by_instance_uuid.get(instance.uuid) - if migration: - LOG.debug("Excluding member %s of server-group %s, " - "due to being on the source side of " - "ongoing migration %s.", - instance.uuid, sg.uuid, migration.uuid) - continue - - try: - moref = vm_util.get_vm_ref(self._session, instance) - except exception.InstanceNotFound: - LOG.warning('Could not find moref for instance %s. ' - 'Ignoring member of server-group %s', - instance.uuid, sg.uuid) - continue - expected_members[instance.uuid] = moref + expected_members = self._filter_instances_for_drs( + context, sg.members, sg_uuid=sg_uuid) rule_members_by_name = {} if sg.policy == 'soft-anti-affinity':