diff --git a/Makefile b/Makefile index 03d0d533a..d01652d0c 100755 --- a/Makefile +++ b/Makefile @@ -84,6 +84,7 @@ PLUGIN_SCRIPT_DEST := /etc/xapi.d/plugins/ LIBEXEC := /opt/xensource/libexec/ UDEV_RULES_DIR := /etc/udev/rules.d/ UDEV_SCRIPTS_DIR := /etc/udev/scripts/ +SYSTEMD_CONF_DIR := /etc/systemd/system/ SYSTEMD_SERVICE_DIR := /usr/lib/systemd/system/ INIT_DIR := /etc/rc.d/init.d/ MPATH_CONF_DIR := /etc/multipath.xenserver/ @@ -138,6 +139,8 @@ install: precheck mkdir -p $(SM_STAGING)$(UDEV_RULES_DIR) mkdir -p $(SM_STAGING)$(UDEV_SCRIPTS_DIR) mkdir -p $(SM_STAGING)$(INIT_DIR) + mkdir -p $(SM_STAGING)$(SYSTEMD_CONF_DIR) + mkdir -p $(SM_STAGING)$(SYSTEMD_CONF_DIR)/linstor-satellite.service.d mkdir -p $(SM_STAGING)$(SYSTEMD_SERVICE_DIR) mkdir -p $(SM_STAGING)$(MPATH_CONF_DIR) mkdir -p $(SM_STAGING)$(MODPROBE_DIR) @@ -163,6 +166,10 @@ install: precheck $(SM_STAGING)/$(SM_DEST) install -m 644 etc/logrotate.d/$(SMLOG_CONF) \ $(SM_STAGING)/$(LOGROTATE_DIR) + install -m 644 etc/systemd/system/linstor-satellite.service.d/override.conf \ + $(SM_STAGING)/$(SYSTEMD_CONF_DIR)/linstor-satellite.service.d/ + install -m 644 etc/systemd/system/var-lib-linstor.service \ + $(SM_STAGING)/$(SYSTEMD_CONF_DIR) install -m 644 etc/make-dummy-sr.service \ $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) install -m 644 systemd/xs-sm.service \ @@ -206,6 +213,9 @@ install: precheck install -m 755 drivers/iscsilib.py $(SM_STAGING)$(SM_DEST) install -m 755 drivers/fcoelib.py $(SM_STAGING)$(SM_DEST) mkdir -p $(SM_STAGING)$(LIBEXEC) + install -m 755 scripts/fork-log-daemon $(SM_STAGING)$(LIBEXEC) + install -m 755 scripts/linstor-kv-tool $(SM_STAGING)$(BIN_DEST) + install -m 755 scripts/safe-umount $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/local-device-change $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/check-device-sharing $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/usb_change $(SM_STAGING)$(LIBEXEC) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index a72d43e09..ca5015e31 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -19,8 +19,11 @@ try: from linstorjournaler import LinstorJournaler from linstorvhdutil import LinstorVhdUtil - from linstorvolumemanager \ - import LinstorVolumeManager, LinstorVolumeManagerError + from linstorvolumemanager import get_controller_uri + from linstorvolumemanager import get_controller_node_name + from linstorvolumemanager import LinstorVolumeManager + from linstorvolumemanager import LinstorVolumeManagerError + LINSTOR_AVAILABLE = True except ImportError: LINSTOR_AVAILABLE = False @@ -28,16 +31,24 @@ from lock import Lock import blktap2 import cleanup +import distutils import errno import functools +import lvutil +import os +import re import scsiutil +import signal +import socket import SR import SRCommand +import subprocess import time import traceback import util import VDI import vhdutil +import xml.etree.ElementTree as xml_parser import xmlrpc.client import xs_errors @@ -48,6 +59,23 @@ HIDDEN_TAG = 'hidden' +XHA_CONFIG_PATH = '/etc/xensource/xhad.conf' + +FORK_LOG_DAEMON = '/opt/xensource/libexec/fork-log-daemon' + +# This flag can be disabled to debug the DRBD layer. +# When this config var is False, the HA can only be used under +# specific conditions: +# - Only one heartbeat diskless VDI is present in the pool. +# - The other hearbeat volumes must be diskful and limited to a maximum of 3. +USE_HTTP_NBD_SERVERS = True + +# Useful flag to trace calls using cProfile. +TRACE_PERFS = False + +# Enable/Disable VHD key hash support. +USE_KEY_HASH = False + # ============================================================================== # TODO: Supports 'VDI_INTRODUCE', 'VDI_RESET_ON_BOOT/2', 'SR_TRIM', @@ -72,9 +100,9 @@ CONFIGURATION = [ ['group-name', 'LVM group name'], - ['hosts', 'host names to use'], ['redundancy', 'replication count'], - ['provisioning', '"thin" or "thick" are accepted'] + ['provisioning', '"thin" or "thick" are accepted (optional, defaults to thin)'], + ['monitor-db-quorum', 'disable controller when only one host is online (optional, defaults to true)'] ] DRIVER_INFO = { @@ -92,7 +120,8 @@ OPS_EXCLUSIVE = [ 'sr_create', 'sr_delete', 'sr_attach', 'sr_detach', 'sr_scan', - 'sr_update', 'vdi_create', 'vdi_delete', 'vdi_clone', 'vdi_snapshot' + 'sr_update', 'sr_probe', 'vdi_init', 'vdi_create', 'vdi_delete', + 'vdi_attach', 'vdi_detach', 'vdi_clone', 'vdi_snapshot', ] # ============================================================================== @@ -136,7 +165,9 @@ def attach_thin(session, journaler, linstor, sr_uuid, vdi_uuid): # If the virtual VHD size is lower than the LINSTOR volume size, # there is nothing to do. vhd_size = compute_volume_size( - LinstorVhdUtil(session, linstor).get_size_virt(vdi_uuid), + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 + LinstorVhdUtil(session, linstor).get_size_virt(vdi_uuid), # pylint: disable = E1120 image_type ) @@ -180,12 +211,16 @@ def detach_thin(session, linstor, sr_uuid, vdi_uuid): device_path = linstor.get_device_path(vdi_uuid) new_volume_size = LinstorVolumeManager.round_up_volume_size( - LinstorVhdUtil(session, linstor).get_size_phys(device_path) + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 + LinstorVhdUtil(session, linstor).get_size_phys(vdi_uuid) # pylint: disable = E1120 ) volume_info = linstor.get_volume_info(vdi_uuid) old_volume_size = volume_info.virtual_size - deflate(vdi_uuid, device_path, new_volume_size, old_volume_size) + deflate( + linstor, vdi_uuid, device_path, new_volume_size, old_volume_size + ) finally: lock.release() @@ -197,7 +232,7 @@ def inflate(journaler, linstor, vdi_uuid, vdi_path, new_size, old_size): return util.SMlog( - 'Inflate {} (new VHD size={}, previous={})' + 'Inflate {} (size={}, previous={})' .format(vdi_uuid, new_size, old_size) ) @@ -206,8 +241,15 @@ def inflate(journaler, linstor, vdi_uuid, vdi_path, new_size, old_size): ) linstor.resize_volume(vdi_uuid, new_size) + result_size = linstor.get_volume_size(vdi_uuid) + if result_size < new_size: + util.SMlog( + 'WARNING: Cannot inflate volume to {}B, result size: {}B' + .format(new_size, result_size) + ) + if not util.zeroOut( - vdi_path, new_size - vhdutil.VHD_FOOTER_SIZE, + vdi_path, result_size - vhdutil.VHD_FOOTER_SIZE, vhdutil.VHD_FOOTER_SIZE ): raise xs_errors.XenError( @@ -215,11 +257,11 @@ def inflate(journaler, linstor, vdi_uuid, vdi_path, new_size, old_size): opterr='Failed to zero out VHD footer {}'.format(vdi_path) ) - vhdutil.setSizePhys(vdi_path, new_size, False) + LinstorVhdUtil(None, linstor).set_size_phys(vdi_path, result_size, False) journaler.remove(LinstorJournaler.INFLATE, vdi_uuid) -def deflate(vdi_uuid, vdi_path, new_size, old_size): +def deflate(linstor, vdi_uuid, vdi_path, new_size, old_size): new_size = LinstorVolumeManager.round_up_volume_size(new_size) if new_size >= old_size: return @@ -229,16 +271,86 @@ def deflate(vdi_uuid, vdi_path, new_size, old_size): .format(vdi_uuid, new_size, old_size) ) - vhdutil.setSizePhys(vdi_path, new_size) + LinstorVhdUtil(None, linstor).set_size_phys(vdi_path, new_size) # TODO: Change the LINSTOR volume size using linstor.resize_volume. +IPS_XHA_CACHE = None + + +def get_ips_from_xha_config_file(): + if IPS_XHA_CACHE: + return IPS_XHA_CACHE + + ips = dict() + host_id = None + try: + # Ensure there is no dirty read problem. + # For example if the HA is reloaded. + tree = util.retry( + lambda: xml_parser.parse(XHA_CONFIG_PATH), + maxretry=10, + period=1 + ) + except: + return (None, ips) + + def parse_host_nodes(ips, node): + current_id = None + current_ip = None + + for sub_node in node: + if sub_node.tag == 'IPaddress': + current_ip = sub_node.text + elif sub_node.tag == 'HostID': + current_id = sub_node.text + else: + continue + + if current_id and current_ip: + ips[current_id] = current_ip + return + util.SMlog('Ill-formed XHA file, missing IPaddress or/and HostID') + + def parse_common_config(ips, node): + for sub_node in node: + if sub_node.tag == 'host': + parse_host_nodes(ips, sub_node) + + def parse_local_config(ips, node): + for sub_node in node: + if sub_node.tag == 'localhost': + for host_node in sub_node: + if host_node.tag == 'HostID': + return host_node.text + + for node in tree.getroot(): + if node.tag == 'common-config': + parse_common_config(ips, node) + elif node.tag == 'local-config': + host_id = parse_local_config(ips, node) + else: + continue + + if ips and host_id: + break + + return (host_id and ips.get(host_id), ips) + + +def activate_lvm_group(group_name): + path = group_name.split('/') + assert path and len(path) <= 2 + try: + lvutil.setActiveVG(path[0], True) + except Exception as e: + util.SMlog('Cannot active VG `{}`: {}'.format(path[0], e)) + # ============================================================================== # Usage example: # xe sr-create type=linstor name-label=linstor-sr # host-uuid=d2deba7a-c5ad-4de1-9a20-5c8df3343e93 -# device-config:hosts=node-linstor1,node-linstor2,node-linstor3 # device-config:group-name=vg_loop device-config:redundancy=2 @@ -250,6 +362,11 @@ class LinstorSR(SR.SR): MANAGER_PLUGIN = 'linstor-manager' + INIT_STATUS_NOT_SET = 0 + INIT_STATUS_IN_PROGRESS = 1 + INIT_STATUS_OK = 2 + INIT_STATUS_FAIL = 3 + # -------------------------------------------------------------------------- # SR methods. # -------------------------------------------------------------------------- @@ -265,8 +382,6 @@ def load(self, sr_uuid): ) # Check parameters. - if 'hosts' not in self.dconf or not self.dconf['hosts']: - raise xs_errors.XenError('LinstorConfigHostsMissing') if 'group-name' not in self.dconf or not self.dconf['group-name']: raise xs_errors.XenError('LinstorConfigGroupNameMissing') if 'redundancy' not in self.dconf or not self.dconf['redundancy']: @@ -289,6 +404,10 @@ def load(self, sr_uuid): else: self._provisioning = self.PROVISIONING_DEFAULT + monitor_db_quorum = self.dconf.get('monitor-db-quorum') + self._monitor_db_quorum = (monitor_db_quorum is None) or \ + distutils.util.strtobool(monitor_db_quorum) + # Note: We don't have access to the session field if the # 'vdi_attach_from_config' command is executed. self._has_session = self.sr_ref and self.session is not None @@ -307,8 +426,8 @@ def load(self, sr_uuid): self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) self.sr_vditype = SR.DEFAULT_TAP - self._hosts = self.dconf['hosts'].split(',') - self._redundancy = int(self.dconf['redundancy'] or 1) + if self.cmd == 'sr_create': + self._redundancy = int(self.dconf['redundancy']) or 1 self._linstor = None # Ensure that LINSTOR attribute exists. self._journaler = None @@ -317,46 +436,72 @@ def load(self, sr_uuid): self._is_master = True self._group_name = self.dconf['group-name'] - self._master_uri = None - self._vdi_shared_locked = False + self._vdi_shared_time = 0 - self._initialized = False + self._init_status = self.INIT_STATUS_NOT_SET + + self._vdis_loaded = False + self._all_volume_info_cache = None + self._all_volume_metadata_cache = None def _locked_load(method): - @functools.wraps(method) - def wrap(self, *args, **kwargs): - if self._initialized: - return method(self, *args, **kwargs) - self._initialized = True + def wrapped_method(self, *args, **kwargs): + self._init_status = self.INIT_STATUS_OK + return method(self, *args, **kwargs) - if not self._has_session: - if self.srcmd.cmd == 'vdi_attach_from_config': - # We must have a valid LINSTOR instance here without using - # the XAPI. - self._master_uri = 'linstor://{}'.format( - util.get_master_address() - ) - self._journaler = LinstorJournaler( - self._master_uri, self._group_name, logger=util.SMlog - ) + def load(self, *args, **kwargs): + # Activate all LVMs to make drbd-reactor happy. + if self.srcmd.cmd == 'sr_attach': + activate_lvm_group(self._group_name) - try: + if not self._has_session: + if self.srcmd.cmd in ( + 'vdi_attach_from_config', + 'vdi_detach_from_config', + # When on-slave (is_open) is executed we have an + # empty command. + None + ): + def create_linstor(uri, attempt_count=30): self._linstor = LinstorVolumeManager( - self._master_uri, + uri, self._group_name, - logger=util.SMlog - ) - return - except Exception as e: - util.SMlog( - 'Ignore exception. Failed to build LINSTOR ' - 'instance without session: {}'.format(e) + logger=util.SMlog, + attempt_count=attempt_count ) - return - self._master_uri = 'linstor://{}'.format( - util.get_master_rec(self.session)['address'] - ) + controller_uri = get_controller_uri() + if controller_uri: + create_linstor(controller_uri) + else: + def connect(): + # We must have a valid LINSTOR instance here without using + # the XAPI. Fallback with the HA config file. + for ip in get_ips_from_xha_config_file()[1].values(): + controller_uri = 'linstor://' + ip + try: + util.SMlog('Connecting from config to LINSTOR controller using: {}'.format(ip)) + create_linstor(controller_uri, attempt_count=0) + return controller_uri + except: + pass + + controller_uri = util.retry(connect, maxretry=30, period=1) + if not controller_uri: + raise xs_errors.XenError( + 'SRUnavailable', + opterr='No valid controller URI to attach/detach from config' + ) + + self._journaler = LinstorJournaler( + controller_uri, self._group_name, logger=util.SMlog + ) + + if self.srcmd.cmd is None: + # Only useful on on-slave plugin (is_open). + self._vhdutil = LinstorVhdUtil(self.session, self._linstor) + + return wrapped_method(self, *args, **kwargs) if not self._is_master: if self.cmd in [ @@ -374,37 +519,12 @@ def wrap(self, *args, **kwargs): # behaviors if the GC is executed during an action on a slave. if self.cmd.startswith('vdi_'): self._shared_lock_vdi(self.srcmd.params['vdi_uuid']) - self._vdi_shared_locked = True + self._vdi_shared_time = time.time() - self._journaler = LinstorJournaler( - self._master_uri, self._group_name, logger=util.SMlog - ) - - # Ensure ports are opened and LINSTOR controller/satellite - # are activated. - if self.srcmd.cmd == 'sr_create': - # TODO: Disable if necessary - self._enable_linstor_on_all_hosts(status=True) - - try: - # Try to open SR if exists. - self._linstor = LinstorVolumeManager( - self._master_uri, - self._group_name, - repair=self._is_master, - logger=util.SMlog - ) - self._vhdutil = LinstorVhdUtil(self.session, self._linstor) - except Exception as e: - if self.srcmd.cmd == 'sr_create' or \ - self.srcmd.cmd == 'sr_detach': - # Ignore exception in this specific case: sr_create. - # At this moment the LinstorVolumeManager cannot be - # instantiated. Concerning the sr_detach command, we must - # ignore LINSTOR exceptions (if the volume group doesn't - # exist for example after a bad user action). - pass - else: + if self.srcmd.cmd != 'sr_create' and self.srcmd.cmd != 'sr_detach': + try: + self._reconnect() + except Exception as e: raise xs_errors.XenError('SRUnavailable', opterr=str(e)) if self._linstor: @@ -416,41 +536,87 @@ def wrap(self, *args, **kwargs): if hosts: util.SMlog('Failed to join node(s): {}'.format(hosts)) + # Ensure we use a non-locked volume when vhdutil is called. + if ( + self._is_master and self.cmd.startswith('vdi_') and + self.cmd != 'vdi_create' + ): + self._linstor.ensure_volume_is_not_locked( + self.srcmd.params['vdi_uuid'] + ) + try: - # If the command is a SR command on the master, we must - # load all VDIs and clean journal transactions. - # We must load the VDIs in the snapshot case too. + # If the command is a SR scan command on the master, + # we must load all VDIs and clean journal transactions. + # We must load the VDIs in the snapshot case too only if + # there is at least one entry in the journal. + # + # If the command is a SR command we want at least to remove + # resourceless volumes. if self._is_master and self.cmd not in [ 'vdi_attach', 'vdi_detach', 'vdi_activate', 'vdi_deactivate', 'vdi_epoch_begin', 'vdi_epoch_end', 'vdi_update', 'vdi_destroy' ]: - self._load_vdis() - self._undo_all_journal_transactions() + load_vdis = ( + self.cmd == 'sr_scan' or + self.cmd == 'sr_attach' + ) or len( + self._journaler.get_all(LinstorJournaler.INFLATE) + ) or len( + self._journaler.get_all(LinstorJournaler.CLONE) + ) + + if load_vdis: + self._load_vdis() + self._linstor.remove_resourceless_volumes() self._synchronize_metadata() except Exception as e: + if self.cmd == 'sr_scan' or self.cmd == 'sr_attach': + # Always raise, we don't want to remove VDIs + # from the XAPI database otherwise. + raise e util.SMlog( 'Ignoring exception in LinstorSR.load: {}'.format(e) ) util.SMlog(traceback.format_exc()) - return method(self, *args, **kwargs) + return wrapped_method(self, *args, **kwargs) + + @functools.wraps(wrapped_method) + def wrap(self, *args, **kwargs): + if self._init_status in \ + (self.INIT_STATUS_OK, self.INIT_STATUS_IN_PROGRESS): + return wrapped_method(self, *args, **kwargs) + if self._init_status == self.INIT_STATUS_FAIL: + util.SMlog( + 'Can\'t call method {} because initialization failed' + .format(method) + ) + else: + try: + self._init_status = self.INIT_STATUS_IN_PROGRESS + return load(self, *args, **kwargs) + except Exception: + if self._init_status != self.INIT_STATUS_OK: + self._init_status = self.INIT_STATUS_FAIL + raise return wrap - @_locked_load def cleanup(self): - if self._vdi_shared_locked: + if self._vdi_shared_time: self._shared_lock_vdi(self.srcmd.params['vdi_uuid'], locked=False) @_locked_load def create(self, uuid, size): util.SMlog('LinstorSR.create for {}'.format(self.uuid)) - if self._redundancy > len(self._hosts): + host_adresses = util.get_host_addresses(self.session) + if self._redundancy > len(host_adresses): raise xs_errors.XenError( 'LinstorSRCreate', opterr='Redundancy greater than host count' @@ -472,15 +638,39 @@ def create(self, uuid, size): opterr='group name must be unique' ) + if srs: + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='LINSTOR SR must be unique in a pool' + ) + + online_hosts = util.get_online_hosts(self.session) + if len(online_hosts) < len(host_adresses): + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='Not enough online hosts' + ) + + ips = {} + for host_ref in online_hosts: + record = self.session.xenapi.host.get_record(host_ref) + hostname = record['hostname'] + ips[hostname] = record['address'] + + # Ensure ports are opened and LINSTOR satellites + # are activated. In the same time the drbd-reactor instances + # must be stopped. + self._prepare_sr_on_all_hosts(self._group_name, enabled=True) + # Create SR. # Throw if the SR already exists. try: self._linstor = LinstorVolumeManager.create_sr( - self._master_uri, self._group_name, - self._hosts, + ips, self._redundancy, thin_provisioning=self._provisioning == 'thin', + auto_quorum=self._monitor_db_quorum, logger=util.SMlog ) self._vhdutil = LinstorVhdUtil(self.session, self._linstor) @@ -488,30 +678,83 @@ def create(self, uuid, size): util.SMlog('Failed to create LINSTOR SR: {}'.format(e)) raise xs_errors.XenError('LinstorSRCreate', opterr=str(e)) + try: + util.SMlog( + "Finishing SR creation, enable drbd-reactor on all hosts..." + ) + self._update_drbd_reactor_on_all_hosts(enabled=True) + except Exception as e: + try: + self._linstor.destroy() + except Exception as e2: + util.SMlog( + 'Failed to destroy LINSTOR SR after creation fail: {}' + .format(e2) + ) + raise e + @_locked_load def delete(self, uuid): util.SMlog('LinstorSR.delete for {}'.format(self.uuid)) cleanup.gc_force(self.session, self.uuid) - if self.vdis: + if self.vdis or self._linstor._volumes: raise xs_errors.XenError('SRNotEmpty') - try: - # TODO: Use specific exceptions. If the LINSTOR group doesn't - # exist, we can remove it without problem. + node_name = get_controller_node_name() + if not node_name: + raise xs_errors.XenError( + 'LinstorSRDelete', + opterr='Cannot get controller node name' + ) - # TODO: Maybe remove all volumes unused by the SMAPI. - # We must ensure it's a safe idea... + host = None + if node_name == 'localhost': + host = util.get_this_host_ref(self.session) + else: + for slave in util.get_all_slaves(self.session): + r_name = self.session.xenapi.host.get_record(slave)['hostname'] + if r_name == node_name: + host = slave + break + + if not host: + raise xs_errors.XenError( + 'LinstorSRDelete', + opterr='Failed to find host with hostname: {}'.format( + node_name + ) + ) - self._linstor.destroy() - Lock.cleanupAll(self.uuid) + try: + self._update_drbd_reactor_on_all_hosts( + controller_node_name=node_name, enabled=False + ) + + args = { + 'groupName': self._group_name, + } + self._exec_manager_command( + host, 'destroy', args, 'LinstorSRDelete' + ) except Exception as e: + try: + self._update_drbd_reactor_on_all_hosts( + controller_node_name=node_name, enabled=True + ) + except Exception as e2: + util.SMlog( + 'Failed to restart drbd-reactor after destroy fail: {}' + .format(e2) + ) util.SMlog('Failed to delete LINSTOR SR: {}'.format(e)) raise xs_errors.XenError( 'LinstorSRDelete', opterr=str(e) ) + Lock.cleanupAll(self.uuid) + @_locked_load def update(self, uuid): util.SMlog('LinstorSR.update for {}'.format(self.uuid)) @@ -558,6 +801,9 @@ def probe(self): @_locked_load def scan(self, uuid): + if self._init_status == self.INIT_STATUS_FAIL: + return + util.SMlog('LinstorSR.scan for {}'.format(self.uuid)) if not self._linstor: raise xs_errors.XenError( @@ -565,6 +811,9 @@ def scan(self, uuid): opterr='no such volume group: {}'.format(self._group_name) ) + # Note: `scan` can be called outside this module, so ensure the VDIs + # are loaded. + self._load_vdis() self._update_physical_size() for vdi_uuid in self.vdis.keys(): @@ -588,10 +837,9 @@ def vdi(self, uuid): # -------------------------------------------------------------------------- def _shared_lock_vdi(self, vdi_uuid, locked=True): - pools = self.session.xenapi.pool.get_all() - master = self.session.xenapi.pool.get_master(pools[0]) + master = util.get_master_ref(self.session) - method = 'lockVdi' + command = 'lockVdi' args = { 'groupName': self._group_name, 'srUuid': self.uuid, @@ -599,48 +847,128 @@ def _shared_lock_vdi(self, vdi_uuid, locked=True): 'locked': str(locked) } - ret = self.session.xenapi.host.call_plugin( - master, self.MANAGER_PLUGIN, method, args - ) - util.SMlog( - 'call-plugin ({} with {}) returned: {}' - .format(method, args, ret) - ) - if ret == 'False': - raise xs_errors.XenError( - 'VDIUnavailable', - opterr='Plugin {} failed'.format(self.MANAGER_PLUGIN) - ) + # Note: We must avoid to unlock the volume if the timeout is reached + # because during volume unlock, the SR lock is not used. Otherwise + # we could destroy a valid lock acquired from another host... + # + # This code is not very clean, the ideal solution would be to acquire + # the SR lock during volume unlock (like lock) but it's not easy + # to implement without impacting performance. + if not locked: + elapsed_time = time.time() - self._vdi_shared_time + timeout = LinstorVolumeManager.LOCKED_EXPIRATION_DELAY * 0.7 + if elapsed_time >= timeout: + util.SMlog( + 'Avoid unlock call of {} because timeout has been reached' + .format(vdi_uuid) + ) + return + + self._exec_manager_command(master, command, args, 'VDIUnavailable') # -------------------------------------------------------------------------- # Network. # -------------------------------------------------------------------------- - def _enable_linstor(self, host, status): - method = 'enable' - args = {'enabled': str(bool(status))} + def _exec_manager_command(self, host_ref, command, args, error): + host_rec = self.session.xenapi.host.get_record(host_ref) + host_uuid = host_rec['uuid'] + + try: + ret = self.session.xenapi.host.call_plugin( + host_ref, self.MANAGER_PLUGIN, command, args + ) + except Exception as e: + util.SMlog( + 'call-plugin on {} ({}:{} with {}) raised'.format( + host_uuid, self.MANAGER_PLUGIN, command, args + ) + ) + raise e - ret = self.session.xenapi.host.call_plugin( - host, self.MANAGER_PLUGIN, method, args - ) util.SMlog( - 'call-plugin ({} with {}) returned: {}'.format(method, args, ret) + 'call-plugin on {} ({}:{} with {}) returned: {}'.format( + host_uuid, self.MANAGER_PLUGIN, command, args, ret + ) ) if ret == 'False': raise xs_errors.XenError( - 'SRUnavailable', + error, opterr='Plugin {} failed'.format(self.MANAGER_PLUGIN) ) - def _enable_linstor_on_master(self, status): - pools = self.session.xenapi.pool.get_all() - master = self.session.xenapi.pool.get_master(pools[0]) - self._enable_linstor(master, status) + def _prepare_sr(self, host, group_name, enabled): + self._exec_manager_command( + host, + 'prepareSr' if enabled else 'releaseSr', + {'groupName': group_name}, + 'SRUnavailable' + ) + + def _prepare_sr_on_all_hosts(self, group_name, enabled): + master = util.get_master_ref(self.session) + self._prepare_sr(master, group_name, enabled) - def _enable_linstor_on_all_hosts(self, status): - self._enable_linstor_on_master(status) for slave in util.get_all_slaves(self.session): - self._enable_linstor(slave, status) + self._prepare_sr(slave, group_name, enabled) + + def _update_drbd_reactor(self, host, enabled): + self._exec_manager_command( + host, + 'updateDrbdReactor', + {'enabled': str(enabled)}, + 'SRUnavailable' + ) + + def _update_drbd_reactor_on_all_hosts( + self, enabled, controller_node_name=None + ): + if controller_node_name == 'localhost': + controller_node_name = self.session.xenapi.host.get_record( + util.get_this_host_ref(self.session) + )['hostname'] + assert controller_node_name + assert controller_node_name != 'localhost' + + controller_host = None + secondary_hosts = [] + + hosts = self.session.xenapi.host.get_all_records() + for host_ref, host_rec in hosts.iteritems(): + hostname = host_rec['hostname'] + if controller_node_name == hostname: + controller_host = host_ref + else: + secondary_hosts.append((host_ref, hostname)) + + action_name = 'Starting' if enabled else 'Stopping' + if controller_node_name and not controller_host: + util.SMlog('Failed to find controller host: `{}`'.format( + controller_node_name + )) + + if enabled and controller_host: + util.SMlog('{} drbd-reactor on controller host `{}`...'.format( + action_name, controller_node_name + )) + # If enabled is true, we try to start the controller on the desired + # node name first. + self._update_drbd_reactor(controller_host, enabled) + + for host_ref, hostname in secondary_hosts: + util.SMlog('{} drbd-reactor on host {}...'.format( + action_name, hostname + )) + self._update_drbd_reactor(host_ref, enabled) + + if not enabled and controller_host: + util.SMlog('{} drbd-reactor on controller host `{}`...'.format( + action_name, controller_node_name + )) + # If enabled is false, we disable the drbd-reactor service of + # the controller host last. Why? Otherwise the linstor-controller + # of other nodes can be started, and we don't want that. + self._update_drbd_reactor(controller_host, enabled) # -------------------------------------------------------------------------- # Metadata. @@ -653,7 +981,7 @@ def _synchronize_metadata_and_xapi(self): # Now update the VDI information in the metadata if required. xenapi = self.session.xenapi - volumes_metadata = self._linstor.volumes_with_metadata + volumes_metadata = self._linstor.get_volumes_with_metadata() for vdi_uuid, volume_metadata in volumes_metadata.items(): try: vdi_ref = xenapi.VDI.get_by_uuid(vdi_uuid) @@ -708,36 +1036,43 @@ def _update_stats(self, virt_alloc_delta): # Update size attributes of the SR parent class. self.virtual_allocation = valloc + virt_alloc_delta - # Physical size contains the total physical size. - # i.e. the sum of the sizes of all devices on all hosts, not the AVG. self._update_physical_size() # Notify SR parent class. self._db_update() def _update_physical_size(self): - # Physical size contains the total physical size. - # i.e. the sum of the sizes of all devices on all hosts, not the AVG. - self.physical_size = self._linstor.physical_size + # We use the size of the smallest disk, this is an approximation that + # ensures the displayed physical size is reachable by the user. + (min_physical_size, pool_count) = self._linstor.get_min_physical_size() + self.physical_size = min_physical_size * pool_count / \ + self._linstor.redundancy - # `self._linstor.physical_free_size` contains the total physical free - # memory. If Thin provisioning is used we can't use it, we must use - # LINSTOR volume size to gives a good idea of the required - # usable memory to the users. - self.physical_utilisation = self._linstor.total_allocated_volume_size - - # If Thick provisioning is used, we can use this line instead: - # self.physical_utilisation = \ - # self.physical_size - self._linstor.physical_free_size + self.physical_utilisation = self._linstor.allocated_volume_size # -------------------------------------------------------------------------- # VDIs. # -------------------------------------------------------------------------- def _load_vdis(self): - if self.vdis: + if self._vdis_loaded: return + assert self._is_master + + # We use a cache to avoid repeated JSON parsing. + # The performance gain is not big but we can still + # enjoy it with a few lines. + self._create_linstor_cache() + self._load_vdis_ex() + self._destroy_linstor_cache() + + # We must mark VDIs as loaded only if the load is a success. + self._vdis_loaded = True + + self._undo_all_journal_transactions() + + def _load_vdis_ex(self): # 1. Get existing VDIs in XAPI. xenapi = self.session.xenapi xapi_vdi_uuids = set() @@ -745,8 +1080,8 @@ def _load_vdis(self): xapi_vdi_uuids.add(xenapi.VDI.get_uuid(vdi)) # 2. Get volumes info. - all_volume_info = self._linstor.volumes_with_info - volumes_metadata = self._linstor.volumes_with_metadata + all_volume_info = self._all_volume_info_cache + volumes_metadata = self._all_volume_metadata_cache # 3. Get CBT vdis. # See: https://support.citrix.com/article/CTX230619 @@ -758,7 +1093,8 @@ def _load_vdis(self): introduce = False - if self.cmd == 'sr_scan': + # Try to introduce VDIs only during scan/attach. + if self.cmd == 'sr_scan' or self.cmd == 'sr_attach': has_clone_entries = list(self._journaler.get_all( LinstorJournaler.CLONE ).items()) @@ -782,6 +1118,9 @@ def _load_vdis(self): if not introduce: continue + if vdi_uuid.startswith('DELETED_'): + continue + volume_metadata = volumes_metadata.get(vdi_uuid) if not volume_metadata: util.SMlog( @@ -836,10 +1175,10 @@ def _load_vdis(self): util.SMlog( 'Introducing VDI {} '.format(vdi_uuid) + - ' (name={}, virtual_size={}, physical_size={})'.format( + ' (name={}, virtual_size={}, allocated_size={})'.format( name_label, volume_info.virtual_size, - volume_info.physical_size + volume_info.allocated_size ) ) @@ -857,7 +1196,7 @@ def _load_vdis(self): sm_config, managed, str(volume_info.virtual_size), - str(volume_info.physical_size) + str(volume_info.allocated_size) ) is_a_snapshot = volume_metadata.get(IS_A_SNAPSHOT_TAG) @@ -881,9 +1220,11 @@ def _load_vdis(self): vdi = self.vdi(vdi_uuid) self.vdis[vdi_uuid] = vdi - if vdi.vdi_type == vhdutil.VDI_TYPE_VHD: + if USE_KEY_HASH and vdi.vdi_type == vhdutil.VDI_TYPE_VHD: + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 vdi.sm_config_override['key_hash'] = \ - self._vhdutil.get_key_hash(vdi_uuid) + self._vhdutil.get_key_hash(vdi_uuid) # pylint: disable = E1120 # 4.c. Update CBT status of disks either just added # or already in XAPI. @@ -940,7 +1281,7 @@ def _load_vdis(self): else: geneology[vdi.parent] = [vdi_uuid] if not vdi.hidden: - self.virtual_allocation += vdi.utilisation + self.virtual_allocation += vdi.size # 9. Remove all hidden leaf nodes to avoid introducing records that # will be GC'ed. @@ -1014,13 +1355,18 @@ def _handle_interrupted_inflate(self, vdi_uuid, old_size): util.SMlog('Cannot deflate missing VDI {}'.format(vdi_uuid)) return - current_size = self._linstor.get_volume_info(self.uuid).virtual_size + assert not self._all_volume_info_cache + volume_info = self._linstor.get_volume_info(vdi_uuid) + + current_size = volume_info.virtual_size + assert current_size > 0 + util.zeroOut( vdi.path, current_size - vhdutil.VHD_FOOTER_SIZE, vhdutil.VHD_FOOTER_SIZE ) - deflate(vdi_uuid, vdi.path, old_size, current_size) + deflate(self._linstor, vdi_uuid, vdi.path, old_size, current_size) def _handle_interrupted_clone( self, vdi_uuid, clone_info, force_undo=False @@ -1033,7 +1379,7 @@ def _handle_interrupted_clone( base_uuid, snap_uuid = clone_info.split('_') # Use LINSTOR data because new VDIs may not be in the XAPI. - volume_names = self._linstor.volumes_with_name + volume_names = self._linstor.get_volumes_with_name() # Check if we don't have a base VDI. (If clone failed at startup.) if base_uuid not in volume_names: @@ -1089,7 +1435,7 @@ def _undo_clone(self, volume_names, vdi_uuid, base_uuid, snap_uuid): if base_type == vhdutil.VDI_TYPE_VHD: vhd_info = self._vhdutil.get_vhd_info(base_uuid, False) if vhd_info.hidden: - vhdutil.setHidden(base_path, False) + self._vhdutil.set_hidden(base_path, False) elif base_type == vhdutil.VDI_TYPE_RAW and \ base_metadata.get(HIDDEN_TAG): self._linstor.update_volume_metadata( @@ -1099,10 +1445,6 @@ def _undo_clone(self, volume_names, vdi_uuid, base_uuid, snap_uuid): # Remove the child nodes. if snap_uuid and snap_uuid in volume_names: util.SMlog('Destroying snap {}...'.format(snap_uuid)) - snap_metadata = self._linstor.get_volume_metadata(snap_uuid) - - if snap_metadata.get(VDI_TYPE_TAG) != vhdutil.VDI_TYPE_VHD: - raise util.SMException('Clone {} not VHD'.format(snap_uuid)) try: self._linstor.destroy_volume(snap_uuid) @@ -1150,10 +1492,64 @@ def _undo_clone(self, volume_names, vdi_uuid, base_uuid, snap_uuid): util.SMlog('*** INTERRUPTED CLONE OP: rollback success') + # -------------------------------------------------------------------------- + # Cache. + # -------------------------------------------------------------------------- + + def _create_linstor_cache(self): + # TODO: use a nonlocal with python3. + class context: + reconnect = False + + def create_cache(): + try: + if context.reconnect: + self._reconnect() + return self._linstor.get_volumes_with_info() + except Exception as e: + context.reconnect = True + raise e + + self._all_volume_metadata_cache = \ + self._linstor.get_volumes_with_metadata() + self._all_volume_info_cache = util.retry( + create_cache, + maxretry=10, + period=3 + ) + + def _destroy_linstor_cache(self): + self._all_volume_info_cache = None + self._all_volume_metadata_cache = None + # -------------------------------------------------------------------------- # Misc. # -------------------------------------------------------------------------- + def _reconnect(self): + controller_uri = get_controller_uri() + + self._journaler = LinstorJournaler( + controller_uri, self._group_name, logger=util.SMlog + ) + + # Try to open SR if exists. + # We can repair only if we are on the master AND if + # we are trying to execute an exclusive operation. + # Otherwise we could try to delete a VDI being created or + # during a snapshot. An exclusive op is the guarantee that + # the SR is locked. + self._linstor = LinstorVolumeManager( + controller_uri, + self._group_name, + repair=( + self._is_master and + self.srcmd.cmd in self.ops_exclusive + ), + logger=util.SMlog + ) + self._vhdutil = LinstorVhdUtil(self.session, self._linstor) + def _ensure_space_available(self, amount_needed): space_available = self._linstor.max_volume_size_allowed if (space_available < amount_needed): @@ -1233,7 +1629,7 @@ def raise_bad_load(e): if ( self.sr.srcmd.cmd == 'vdi_attach_from_config' or self.sr.srcmd.cmd == 'vdi_detach_from_config' - ) and self.sr.srcmd.params['vdi_uuid'] == self.uuid: + ): self.vdi_type = vhdutil.VDI_TYPE_RAW self.path = self.sr.srcmd.params['vdi_path'] else: @@ -1297,11 +1693,11 @@ def create(self, sr_uuid, vdi_uuid, size): # 2. Compute size and check space available. size = vhdutil.validate_and_round_vhd_size(int(size)) - util.SMlog('LinstorVDI.create: type={}, size={}'.format( - self.vdi_type, size - )) - volume_size = compute_volume_size(size, self.vdi_type) + util.SMlog( + 'LinstorVDI.create: type={}, vhd-size={}, volume-size={}' + .format(self.vdi_type, size, volume_size) + ) self.sr._ensure_space_available(volume_size) # 3. Set sm_config attribute of VDI parent class. @@ -1310,8 +1706,15 @@ def create(self, sr_uuid, vdi_uuid, size): # 4. Create! failed = False try: + volume_name = None + if self.ty == 'ha_statefile': + volume_name = 'xcp-persistent-ha-statefile' + elif self.ty == 'redo_log': + volume_name = 'xcp-persistent-redo-log' + self._linstor.create_volume( - self.uuid, volume_size, persistent=False + self.uuid, volume_size, persistent=False, + volume_name=volume_name ) volume_info = self._linstor.get_volume_info(self.uuid) @@ -1320,16 +1723,16 @@ def create(self, sr_uuid, vdi_uuid, size): if self.vdi_type == vhdutil.VDI_TYPE_RAW: self.size = volume_info.virtual_size else: - vhdutil.create( + self.sr._vhdutil.create( self.path, size, False, self.MAX_METADATA_VIRT_SIZE ) self.size = self.sr._vhdutil.get_size_virt(self.uuid) if self._key_hash: - vhdutil.setKey(self.path, self._key_hash) + self.sr._vhdutil.set_key(self.path, self._key_hash) # Because vhdutil commands modify the volume data, - # we must retrieve a new time the utilisation size. + # we must retrieve a new time the utilization size. volume_info = self._linstor.get_volume_info(self.uuid) volume_metadata = { @@ -1344,6 +1747,13 @@ def create(self, sr_uuid, vdi_uuid, size): METADATA_OF_POOL_TAG: '' } self._linstor.set_volume_metadata(self.uuid, volume_metadata) + + # Set the open timeout to 1min to reduce CPU usage + # in http-disk-server when a secondary server tries to open + # an already opened volume. + if self.ty == 'ha_statefile' or self.ty == 'redo_log': + self._linstor.set_auto_promote_timeout(self.uuid, 600) + self._linstor.mark_volume_as_persistent(self.uuid) except util.CommandException as e: failed = True @@ -1364,11 +1774,11 @@ def create(self, sr_uuid, vdi_uuid, size): '{}'.format(e) ) - self.utilisation = volume_info.physical_size + self.utilisation = volume_info.allocated_size self.sm_config['vdi_type'] = self.vdi_type self.ref = self._db_introduce() - self.sr._update_stats(volume_info.virtual_size) + self.sr._update_stats(self.size) return VDI.VDI.get_params(self) @@ -1407,14 +1817,15 @@ def delete(self, sr_uuid, vdi_uuid, data_only=False): del self.sr.vdis[self.uuid] # TODO: Check size after delete. - self.sr._update_stats(-self.capacity) + self.sr._update_stats(-self.size) self.sr._kick_gc() return super(LinstorVDI, self).delete(sr_uuid, vdi_uuid, data_only) def attach(self, sr_uuid, vdi_uuid): util.SMlog('LinstorVDI.attach for {}'.format(self.uuid)) + attach_from_config = self.sr.srcmd.cmd == 'vdi_attach_from_config' if ( - self.sr.srcmd.cmd != 'vdi_attach_from_config' or + not attach_from_config or self.sr.srcmd.params['vdi_uuid'] != self.uuid ) and self.sr._journaler.has_entries(self.uuid): raise xs_errors.XenError( @@ -1423,50 +1834,62 @@ def attach(self, sr_uuid, vdi_uuid): 'scan SR first to trigger auto-repair' ) - writable = 'args' not in self.sr.srcmd.params or \ - self.sr.srcmd.params['args'][0] == 'true' + if not attach_from_config or self.sr._is_master: + writable = 'args' not in self.sr.srcmd.params or \ + self.sr.srcmd.params['args'][0] == 'true' - # We need to inflate the volume if we don't have enough place - # to mount the VHD image. I.e. the volume capacity must be greater - # than the VHD size + bitmap size. - need_inflate = True - if self.vdi_type == vhdutil.VDI_TYPE_RAW or not writable or \ - self.capacity >= compute_volume_size(self.size, self.vdi_type): - need_inflate = False - - if need_inflate: - try: - self._prepare_thin(True) - except Exception as e: - raise xs_errors.XenError( - 'VDIUnavailable', - opterr='Failed to attach VDI during "prepare thin": {}' - .format(e) - ) + # We need to inflate the volume if we don't have enough place + # to mount the VHD image. I.e. the volume capacity must be greater + # than the VHD size + bitmap size. + need_inflate = True + if ( + self.vdi_type == vhdutil.VDI_TYPE_RAW or + not writable or + self.capacity >= compute_volume_size(self.size, self.vdi_type) + ): + need_inflate = False - if not util.pathexists(self.path): - raise xs_errors.XenError( - 'VDIUnavailable', opterr='Could not find: {}'.format(self.path) - ) + if need_inflate: + try: + self._prepare_thin(True) + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Failed to attach VDI during "prepare thin": {}' + .format(e) + ) if not hasattr(self, 'xenstore_data'): self.xenstore_data = {} - - # TODO: Is it useful? - self.xenstore_data.update(scsiutil.update_XS_SCSIdata( - self.uuid, scsiutil.gen_synthetic_page_data(self.uuid) - )) - self.xenstore_data['storage-type'] = LinstorSR.DRIVER_TYPE - self.attached = True + if ( + USE_HTTP_NBD_SERVERS and + attach_from_config and + self.path.startswith('/dev/http-nbd/') + ): + return self._attach_using_http_nbd() + + # Ensure we have a path... + while vdi_uuid: + path = self._linstor.get_device_path(vdi_uuid) + if not util.pathexists(path): + raise xs_errors.XenError( + 'VDIUnavailable', opterr='Could not find: {}'.format(path) + ) + vdi_uuid = self.sr._vhdutil.get_vhd_info(vdi_uuid).parentUuid + self.attached = True return VDI.VDI.attach(self, self.sr.uuid, self.uuid) def detach(self, sr_uuid, vdi_uuid): util.SMlog('LinstorVDI.detach for {}'.format(self.uuid)) + detach_from_config = self.sr.srcmd.cmd == 'vdi_detach_from_config' self.attached = False + if detach_from_config and self.path.startswith('/dev/http-nbd/'): + return self._detach_using_http_nbd() + if self.vdi_type == vhdutil.VDI_TYPE_RAW: return @@ -1503,9 +1926,23 @@ def detach(self, sr_uuid, vdi_uuid): def resize(self, sr_uuid, vdi_uuid, size): util.SMlog('LinstorVDI.resize for {}'.format(self.uuid)) + if not self.sr._is_master: + raise xs_errors.XenError( + 'VDISize', + opterr='resize on slave not allowed' + ) + if self.hidden: raise xs_errors.XenError('VDIUnavailable', opterr='hidden VDI') + # Compute the virtual VHD and DRBD volume size. + size = vhdutil.validate_and_round_vhd_size(int(size)) + volume_size = compute_volume_size(size, self.vdi_type) + util.SMlog( + 'LinstorVDI.resize: type={}, vhd-size={}, volume-size={}' + .format(self.vdi_type, size, volume_size) + ) + if size < self.size: util.SMlog( 'vdi_resize: shrinking not supported: ' @@ -1513,18 +1950,13 @@ def resize(self, sr_uuid, vdi_uuid, size): ) raise xs_errors.XenError('VDISize', opterr='shrinking not allowed') - # Compute the virtual VHD size. - size = vhdutil.validate_and_round_vhd_size(int(size)) - if size == self.size: return VDI.VDI.get_params(self) - # Compute the LINSTOR volume size. - new_volume_size = compute_volume_size(size, self.vdi_type) if self.vdi_type == vhdutil.VDI_TYPE_RAW: old_volume_size = self.size else: - old_volume_size = self.capacity + old_volume_size = self.utilisation if self.sr._provisioning == 'thin': # VDI is currently deflated, so keep it deflated. new_volume_size = old_volume_size @@ -1533,7 +1965,7 @@ def resize(self, sr_uuid, vdi_uuid, size): space_needed = new_volume_size - old_volume_size self.sr._ensure_space_available(space_needed) - old_capacity = self.capacity + old_size = self.size if self.vdi_type == vhdutil.VDI_TYPE_RAW: self._linstor.resize(self.uuid, new_volume_size) else: @@ -1542,7 +1974,7 @@ def resize(self, sr_uuid, vdi_uuid, size): self.sr._journaler, self._linstor, self.uuid, self.path, new_volume_size, old_volume_size ) - vhdutil.setSizeVirtFast(self.path, size) + self.sr._vhdutil.set_size_virt_fast(self.path, size) # Reload size attributes. self._load_this() @@ -1552,7 +1984,7 @@ def resize(self, sr_uuid, vdi_uuid, size): self.session.xenapi.VDI.set_physical_utilisation( vdi_ref, str(self.utilisation) ) - self.sr._update_stats(self.capacity - old_capacity) + self.sr._update_stats(self.size - old_size) return VDI.VDI.get_params(self) def clone(self, sr_uuid, vdi_uuid): @@ -1574,8 +2006,8 @@ def compose(self, sr_uuid, vdi1, vdi2): if not blktap2.VDI.tap_pause(self.session, self.sr.uuid, self.uuid): raise util.SMException('Failed to pause VDI {}'.format(self.uuid)) try: - vhdutil.setParent(self.path, parent_path, False) - vhdutil.setHidden(parent_path) + self.sr._vhdutil.set_parent(self.path, parent_path, False) + self.sr._vhdutil.set_hidden(parent_path) self.sr.session.xenapi.VDI.set_managed( self.sr.srcmd.params['args'][0], False ) @@ -1598,25 +2030,40 @@ def generate_config(self, sr_uuid, vdi_uuid): util.SMlog('LinstorVDI.generate_config for {}'.format(self.uuid)) - if not self.path or not util.pathexists(self.path): - available = False - # Try to refresh symlink path... - try: - self.path = self._linstor.get_device_path(vdi_uuid) - available = util.pathexists(self.path) - except Exception: - pass - if not available: - raise xs_errors.XenError('VDIUnavailable') - resp = {} resp['device_config'] = self.sr.dconf resp['sr_uuid'] = sr_uuid resp['vdi_uuid'] = self.uuid resp['sr_sm_config'] = self.sr.sm_config - resp['vdi_path'] = self.path resp['command'] = 'vdi_attach_from_config' + # By default, we generate a normal config. + # But if the disk is persistent, we must use a HTTP/NBD + # server to ensure we can always write or read data. + # Why? DRBD is unsafe when used with more than 4 hosts: + # We are limited to use 1 diskless and 3 full. + # We can't increase this limitation, so we use a NBD/HTTP device + # instead. + volume_name = self._linstor.get_volume_name(self.uuid) + if not USE_HTTP_NBD_SERVERS or volume_name not in [ + 'xcp-persistent-ha-statefile', 'xcp-persistent-redo-log' + ]: + if not self.path or not util.pathexists(self.path): + available = False + # Try to refresh symlink path... + try: + self.path = self._linstor.get_device_path(vdi_uuid) + available = util.pathexists(self.path) + except Exception: + pass + if not available: + raise xs_errors.XenError('VDIUnavailable') + + resp['vdi_path'] = self.path + else: + # Axiom: DRBD device is present on at least one host. + resp['vdi_path'] = '/dev/http-nbd/' + volume_name + config = xmlrpc.client.dumps(tuple([resp]), 'vdi_attach_from_config') return xmlrpc.client.dumps((config,), "", True) @@ -1652,19 +2099,28 @@ def reset_leaf(self, sr_uuid, vdi_uuid): .format(self.uuid) ) - vhdutil.killData(self.path) + self.sr._vhdutil.kill_data(self.path) def _load_this(self): - volume_metadata = self._linstor.get_volume_metadata(self.uuid) - volume_info = self._linstor.get_volume_info(self.uuid) + volume_metadata = None + if self.sr._all_volume_metadata_cache: + volume_metadata = self.sr._all_volume_metadata_cache.get(self.uuid) + if volume_metadata is None: + volume_metadata = self._linstor.get_volume_metadata(self.uuid) + + volume_info = None + if self.sr._all_volume_info_cache: + volume_info = self.sr._all_volume_info_cache.get(self.uuid) + if volume_info is None: + volume_info = self._linstor.get_volume_info(self.uuid) - # Contains the physical size used on all disks. + # Contains the max physical size used on a disk. # When LINSTOR LVM driver is used, the size should be similar to # virtual size (i.e. the LINSTOR max volume size). # When LINSTOR Thin LVM driver is used, the used physical size should # be lower than virtual size at creation. # The physical size increases after each write in a new block. - self.utilisation = volume_info.physical_size + self.utilisation = volume_info.allocated_size self.capacity = volume_info.virtual_size if self.vdi_type == vhdutil.VDI_TYPE_RAW: @@ -1691,7 +2147,7 @@ def _mark_hidden(self, hidden=True): return if self.vdi_type == vhdutil.VDI_TYPE_VHD: - vhdutil.setHidden(self.path, hidden) + self.sr._vhdutil.set_hidden(self.path, hidden) else: self._linstor.update_volume_metadata(self.uuid, { HIDDEN_TAG: hidden @@ -1739,25 +2195,14 @@ def _prepare_thin(self, attach): else: fn = 'attach' if attach else 'detach' - # We assume the first pool is always the one currently in use. - pools = self.session.xenapi.pool.get_all() - master = self.session.xenapi.pool.get_master(pools[0]) + master = util.get_master_ref(self.session) + args = { 'groupName': self.sr._group_name, 'srUuid': self.sr.uuid, 'vdiUuid': self.uuid } - ret = self.session.xenapi.host.call_plugin( - master, self.sr.MANAGER_PLUGIN, fn, args - ) - util.SMlog( - 'call-plugin ({} with {}) returned: {}'.format(fn, args, ret) - ) - if ret == 'False': - raise xs_errors.XenError( - 'VDIUnavailable', - opterr='Plugin {} failed'.format(self.sr.MANAGER_PLUGIN) - ) + self.sr._exec_manager_command(master, fn, args, 'VDIUnavailable') # Reload size attrs after inflate or deflate! self._load_this() @@ -1807,9 +2252,7 @@ def _determine_type_and_path(self): 'VDIUnavailable', opterr='failed to get vdi_type in metadata' ) - self._update_device_name( - self._linstor.get_volume_name(self.uuid) - ) + self._update_device_name(self._linstor.get_volume_name(self.uuid)) def _update_device_name(self, device_name): self._device_name = device_name @@ -1832,7 +2275,7 @@ def _create_snapshot(self, snap_uuid, snap_of_uuid=None): # 2. Write the snapshot content. is_raw = (self.vdi_type == vhdutil.VDI_TYPE_RAW) - vhdutil.snapshot( + self.sr._vhdutil.snapshot( snap_path, self.path, is_raw, self.MAX_METADATA_VIRT_SIZE ) @@ -1862,7 +2305,7 @@ def _create_snapshot(self, snap_uuid, snap_of_uuid=None): volume_info = self._linstor.get_volume_info(snap_uuid) snap_vdi.size = self.sr._vhdutil.get_size_virt(snap_uuid) - snap_vdi.utilisation = volume_info.physical_size + snap_vdi.utilisation = volume_info.allocated_size # 6. Update sm config. snap_vdi.sm_config = {} @@ -1932,6 +2375,9 @@ def _snapshot(self, snap_type, cbtlog=None, cbt_consistency=None): elif depth >= vhdutil.MAX_CHAIN_SIZE: raise xs_errors.XenError('SnapshotChainTooLong') + # Ensure we have a valid path if we don't have a local diskful. + self.sr._linstor.get_device_path(self.uuid) + volume_path = self.path if not util.pathexists(volume_path): raise xs_errors.XenError( @@ -2057,7 +2503,7 @@ def _snapshot(self, snap_type, cbtlog=None, cbt_consistency=None): raise if snap_type != VDI.SNAPSHOT_INTERNAL: - self.sr._update_stats(self.capacity) + self.sr._update_stats(self.size) # 10. Return info on the new user-visible leaf VDI. ret_vdi = snap_vdi @@ -2088,10 +2534,318 @@ def _snapshot(self, snap_type, cbtlog=None, cbt_consistency=None): return ret_vdi.get_params() + @staticmethod + def _start_persistent_http_server(volume_name): + pid_path = None + http_server = None + + try: + if volume_name == 'xcp-persistent-ha-statefile': + port = '8076' + else: + port = '8077' + + try: + # Use a timeout call because XAPI may be unusable on startup + # or if the host has been ejected. So in this case the call can + # block indefinitely. + session = util.timeout_call(5, util.get_localAPI_session) + host_ip = util.get_this_host_address(session) + except: + # Fallback using the XHA file if session not available. + host_ip, _ = get_ips_from_xha_config_file() + if not host_ip: + raise Exception( + 'Cannot start persistent HTTP server: no XAPI session, nor XHA config file' + ) + + arguments = [ + 'http-disk-server', + '--disk', + '/dev/drbd/by-res/{}/0'.format(volume_name), + '--ip', + host_ip, + '--port', + port + ] + + util.SMlog('Starting {} on port {}...'.format(arguments[0], port)) + http_server = subprocess.Popen( + [FORK_LOG_DAEMON] + arguments, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + # Ensure we use another group id to kill this process without + # touch the current one. + preexec_fn=os.setsid + ) + + pid_path = '/run/http-server-{}.pid'.format(volume_name) + with open(pid_path, 'w') as pid_file: + pid_file.write(str(http_server.pid)) + + reg_server_ready = re.compile("Server ready!$") + def is_ready(): + while http_server.poll() is None: + line = http_server.stdout.readline() + if reg_server_ready.search(line): + return True + return False + try: + if not util.timeout_call(10, is_ready): + raise Exception('Failed to wait HTTP server startup, bad output') + except util.TimeoutException: + raise Exception('Failed to wait for HTTP server startup during given delay') + except Exception as e: + if pid_path: + try: + os.remove(pid_path) + except Exception: + pass + + if http_server: + # Kill process and children in this case... + try: + os.killpg(os.getpgid(http_server.pid), signal.SIGTERM) + except: + pass + + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Failed to start http-server: {}'.format(e) + ) + + def _start_persistent_nbd_server(self, volume_name): + pid_path = None + nbd_path = None + nbd_server = None + + try: + # We use a precomputed device size. + # So if the XAPI is modified, we must update these values! + if volume_name == 'xcp-persistent-ha-statefile': + # See: https://github.com/xapi-project/xen-api/blob/703479fa448a8d7141954bb6e8964d8e25c4ac2e/ocaml/xapi/xha_statefile.ml#L32-L37 + port = '8076' + device_size = 4 * 1024 * 1024 + else: + # See: https://github.com/xapi-project/xen-api/blob/703479fa448a8d7141954bb6e8964d8e25c4ac2e/ocaml/database/redo_log.ml#L41-L44 + port = '8077' + device_size = 256 * 1024 * 1024 + + try: + session = util.timeout_call(5, util.get_localAPI_session) + ips = util.get_host_addresses(session) + except Exception as e: + _, ips = get_ips_from_xha_config_file() + if not ips: + raise Exception( + 'Cannot start persistent NBD server: no XAPI session, nor XHA config file ({})'.format(e) + ) + ips = ips.values() + + arguments = [ + 'nbd-http-server', + '--socket-path', + '/run/{}.socket'.format(volume_name), + '--nbd-name', + volume_name, + '--urls', + ','.join(map(lambda ip: 'http://' + ip + ':' + port, ips)), + '--device-size', + str(device_size) + ] + + util.SMlog('Starting {} using port {}...'.format(arguments[0], port)) + nbd_server = subprocess.Popen( + [FORK_LOG_DAEMON] + arguments, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + # Ensure we use another group id to kill this process without + # touch the current one. + preexec_fn=os.setsid + ) + + pid_path = '/run/nbd-server-{}.pid'.format(volume_name) + with open(pid_path, 'w') as pid_file: + pid_file.write(str(nbd_server.pid)) + + reg_nbd_path = re.compile("NBD `(/dev/nbd[0-9]+)` is now attached.$") + def get_nbd_path(): + while nbd_server.poll() is None: + line = nbd_server.stdout.readline() + match = reg_nbd_path.search(line) + if match: + return match.group(1) + # Use a timeout to never block the smapi if there is a problem. + try: + nbd_path = util.timeout_call(10, get_nbd_path) + if nbd_path is None: + raise Exception('Empty NBD path (NBD server is probably dead)') + except util.TimeoutException: + raise Exception('Unable to read NBD path') + + util.SMlog('Create symlink: {} -> {}'.format(self.path, nbd_path)) + os.symlink(nbd_path, self.path) + except Exception as e: + if pid_path: + try: + os.remove(pid_path) + except Exception: + pass + + if nbd_path: + try: + os.remove(nbd_path) + except Exception: + pass + + if nbd_server: + # Kill process and children in this case... + try: + os.killpg(os.getpgid(nbd_server.pid), signal.SIGTERM) + except: + pass + + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Failed to start nbd-server: {}'.format(e) + ) + + @classmethod + def _kill_persistent_server(self, type, volume_name, sig): + try: + path = '/run/{}-server-{}.pid'.format(type, volume_name) + if not os.path.exists(path): + return + + pid = None + with open(path, 'r') as pid_file: + try: + pid = int(pid_file.read()) + except Exception: + pass + + if pid is not None and util.check_pid_exists(pid): + util.SMlog('Kill {} server {} (pid={})'.format(type, path, pid)) + try: + os.killpg(os.getpgid(pid), sig) + except Exception as e: + util.SMlog('Failed to kill {} server: {}'.format(type, e)) + + os.remove(path) + except: + pass + + @classmethod + def _kill_persistent_http_server(self, volume_name, sig=signal.SIGTERM): + return self._kill_persistent_server('nbd', volume_name, sig) + + @classmethod + def _kill_persistent_nbd_server(self, volume_name, sig=signal.SIGTERM): + return self._kill_persistent_server('http', volume_name, sig) + + def _check_http_nbd_volume_name(self): + volume_name = self.path[14:] + if volume_name not in [ + 'xcp-persistent-ha-statefile', 'xcp-persistent-redo-log' + ]: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unsupported path: {}'.format(self.path) + ) + return volume_name + + def _attach_using_http_nbd(self): + volume_name = self._check_http_nbd_volume_name() + + # Ensure there is no NBD and HTTP server running. + self._kill_persistent_nbd_server(volume_name) + self._kill_persistent_http_server(volume_name) + + # 0. Fetch drbd path. + must_get_device_path = True + if not self.sr._is_master: + # We are on a slave, we must try to find a diskful locally. + try: + volume_info = self._linstor.get_volume_info(self.uuid) + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot get volume info of {}: {}' + .format(self.uuid, e) + ) + + hostname = socket.gethostname() + must_get_device_path = hostname in volume_info.diskful + + drbd_path = None + if must_get_device_path or self.sr._is_master: + # If we are master, we must ensure we have a diskless + # or diskful available to init HA. + # It also avoid this error in xensource.log + # (/usr/libexec/xapi/cluster-stack/xhad/ha_set_pool_state): + # init exited with code 8 [stdout = ''; stderr = 'SF: failed to write in State-File \x10 (fd 4208696). (sys 28)\x0A'] + # init returned MTC_EXIT_CAN_NOT_ACCESS_STATEFILE (State-File is inaccessible) + available = False + try: + drbd_path = self._linstor.get_device_path(self.uuid) + available = util.pathexists(drbd_path) + except Exception: + pass + + if not available: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot get device path of {}'.format(self.uuid) + ) + + # 1. Prepare http-nbd folder. + try: + if not os.path.exists('/dev/http-nbd/'): + os.makedirs('/dev/http-nbd/') + elif os.path.islink(self.path): + os.remove(self.path) + except OSError as e: + if e.errno != errno.EEXIST: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot prepare http-nbd: {}'.format(e) + ) + + # 2. Start HTTP service if we have a diskful or if we are master. + http_service = None + if drbd_path: + assert(drbd_path in ( + '/dev/drbd/by-res/xcp-persistent-ha-statefile/0', + '/dev/drbd/by-res/xcp-persistent-redo-log/0' + )) + self._start_persistent_http_server(volume_name) + + # 3. Start NBD server in all cases. + try: + self._start_persistent_nbd_server(volume_name) + except Exception as e: + if drbd_path: + self._kill_persistent_http_server(volume_name) + raise + + self.attached = True + return VDI.VDI.attach(self, self.sr.uuid, self.uuid) + + def _detach_using_http_nbd(self): + volume_name = self._check_http_nbd_volume_name() + self._kill_persistent_nbd_server(volume_name) + self._kill_persistent_http_server(volume_name) + # ------------------------------------------------------------------------------ if __name__ == '__main__': - SRCommand.run(LinstorSR, DRIVER_INFO) + def run(): + SRCommand.run(LinstorSR, DRIVER_INFO) + + if not TRACE_PERFS: + run() + else: + util.make_profile('LinstorSR', run) else: SR.registerSR(LinstorSR) diff --git a/drivers/blktap2.py b/drivers/blktap2.py index 3a419aadf..7b4735636 100755 --- a/drivers/blktap2.py +++ b/drivers/blktap2.py @@ -49,6 +49,12 @@ from xmlrpc.client import ServerProxy, Transport from socket import socket, AF_UNIX, SOCK_STREAM +try: + from linstorvolumemanager import log_drbd_openers + LINSTOR_AVAILABLE = True +except ImportError: + LINSTOR_AVAILABLE = False + PLUGIN_TAP_PAUSE = "tapdisk-pause" SOCKPATH = "/var/xapi/xcp-rrdd" @@ -811,7 +817,22 @@ def launch_on_tap(cls, blktap, path, _type, options): TapCtl.attach(pid, minor) try: - TapCtl.open(pid, minor, _type, path, options) + retry_open = 0 + while True: + try: + TapCtl.open(pid, minor, _type, path, options) + except TapCtl.CommandFailure as e: + err = ( + 'status' in e.info and e.info['status'] + ) or None + if err in (errno.EIO, errno.EROFS, errno.EAGAIN): + if retry_open < 5: + retry_open += 1 + time.sleep(1) + continue + if LINSTOR_AVAILABLE and err == errno.EROFS: + log_drbd_openers(path) + break try: tapdisk = cls.__from_blktap(blktap) node = '/sys/dev/block/%d:%d' % (tapdisk.major(), tapdisk.minor) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index fe9f996cd..1fda9f69a 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -53,8 +53,10 @@ try: from linstorjournaler import LinstorJournaler from linstorvhdutil import LinstorVhdUtil - from linstorvolumemanager \ - import LinstorVolumeManager, LinstorVolumeManagerError + from linstorvolumemanager import get_controller_uri + from linstorvolumemanager import LinstorVolumeManager + from linstorvolumemanager import LinstorVolumeManagerError + LINSTOR_AVAILABLE = True except ImportError: LINSTOR_AVAILABLE = False @@ -477,7 +479,7 @@ def set_task_status(self, status): # # VDI # -class VDI: +class VDI(object): """Object representing a VDI of a VHD-based SR""" POLL_INTERVAL = 1 @@ -760,6 +762,12 @@ def delete(self): lock.Lock.cleanupAll(self.uuid) self._clear() + def getParent(self): + return vhdutil.getParent(self.path, lambda x: x.strip()) + + def repair(self, parent): + vhdutil.repair(parent) + def __str__(self): strHidden = "" if self.hidden: @@ -874,12 +882,14 @@ def _reportCoalesceError(vdi, ce): xapi.message.create(msg_name, "3", "SR", vdi.sr.uuid, msg_body) _reportCoalesceError = staticmethod(_reportCoalesceError) + def coalesce(self): + vhdutil.coalesce(self.path) + def _doCoalesceVHD(vdi): try: - startTime = time.time() vhdSize = vdi.getSizeVHD() - vhdutil.coalesce(vdi.path) + vdi.coalesce() endTime = time.time() vdi.sr.recordStorageSpeed(startTime, endTime, vhdSize) except util.CommandException as ce: @@ -918,13 +928,12 @@ def _coalesceVHD(self, timeOut): # Try a repair and reraise the exception parent = "" try: - parent = vhdutil.getParent(self.path, lambda x: x.strip()) - if not self._vdi_is_raw(parent): - # Repair error is logged and ignored. Error reraised later - util.SMlog('Coalesce failed on %s, attempting repair on ' \ - 'parent %s' % (self.uuid, parent)) - vhdutil.repair(parent) - except Exception as e: + parent = self.getParent() + # Repair error is logged and ignored. Error reraised later + util.SMlog('Coalesce failed on %s, attempting repair on ' \ + 'parent %s' % (self.uuid, parent)) + self.repair(parent) + except Exception, e: util.SMlog('(error ignored) Failed to repair parent %s ' \ 'after failed coalesce on %s, err: %s' % (parent, self.path, e)) @@ -1509,17 +1518,28 @@ def delete(self): self.sr.unlock() VDI.delete(self) - def pauseVDIs(self, vdiList): - self.sr._linstor.ensure_volume_list_is_not_locked( - vdiList, timeout=self.VOLUME_LOCK_TIMEOUT - ) - return super(VDI).pauseVDIs(vdiList) + def validate(self, fast=False): + if not self.sr._vhdutil.check(self.uuid, fast=fast): + raise util.SMException('VHD {} corrupted'.format(self)) - def _liveLeafCoalesce(self, vdi): + def pause(self, failfast=False): self.sr._linstor.ensure_volume_is_not_locked( - vdi.uuid, timeout=self.VOLUME_LOCK_TIMEOUT + self.uuid, timeout=self.VOLUME_LOCK_TIMEOUT + ) + return super(LinstorVDI, self).pause(failfast) + + def coalesce(self): + self.sr._vhdutil.force_coalesce(self.path) + + def getParent(self): + return self.sr._vhdutil.get_parent( + self.sr._linstor.get_volume_uuid_from_device_path(self.path) + ) + + def repair(self, parent_uuid): + self.sr._vhdutil.force_repair( + self.sr._linstor.get_device_path(parent_uuid) ) - return super(VDI)._liveLeafCoalesce(vdi) def _relinkSkip(self): abortFlag = IPCFlag(self.sr.uuid) @@ -1545,6 +1565,19 @@ def _relinkSkip(self): blktap2.VDI.tap_unpause(session, sr_uuid, vdi_uuid) self.children = [] + def _setParent(self, parent): + self.sr._vhdutil.force_parent(self.path, parent.path) + self.parent = parent + self.parentUuid = parent.uuid + parent.children.append(self) + try: + self.setConfig(self.DB_VHD_PARENT, self.parentUuid) + Util.log("Updated the vhd-parent field for child %s with %s" % \ + (self.uuid, self.parentUuid)) + except: + Util.log("Failed to update %s with vhd-parent field %s" % \ + (self.uuid, self.parentUuid)) + def _setHidden(self, hidden=True): HIDDEN_TAG = 'hidden' @@ -1563,7 +1596,7 @@ def _queryVHDBlocks(self): # # SR # -class SR: +class SR(object): class LogFilter: def __init__(self, sr): self.sr = sr @@ -2955,7 +2988,6 @@ def __init__(self, uuid, xapi, createLock, force): ) SR.__init__(self, uuid, xapi, createLock, force) - self._master_uri = 'linstor://localhost' self.path = LinstorVolumeManager.DEV_ROOT_PATH self._reloadLinstor() @@ -2982,6 +3014,12 @@ def scan(self, force=False): self.logFilter.logState() self._handleInterruptedCoalesceLeaf() + def pauseVDIs(self, vdiList): + self._linstor.ensure_volume_list_is_not_locked( + vdiList, timeout=LinstorVDI.VOLUME_LOCK_TIMEOUT + ) + return super(LinstorSR, self).pauseVDIs(vdiList) + def _reloadLinstor(self): session = self.xapi.session host_ref = util.get_this_host_ref(session) @@ -2994,12 +3032,13 @@ def _reloadLinstor(self): dconf = session.xenapi.PBD.get_device_config(pbd) group_name = dconf['group-name'] + controller_uri = get_controller_uri() self.journaler = LinstorJournaler( - self._master_uri, group_name, logger=util.SMlog + controller_uri, group_name, logger=util.SMlog ) self._linstor = LinstorVolumeManager( - self._master_uri, + controller_uri, group_name, repair=True, logger=util.SMlog @@ -3032,8 +3071,8 @@ def _load_vdi_info(self): # TODO: Ensure metadata contains the right info. - all_volume_info = self._linstor.volumes_with_info - volumes_metadata = self._linstor.volumes_with_metadata + all_volume_info = self._linstor.get_volumes_with_info() + volumes_metadata = self._linstor.get_volumes_with_metadata() for vdi_uuid, volume_info in all_volume_info.items(): try: if not volume_info.name and \ @@ -3048,7 +3087,7 @@ def _load_vdi_info(self): except Exception as e: Util.log( ' [VDI {}: failed to load VDI info]: {}' - .format(self.uuid, e) + .format(vdi_uuid, e) ) info = vhdutil.VHDInfo(vdi_uuid) info.error = 1 @@ -3064,8 +3103,10 @@ def _calcExtraSpaceNeeded(self, child, parent): virtual_size = LinstorVolumeManager.round_up_volume_size( parent.sizeVirt + meta_overhead + bitmap_overhead ) - # TODO: Check result. - return virtual_size - self._linstor.get_volume_size(parent.uuid) + volume_size = self._linstor.get_volume_size(parent.uuid) + + assert virtual_size >= volume_size + return virtual_size - volume_size def _hasValidDevicePath(self, uuid): try: @@ -3075,6 +3116,16 @@ def _hasValidDevicePath(self, uuid): return False return True + def _liveLeafCoalesce(self, vdi): + self.lock() + try: + self._linstor.ensure_volume_is_not_locked( + vdi.uuid, timeout=LinstorVDI.VOLUME_LOCK_TIMEOUT + ) + return super(LinstorSR, self)._liveLeafCoalesce(vdi) + finally: + self.unlock() + def _handleInterruptedCoalesceLeaf(self): entries = self.journaler.get_all(VDI.JRN_LEAF) for uuid, parentUuid in entries.items(): @@ -3101,7 +3152,6 @@ def _undoInterruptedCoalesceLeaf(self, childUuid, parentUuid): 'Renaming parent back: {} -> {}'.format(childUuid, parentUuid) ) parent.rename(parentUuid) - util.fistpoint.activate('LVHDRT_coaleaf_undo_after_rename', self.uuid) child = self.getVDI(childUuid) if not child: @@ -3117,9 +3167,6 @@ def _undoInterruptedCoalesceLeaf(self, childUuid, parentUuid): Util.log('Updating the VDI record') child.setConfig(VDI.DB_VHD_PARENT, parentUuid) child.setConfig(VDI.DB_VDI_TYPE, vhdutil.VDI_TYPE_VHD) - util.fistpoint.activate( - 'LVHDRT_coaleaf_undo_after_rename2', self.uuid - ) # TODO: Maybe deflate here. @@ -3128,10 +3175,7 @@ def _undoInterruptedCoalesceLeaf(self, childUuid, parentUuid): if not parent.hidden: parent._setHidden(True) self._updateSlavesOnUndoLeafCoalesce(parent, child) - util.fistpoint.activate('LVHDRT_coaleaf_undo_end', self.uuid) Util.log('*** leaf-coalesce undo successful') - if util.fistpoint.is_active('LVHDRT_coaleaf_stop_after_recovery'): - child.setConfig(VDI.DB_LEAFCLSC, VDI.LEAFCLSC_DISABLED) def _finishInterruptedCoalesceLeaf(self, childUuid, parentUuid): Util.log('*** FINISH LEAF-COALESCE') @@ -3144,32 +3188,21 @@ def _finishInterruptedCoalesceLeaf(self, childUuid, parentUuid): except XenAPI.Failure: pass self._updateSlavesOnResize(vdi) - util.fistpoint.activate('LVHDRT_coaleaf_finish_end', self.uuid) Util.log('*** finished leaf-coalesce successfully') def _checkSlaves(self, vdi): try: - states = self._linstor.get_usage_states(vdi.uuid) - for node_name, state in states.items(): - self._checkSlave(node_name, vdi, state) + all_openers = self._linstor.get_volume_openers(vdi.uuid) + for openers in all_openers.itervalues(): + for opener in openers.values(): + if opener['process-name'] != 'tapdisk': + raise util.SMException( + 'VDI {} is in use: {}'.format(vdi.uuid, all_openers) + ) except LinstorVolumeManagerError as e: if e.code != LinstorVolumeManagerError.ERR_VOLUME_NOT_EXISTS: raise - @staticmethod - def _checkSlave(node_name, vdi, state): - # If state is None, LINSTOR doesn't know the host state - # (bad connection?). - if state is None: - raise util.SMException( - 'Unknown state for VDI {} on {}'.format(vdi.uuid, node_name) - ) - - if state: - raise util.SMException( - 'VDI {} is in use on {}'.format(vdi.uuid, node_name) - ) - ################################################################################ # diff --git a/drivers/linstor-manager b/drivers/linstor-manager index f7ce18099..9e96aacac 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -14,32 +14,52 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +# We must modify default import path, we don't want to import modules +# installed in plugins folder and instead we must import from LINSTOR driver +# folder. +import sys +sys.path[0] = '/opt/xensource/sm/' + import base64 import distutils.util -import subprocess -import sys +import os +import socket +import XenAPI import XenAPIPlugin -sys.path.append('/opt/xensource/sm/') from linstorjournaler import LinstorJournaler -from linstorvolumemanager import LinstorVolumeManager +from linstorvolumemanager import get_controller_uri, get_local_volume_openers, LinstorVolumeManager from lock import Lock import json import LinstorSR +import re import util import vhdutil +BACKING_DISK_RE = re.compile('^/dev/([^/]+)/(?:[^/]+)$') +LVM_PLUGIN = 'lvm.py' +THIN_POOL = 'thin_pool' FIREWALL_PORT_SCRIPT = '/etc/xapi.d/plugins/firewall-port' -LINSTOR_PORTS = [3366, 3370, 3376, 3377, '7000:8000'] +LINSTOR_PORTS = [3366, 3370, 3376, 3377, 8076, 8077] +DRBD_PORTS = '7000:8000' + +DRBD_REACTOR_CONF = '/etc/drbd-reactor.d/sm-linstor.toml' +DRBD_REACTOR_CONF_CONTENT = """[[promoter]] -def get_linstor_uri(session): - return 'linstor://{}'.format(util.get_master_rec(session)['address']) +[promoter.resources.xcp-persistent-database] +start = [ "var-lib-linstor.service", "linstor-controller.service" ] +""" +DRBD_REACTOR_DEPS = [ + '/run/systemd/system/linstor-controller.service.d/reactor.conf', + '/run/systemd/system/var-lib-linstor.service.d/reactor.conf' +] -def update_port(port, open): - fn = 'open' if open else 'close' + +def update_linstor_port(port, open_ports): + fn = 'open' if open_ports else 'close' args = ( FIREWALL_PORT_SCRIPT, fn, str(port), 'tcp' ) @@ -50,28 +70,238 @@ def update_port(port, open): raise Exception('Failed to {} port: {} {}'.format(fn, out, err)) -def update_all_ports(open): - for port in LINSTOR_PORTS: - update_port(port, open) +def has_iptables_rule(rule): + (ret, stdout, stderr) = util.doexec(['iptables', '-C'] + rule) + return not ret -def update_service(start): - fn = 'enable' if start else 'disable' - args = ('systemctl', fn, '--now', 'linstor-satellite') - (ret, out, err) = util.doexec(args) - if ret == 0: +def update_drbd_ports(open_ports): + # We want to use a static rule regarding DRBD volumes, + # so we can't use the XAPI firewall port script, we have to manually + # check for existing rules before updating iptables service. + rule = ['INPUT', '-p', 'tcp', '--dport', DRBD_PORTS, '-j', 'ACCEPT'] + if open_ports == has_iptables_rule(rule): return - raise Exception('Failed to {} satellite: {} {}'.format(fn, out, err)) + if open_ports: + rule.insert(1, '1') + (ret, stdout, stderr) = util.doexec(['iptables', '-I'] + rule) + if ret: + raise Exception('Failed to add DRBD rule: {}'.format(stderr)) + else: + (ret, stdout, stderr) = util.doexec(['iptables', '-D'] + rule) + if ret: + raise Exception('Failed to remove DRBD rule: {}'.format(stderr)) + (ret, stdout, stderr) = util.doexec(['service', 'iptables', 'save']) + if ret: + raise Exception('Failed to save DRBD rule: {}'.format(stderr)) + + +def update_all_ports(open_ports): + for port in LINSTOR_PORTS: + update_linstor_port(port, open_ports) + update_drbd_ports(open_ports) + + +def update_linstor_satellite_service(start): + service = 'linstor-satellite' + + # Stop services in all cases first. + # Ensure we don't have an invalid cache used by a satellite. + # (We found an issue with a new added disk which used a volume group name + # formerly involved by another disk. To avoid this kind of problem, we + # always restart the satellite.) + util.enable_and_start_service(service, False) + if start: + util.enable_and_start_service(service, True) + + +def update_drbd_reactor_service(start): + if start: + util.atomicFileWrite(DRBD_REACTOR_CONF, None, DRBD_REACTOR_CONF_CONTENT) + else: + try: + os.remove(DRBD_REACTOR_CONF) + except Exception: + pass + + util.stop_service('drbd-reactor') + + try: + util.stop_service('drbd-promote@xcp\x2dpersistent\x2ddatabase.service') + except Exception as e: + if str(e).rstrip().endswith(' not loaded.'): + pass + raise e + + util.stop_service('linstor-controller') + util.stop_service('var-lib-linstor.service') + + for dep in DRBD_REACTOR_DEPS: + try: + os.remove(dep) + except Exception: + pass + + util.doexec(['systemctl', 'daemon-reload']) + util.enable_and_start_service('drbd-reactor', start) + + +def exec_create_sr(session, name, description, disks, volume_group, redundancy, provisioning, force): + disk_hostnames = disks.keys() + thin = provisioning == 'thin' + + # Create volumes. + hosts = session.xenapi.host.get_all_records() + hostnames = [] + for host_ref, host_record in hosts.items(): + hostname = host_record['hostname'] + hostnames.append(hostname) + + if force: + try: + session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'destroy_volume_group', { + 'vg_name': volume_group, + 'force': 'True' + } + ) + except Exception as e: + try: + response = session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'list_volume_groups', { + 'vg_name': volume_group + } + ) + if response != '{}': + raise e + except Exception: + raise e + + if hostname not in disk_hostnames or not disks[hostname]: + if force or session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'list_volume_groups', { + 'vg_name': volume_group + } + ) == '{}': + continue + raise Exception('Volume group should not exist on `{}`, you must remove it manually'.format(hostname)) + + host_disks = disks[hostname] + if type(host_disks) is list: + host_disks = ','.join(disks[hostname]) + else: + raise Exception('Disk value of `{}` must be a disk list'.format(hostname)) + + session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'create_physical_volume', { + 'devices': host_disks, + 'force': str(force) + } + ) + + session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'create_volume_group', { + 'vg_name': volume_group, + 'devices': host_disks + } + ) + + if thin: + session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'create_thin_pool', { + 'vg_name': volume_group, + 'lv_name': THIN_POOL + } + ) + + # Create SR. + master_ref = session.xenapi.pool.get_all_records().values()[0]['master'] + + device_config = { + 'redundancy': str(redundancy), + 'provisioning': 'thin' if thin else 'thick', + 'group-name': '{}/{}'.format(volume_group, THIN_POOL) if thin else volume_group, + 'hosts': ','.join(hostnames), + 'monitor-db-quorum': str(len(hostnames) > 2) + } + sr_ref = session.xenapi.SR.create( + master_ref, device_config, '0', name, description, 'linstor', '', True, {} + ) + return session.xenapi.SR.get_uuid(sr_ref) + + +def get_drbd_volumes(volume_group=None): + drbd_volumes = {} + (ret, stdout, stderr) = util.doexec(['drbdsetup', 'show', '--json']) + if ret: + raise Exception('Failed to get JSON object: {}'.format(stderr)) + + config = json.loads(stdout) + for resource in config: + for volume in resource['_this_host']['volumes']: + backing_disk = volume['backing-disk'] + match = BACKING_DISK_RE.match(backing_disk) + if not match: + continue + + cur_volume_group = match.groups()[0] + if volume_group and cur_volume_group != volume_group: + continue + + minor = int(volume['device_minor']) + if cur_volume_group in drbd_volumes: + drbd_volumes[cur_volume_group].append(minor) + else: + drbd_volumes[cur_volume_group] = [minor] + return drbd_volumes + + +def force_destroy_drbd_volume(minor): + (ret, stdout, stderr) = util.doexec(['drbdsetup', 'detach', minor, '--force']) + if ret: + raise Exception('Failed to detach volume: {}'.format(stderr)) + (ret, stdout, stderr) = util.doexec(['drbdsetup', 'del-minor', minor]) + if ret: + raise Exception('Failed to destroy volume: {}'.format(stderr)) + +# ------------------------------------------------------------------------------ + + +def prepare_sr(session, args): + try: + LinstorSR.activate_lvm_group(args['groupName']) + + update_all_ports(open_ports=True) + # We don't want to enable and start drbd-reactor daemon during + # SR creation. + update_drbd_reactor_service(start=False) + update_linstor_satellite_service(start=True) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:prepare_sr error: {}'.format(e)) + return str(False) + + +def release_sr(session, args): + try: + update_linstor_satellite_service(start=False) + update_drbd_reactor_service(start=False) + update_all_ports(open_ports=False) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:release_sr error: {}'.format(e)) + return str(False) -def enable(session, args): +def update_drbd_reactor(session, args): try: enabled = distutils.util.strtobool(args['enabled']) - update_all_ports(open=enabled) - update_service(start=enabled) + update_drbd_reactor_service(start=enabled) return str(True) except Exception as e: - util.SMlog('linstor-manager:disable error: {}'.format(e)) + util.SMlog( + 'linstor-manager:update_drbd_reactor error: {}'.format(e) + ) return str(False) @@ -81,12 +311,12 @@ def attach(session, args): vdi_uuid = args['vdiUuid'] group_name = args['groupName'] - linstor_uri = get_linstor_uri(session) + controller_uri = get_controller_uri() journaler = LinstorJournaler( - linstor_uri, group_name, logger=util.SMlog + controller_uri, group_name, logger=util.SMlog ) linstor = LinstorVolumeManager( - linstor_uri, + controller_uri, group_name, logger=util.SMlog ) @@ -104,7 +334,7 @@ def detach(session, args): group_name = args['groupName'] linstor = LinstorVolumeManager( - get_linstor_uri(session), + get_controller_uri(), group_name, logger=util.SMlog ) @@ -115,10 +345,37 @@ def detach(session, args): return str(False) +def destroy(session, args): + try: + group_name = args['groupName'] + + # When destroy is called, there are no running drbd-reactor daemons. + # So the controllers are stopped too, we must start an instance. + util.restart_service('var-lib-linstor.service') + util.restart_service('linstor-controller') + + linstor = LinstorVolumeManager( + 'linstor://localhost', + group_name, + logger=util.SMlog + ) + linstor.destroy() + return str(True) + except Exception as e: + util.stop_service('linstor-controller') + util.stop_service('var-lib-linstor.service') + util.SMlog('linstor-manager:destroy error: {}'.format(e)) + return str(False) + + def check(session, args): try: device_path = args['devicePath'] - return str(vhdutil.check(device_path)) + ignore_missing_footer = distutils.util.strtobool( + args['ignoreMissingFooter'] + ) + fast = distutils.util.strtobool(args['fast']) + return str(vhdutil.check(device_path, ignore_missing_footer, fast)) except Exception as e: util.SMlog('linstor-manager:check error: {}'.format(e)) raise @@ -131,7 +388,7 @@ def get_vhd_info(session, args): include_parent = distutils.util.strtobool(args['includeParent']) linstor = LinstorVolumeManager( - get_linstor_uri(session), + get_controller_uri(), group_name, logger=util.SMlog ) @@ -143,7 +400,7 @@ def get_vhd_info(session, args): ) vhd_info = vhdutil.getVHDInfo( - device_path, extract_uuid, include_parent + device_path, extract_uuid, include_parent, False ) return json.dumps(vhd_info.__dict__) except Exception as e: @@ -166,7 +423,7 @@ def get_parent(session, args): group_name = args['groupName'] linstor = LinstorVolumeManager( - get_linstor_uri(session), + get_controller_uri(), group_name, logger=util.SMlog ) @@ -228,6 +485,37 @@ def get_block_bitmap(session, args): raise +def set_parent(session, args): + try: + device_path = args['devicePath'] + parent_path = args['parentPath'] + vhdutil.setParent(device_path, parent_path, False) + return '' + except Exception as e: + util.SMlog('linstor-manager:set_parent error: {}'.format(e)) + raise + + +def coalesce(session, args): + try: + device_path = args['devicePath'] + vhdutil.coalesce(device_path) + return '' + except Exception as e: + util.SMlog('linstor-manager:coalesce error: {}'.format(e)) + raise + + +def repair(session, args): + try: + device_path = args['devicePath'] + vhdutil.repair(device_path) + return '' + except Exception as e: + util.SMlog('linstor-manager:repair error: {}'.format(e)) + raise + + def lock_vdi(session, args): lock = None try: @@ -236,10 +524,13 @@ def lock_vdi(session, args): group_name = args['groupName'] locked = distutils.util.strtobool(args['locked']) + # We must lock to mark the VDI. lock = Lock(vhdutil.LOCK_TYPE_SR, sr_uuid) + if locked: + lock.acquire() linstor = LinstorVolumeManager( - get_linstor_uri(session), + get_controller_uri(), group_name, logger=util.SMlog ) @@ -249,16 +540,523 @@ def lock_vdi(session, args): except Exception as e: util.SMlog('linstor-manager:lock_vdi error: {}'.format(e)) finally: - if lock: + if locked and lock: lock.release() return str(False) +def has_controller_running(session, args): + (ret, stdout, stderr) = util.doexec([ + 'systemctl', 'is-active', '--quiet', 'linstor-controller' + ]) + return str(ret == 0) + + +def add_host(session, args): + group_name = args['groupName'] + + # 1. Find SR and PBDs. + srs = dict() + for sr_ref, sr in session.xenapi.SR.get_all_records().items(): + if sr.get('type') == 'linstor': + srs[sr_ref] = sr + + pbds = dict() + for pbd_ref, pbd in session.xenapi.PBD.get_all_records().items(): + device_config = pbd.get('device_config') + if ( + device_config and + device_config.get('group-name') == group_name + and pbd['SR'] in srs + ): + pbds[pbd_ref] = pbd + + # 2. Ensure there is at least one PBD and all PBDs are used in + # the same SR. + if not pbds: + raise Exception( + 'Failed to find PBDs of group `{}`'.format(group_name) + ) + + sr_ref = None + for pbd in pbds.values(): + if not sr_ref: + sr_ref = pbd['SR'] + elif pbd['SR'] != sr_ref: + raise Exception( + 'Group `{}` is used by many SRs!'.format(group_name) + ) + + # 3. Ensure node doesn't exist. + linstor = LinstorVolumeManager( + get_controller_uri(), + group_name, + logger=util.SMlog + ) + + node_name = socket.gethostname() + has_node = linstor.has_node(node_name) + + pbd_id = 0 + new_pbd_ref = None + + try: + # 4. Enable services. + update_all_ports(open_ports=True) + update_drbd_reactor_service(start=True) + update_linstor_satellite_service(start=True) + + # 5. Try to create local node. + if not has_node: + linstor.create_node(node_name, util.get_this_host_address(session)) + + # 6. Recreate PBDs. + # Use the redundancy given by Linstor instead of smapi config. + redundancy = linstor.redundancy + default_device_config = None + this_host = util.get_this_host_ref(session) + create_new_pbd = True + + assert pbds + pbds = pbds.items() + for pbd_ref, pbd in pbds: + device_config = pbd['device_config'] + + hosts = filter( + lambda host: len(host.strip()), + device_config.get('hosts', []).split(',') + ) + hosts.append(node_name) + hosts = ','.join(list(set(hosts))) + + # Should be the same on all hosts. + provisioning = device_config['provisioning'] + + if not default_device_config: + default_device_config = { + 'group-name': group_name, + 'redundancy': redundancy, + 'hosts': hosts, + 'provisioning': provisioning + } + + if pbd['currently_attached']: + session.xenapi.PBD.unplug(pbd_ref) + session.xenapi.PBD.destroy(pbd_ref) + pbd_id += 1 + + host = pbd['host'] + if host == this_host: + create_new_pbd = False + + pbd_ref = session.xenapi.PBD.create({ + 'host': host, + 'SR': sr_ref, + 'device_config': { + 'group-name': group_name, + 'redundancy': redundancy, + 'hosts': hosts, + 'provisioning': provisioning + } + }) + try: + session.xenapi.PBD.plug(pbd_ref) + except Exception as e: + util.SMlog('Failed to replug PBD: {}'.format(e)) + + # 7. Create new PBD. + if create_new_pbd: + new_pbd_ref = session.xenapi.PBD.create({ + 'host': this_host, + 'SR': sr_ref, + 'device_config': default_device_config + }) + try: + session.xenapi.PBD.plug(new_pbd_ref) + except Exception as e: + util.SMlog('Failed to plug new PBD: {}'.format(e)) + + return str(True) + except Exception as e: + stop_services = not has_node + if stop_services: + try: + linstor.destroy_node(node_name) + except Exception: + pass + + for pbd_ref, pbd in pbds[:pbd_id]: + try: + session.xenapi.PBD.unplug(pbd_ref) + except Exception: + pass + + try: + session.xenapi.PBD.destroy(pbd_ref) + except Exception: + pass + + try: + device_config = pbd['device_config'] + session.xenapi.PBD.create({ + 'host': host, + 'SR': sr_ref, + 'device_config': { + 'group-name': group_name, + 'redundancy': redundancy, + 'hosts': device_config['hosts'], + 'provisioning': device_config['provisioning'] + } + }) + except Exception as pbd_error: + util.SMlog('Failed to recreate PBD: {}'.format(pbd_error)) + pass + + try: + session.xenapi.PBD.plug(pbd_ref) + except Exception: + pass + + if new_pbd_ref: + try: + session.xenapi.PBD.unplug(new_pbd_ref) + except Exception: + pass + + try: + session.xenapi.PBD.destroy(new_pbd_ref) + except Exception: + pass + + try: + # If we failed to remove the node, we don't stop services. + if stop_services and not linstor.has_node(node_name): + update_linstor_satellite_service(start=False) + update_drbd_reactor_service(start=False) + update_all_ports(open_ports=False) + except Exception: + pass + + raise e + + +def remove_host(session, args): + group_name = args['groupName'] + + # 1. Find SRs and PBDs. + srs = dict() + for sr_ref, sr in session.xenapi.SR.get_all_records().items(): + if sr.get('type') == 'linstor': + srs[sr_ref] = sr + + pbds = dict() + for pbd_ref, pbd in session.xenapi.PBD.get_all_records().items(): + device_config = pbd.get('device_config') + if ( + device_config and + device_config.get('group-name') == group_name + and pbd['SR'] in srs + ): + pbds[pbd_ref] = pbd + + # 2. Remove node. + linstor = LinstorVolumeManager( + get_controller_uri(), + group_name, + logger=util.SMlog + ) + + node_name = socket.gethostname() + if linstor.has_node(node_name): + linstor.destroy_node(node_name) + if linstor.has_node(node_name): + raise Exception('Failed to remove node! Unknown error.') + + redundancy = linstor.redundancy + this_host = util.get_this_host_ref(session) + + # 3. Update PBDs. + for pbd_ref, pbd in pbds.items(): + host = pbd['host'] + if host == this_host: + if pbd['currently_attached']: + session.xenapi.PBD.unplug(pbd_ref) + session.xenapi.PBD.destroy(pbd_ref) + continue + + device_config = pbd['device_config'] + hosts = device_config.get('hosts', []).split(',') + try: + hosts.remove(node_name) + except Exception as e: + continue + hosts = ','.join(list(set(hosts))) + + if pbd['currently_attached']: + session.xenapi.PBD.unplug(pbd_ref) + session.xenapi.PBD.destroy(pbd_ref) + + pbd_ref = session.xenapi.PBD.create({ + 'host': host, + 'SR': pbd['SR'], + 'device_config': { + 'group-name': group_name, + 'redundancy': redundancy, + 'hosts': hosts, + 'provisioning': device_config['provisioning'] + } + }) + + try: + session.xenapi.PBD.plug(pbd_ref) + except Exception as e: + util.SMlog('Failed to replug PBD: {}'.format(e)) + + # 3. Stop services. + try: + update_linstor_satellite_service(start=False) + update_drbd_reactor_service(start=False) + update_all_ports(open_ports=False) + except Exception as e: + util.SMlog('Error while stopping services: {}'.format(e)) + pass + + return str('True') + + +def create_sr(session, args): + try: + # Use a complex parsing contrary to the other functions because + # this helper is a public method and is not easy to use. + name = args.get('name') + if not name: + raise Exception('`name` is empty') + + description = args.get('description') or '' + + disks = args.get('disks') + if not disks: + raise Exception('`disks` is empty') + try: + disks = json.loads(disks) + except Exception as e: + raise Exception('failed to decode `disks`: {}'.format(e)) + if type(disks) is not dict: + raise Exception('`disks` must be a JSON object') + + volume_group = args.get('volume_group') + if not volume_group: + raise Exception('`volume_group` is empty') + + redundancy = args.get('redundancy') + if not redundancy: + raise Exception('`redundancy` is empty') + + try: + redundancy = int(redundancy) + except Exception: + raise Exception('`redundancy` is not a number') + + provisioning = args.get('provisioning') + if not provisioning: + provisioning = 'thin' + elif provisioning != 'thin' and provisioning != 'thick': + raise Exception('unsupported provisioning') + + force = distutils.util.strtobool(args.get('force') or '0') + + return exec_create_sr( + session, name, description, disks, volume_group, redundancy, provisioning, force + ) + except Exception as e: + util.SMlog('linstor-manager:create_sr error: {}'.format(e)) + raise + + +def demote_drbd_resource(session, args): + try: + resource_name = args['resource_name'] + (ret, stdout, stderr) = util.doexec(['drbdsetup', 'secondary', resource_name]) + if ret: + raise Exception('Failed to demote resource: {}'.format(stderr)) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:demote_drbd_resource error: {}'.format(e)) + return str(False) + + +def list_drbd_volumes(session, args): + try: + volume_group = args.get('volume_group') + return json.dumps(get_drbd_volumes(volume_group)) + except Exception as e: + util.SMlog('linstor-manager:list_drbd_volumes error: {}'.format(e)) + raise + + +def destroy_drbd_volume(session, args): + try: + minor = args.get('minor') + if not minor: + raise Exception('Cannot destroy DRBD volume without minor.') + force_destroy_drbd_volume(minor) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:destroy_drbd_volume error: {}'.format(e)) + return str(False) + + +def destroy_drbd_volumes(session, args): + try: + volume_group = args.get('volume_group') + if not volume_group: + raise Exception('Cannot destroy DRBD volumes without volume group.') + for minor in get_drbd_volumes(volume_group).get(volume_group, []): + force_destroy_drbd_volume(str(minor)) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:destroy_drbd_volumes error: {}'.format(e)) + return str(False) + + +def get_drbd_openers(session, args): + try: + resource_name = args.get('resourceName') + volume = args.get('volume') + return get_local_volume_openers(resource_name, volume) + except Exception as e: + util.SMlog('linstor-manager:get_drbd_openers error: {}'.format(e)) + raise + + +def health_check(session, args): + group_name = args['groupName'] + + result = { + 'controller-uri': '', + 'nodes': {}, + 'storage-pools': {}, + 'warnings': [], + 'errors': [] + } + + def format_result(): + return json.dumps(result) + + # 1. Get controller. + try: + controller_uri = get_controller_uri() + + result['controller-uri'] = controller_uri + try: + if controller_uri == 'linstor://localhost': + # Replace `localhost` with IP to give a better info for users. + result['controller-uri'] = 'linstor://' + util.get_this_host_address(session) + except Exception: + # Ignore error: can be a XAPI restart or something else. + pass + + linstor = LinstorVolumeManager( + controller_uri, + group_name, + logger=util.SMlog + ) + except Exception as e: + # Probably a network issue, or offline controller. + result['errors'].append('Cannot join SR: `{}`.'.format(e)) + return format_result() + + try: + # 2. Check node statuses. + nodes = linstor.get_nodes_info() + result['nodes'] = nodes + for node_name, status in nodes.items(): + if status != 'ONLINE': + result['warnings'].append('Node `{}` is {}.'.format(node_name, status)) + + # 3. Check storage pool statuses. + storage_pools_per_node = linstor.get_storage_pools_info() + result['storage-pools'] = storage_pools_per_node + for node_name, storage_pools in storage_pools_per_node.items(): + for storage_pool in storage_pools: + free_size = storage_pool['free-size'] + capacity = storage_pool['capacity'] + if free_size < 0 or capacity <= 0: + result['errors'].append( + 'Cannot get free size and/or capacity of storage pool `{}`.' + .format(storage_pool['uuid']) + ) + elif free_size > capacity: + result['errors'].append( + 'Free size of storage pool `{}` is greater than capacity.' + .format(storage_pool['uuid']) + ) + else: + remaining_percent = free_size / float(capacity) * 100.0 + threshold = 10.0 + if remaining_percent < threshold: + result['warnings'].append( + 'Remaining size of storage pool `{}` is below {}% of its capacity.' + .format(storage_pool['uuid'], threshold) + ) + + # 4. Check resource statuses. + all_resources = linstor.get_resources_info() + result['resources'] = all_resources + + for resource_name, resource_by_node in all_resources.items(): + for node_name, resource in resource_by_node.items(): + for volume_index, volume in enumerate(resource['volumes']): + disk_state = volume['disk-state'] + if disk_state in ['UpToDate', 'Created', 'Attached']: + continue + if disk_state == 'DUnknown': + result['warnings'].append( + 'Unknown state for volume `{}` at index {} for resource `{}` on node `{}`' + .format(volume['device-path'], volume_index, resource_name, node_name) + ) + continue + if disk_state in ['Inconsistent', 'Failed', 'To: Creating', 'To: Attachable', 'To: Attaching']: + result['errors'].append( + 'Invalid state `{}` for volume `{}` at index {} for resource `{}` on node `{}`' + .format(disk_state, volume['device-path'], volume_index, resource_name, node_name) + ) + continue + if disk_state == 'Diskless': + if resource['diskful']: + result['errors'].append( + 'Unintentional diskless state detected for volume `{}` at index {} for resource `{}` on node `{}`' + .format(volume['device-path'], volume_index, resource_name, node_name) + ) + elif resource['tie-breaker']: + volume['disk-state'] = 'TieBreaker' + continue + result['warnings'].append( + 'Unhandled state `{}` for volume `{}` at index {} for resource `{}` on node `{}`' + .format(disk_state, volume['device-path'], volume_index, resource_name, node_name) + ) + + except Exception as e: + result['errors'].append('Unexpected error: `{}`'.format(e)) + + return format_result() + + if __name__ == '__main__': XenAPIPlugin.dispatch({ - 'enable': enable, + 'prepareSr': prepare_sr, + 'releaseSr': release_sr, + 'updateDrbdReactor': update_drbd_reactor, 'attach': attach, 'detach': detach, + 'destroy': destroy, + + # vhdutil wrappers called by linstorvhdutil. + # Note: When a VHD is open in RO mode (so for all vhdutil getters), + # the LVM layer is used directly to bypass DRBD verifications. + # In this case there can't be EROFS errors. + # Note 2: We assume linstorvhdutil executes remote calls on diskful + # DRBDs, otherwise we still have EROFS errors... 'check': check, 'getVHDInfo': get_vhd_info, 'hasParent': has_parent, @@ -268,5 +1066,22 @@ if __name__ == '__main__': 'getDepth': get_depth, 'getKeyHash': get_key_hash, 'getBlockBitmap': get_block_bitmap, - 'lockVdi': lock_vdi + + # Called by cleanup.py to coalesce when a primary + # is opened on a non-local host. + 'setParent': set_parent, + 'coalesce': coalesce, + 'repair': repair, + + 'lockVdi': lock_vdi, + 'hasControllerRunning': has_controller_running, + 'addHost': add_host, + 'removeHost': remove_host, + 'createSr': create_sr, + 'listDrbdVolumes': list_drbd_volumes, + 'demoteDrbdResource': demote_drbd_resource, + 'destroyDrbdVolume': destroy_drbd_volume, + 'destroyDrbdVolumes': destroy_drbd_volumes, + 'getDrbdOpeners': get_drbd_openers, + 'healthCheck': health_check }) diff --git a/drivers/linstorjournaler.py b/drivers/linstorjournaler.py index bc7cff7c2..a61d9f11b 100755 --- a/drivers/linstorjournaler.py +++ b/drivers/linstorjournaler.py @@ -16,7 +16,8 @@ # -from linstorvolumemanager import LinstorVolumeManager +from linstorvolumemanager import \ + get_controller_uri, LinstorVolumeManager, LinstorVolumeManagerError import linstor import re import util @@ -52,20 +53,10 @@ def __init__(self, uri, group_name, logger=default_logger.__func__): self._namespace = '{}journal/'.format( LinstorVolumeManager._build_sr_namespace() ) - - def connect(): - self._journal = linstor.KV( - LinstorVolumeManager._build_group_name(group_name), - uri=uri, - namespace=self._namespace - ) - - util.retry( - connect, - maxretry=60, - exceptions=[linstor.errors.LinstorNetworkError] - ) self._logger = logger + self._journal = self._create_journal_instance( + uri, group_name, self._namespace + ) def create(self, type, identifier, value): # TODO: Maybe rename to 'add' in the future (in Citrix code too). @@ -116,6 +107,7 @@ def remove(self, type, identifier): ) def get(self, type, identifier): + self._reset_namespace() return self._journal.get(self._get_key(type, identifier)) def get_all(self, type): @@ -150,6 +142,34 @@ def hasJournals(self, identifier): def _reset_namespace(self): self._journal.namespace = self._namespace + @classmethod + def _create_journal_instance(cls, uri, group_name, namespace): + def connect(uri): + if not uri: + uri = get_controller_uri() + if not uri: + raise LinstorVolumeManagerError( + 'Unable to find controller uri...' + ) + return linstor.KV( + LinstorVolumeManager._build_group_name(group_name), + uri=uri, + namespace=namespace + ) + + try: + return connect(uri) + except (linstor.errors.LinstorNetworkError, LinstorVolumeManagerError): + pass + + return util.retry( + lambda: connect(None), + maxretry=10, + exceptions=[ + linstor.errors.LinstorNetworkError, LinstorVolumeManagerError + ] + ) + @staticmethod def _get_key(type, identifier): return '{}/{}'.format(type, identifier) diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index 7a1356627..83d7f8be5 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -25,9 +25,46 @@ MANAGER_PLUGIN = 'linstor-manager' +# EMEDIUMTYPE constant (124) is not available in python2. +EMEDIUMTYPE = 124 + + +def call_vhd_util_on_host(session, host_ref, method, device_path, args): + try: + response = session.xenapi.host.call_plugin( + host_ref, MANAGER_PLUGIN, method, args + ) + except Exception as e: + util.SMlog('call-plugin ({} with {}) exception: {}'.format( + method, args, e + )) + raise + + util.SMlog('call-plugin ({} with {}) returned: {}'.format( + method, args, response + )) + + return response + + +class LinstorCallException(Exception): + def __init__(self, cmd_err): + self.cmd_err = cmd_err + + def __str__(self): + return str(self.cmd_err) + + +class ErofsLinstorCallException(LinstorCallException): + pass + + +class NoPathLinstorCallException(LinstorCallException): + pass + def linstorhostcall(local_method, remote_method): - def decorated(func): + def decorated(response_parser): def wrapper(*args, **kwargs): self = args[0] vdi_uuid = args[1] @@ -41,50 +78,56 @@ def wrapper(*args, **kwargs): # Try to read locally if the device is not in use or if the device # is up to date and not diskless. - (node_names, in_use) = \ - self._linstor.find_up_to_date_diskfull_nodes(vdi_uuid) + (node_names, in_use_by) = \ + self._linstor.find_up_to_date_diskful_nodes(vdi_uuid) + local_e = None try: - if not in_use or socket.gethostname() in node_names: - return local_method(device_path, *args[2:], **kwargs) - except util.CommandException as e: - # EMEDIUMTYPE constant (124) is not available in python2. - if e.code != errno.EROFS and e.code != 124: - raise + if not in_use_by or socket.gethostname() in node_names: + return self._call_local_vhd_util(local_method, device_path, *args[2:], **kwargs) + except ErofsLinstorCallException as e: + local_e = e.cmd_err + except Exception as e: + local_e = e + + util.SMlog( + 'unable to execute `{}` locally, retry using a readable host... (cause: {})'.format( + remote_method, local_e if local_e else 'local diskless + in use or not up to date' + ) + ) + + if in_use_by: + node_names = {in_use_by} # B. Execute the plugin on master or slave. - def exec_remote_method(): - host_ref = self._get_readonly_host( - vdi_uuid, device_path, node_names - ) - args = { - 'devicePath': device_path, - 'groupName': self._linstor.group_name - } - args.update(**kwargs) + remote_args = { + 'devicePath': device_path, + 'groupName': self._linstor.group_name + } + remote_args.update(**kwargs) + remote_args = {str(key): str(value) for key, value in remote_args.iteritems()} - try: - response = self._session.xenapi.host.call_plugin( - host_ref, MANAGER_PLUGIN, remote_method, args - ) - except Exception as e: - util.SMlog('call-plugin ({} with {}) exception: {}'.format( - remote_method, args, e - )) - raise - - util.SMlog('call-plugin ({} with {}) returned: {}'.format( - remote_method, args, response - )) - if response == 'False': - raise xs_errors.XenError( - 'VDIUnavailable', - opterr='Plugin {} failed'.format(MANAGER_PLUGIN) - ) - kwargs['response'] = response - - util.retry(exec_remote_method, 5, 3) - return func(*args, **kwargs) + try: + def remote_call(): + host_ref = self._get_readonly_host(vdi_uuid, device_path, node_names) + return call_vhd_util_on_host(self._session, host_ref, remote_method, device_path, remote_args) + response = util.retry(remote_call, 5, 2) + except Exception as remote_e: + self._raise_openers_exception(device_path, local_e or remote_e) + + return response_parser(self, vdi_uuid, response) + return wrapper + return decorated + + +def linstormodifier(): + def decorated(func): + def wrapper(*args, **kwargs): + self = args[0] + + ret = func(*args, **kwargs) + self._linstor.invalidate_resource_cache() + return ret return wrapper return decorated @@ -94,17 +137,33 @@ def __init__(self, session, linstor): self._session = session self._linstor = linstor + # -------------------------------------------------------------------------- + # Getters: read locally and try on another host in case of failure. + # -------------------------------------------------------------------------- + + def check(self, vdi_uuid, ignore_missing_footer=False, fast=False): + kwargs = { + 'ignoreMissingFooter': ignore_missing_footer, + 'fast': fast + } + return self._check(vdi_uuid, **kwargs) # pylint: disable = E1123 + @linstorhostcall(vhdutil.check, 'check') - def check(self, vdi_uuid, **kwargs): - return distutils.util.strtobool(kwargs['response']) + def _check(self, vdi_uuid, response): + return distutils.util.strtobool(response) def get_vhd_info(self, vdi_uuid, include_parent=True): - kwargs = {'includeParent': str(include_parent)} - return self._get_vhd_info(vdi_uuid, self._extract_uuid, **kwargs) + kwargs = { + 'includeParent': include_parent, + 'resolveParent': False + } + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 + return self._get_vhd_info(vdi_uuid, self._extract_uuid, **kwargs) # pylint: disable = E1123 @linstorhostcall(vhdutil.getVHDInfo, 'getVHDInfo') - def _get_vhd_info(self, vdi_uuid, *args, **kwargs): - obj = json.loads(kwargs['response']) + def _get_vhd_info(self, vdi_uuid, response): + obj = json.loads(response) vhd_info = vhdutil.VHDInfo(vdi_uuid) vhd_info.sizeVirt = obj['sizeVirt'] @@ -118,35 +177,91 @@ def _get_vhd_info(self, vdi_uuid, *args, **kwargs): return vhd_info @linstorhostcall(vhdutil.hasParent, 'hasParent') - def has_parent(self, vdi_uuid, **kwargs): - return distutils.util.strtobool(kwargs['response']) + def has_parent(self, vdi_uuid, response): + return distutils.util.strtobool(response) def get_parent(self, vdi_uuid): return self._get_parent(vdi_uuid, self._extract_uuid) @linstorhostcall(vhdutil.getParent, 'getParent') - def _get_parent(self, vdi_uuid, *args, **kwargs): - return kwargs['response'] + def _get_parent(self, vdi_uuid, response): + return response @linstorhostcall(vhdutil.getSizeVirt, 'getSizeVirt') - def get_size_virt(self, vdi_uuid, **kwargs): - return int(kwargs['response']) + def get_size_virt(self, vdi_uuid, response): + return int(response) @linstorhostcall(vhdutil.getSizePhys, 'getSizePhys') - def get_size_phys(self, vdi_uuid, **kwargs): - return int(kwargs['response']) + def get_size_phys(self, vdi_uuid, response): + return int(response) @linstorhostcall(vhdutil.getDepth, 'getDepth') - def get_depth(self, vdi_uuid, **kwargs): - return int(kwargs['response']) + def get_depth(self, vdi_uuid, response): + return int(response) @linstorhostcall(vhdutil.getKeyHash, 'getKeyHash') - def get_key_hash(self, vdi_uuid, **kwargs): - return kwargs['response'] or None + def get_key_hash(self, vdi_uuid, response): + return response or None @linstorhostcall(vhdutil.getBlockBitmap, 'getBlockBitmap') - def get_block_bitmap(self, vdi_uuid, **kwargs): - return base64.b64decode(kwargs['response']) + def get_block_bitmap(self, vdi_uuid, response): + return base64.b64decode(response) + + # -------------------------------------------------------------------------- + # Setters: only used locally. + # -------------------------------------------------------------------------- + + @linstormodifier() + def create(self, path, size, static, msize=0): + return self._call_local_vhd_util_or_fail(vhdutil.create, path, size, static, msize) + + @linstormodifier() + def set_size_virt_fast(self, path, size): + return self._call_local_vhd_util_or_fail(vhdutil.setSizeVirtFast, path, size) + + @linstormodifier() + def set_size_phys(self, path, size, debug=True): + return self._call_local_vhd_util_or_fail(vhdutil.setSizePhys, path, size, debug) + + @linstormodifier() + def set_parent(self, path, parentPath, parentRaw=False): + return self._call_local_vhd_util_or_fail(vhdutil.setParent, path, parentPath, parentRaw) + + @linstormodifier() + def set_hidden(self, path, hidden=True): + return self._call_local_vhd_util_or_fail(vhdutil.setHidden, path, hidden) + + @linstormodifier() + def set_key(self, path, key_hash): + return self._call_local_vhd_util_or_fail(vhdutil.setKey, path, key_hash) + + @linstormodifier() + def kill_data(self, path): + return self._call_local_vhd_util_or_fail(vhdutil.killData, path) + + @linstormodifier() + def snapshot(self, path, parent, parentRaw, msize=0, checkEmpty=True): + return self._call_local_vhd_util_or_fail(vhdutil.snapshot, path, parent, parentRaw, msize, checkEmpty) + + # -------------------------------------------------------------------------- + # Remote setters: write locally and try on another host in case of failure. + # -------------------------------------------------------------------------- + + @linstormodifier() + def force_parent(self, path, parentPath, parentRaw=False): + kwargs = { + 'parentPath': str(parentPath), + 'parentRaw': parentRaw + } + return self._call_vhd_util(vhdutil.setParent, 'setParent', path, use_parent=False, **kwargs) + + @linstormodifier() + def force_coalesce(self, path): + return self._call_vhd_util(vhdutil.coalesce, 'coalesce', path, use_parent=True) + + @linstormodifier() + def force_repair(self, path): + return self._call_vhd_util(vhdutil.repair, 'repair', path, use_parent=False) # -------------------------------------------------------------------------- # Helpers. @@ -161,7 +276,7 @@ def _extract_uuid(self, device_path): def _get_readonly_host(self, vdi_uuid, device_path, node_names): """ When vhd-util is called to fetch VDI info we must find a - diskfull DRBD disk to read the data. It's the goal of this function. + diskful DRBD disk to read the data. It's the goal of this function. Why? Because when a VHD is open in RO mode, the LVM layer is used directly to bypass DRBD verifications (we can have only one process that reads/writes to disk with DRBD devices). @@ -170,7 +285,7 @@ def _get_readonly_host(self, vdi_uuid, device_path, node_names): if not node_names: raise xs_errors.XenError( 'VDIUnavailable', - opterr='Unable to find diskfull node: {} (path={})' + opterr='Unable to find diskful node: {} (path={})' .format(vdi_uuid, device_path) ) @@ -184,3 +299,134 @@ def _get_readonly_host(self, vdi_uuid, device_path, node_names): opterr='Unable to find a valid host from VDI: {} (path={})' .format(vdi_uuid, device_path) ) + + # -------------------------------------------------------------------------- + + def _raise_openers_exception(self, device_path, e): + if isinstance(e, util.CommandException): + e_str = 'cmd: `{}`, code: `{}`, reason: `{}`'.format(e.cmd, e.code, e.reason) + else: + e_str = str(e) + + e_with_openers = None + try: + volume_uuid = self._linstor.get_volume_uuid_from_device_path( + device_path + ) + e_wrapper = Exception( + e_str + ' (openers: {})'.format( + self._linstor.get_volume_openers(volume_uuid) + ) + ) + except Exception as illformed_e: + e_wrapper = Exception( + e_str + ' (unable to get openers: {})'.format(illformed_e) + ) + util.SMlog('raise opener exception: {}'.format(e_wrapper)) + raise e_wrapper # pylint: disable = E0702 + + def _call_local_vhd_util(self, local_method, device_path, *args, **kwargs): + try: + def local_call(): + try: + return local_method(device_path, *args, **kwargs) + except util.CommandException as e: + if e.code == errno.EROFS or e.code == EMEDIUMTYPE: + raise ErofsLinstorCallException(e) # Break retry calls. + if e.code == errno.ENOENT: + raise NoPathLinstorCallException(e) + raise e + # Retry only locally if it's not an EROFS exception. + return util.retry(local_call, 5, 2, exceptions=[util.CommandException]) + except util.CommandException as e: + util.SMlog('failed to execute locally vhd-util (sys {})'.format(e.code)) + raise e + + def _call_local_vhd_util_or_fail(self, local_method, device_path, *args, **kwargs): + try: + return self._call_local_vhd_util(local_method, device_path, *args, **kwargs) + except ErofsLinstorCallException as e: + # Volume is locked on a host, find openers. + self._raise_openers_exception(device_path, e.cmd_err) + + def _call_vhd_util(self, local_method, remote_method, device_path, use_parent, *args, **kwargs): + # Note: `use_parent` exists to know if the VHD parent is used by the local/remote method. + # Normally in case of failure, if the parent is unused we try to execute the method on + # another host using the DRBD opener list. In the other case, if the parent is required, + # we must check where this last one is open instead of the child. + + # A. Try to write locally... + try: + return self._call_local_vhd_util(local_method, device_path, *args, **kwargs) + except Exception: + pass + + util.SMlog('unable to execute `{}` locally, retry using a writable host...'.format(remote_method)) + + # B. Execute the command on another host. + # B.1. Get host list. + try: + hosts = self._session.xenapi.host.get_all_records() + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unable to get host list to run vhd-util command `{}` (path={}): {}' + .format(remote_method, device_path, e) + ) + + # B.2. Prepare remote args. + remote_args = { + 'devicePath': device_path, + 'groupName': self._linstor.group_name + } + remote_args.update(**kwargs) + remote_args = {str(key): str(value) for key, value in remote_args.iteritems()} + + volume_uuid = self._linstor.get_volume_uuid_from_device_path( + device_path + ) + parent_volume_uuid = None + if use_parent: + parent_volume_uuid = self.get_parent(volume_uuid) + + openers_uuid = parent_volume_uuid if use_parent else volume_uuid + + # B.3. Call! + def remote_call(): + try: + all_openers = self._linstor.get_volume_openers(openers_uuid) + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unable to get DRBD openers to run vhd-util command `{}` (path={}): {}' + .format(remote_method, device_path, e) + ) + + no_host_found = True + for hostname, openers in all_openers.iteritems(): + if not openers: + continue + + try: + host_ref = next(ref for ref, rec in hosts.iteritems() if rec['hostname'] == hostname) + except StopIteration: + continue + + no_host_found = False + try: + return call_vhd_util_on_host(self._session, host_ref, remote_method, device_path, remote_args) + except Exception: + pass + + if no_host_found: + try: + return local_method(device_path, *args, **kwargs) + except Exception as e: + self._raise_openers_exception(device_path, e) + + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='No valid host found to run vhd-util command `{}` (path=`{}`, openers=`{}`): {}' + .format(remote_method, device_path, openers, e) + ) + return util.retry(remote_call, 5, 2) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 182b88992..a9f39e03c 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -16,15 +16,103 @@ # +import distutils.util +import errno +import glob import json import linstor import os.path import re +import shutil import socket +import stat import time import util +import uuid +# Contains the data of the "/var/lib/linstor" directory. +DATABASE_VOLUME_NAME = 'xcp-persistent-database' +DATABASE_SIZE = 1 << 30 # 1GB. +DATABASE_PATH = '/var/lib/linstor' +DATABASE_MKFS = 'mkfs.ext4' + +REG_DRBDADM_PRIMARY = re.compile("([^\\s]+)\\s+role:Primary") +REG_DRBDSETUP_IP = re.compile('[^\\s]+\\s+(.*):.*$') + +DRBD_BY_RES_PATH = '/dev/drbd/by-res/' + +PLUGIN = 'linstor-manager' + + +# ============================================================================== + +def get_local_volume_openers(resource_name, volume): + if not resource_name or volume is None: + raise Exception('Cannot get DRBD openers without resource name and/or volume.') + + path = '/sys/kernel/debug/drbd/resources/{}/volumes/{}/openers'.format( + resource_name, volume + ) + + with open(path, 'r') as openers: + # Not a big cost, so read all lines directly. + lines = openers.readlines() + + result = {} + + opener_re = re.compile('(.*)\\s+([0-9]+)\\s+([0-9]+)') + for line in lines: + match = opener_re.match(line) + assert match + + groups = match.groups() + process_name = groups[0] + pid = groups[1] + open_duration_ms = groups[2] + result[pid] = { + 'process-name': process_name, + 'open-duration': open_duration_ms + } + + return json.dumps(result) + +def get_all_volume_openers(resource_name, volume): + PLUGIN_CMD = 'getDrbdOpeners' + + volume = str(volume) + openers = {} + + # Make sure this call never stucks because this function can be called + # during HA init and in this case we can wait forever. + session = util.timeout_call(10, util.get_localAPI_session) + + hosts = session.xenapi.host.get_all_records() + for host_ref, host_record in hosts.items(): + node_name = host_record['hostname'] + try: + if not session.xenapi.host_metrics.get_record( + host_record['metrics'] + )['live']: + # Ensure we call plugin on online hosts only. + continue + + openers[node_name] = json.loads( + session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, { + 'resourceName': resource_name, + 'volume': volume + }) + ) + except Exception as e: + util.SMlog('Failed to get openers of `{}` on `{}`: {}'.format( + resource_name, node_name, e + )) + + return openers + + +# ============================================================================== + def round_up(value, divisor): assert divisor divisor = int(divisor) @@ -37,6 +125,148 @@ def round_down(value, divisor): return value - (value % int(divisor)) +# ============================================================================== + +def get_remote_host_ip(node_name): + (ret, stdout, stderr) = util.doexec([ + 'drbdsetup', 'show', DATABASE_VOLUME_NAME, '--json' + ]) + if ret != 0: + return + + try: + conf = json.loads(stdout) + if not conf: + return + + for connection in conf[0]['connections']: + if connection['net']['_name'] == node_name: + value = connection['path']['_remote_host'] + res = REG_DRBDSETUP_IP.match(value) + if res: + return res.groups()[0] + break + except Exception: + pass + + +def _get_controller_uri(): + PLUGIN_CMD = 'hasControllerRunning' + + # Try to find controller using drbdadm. + (ret, stdout, stderr) = util.doexec([ + 'drbdadm', 'status', DATABASE_VOLUME_NAME + ]) + if ret == 0: + # If we are here, the database device exists locally. + + if stdout.startswith('{} role:Primary'.format(DATABASE_VOLUME_NAME)): + # Nice case, we have the controller running on this local host. + return 'linstor://localhost' + + # Try to find the host using DRBD connections. + res = REG_DRBDADM_PRIMARY.search(stdout) + if res: + node_name = res.groups()[0] + ip = get_remote_host_ip(node_name) + if ip: + return 'linstor://' + ip + + # Worst case: we use many hosts in the pool (>= 4), so we can't find the + # primary using drbdadm because we don't have all connections to the + # replicated volume. `drbdadm status xcp-persistent-database` returns + # 3 connections by default. + try: + session = util.timeout_call(10, util.get_localAPI_session) + + for host_ref, host_record in session.xenapi.host.get_all_records().items(): + node_name = host_record['hostname'] + try: + if distutils.util.strtobool( + session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, {}) + ): + return 'linstor://' + host_record['address'] + except Exception as e: + # Can throw and exception if a host is offline. So catch it. + util.SMlog('Unable to search controller on `{}`: {}'.format( + node_name, e + )) + except: + # Not found, maybe we are trying to create the SR... + pass + +def get_controller_uri(): + retries = 0 + while True: + uri = _get_controller_uri() + if uri: + return uri + + retries += 1 + if retries >= 10: + break + time.sleep(1) + + +def get_controller_node_name(): + PLUGIN_CMD = 'hasControllerRunning' + + (ret, stdout, stderr) = util.doexec([ + 'drbdadm', 'status', DATABASE_VOLUME_NAME + ]) + + if ret == 0: + if stdout.startswith('{} role:Primary'.format(DATABASE_VOLUME_NAME)): + return 'localhost' + + res = REG_DRBDADM_PRIMARY.search(stdout) + if res: + return res.groups()[0] + + session = util.timeout_call(5, util.get_localAPI_session) + + for host_ref, host_record in session.xenapi.host.get_all_records().items(): + node_name = host_record['hostname'] + try: + if not session.xenapi.host_metrics.get_record( + host_record['metrics'] + )['live']: + continue + + if distutils.util.strtobool(session.xenapi.host.call_plugin( + host_ref, PLUGIN, PLUGIN_CMD, {} + )): + return node_name + except Exception as e: + util.SMlog('Failed to call plugin to get controller on `{}`: {}'.format( + node_name, e + )) + + +def demote_drbd_resource(node_name, resource_name): + PLUGIN_CMD = 'demoteDrbdResource' + + session = util.timeout_call(5, util.get_localAPI_session) + + for host_ref, host_record in session.xenapi.host.get_all_records().items(): + if host_record['hostname'] != node_name: + continue + + try: + session.xenapi.host.call_plugin( + host_ref, PLUGIN, PLUGIN_CMD, {'resource_name': resource_name} + ) + except Exception as e: + util.SMlog('Failed to demote resource `{}` on `{}`: {}'.format( + resource_name, node_name, e + )) + raise Exception( + 'Can\'t demote resource `{}`, unable to find node `{}`' + .format(resource_name, node_name) + ) + +# ============================================================================== + class LinstorVolumeManagerError(Exception): ERR_GENERIC = 0, ERR_VOLUME_EXISTS = 1, @@ -50,6 +280,7 @@ def __init__(self, message, code=ERR_GENERIC): def code(self): return self._code + # ============================================================================== # Note: @@ -63,7 +294,17 @@ class LinstorVolumeManager(object): A volume in this context is a physical part of the storage layer. """ - DEV_ROOT_PATH = '/dev/drbd/by-res/' + __slots__ = ( + '_linstor', '_logger', + '_uri', '_base_group_name', + '_redundancy', '_group_name', + '_volumes', '_storage_pools', + '_storage_pools_time', + '_kv_cache', '_resource_cache', '_volume_info_cache', + '_kv_cache_dirty', '_resource_cache_dirty', '_volume_info_cache_dirty' + ) + + DEV_ROOT_PATH = DRBD_BY_RES_PATH # Default LVM extent size. BLOCK_SIZE = 4 * 1024 * 1024 @@ -90,7 +331,7 @@ class LinstorVolumeManager(object): # Property namespaces. NAMESPACE_SR = 'xcp/sr' - NAMESPACE_VOLUME = 'volume' + NAMESPACE_VOLUME = 'xcp/volume' # Regex to match properties. REG_PROP = '^([^/]+)/{}$' @@ -106,6 +347,10 @@ class LinstorVolumeManager(object): PREFIX_SR = 'xcp-sr-' PREFIX_VOLUME = 'xcp-volume-' + # Limit request number when storage pool info is asked, we fetch + # the current pool status after N elapsed seconds. + STORAGE_POOLS_FETCH_INTERVAL = 15 + @staticmethod def default_logger(*args): print(args) @@ -117,38 +362,43 @@ def default_logger(*args): class VolumeInfo(object): __slots__ = ( 'name', - 'physical_size', # Total physical size used by this volume on - # all disks. - 'virtual_size' # Total virtual available size of this volume - # (i.e. the user size at creation). + 'allocated_size', # Allocated size, place count is not used. + 'virtual_size', # Total virtual available size of this volume + # (i.e. the user size at creation). + 'diskful' # Array of nodes that have a diskful volume. ) def __init__(self, name): self.name = name - self.physical_size = 0 + self.allocated_size = 0 self.virtual_size = 0 + self.diskful = [] def __repr__(self): - return 'VolumeInfo("{}", {}, {})'.format( - self.name, self.physical_size, self.virtual_size + return 'VolumeInfo("{}", {}, {}, {})'.format( + self.name, self.allocated_size, self.virtual_size, + self.diskful ) # -------------------------------------------------------------------------- def __init__( - self, uri, group_name, repair=False, logger=default_logger.__func__ + self, uri, group_name, repair=False, logger=default_logger.__func__, + attempt_count=30 ): """ - Create a new LinstorApi object. + Create a new LinstorVolumeManager object. :param str uri: URI to communicate with the LINSTOR controller. :param str group_name: The SR goup name to use. :param bool repair: If true we try to remove bad volumes due to a crash or unexpected behavior. :param function logger: Function to log messages. + :param int attempt_count: Number of attempts to join the controller. """ - self._uri = uri - self._linstor = self._create_linstor_instance(uri) + self._linstor = self._create_linstor_instance( + uri, attempt_count=attempt_count + ) self._base_group_name = group_name # Ensure group exists. @@ -164,6 +414,16 @@ def __init__( self._logger = logger self._redundancy = groups[0].select_filter.place_count self._group_name = group_name + self._volumes = set() + self._storage_pools_time = 0 + + # To increate performance and limit request count to LINSTOR services, + # we use caches. + self._kv_cache = self._create_kv_cache() + self._resource_cache = None + self._resource_cache_dirty = True + self._volume_info_cache = None + self._volume_info_cache_dirty = True self._build_volumes(repair=repair) @property @@ -175,6 +435,15 @@ def group_name(self): """ return self._base_group_name + @property + def redundancy(self): + """ + Give the used redundancy. + :return: The redundancy. + :rtype: int + """ + return self._redundancy + @property def volumes(self): """ @@ -184,66 +453,6 @@ def volumes(self): """ return self._volumes - @property - def volumes_with_name(self): - """ - Give a volume dictionnary that contains names actually owned. - :return: A volume/name dict. - :rtype: dict(str, str) - """ - return self._get_volumes_by_property(self.REG_VOLUME_NAME) - - @property - def volumes_with_info(self): - """ - Give a volume dictionnary that contains VolumeInfos. - :return: A volume/VolumeInfo dict. - :rtype: dict(str, VolumeInfo) - """ - - volumes = {} - - all_volume_info = self._get_volumes_info() - volume_names = self.volumes_with_name - for volume_uuid, volume_name in volume_names.items(): - if volume_name: - volume_info = all_volume_info.get(volume_name) - if volume_info: - volumes[volume_uuid] = volume_info - continue - - # Well I suppose if this volume is not available, - # LINSTOR has been used directly without using this API. - volumes[volume_uuid] = self.VolumeInfo('') - - return volumes - - @property - def volumes_with_metadata(self): - """ - Give a volume dictionnary that contains metadata. - :return: A volume/metadata dict. - :rtype: dict(str, dict) - """ - - volumes = {} - - metadata = self._get_volumes_by_property(self.REG_METADATA) - for volume_uuid, volume_metadata in metadata.items(): - if volume_metadata: - volume_metadata = json.loads(volume_metadata) - if isinstance(volume_metadata, dict): - volumes[volume_uuid] = volume_metadata - continue - raise LinstorVolumeManagerError( - 'Expected dictionary in volume metadata: {}' - .format(volume_uuid) - ) - - volumes[volume_uuid] = {} - - return volumes - @property def max_volume_size_allowed(self): """ @@ -284,26 +493,67 @@ def physical_free_size(self): return self._compute_size('free_capacity') @property - def total_allocated_volume_size(self): + def allocated_volume_size(self): """ - Give the sum of all created volumes. - :return: The physical required size to use the volumes. + Give the allocated size for all volumes. The place count is not + used here. When thick lvm is used, the size for one volume should + be equal to the virtual volume size. With thin lvm, the size is equal + or lower to the volume size. + :return: The allocated size of all volumes. :rtype: int """ - size = 0 - for resource in self._linstor.resource_list_raise().resources: + # Paths: /res_name/vol_number/size + sizes = {} + + for resource in self._get_resource_cache().resources: + if resource.name not in sizes: + current = sizes[resource.name] = {} + else: + current = sizes[resource.name] + for volume in resource.volumes: # We ignore diskless pools of the form "DfltDisklessStorPool". - if volume.storage_pool_name == self._group_name: - current_size = volume.usable_size - if current_size < 0: - raise LinstorVolumeManagerError( - 'Failed to get usable size of `{}` on `{}`' - .format(resource.name, volume.storage_pool_name) - ) - size += current_size - return size * 1024 + if volume.storage_pool_name != self._group_name: + continue + + current_size = volume.allocated_size + if current_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get allocated size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) + current[volume.number] = max(current_size, current.get(volume.number) or 0) + + total_size = 0 + for volumes in sizes.itervalues(): + for size in volumes.itervalues(): + total_size += size + + return total_size * 1024 + + def get_min_physical_size(self): + """ + Give the minimum physical size of the SR. + I.e. the size of the smallest disk + the number of pools. + :return: The physical min size. + :rtype: tuple(int, int) + """ + size = None + pool_count = 0 + for pool in self._get_storage_pools(force=True): + space = pool.free_space + if space: + pool_count += 1 + current_size = space.total_capacity + if current_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get pool total_capacity attr of `{}`' + .format(pool.node_name) + ) + if size is None or current_size < size: + size = current_size + return (pool_count, (size or 0) * 1024) @property def metadata(self): @@ -346,12 +596,8 @@ def disconnected_hosts(self): :rtype: set(str) """ - pools = self._linstor.storage_pool_list_raise( - filter_by_stor_pools=[self._group_name] - ).storage_pools - disconnected_hosts = set() - for pool in pools: + for pool in self._get_storage_pools(): for report in pool.reports: if report.ret_code & linstor.consts.WARN_NOT_CONNECTED == \ linstor.consts.WARN_NOT_CONNECTED: @@ -367,23 +613,33 @@ def check_volume_exists(self, volume_uuid): """ return volume_uuid in self._volumes - def create_volume(self, volume_uuid, size, persistent=True): + def create_volume( + self, volume_uuid, size, persistent=True, volume_name=None, + no_diskless=False + ): """ Create a new volume on the SR. :param str volume_uuid: The volume uuid to use. :param int size: volume size in B. :param bool persistent: If false the volume will be unavailable on the next constructor call LinstorSR(...). + :param str volume_name: If set, this name is used in the LINSTOR + database instead of a generated name. + :param bool no_diskless: If set, the default group redundancy is not + used, instead the volume is created on all nodes. :return: The current device path of the volume. :rtype: str """ self._logger('Creating LINSTOR volume {}...'.format(volume_uuid)) - volume_name = self.build_volume_name(util.gen_uuid()) + if not volume_name: + volume_name = self.build_volume_name(util.gen_uuid()) volume_properties = self._create_volume_with_properties( - volume_uuid, volume_name, size, place_resources=True + volume_uuid, volume_name, size, place_resources=True, + no_diskless=no_diskless ) + # Volume created! Now try to find the device path. try: self._logger( 'Find device path of LINSTOR volume {}...'.format(volume_uuid) @@ -396,8 +652,10 @@ def create_volume(self, volume_uuid, size, persistent=True): 'LINSTOR volume {} created!'.format(volume_uuid) ) return device_path - except Exception: - self._force_destroy_volume(volume_uuid, volume_properties) + except Exception as e: + # There is an issue to find the path. + # At this point the volume has just been created, so force flag can be used. + self._destroy_volume(volume_uuid, force=True) raise def mark_volume_as_persistent(self, volume_uuid): @@ -426,7 +684,7 @@ def destroy_volume(self, volume_uuid): volume_properties[self.PROP_NOT_EXISTS] = self.STATE_NOT_EXISTS self._volumes.remove(volume_uuid) - self._destroy_volume(volume_uuid, volume_properties) + self._destroy_volume(volume_uuid) def lock_volume(self, volume_uuid, locked=True): """ @@ -476,12 +734,15 @@ def ensure_volume_list_is_not_locked(self, volume_uuids, timeout=None): waiting = False + volume_properties = self._get_kv_cache() + start = time.time() while True: # Can't delete in for loop, use a copy of the list. remaining = checked.copy() for volume_uuid in checked: - volume_properties = self._get_volume_properties(volume_uuid) + volume_properties.namespace = \ + self._build_volume_namespace(volume_uuid) timestamp = volume_properties.get( self.PROP_IS_READONLY_TIMESTAMP ) @@ -519,6 +780,7 @@ def ensure_volume_list_is_not_locked(self, volume_uuids, timeout=None): # We must wait to use the volume. After that we can modify it # ONLY if the SR is locked to avoid bad reads on the slaves. time.sleep(1) + volume_properties = self._create_kv_cache() if waiting: self._logger('No volume locked now!') @@ -542,6 +804,9 @@ def resize_volume(self, volume_uuid, new_size): volume_nr=0, size=new_size // 1024 ) + + self._mark_resource_cache_as_dirty() + error_str = self._get_error_str(result) if error_str: raise LinstorVolumeManagerError( @@ -587,6 +852,25 @@ def get_volume_size(self, volume_uuid): ) return size * 1024 + + def set_auto_promote_timeout(self, volume_uuid, timeout): + """ + Define the blocking time of open calls when a DRBD + is already open on another host. + :param str volume_uuid: The volume uuid to modify. + """ + + volume_name = self.get_volume_name(volume_uuid) + result = self._linstor.resource_dfn_modify(volume_name, { + 'DrbdOptions/Resource/auto-promote-timeout': timeout + }) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not change the auto promote timeout of `{}`: {}' + .format(volume_uuid, error_str) + ) + def get_volume_info(self, volume_uuid): """ Get the volume info of a particular volume. @@ -596,7 +880,7 @@ def get_volume_info(self, volume_uuid): """ volume_name = self.get_volume_name(volume_uuid) - return self._get_volumes_info(filter=[volume_name])[volume_name] + return self._get_volumes_info()[volume_name] def get_device_path(self, volume_uuid): """ @@ -620,7 +904,7 @@ def get_volume_uuid_from_device_path(self, device_path): expected_volume_name = \ self.get_volume_name_from_device_path(device_path) - volume_names = self.volumes_with_name + volume_names = self.get_volumes_with_name() for volume_uuid, volume_name in volume_names.items(): if volume_name == expected_volume_name: return volume_uuid @@ -631,26 +915,24 @@ def get_volume_uuid_from_device_path(self, device_path): def get_volume_name_from_device_path(self, device_path): """ - Get the volume name of a device_path on the current host. + Get the volume name of a device_path. :param str device_path: The dev path to find the volume name. - :return: The volume name of the local device path. + :return: The volume name of the device path. :rtype: str """ - node_name = socket.gethostname() - resources = self._linstor.resource_list_raise( - filter_by_nodes=[node_name] - ).resources - - real_device_path = os.path.realpath(device_path) - for resource in resources: - if resource.volumes[0].device_path == real_device_path: - return resource.name + # Assume that we have a path like this: + # - "/dev/drbd/by-res/xcp-volume-/0" + # - "../xcp-volume-/0" + if device_path.startswith(DRBD_BY_RES_PATH): + prefix_len = len(DRBD_BY_RES_PATH) + else: + assert device_path.startswith('../') + prefix_len = 3 - raise LinstorVolumeManagerError( - 'Unable to find volume name from dev path `{}`' - .format(device_path) - ) + res_name_end = device_path.find('/', prefix_len) + assert res_name_end != -1 + return device_path[prefix_len:res_name_end] def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): """ @@ -664,6 +946,8 @@ def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): deleted VDI. """ + assert volume_uuid != new_volume_uuid + self._logger( 'Trying to update volume UUID {} to {}...' .format(volume_uuid, new_volume_uuid) @@ -685,36 +969,45 @@ def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): .format(volume_uuid) ) - new_volume_properties = self._get_volume_properties( + # 1. Copy in temp variables metadata and volume_name. + metadata = volume_properties.get(self.PROP_METADATA) + volume_name = volume_properties.get(self.PROP_VOLUME_NAME) + + # 2. Switch to new volume namespace. + volume_properties.namespace = self._build_volume_namespace( new_volume_uuid ) - if list(new_volume_properties.items()): + + if list(volume_properties.items()): raise LinstorVolumeManagerError( 'Cannot update volume uuid {} to {}: ' .format(volume_uuid, new_volume_uuid) + 'this last one is not empty' ) - assert volume_properties.namespace != \ - new_volume_properties.namespace - try: - # 1. Mark new volume properties with PROP_UPDATING_UUID_SRC. + # 3. Mark new volume properties with PROP_UPDATING_UUID_SRC. # If we crash after that, the new properties can be removed # properly. - new_volume_properties[self.PROP_NOT_EXISTS] = self.STATE_NOT_EXISTS - new_volume_properties[self.PROP_UPDATING_UUID_SRC] = volume_uuid - - # 2. Copy the properties. - for property in [self.PROP_METADATA, self.PROP_VOLUME_NAME]: - new_volume_properties[property] = \ - volume_properties.get(property) + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_NOT_EXISTS + volume_properties[self.PROP_UPDATING_UUID_SRC] = volume_uuid + + # 4. Copy the properties. + # Note: On new volumes, during clone for example, the metadata + # may be missing. So we must test it to avoid this error: + # "None has to be a str/unicode, but is " + if metadata: + volume_properties[self.PROP_METADATA] = metadata + volume_properties[self.PROP_VOLUME_NAME] = volume_name - # 3. Ok! - new_volume_properties[self.PROP_NOT_EXISTS] = self.STATE_EXISTS + # 5. Ok! + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_EXISTS except Exception as e: try: - new_volume_properties.clear() + # Clear the new volume properties in case of failure. + assert volume_properties.namespace == \ + self._build_volume_namespace(new_volume_uuid) + volume_properties.clear() except Exception as e: self._logger( 'Failed to clear new volume properties: {} (ignoring...)' @@ -725,11 +1018,21 @@ def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): ) try: - # 4. After this point, it's ok we can remove the + # 6. After this point, it's ok we can remove the # PROP_UPDATING_UUID_SRC property and clear the src properties # without problems. + + # 7. Switch to old volume namespace. + volume_properties.namespace = self._build_volume_namespace( + volume_uuid + ) volume_properties.clear() - new_volume_properties.pop(self.PROP_UPDATING_UUID_SRC) + + # 8. Switch a last time to new volume namespace. + volume_properties.namespace = self._build_volume_namespace( + new_volume_uuid + ) + volume_properties.pop(self.PROP_UPDATING_UUID_SRC) except Exception as e: raise LinstorVolumeManagerError( 'Failed to clear volume properties ' @@ -743,7 +1046,7 @@ def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): 'UUID update succeeded of {} to {}! (properties={})' .format( volume_uuid, new_volume_uuid, - self._get_filtered_properties(new_volume_properties) + self._get_filtered_properties(volume_properties) ) ) @@ -788,6 +1091,73 @@ def get_usage_states(self, volume_uuid): return states + def get_volume_openers(self, volume_uuid): + """ + Get openers of a volume. + :param str volume_uuid: The volume uuid to monitor. + :return: A dictionnary that contains openers. + :rtype: dict(str, obj) + """ + return get_all_volume_openers(self.get_volume_name(volume_uuid), '0') + + + def get_volumes_with_name(self): + """ + Give a volume dictionnary that contains names actually owned. + :return: A volume/name dict. + :rtype: dict(str, str) + """ + return self._get_volumes_by_property(self.REG_VOLUME_NAME) + + def get_volumes_with_info(self): + """ + Give a volume dictionnary that contains VolumeInfos. + :return: A volume/VolumeInfo dict. + :rtype: dict(str, VolumeInfo) + """ + + volumes = {} + + all_volume_info = self._get_volumes_info() + volume_names = self.get_volumes_with_name() + for volume_uuid, volume_name in volume_names.items(): + if volume_name: + volume_info = all_volume_info.get(volume_name) + if volume_info: + volumes[volume_uuid] = volume_info + continue + + # Well I suppose if this volume is not available, + # LINSTOR has been used directly without using this API. + volumes[volume_uuid] = self.VolumeInfo('') + + return volumes + + def get_volumes_with_metadata(self): + """ + Give a volume dictionnary that contains metadata. + :return: A volume/metadata dict. + :rtype: dict(str, dict) + """ + + volumes = {} + + metadata = self._get_volumes_by_property(self.REG_METADATA) + for volume_uuid, volume_metadata in metadata.items(): + if volume_metadata: + volume_metadata = json.loads(volume_metadata) + if isinstance(volume_metadata, dict): + volumes[volume_uuid] = volume_metadata + continue + raise LinstorVolumeManagerError( + 'Expected dictionary in volume metadata: {}' + .format(volume_uuid) + ) + + volumes[volume_uuid] = {} + + return volumes + def get_volume_metadata(self, volume_uuid): """ Get the metadata of a volume. @@ -910,17 +1280,11 @@ def find_best_nodes(): rsc_name=clone_volume_name, storage_pool=self._group_name )) - for node_name in diskless_node_names: - resources.append(linstor.ResourceData( - node_name=node_name, - rsc_name=clone_volume_name, - diskless=True - )) # 5. Create resources! - def clean(properties): + def clean(): try: - self._destroy_volume(clone_uuid, properties) + self._destroy_volume(clone_uuid, force=True) except Exception as e: self._logger( 'Unable to destroy volume {} after shallow clone fail: {}' @@ -928,12 +1292,16 @@ def clean(properties): ) def create(): - try: - volume_properties = self._create_volume_with_properties( - clone_uuid, clone_volume_name, size, - place_resources=False - ) + # Note: placed outside try/except block because we create only definition first. + # There is no reason to call `clean` before the real resource creation. + volume_properties = self._create_volume_with_properties( + clone_uuid, clone_volume_name, size, + place_resources=False + ) + # After this point, `clean` can be called for any fail because the clone UUID + # is really unique. No risk to remove existing data. + try: result = self._linstor.resource_create(resources) error_str = self._get_error_str(result) if error_str: @@ -946,7 +1314,7 @@ def create(): ) return volume_properties except Exception: - clean(volume_properties) + clean() raise # Retry because we can get errors like this: @@ -962,7 +1330,7 @@ def create(): self._volumes.add(clone_uuid) return device_path except Exception as e: - clean(volume_properties) + clean() raise def remove_resourceless_volumes(self): @@ -974,83 +1342,337 @@ def remove_resourceless_volumes(self): """ resource_names = self._fetch_resource_names() - for volume_uuid, volume_name in self.volumes_with_name.items(): + for volume_uuid, volume_name in self.get_volumes_with_name().items(): if not volume_name or volume_name not in resource_names: + # Don't force, we can be sure of what's happening. self.destroy_volume(volume_uuid) - def destroy(self, force=False): + def destroy(self): """ Destroy this SR. Object should not be used after that. :param bool force: Try to destroy volumes before if true. """ - if (force): - for volume_uuid in self._volumes: - self.destroy_volume(volume_uuid) + if self._volumes: + raise LinstorVolumeManagerError( + 'Cannot destroy LINSTOR volume manager: ' + 'It exists remaining volumes' + ) + + controller_is_running = self._controller_is_running() + uri = 'linstor://localhost' + try: + if controller_is_running: + self._start_controller(start=False) + + # 1. Umount LINSTOR database. + self._mount_database_volume( + self.build_device_path(DATABASE_VOLUME_NAME), + mount=False, + force=True + ) + + # 2. Refresh instance. + self._start_controller(start=True) + self._linstor = self._create_linstor_instance( + uri, keep_uri_unmodified=True + ) - # TODO: Throw exceptions in the helpers below if necessary. - # TODO: What's the required action if it exists remaining volumes? + # 3. Destroy database volume. + self._destroy_resource(DATABASE_VOLUME_NAME) - self._destroy_resource_group(self._linstor, self._group_name) + # 4. Destroy group and storage pools. + self._destroy_resource_group(self._linstor, self._group_name) + for pool in self._get_storage_pools(force=True): + self._destroy_storage_pool( + self._linstor, pool.name, pool.node_name + ) + except Exception as e: + self._start_controller(start=controller_is_running) + raise e - pools = self._linstor.storage_pool_list_raise( - filter_by_stor_pools=[self._group_name] - ).storage_pools - for pool in pools: - self._destroy_storage_pool( - self._linstor, pool.name, pool.node_name + try: + self._start_controller(start=False) + for file in glob.glob(DATABASE_PATH + '/'): + os.remove(file) + except Exception as e: + util.SMlog( + 'Ignoring failure after LINSTOR SR destruction: {}' + .format(e) ) - def find_up_to_date_diskfull_nodes(self, volume_uuid): + def find_up_to_date_diskful_nodes(self, volume_uuid): """ - Find all nodes that contain a specific volume using diskfull disks. + Find all nodes that contain a specific volume using diskful disks. The disk must be up to data to be used. :param str volume_uuid: The volume to use. :return: The available nodes. - :rtype: tuple(set(str), bool) + :rtype: tuple(set(str), str) """ volume_name = self.get_volume_name(volume_uuid) - in_use = False + in_use_by = None node_names = set() - resource_list = self._linstor.resource_list_raise( - filter_by_resources=[volume_name] + + resource_states = filter( + lambda resource_state: resource_state.name == volume_name, + self._get_resource_cache().resource_states ) - for resource_state in resource_list.resource_states: + + for resource_state in resource_states: volume_state = resource_state.volume_states[0] if volume_state.disk_state == 'UpToDate': node_names.add(resource_state.node_name) if resource_state.in_use: - in_use = True + in_use_by = resource_state.node_name + + return (node_names, in_use_by) + + def invalidate_resource_cache(self): + """ + If resources are impacted by external commands like vhdutil, + it's necessary to call this function to invalidate current resource + cache. + """ + self._mark_resource_cache_as_dirty() - return (node_names, in_use) + def has_node(self, node_name): + """ + Check if a node exists in the LINSTOR database. + :rtype: bool + """ + result = self._linstor.node_list() + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Failed to list nodes using `{}`: {}' + .format(node_name, error_str) + ) + return bool(result[0].node(node_name)) + + def create_node(self, node_name, ip): + """ + Create a new node in the LINSTOR database. + :param str node_name: Node name to use. + :param str ip: Host IP to communicate. + """ + result = self._linstor.node_create( + node_name, + linstor.consts.VAL_NODE_TYPE_CMBD, + ip + ) + errors = self._filter_errors(result) + if errors: + error_str = self._get_error_str(errors) + raise LinstorVolumeManagerError( + 'Failed to create node `{}`: {}'.format(node_name, error_str) + ) + + def destroy_node(self, node_name): + """ + Destroy a node in the LINSTOR database. + :param str node_name: Node name to remove. + """ + result = self._linstor.node_delete(node_name) + errors = self._filter_errors(result) + if errors: + error_str = self._get_error_str(errors) + raise LinstorVolumeManagerError( + 'Failed to destroy node `{}`: {}'.format(node_name, error_str) + ) + + def get_nodes_info(self): + """ + Get all nodes + statuses, used or not by the pool. + :rtype: dict(str, dict) + """ + try: + nodes = {} + for node in self._linstor.node_list_raise().nodes: + nodes[node.name] = node.connection_status + return nodes + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to get all nodes: `{}`'.format(e) + ) + + def get_storage_pools_info(self): + """ + Give all storage pools of current group name. + :rtype: dict(str, list) + """ + storage_pools = {} + for pool in self._get_storage_pools(force=True): + if pool.node_name not in storage_pools: + storage_pools[pool.node_name] = [] + + size = -1 + capacity = -1 + + space = pool.free_space + if space: + size = space.free_capacity + if size < 0: + size = -1 + else: + size *= 1024 + capacity = space.total_capacity + if capacity <= 0: + capacity = -1 + else: + capacity *= 1024 + + storage_pools[pool.node_name].append({ + 'storage-pool-name': pool.name, + 'uuid': pool.uuid, + 'free-size': size, + 'capacity': capacity + }) + + return storage_pools + + def get_resources_info(self): + """ + Give all resources of current group name. + :rtype: dict(str, list) + """ + resources = {} + resource_list = self._linstor.resource_list_raise() + for resource in resource_list.resources: + if resource.name not in resources: + resources[resource.name] = {} + + resources[resource.name][resource.node_name] = { + 'volumes': [], + 'diskful': linstor.consts.FLAG_DISKLESS not in resource.flags, + 'tie-breaker': linstor.consts.FLAG_TIE_BREAKER in resource.flags + } + + for volume in resource.volumes: + # We ignore diskless pools of the form "DfltDisklessStorPool". + if volume.storage_pool_name != self._group_name: + continue + + usable_size = volume.usable_size + if usable_size < 0: + usable_size = -1 + else: + usable_size *= 1024 + + allocated_size = volume.allocated_size + if allocated_size < 0: + allocated_size = -1 + else: + allocated_size *= 1024 + + resources[resource.name][resource.node_name]['volumes'].append({ + 'storage-pool-name': volume.storage_pool_name, + 'uuid': volume.uuid, + 'number': volume.number, + 'device-path': volume.device_path, + 'usable-size': usable_size, + 'allocated-size': allocated_size + }) + + for resource_state in resource_list.resource_states: + resource = resources[resource_state.rsc_name][resource_state.node_name] + resource['in-use'] = resource_state.in_use + + volumes = resource['volumes'] + for volume_state in resource_state.volume_states: + volume = next((x for x in volumes if x['number'] == volume_state.number), None) + if volume: + volume['disk-state'] = volume_state.disk_state + + return resources @classmethod def create_sr( - cls, uri, group_name, node_names, redundancy, - thin_provisioning=False, + cls, group_name, ips, redundancy, + thin_provisioning, auto_quorum, logger=default_logger.__func__ ): """ Create a new SR on the given nodes. - :param str uri: URI to communicate with the LINSTOR controller. :param str group_name: The SR group_name to use. - :param list[str] node_names: String list of nodes. + :param set(str) ips: Node ips. :param int redundancy: How many copy of volumes should we store? + :param bool thin_provisioning: Use thin or thick provisioning. + :param bool auto_quorum: DB quorum is monitored by LINSTOR. :param function logger: Function to log messages. :return: A new LinstorSr instance. :rtype: LinstorSr """ + try: + cls._start_controller(start=True) + sr = cls._create_sr( + group_name, + ips, + redundancy, + thin_provisioning, + auto_quorum, + logger + ) + finally: + # Controller must be stopped and volume unmounted because + # it is the role of the drbd-reactor daemon to do the right + # actions. + cls._start_controller(start=False) + cls._mount_volume( + cls.build_device_path(DATABASE_VOLUME_NAME), + DATABASE_PATH, + mount=False + ) + return sr + + @classmethod + def _create_sr( + cls, group_name, ips, redundancy, + thin_provisioning, auto_quorum, + logger=default_logger.__func__ + ): # 1. Check if SR already exists. - lin = cls._create_linstor_instance(uri) + uri = 'linstor://localhost' + + lin = cls._create_linstor_instance(uri, keep_uri_unmodified=True) + + node_names = ips.keys() + for node_name, ip in ips.iteritems(): + while True: + # Try to create node. + result = lin.node_create( + node_name, + linstor.consts.VAL_NODE_TYPE_CMBD, + ip + ) + + errors = cls._filter_errors(result) + if cls._check_errors( + errors, [linstor.consts.FAIL_EXISTS_NODE] + ): + # If it already exists, remove, then recreate. + result = lin.node_delete(node_name) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Failed to remove old node `{}`: {}' + .format(node_name, error_str) + ) + elif not errors: + break # Created! + else: + raise LinstorVolumeManagerError( + 'Failed to create node `{}` with ip `{}`: {}'.format( + node_name, ip, cls._get_error_str(errors) + ) + ) + driver_pool_name = group_name + base_group_name = group_name group_name = cls._build_group_name(group_name) pools = lin.storage_pool_list_raise(filter_by_stor_pools=[group_name]) - - # TODO: Maybe if the SR already exists and if the nodes are the same, - # we can try to use it directly. pools = pools.storage_pools if pools: existing_node_names = [pool.node_name for pool in pools] @@ -1076,9 +1698,14 @@ def create_sr( ) # 2. Create storage pool on each node + resource group. + reg_volume_group_not_found = re.compile( + ".*Volume group '.*' not found$" + ) + i = 0 try: # 2.a. Create storage pools. + storage_pool_count = 0 while i < len(node_names): node_name = node_names[i] @@ -1089,17 +1716,35 @@ def create_sr( driver_pool_name=driver_pool_name ) - error_str = cls._get_error_str(result) - if error_str: - raise LinstorVolumeManagerError( - 'Could not create SP `{}` on node `{}`: {}'.format( - group_name, - node_name, - error_str + errors = linstor.Linstor.filter_api_call_response_errors( + result + ) + if errors: + if len(errors) == 1 and errors[0].is_error( + linstor.consts.FAIL_STOR_POOL_CONFIGURATION_ERROR + ) and reg_volume_group_not_found.match(errors[0].message): + logger( + 'Volume group `{}` not found on `{}`. Ignoring...' + .format(group_name, node_name) ) - ) + cls._destroy_storage_pool(lin, group_name, node_name) + else: + error_str = cls._get_error_str(result) + raise LinstorVolumeManagerError( + 'Could not create SP `{}` on node `{}`: {}' + .format(group_name, node_name, error_str) + ) + else: + storage_pool_count += 1 i += 1 + if not storage_pool_count: + raise LinstorVolumeManagerError( + 'Unable to create SR `{}`: No VG group found'.format( + group_name, + ) + ) + # 2.b. Create resource group. result = lin.resource_group_create( name=group_name, @@ -1125,30 +1770,78 @@ def create_sr( ) ) - # 3. Remove storage pools/resource/volume group in the case of errors. + # 3. Create the LINSTOR database volume and mount it. + try: + logger('Creating database volume...') + volume_path = cls._create_database_volume( + lin, group_name, node_names, redundancy, auto_quorum + ) + except LinstorVolumeManagerError as e: + if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: + logger('Destroying database volume after creation fail...') + cls._force_destroy_database_volume(lin, group_name) + raise + + try: + logger('Mounting database volume...') + + # First we must disable the controller to move safely the + # LINSTOR config. + cls._start_controller(start=False) + + cls._mount_database_volume(volume_path) + except Exception as e: + # Ensure we are connected because controller has been + # restarted during mount call. + logger('Destroying database volume after mount fail...') + + try: + cls._start_controller(start=True) + except Exception: + pass + + lin = cls._create_linstor_instance( + uri, keep_uri_unmodified=True + ) + cls._force_destroy_database_volume(lin, group_name) + raise e + + cls._start_controller(start=True) + lin = cls._create_linstor_instance(uri, keep_uri_unmodified=True) + + # 4. Remove storage pools/resource/volume group in the case of errors. except Exception as e: + logger('Destroying resource group and storage pools after fail...') try: cls._destroy_resource_group(lin, group_name) - except Exception: + except Exception as e2: + logger('Failed to destroy resource group: {}'.format(e2)) pass j = 0 i = min(i, len(node_names) - 1) while j <= i: try: cls._destroy_storage_pool(lin, group_name, node_names[j]) - except Exception: + except Exception as e2: + logger('Failed to destroy resource group: {}'.format(e2)) pass j += 1 raise e - # 4. Return new instance. + # 5. Return new instance. instance = cls.__new__(cls) - instance._uri = uri instance._linstor = lin instance._logger = logger instance._redundancy = redundancy + instance._base_group_name = base_group_name instance._group_name = group_name instance._volumes = set() + instance._storage_pools_time = 0 + instance._kv_cache = instance._create_kv_cache() + instance._resource_cache = None + instance._resource_cache_dirty = True + instance._volume_info_cache = None + instance._volume_info_cache_dirty = True return instance @classmethod @@ -1196,6 +1889,32 @@ def round_down_volume_size(cls, volume_size): # Private helpers. # -------------------------------------------------------------------------- + def _create_kv_cache(self): + self._kv_cache = self._create_linstor_kv('/') + self._kv_cache_dirty = False + return self._kv_cache + + def _get_kv_cache(self): + if self._kv_cache_dirty: + self._kv_cache = self._create_kv_cache() + return self._kv_cache + + def _create_resource_cache(self): + self._resource_cache = self._linstor.resource_list_raise() + self._resource_cache_dirty = False + return self._resource_cache + + def _get_resource_cache(self): + if self._resource_cache_dirty: + self._resource_cache = self._create_resource_cache() + return self._resource_cache + + def _mark_resource_cache_as_dirty(self): + self._resource_cache_dirty = True + self._volume_info_cache_dirty = True + + # -------------------------------------------------------------------------- + def _ensure_volume_exists(self, volume_uuid): if volume_uuid not in self._volumes: raise LinstorVolumeManagerError( @@ -1224,12 +1943,13 @@ def _fetch_resource_names(self): resource_names.add(dfn.name) return resource_names - def _get_volumes_info(self, filter=None): + def _get_volumes_info(self, volume_name=None): all_volume_info = {} - resources = self._linstor.resource_list_raise( - filter_by_resources=filter - ) - for resource in resources.resources: + + if not self._volume_info_cache_dirty: + return self._volume_info_cache + + for resource in self._get_resource_cache().resources: if resource.name not in all_volume_info: current = all_volume_info[resource.name] = self.VolumeInfo( resource.name @@ -1237,6 +1957,9 @@ def _get_volumes_info(self, filter=None): else: current = all_volume_info[resource.name] + if linstor.consts.FLAG_DISKLESS not in resource.flags: + current.diskful.append(resource.node_name) + for volume in resource.volumes: # We ignore diskless pools of the form "DfltDisklessStorPool". if volume.storage_pool_name == self._group_name: @@ -1245,22 +1968,32 @@ def _get_volumes_info(self, filter=None): 'Failed to get allocated size of `{}` on `{}`' .format(resource.name, volume.storage_pool_name) ) - current.physical_size += volume.allocated_size + allocated_size = volume.allocated_size - if volume.usable_size < 0: - raise LinstorVolumeManagerError( - 'Failed to get usable size of `{}` on `{}`' - .format(resource.name, volume.storage_pool_name) - ) - virtual_size = volume.usable_size + current.allocated_size = current.allocated_size and \ + max(current.allocated_size, allocated_size) or \ + allocated_size - current.virtual_size = current.virtual_size and \ - min(current.virtual_size, virtual_size) or virtual_size + usable_size = volume.usable_size + if usable_size > 0 and ( + usable_size < current.virtual_size or + not current.virtual_size + ): + current.virtual_size = usable_size + + if current.virtual_size <= 0: + raise LinstorVolumeManagerError( + 'Failed to get usable size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) for current in all_volume_info.values(): - current.physical_size *= 1024 + current.allocated_size *= 1024 current.virtual_size *= 1024 + self._volume_info_cache_dirty = False + self._volume_info_cache = all_volume_info + return all_volume_info def _get_volume_node_names_and_size(self, volume_name): @@ -1289,12 +2022,8 @@ def _get_volume_node_names_and_size(self, volume_name): return (node_names, size * 1024) def _compute_size(self, attr): - pools = self._linstor.storage_pool_list_raise( - filter_by_stor_pools=[self._group_name] - ).storage_pools - capacity = 0 - for pool in pools: + for pool in self._get_storage_pools(force=True): space = pool.free_space if space: size = getattr(space, attr) @@ -1308,45 +2037,104 @@ def _compute_size(self, attr): def _get_node_names(self): node_names = set() - pools = self._linstor.storage_pool_list_raise( - filter_by_stor_pools=[self._group_name] - ).storage_pools - for pool in pools: + for pool in self._get_storage_pools(): node_names.add(pool.node_name) return node_names - def _check_volume_creation_errors(self, result, volume_uuid): - errors = self._filter_errors(result) - if self._check_errors(errors, [ - linstor.consts.FAIL_EXISTS_RSC, linstor.consts.FAIL_EXISTS_RSC_DFN - ]): - raise LinstorVolumeManagerError( - 'Failed to create volume `{}` from SR `{}`, it already exists' - .format(volume_uuid, self._group_name), - LinstorVolumeManagerError.ERR_VOLUME_EXISTS - ) + def _get_storage_pools(self, force=False): + cur_time = time.time() + elsaped_time = cur_time - self._storage_pools_time - if errors: + if force or elsaped_time >= self.STORAGE_POOLS_FETCH_INTERVAL: + self._storage_pools = self._linstor.storage_pool_list_raise( + filter_by_stor_pools=[self._group_name] + ).storage_pools + self._storage_pools_time = time.time() + + return self._storage_pools + + def _create_volume( + self, volume_uuid, volume_name, size, place_resources, + no_diskless=False + ): + if no_diskless and not place_resources: raise LinstorVolumeManagerError( - 'Failed to create volume `{}` from SR `{}`: {}'.format( - volume_uuid, - self._group_name, - self._get_error_str(errors) - ) + 'Could not create volume `{}` from SR `{}`: it\'s impossible ' + .format(volume_uuid, self._group_name) + + 'to force no diskless without placing resources' ) - def _create_volume(self, volume_uuid, volume_name, size, place_resources): size = self.round_up_volume_size(size) + self._mark_resource_cache_as_dirty() - self._check_volume_creation_errors(self._linstor.resource_group_spawn( - rsc_grp_name=self._group_name, - rsc_dfn_name=volume_name, - vlm_sizes=['{}B'.format(size)], - definitions_only=not place_resources - ), volume_uuid) + resources = [] + if no_diskless: + for node_name in self._get_node_names(): + resources.append(linstor.ResourceData( + node_name=node_name, + rsc_name=volume_name, + storage_pool=self._group_name + )) + + def create_definition(): + self._check_volume_creation_errors( + self._linstor.resource_group_spawn( + rsc_grp_name=self._group_name, + rsc_dfn_name=volume_name, + vlm_sizes=['{}B'.format(size)], + definitions_only=True + ), + volume_uuid, + self._group_name + ) + self._configure_volume_peer_slots(self._linstor, volume_name) + + def clean(): + try: + self._destroy_volume(volume_uuid, force=True) + except Exception as e: + self._logger( + 'Unable to destroy volume {} after creation fail: {}' + .format(volume_uuid, e) + ) + + def create(): + try: + create_definition() + if no_diskless: + # Create a physical resource on each node. + result = self._linstor.resource_create(resources) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not create volume `{}` from SR `{}`: {}'.format( + volume_uuid, self._group_name, error_str + ) + ) + elif place_resources: + # Basic case when we use the default redundancy of the group. + self._check_volume_creation_errors( + self._linstor.resource_auto_place( + rsc_name=volume_name, + place_count=self._redundancy, + diskless_on_remaining=not no_diskless + ), + volume_uuid, + self._group_name + ) + except LinstorVolumeManagerError as e: + if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: + clean() + raise + except Exception: + clean() + raise + + util.retry(create, maxretry=5) def _create_volume_with_properties( - self, volume_uuid, volume_name, size, place_resources + self, volume_uuid, volume_name, size, place_resources, + no_diskless=False ): if self.check_volume_exists(volume_uuid): raise LinstorVolumeManagerError( @@ -1375,9 +2163,11 @@ def _create_volume_with_properties( volume_properties[self.PROP_VOLUME_NAME] = volume_name self._create_volume( - volume_uuid, volume_name, size, place_resources + volume_uuid, volume_name, size, place_resources, no_diskless ) + assert volume_properties.namespace == \ + self._build_volume_namespace(volume_uuid) return volume_properties except LinstorVolumeManagerError as e: # Do not destroy existing resource! @@ -1385,12 +2175,8 @@ def _create_volume_with_properties( # before the `self._create_volume` case. # It can only happen if the same volume uuid is used in the same # call in another host. - if e.code == LinstorVolumeManagerError.ERR_VOLUME_EXISTS: - raise - self._force_destroy_volume(volume_uuid, volume_properties) - raise - except Exception: - self._force_destroy_volume(volume_uuid, volume_properties) + if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: + self._destroy_volume(volume_uuid, force=True) raise def _find_device_path(self, volume_uuid, volume_name): @@ -1417,68 +2203,73 @@ def _find_device_path(self, volume_uuid, volume_name): def _request_device_path(self, volume_uuid, volume_name, activate=False): node_name = socket.gethostname() - resources = self._linstor.resource_list( - filter_by_nodes=[node_name], - filter_by_resources=[volume_name] + + resources = filter( + lambda resource: resource.node_name == node_name and + resource.name == volume_name, + self._get_resource_cache().resources ) - if not resources or not resources[0]: - raise LinstorVolumeManagerError( - 'No response list for dev path of `{}`'.format(volume_uuid) - ) - if isinstance(resources[0], linstor.responses.ResourceResponse): - if not resources[0].resources: - if activate: - self._activate_device_path(node_name, volume_name) - return self._request_device_path(volume_uuid, volume_name) - raise LinstorVolumeManagerError( - 'Empty dev path for `{}`, but definition "seems" to exist' - .format(volume_uuid) + if not resources: + if activate: + self._mark_resource_cache_as_dirty() + self._activate_device_path( + self._linstor, node_name, volume_name ) - # Contains a path of the /dev/drbd form. - return resources[0].resources[0].volumes[0].device_path - - raise LinstorVolumeManagerError( - 'Unable to get volume dev path `{}`: {}'.format( - volume_uuid, str(resources[0]) + return self._request_device_path(volume_uuid, volume_name) + raise LinstorVolumeManagerError( + 'Empty dev path for `{}`, but definition "seems" to exist' + .format(volume_uuid) ) - ) - - def _activate_device_path(self, node_name, volume_name): - result = self._linstor.resource_create([ - linstor.ResourceData(node_name, volume_name, diskless=True) - ]) - if linstor.Linstor.all_api_responses_no_error(result): - return - errors = linstor.Linstor.filter_api_call_response_errors(result) - if len(errors) == 1 and errors[0].is_error( - linstor.consts.FAIL_EXISTS_RSC - ): - return - - raise LinstorVolumeManagerError( - 'Unable to activate device path of `{}` on node `{}`: {}' - .format(volume_name, node_name, ', '.join( - [str(x) for x in result])) - ) + # Contains a path of the /dev/drbd form. + return resources[0].volumes[0].device_path - def _destroy_resource(self, resource_name): + def _destroy_resource(self, resource_name, force=False): result = self._linstor.resource_dfn_delete(resource_name) error_str = self._get_error_str(result) - if error_str: + if not error_str: + self._mark_resource_cache_as_dirty() + return + + if not force: + self._mark_resource_cache_as_dirty() raise LinstorVolumeManagerError( - 'Could not destroy resource `{}` from SR `{}`: {}' + 'Could not destroy resource `{}` from SR `{}`: {}' .format(resource_name, self._group_name, error_str) ) - def _destroy_volume(self, volume_uuid, volume_properties): - assert volume_properties.namespace == \ - self._build_volume_namespace(volume_uuid) + # If force is used, ensure there is no opener. + all_openers = get_all_volume_openers(resource_name, '0') + for openers in all_openers.itervalues(): + if openers: + self._mark_resource_cache_as_dirty() + raise LinstorVolumeManagerError( + 'Could not force destroy resource `{}` from SR `{}`: {} (openers=`{}`)' + .format(resource_name, self._group_name, error_str, all_openers) + ) + + # Maybe the resource is blocked in primary mode. DRBD/LINSTOR issue? + resource_states = filter( + lambda resource_state: resource_state.name == resource_name, + self._get_resource_cache().resource_states + ) + + # Mark only after computation of states. + self._mark_resource_cache_as_dirty() + + for resource_state in resource_states: + volume_state = resource_state.volume_states[0] + if resource_state.in_use: + demote_drbd_resource(resource_state.node_name, resource_name) + break + self._destroy_resource(resource_name) + def _destroy_volume(self, volume_uuid, force=False): + volume_properties = self._get_volume_properties(volume_uuid) try: volume_name = volume_properties.get(self.PROP_VOLUME_NAME) if volume_name in self._fetch_resource_names(): - self._destroy_resource(volume_name) + self._destroy_resource(volume_name, force) # Assume this call is atomic. volume_properties.clear() @@ -1487,19 +2278,8 @@ def _destroy_volume(self, volume_uuid, volume_properties): 'Cannot destroy volume `{}`: {}'.format(volume_uuid, e) ) - def _force_destroy_volume(self, volume_uuid, volume_properties): - try: - self._destroy_volume(volume_uuid, volume_properties) - except Exception as e: - self._logger('Ignore fail: {}'.format(e)) - def _build_volumes(self, repair): - properties = linstor.KV( - self._get_store_name(), - uri=self._uri, - namespace=self._build_volume_namespace() - ) - + properties = self._kv_cache resource_names = self._fetch_resource_names() self._volumes = set() @@ -1517,9 +2297,7 @@ def _build_volumes(self, repair): self.REG_NOT_EXISTS, ignore_inexisting_volumes=False ) for volume_uuid, not_exists in existing_volumes.items(): - properties.namespace = self._build_volume_namespace( - volume_uuid - ) + properties.namespace = self._build_volume_namespace(volume_uuid) src_uuid = properties.get(self.PROP_UPDATING_UUID_SRC) if src_uuid: @@ -1569,7 +2347,7 @@ def _build_volumes(self, repair): # Little optimization, don't call `self._destroy_volume`, # we already have resource name list. if volume_name in resource_names: - self._destroy_resource(volume_name) + self._destroy_resource(volume_name, force=True) # Assume this call is atomic. properties.clear() @@ -1579,37 +2357,42 @@ def _build_volumes(self, repair): 'Cannot clean volume {}: {}'.format(volume_uuid, e) ) + # The volume can't be removed, maybe it's still in use, + # in this case rename it with the "DELETED_" prefix. + # This prefix is mandatory if it exists a snap transaction to + # rollback because the original VDI UUID can try to be renamed + # with the UUID we are trying to delete... + if not volume_uuid.startswith('DELETED_'): + self.update_volume_uuid( + volume_uuid, 'DELETED_' + volume_uuid, force=True + ) + for dest_uuid, src_uuid in updating_uuid_volumes.items(): - dest_properties = self._get_volume_properties(dest_uuid) - if int(dest_properties.get(self.PROP_NOT_EXISTS) or - self.STATE_EXISTS): - dest_properties.clear() + dest_namespace = self._build_volume_namespace(dest_uuid) + + properties.namespace = dest_namespace + if int(properties.get(self.PROP_NOT_EXISTS)): + properties.clear() continue - src_properties = self._get_volume_properties(src_uuid) - src_properties.clear() + properties.namespace = self._build_volume_namespace(src_uuid) + properties.clear() - dest_properties.pop(self.PROP_UPDATING_UUID_SRC) + properties.namespace = dest_namespace + properties.pop(self.PROP_UPDATING_UUID_SRC) if src_uuid in self._volumes: self._volumes.remove(src_uuid) self._volumes.add(dest_uuid) def _get_sr_properties(self): - return linstor.KV( - self._get_store_name(), - uri=self._uri, - namespace=self._build_sr_namespace() - ) + return self._create_linstor_kv(self._build_sr_namespace()) def _get_volumes_by_property( self, reg_prop, ignore_inexisting_volumes=True ): - base_properties = linstor.KV( - self._get_store_name(), - uri=self._uri, - namespace=self._build_volume_namespace() - ) + base_properties = self._get_kv_cache() + base_properties.namespace = self._build_volume_namespace() volume_properties = {} for volume_uuid in self._volumes: @@ -1625,15 +2408,17 @@ def _get_volumes_by_property( return volume_properties - def _get_volume_properties(self, volume_uuid): + def _create_linstor_kv(self, namespace): return linstor.KV( - self._get_store_name(), - uri=self._uri, - namespace=self._build_volume_namespace(volume_uuid) + self._group_name, + uri=self._linstor.controller_host(), + namespace=namespace ) - def _get_store_name(self): - return 'xcp-sr-{}'.format(self._group_name) + def _get_volume_properties(self, volume_uuid): + properties = self._get_kv_cache() + properties.namespace = self._build_volume_namespace(volume_uuid) + return properties @classmethod def _build_sr_namespace(cls): @@ -1653,46 +2438,429 @@ def _get_error_str(cls, result): ]) @classmethod - def _create_linstor_instance(cls, uri): - def connect(): + def _create_linstor_instance( + cls, uri, keep_uri_unmodified=False, attempt_count=30 + ): + retry = False + + def connect(uri): + if not uri: + uri = get_controller_uri() + if not uri: + raise LinstorVolumeManagerError( + 'Unable to find controller uri...' + ) instance = linstor.Linstor(uri, keep_alive=True) instance.connect() return instance + try: + return connect(uri) + except (linstor.errors.LinstorNetworkError, LinstorVolumeManagerError): + pass + + if not keep_uri_unmodified: + uri = None + return util.retry( - connect, - maxretry=60, - exceptions=[linstor.errors.LinstorNetworkError] + lambda: connect(uri), + maxretry=attempt_count, + period=1, + exceptions=[ + linstor.errors.LinstorNetworkError, + LinstorVolumeManagerError + ] ) @classmethod - def _destroy_storage_pool(cls, lin, group_name, node_name): - result = lin.storage_pool_delete(node_name, group_name) + def _configure_volume_peer_slots(cls, lin, volume_name): + result = lin.resource_dfn_modify(volume_name, {}, peer_slots=3) error_str = cls._get_error_str(result) if error_str: raise LinstorVolumeManagerError( - 'Failed to destroy SP `{}` on node `{}`: {}'.format( - group_name, - node_name, - error_str + 'Could not configure volume peer slots of {}: {}' + .format(volume_name, error_str) + ) + + @classmethod + def _activate_device_path(cls, lin, node_name, volume_name): + result = lin.resource_create([ + linstor.ResourceData(node_name, volume_name, diskless=True) + ]) + if linstor.Linstor.all_api_responses_no_error(result): + return + errors = linstor.Linstor.filter_api_call_response_errors(result) + if len(errors) == 1 and errors[0].is_error( + linstor.consts.FAIL_EXISTS_RSC + ): + return + + raise LinstorVolumeManagerError( + 'Unable to activate device path of `{}` on node `{}`: {}' + .format(volume_name, node_name, ', '.join( + [str(x) for x in result])) + ) + + @classmethod + def _request_database_path(cls, lin, activate=False): + node_name = socket.gethostname() + + try: + resources = filter( + lambda resource: resource.node_name == node_name and + resource.name == DATABASE_VOLUME_NAME, + lin.resource_list_raise().resources + ) + except Exception as e: + raise LinstorVolumeManagerError( + 'Unable to get resources during database creation: {}' + .format(e) + ) + + if not resources: + if activate: + cls._activate_device_path( + lin, node_name, DATABASE_VOLUME_NAME + ) + return cls._request_database_path( + DATABASE_VOLUME_NAME, DATABASE_VOLUME_NAME ) + raise LinstorVolumeManagerError( + 'Empty dev path for `{}`, but definition "seems" to exist' + .format(DATABASE_PATH) ) + # Contains a path of the /dev/drbd form. + return resources[0].volumes[0].device_path @classmethod - def _destroy_resource_group(cls, lin, group_name): - result = lin.resource_group_delete(group_name) + def _create_database_volume( + cls, lin, group_name, node_names, redundancy, auto_quorum + ): + try: + dfns = lin.resource_dfn_list_raise().resource_definitions + except Exception as e: + raise LinstorVolumeManagerError( + 'Unable to get definitions during database creation: {}' + .format(e) + ) + + if dfns: + raise LinstorVolumeManagerError( + 'Could not create volume `{}` from SR `{}`, '.format( + DATABASE_VOLUME_NAME, group_name + ) + 'LINSTOR volume list must be empty.' + ) + + # Workaround to use thin lvm. Without this line an error is returned: + # "Not enough available nodes" + # I don't understand why but this command protect against this bug. + try: + pools = lin.storage_pool_list_raise( + filter_by_stor_pools=[group_name] + ) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to get storage pool list before database creation: {}' + .format(e) + ) + + # Ensure we have a correct list of storage pools. + nodes_with_pool = map(lambda pool: pool.node_name, pools.storage_pools) + assert nodes_with_pool # We must have at least one storage pool! + for node_name in nodes_with_pool: + assert node_name in node_names + util.SMlog('Nodes with storage pool: {}'.format(nodes_with_pool)) + + # Create the database definition. + size = cls.round_up_volume_size(DATABASE_SIZE) + cls._check_volume_creation_errors(lin.resource_group_spawn( + rsc_grp_name=group_name, + rsc_dfn_name=DATABASE_VOLUME_NAME, + vlm_sizes=['{}B'.format(size)], + definitions_only=True + ), DATABASE_VOLUME_NAME, group_name) + cls._configure_volume_peer_slots(lin, DATABASE_VOLUME_NAME) + + # Create real resources on the first nodes. + resources = [] + + diskful_nodes = [] + diskless_nodes = [] + for node_name in node_names: + if node_name in nodes_with_pool: + diskful_nodes.append(node_name) + else: + diskless_nodes.append(node_name) + + assert diskful_nodes + for node_name in diskful_nodes[:redundancy]: + util.SMlog('Create database diskful on {}'.format(node_name)) + resources.append(linstor.ResourceData( + node_name=node_name, + rsc_name=DATABASE_VOLUME_NAME, + storage_pool=group_name + )) + # Create diskless resources on the remaining set. + for node_name in diskful_nodes[redundancy:] + diskless_nodes: + util.SMlog('Create database diskless on {}'.format(node_name)) + resources.append(linstor.ResourceData( + node_name=node_name, + rsc_name=DATABASE_VOLUME_NAME, + diskless=True + )) + + result = lin.resource_create(resources) error_str = cls._get_error_str(result) if error_str: raise LinstorVolumeManagerError( - 'Failed to destroy RG `{}`: {}'.format(group_name, error_str) + 'Could not create database volume from SR `{}`: {}'.format( + group_name, error_str + ) + ) + + # We must modify the quorum. Otherwise we can't use correctly the + # drbd-reactor daemon. + if auto_quorum: + result = lin.resource_dfn_modify(DATABASE_VOLUME_NAME, { + 'DrbdOptions/auto-quorum': 'disabled', + 'DrbdOptions/Resource/quorum': 'majority' + }) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not activate quorum on database volume: {}' + .format(error_str) + ) + + # Create database and ensure path exists locally and + # on replicated devices. + current_device_path = cls._request_database_path(lin, activate=True) + + # Ensure diskless paths exist on other hosts. Otherwise PBDs can't be + # plugged. + for node_name in node_names: + cls._activate_device_path(lin, node_name, DATABASE_VOLUME_NAME) + + # We use realpath here to get the /dev/drbd path instead of + # /dev/drbd/by-res/. + expected_device_path = cls.build_device_path(DATABASE_VOLUME_NAME) + util.wait_for_path(expected_device_path, 5) + + device_realpath = os.path.realpath(expected_device_path) + if current_device_path != device_realpath: + raise LinstorVolumeManagerError( + 'Invalid path, current={}, expected={} (realpath={})' + .format( + current_device_path, + expected_device_path, + device_realpath + ) + ) + + try: + util.pread2([DATABASE_MKFS, expected_device_path]) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to execute {} on database volume: {}' + .format(DATABASE_MKFS, e) + ) + + return expected_device_path + + @classmethod + def _destroy_database_volume(cls, lin, group_name): + error_str = cls._get_error_str( + lin.resource_dfn_delete(DATABASE_VOLUME_NAME) + ) + if error_str: + raise LinstorVolumeManagerError( + 'Could not destroy resource `{}` from SR `{}`: {}' + .format(DATABASE_VOLUME_NAME, group_name, error_str) ) + @classmethod + def _mount_database_volume(cls, volume_path, mount=True, force=False): + backup_path = DATABASE_PATH + '-' + str(uuid.uuid4()) + + try: + # 1. Create a backup config folder. + database_not_empty = bool(os.listdir(DATABASE_PATH)) + if database_not_empty: + try: + os.mkdir(backup_path) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to create backup path {} of LINSTOR config: {}' + .format(backup_path, e) + ) + + # 2. Move the config in the mounted volume. + if database_not_empty: + cls._move_files(DATABASE_PATH, backup_path) + + cls._mount_volume(volume_path, DATABASE_PATH, mount) + + if database_not_empty: + cls._move_files(backup_path, DATABASE_PATH, force) + + # 3. Remove useless backup directory. + try: + os.rmdir(backup_path) + except Exception: + raise LinstorVolumeManagerError( + 'Failed to remove backup path {} of LINSTOR config {}' + .format(backup_path, e) + ) + except Exception as e: + def force_exec(fn): + try: + fn() + except Exception: + pass + + if mount == cls._is_mounted(DATABASE_PATH): + force_exec(lambda: cls._move_files( + DATABASE_PATH, backup_path + )) + force_exec(lambda: cls._mount_volume( + volume_path, DATABASE_PATH, not mount + )) + + if mount != cls._is_mounted(DATABASE_PATH): + force_exec(lambda: cls._move_files( + backup_path, DATABASE_PATH + )) + + force_exec(lambda: os.rmdir(backup_path)) + raise e + + @classmethod + def _force_destroy_database_volume(cls, lin, group_name): + try: + cls._destroy_database_volume(lin, group_name) + except Exception: + pass + + @classmethod + def _destroy_storage_pool(cls, lin, group_name, node_name): + def destroy(): + result = lin.storage_pool_delete(node_name, group_name) + errors = cls._filter_errors(result) + if cls._check_errors(errors, [ + linstor.consts.FAIL_NOT_FOUND_STOR_POOL, + linstor.consts.FAIL_NOT_FOUND_STOR_POOL_DFN + ]): + return + + if errors: + raise LinstorVolumeManagerError( + 'Failed to destroy SP `{}` on node `{}`: {}'.format( + group_name, + node_name, + cls._get_error_str(errors) + ) + ) + + # We must retry to avoid errors like: + # "can not be deleted as volumes / snapshot-volumes are still using it" + # after LINSTOR database volume destruction. + return util.retry(destroy, maxretry=10) + + @classmethod + def _destroy_resource_group(cls, lin, group_name): + def destroy(): + result = lin.resource_group_delete(group_name) + errors = cls._filter_errors(result) + if cls._check_errors(errors, [ + linstor.consts.FAIL_NOT_FOUND_RSC_GRP + ]): + return + + if errors: + raise LinstorVolumeManagerError( + 'Failed to destroy RG `{}`: {}' + .format(group_name, cls._get_error_str(errors)) + ) + + return util.retry(destroy, maxretry=10) + @classmethod def _build_group_name(cls, base_name): # If thin provisioning is used we have a path like this: # `VG/LV`. "/" is not accepted by LINSTOR. return '{}{}'.format(cls.PREFIX_SR, base_name.replace('/', '_')) + @classmethod + def _check_volume_creation_errors(cls, result, volume_uuid, group_name): + errors = cls._filter_errors(result) + if cls._check_errors(errors, [ + linstor.consts.FAIL_EXISTS_RSC, linstor.consts.FAIL_EXISTS_RSC_DFN + ]): + raise LinstorVolumeManagerError( + 'Failed to create volume `{}` from SR `{}`, it already exists' + .format(volume_uuid, group_name), + LinstorVolumeManagerError.ERR_VOLUME_EXISTS + ) + + if errors: + raise LinstorVolumeManagerError( + 'Failed to create volume `{}` from SR `{}`: {}'.format( + volume_uuid, + group_name, + cls._get_error_str(errors) + ) + ) + + @classmethod + def _move_files(cls, src_dir, dest_dir, force=False): + def listdir(dir): + ignored = ['lost+found'] + return filter(lambda file: file not in ignored, os.listdir(dir)) + + try: + if not force: + files = listdir(dest_dir) + if files: + raise LinstorVolumeManagerError( + 'Cannot move files from {} to {} because destination ' + 'contains: {}'.format(src_dir, dest_dir, files) + ) + except LinstorVolumeManagerError: + raise + except Exception as e: + raise LinstorVolumeManagerError( + 'Cannot list dir {}: {}'.format(dest_dir, e) + ) + + try: + for file in listdir(src_dir): + try: + dest_file = os.path.join(dest_dir, file) + if not force and os.path.exists(dest_file): + raise LinstorVolumeManagerError( + 'Cannot move {} because it already exists in the ' + 'destination'.format(file) + ) + shutil.move(os.path.join(src_dir, file), dest_file) + except LinstorVolumeManagerError: + raise + except Exception as e: + raise LinstorVolumeManagerError( + 'Cannot move {}: {}'.format(file, e) + ) + except Exception as e: + if not force: + try: + cls._move_files(dest_dir, src_dir, force=True) + except Exception: + pass + + raise LinstorVolumeManagerError( + 'Failed to move files from {} to {}: {}'.format( + src_dir, dest_dir, e + ) + ) + @staticmethod def _get_filtered_properties(properties): return dict(properties.items()) @@ -1711,3 +2879,110 @@ def _check_errors(result, codes): if err.is_error(code): return True return False + + @classmethod + def _controller_is_running(cls): + return cls._service_is_running('linstor-controller') + + @classmethod + def _start_controller(cls, start=True): + return cls._start_service('linstor-controller', start) + + @staticmethod + def _start_service(name, start=True): + action = 'start' if start else 'stop' + (ret, out, err) = util.doexec([ + 'systemctl', action, name + ]) + if ret != 0: + raise LinstorVolumeManagerError( + 'Failed to {} {}: {} {}' + .format(action, name, out, err) + ) + + @staticmethod + def _service_is_running(name): + (ret, out, err) = util.doexec([ + 'systemctl', 'is-active', '--quiet', name + ]) + return not ret + + @staticmethod + def _is_mounted(mountpoint): + (ret, out, err) = util.doexec(['mountpoint', '-q', mountpoint]) + return ret == 0 + + @classmethod + def _mount_volume(cls, volume_path, mountpoint, mount=True): + if mount: + try: + util.pread(['mount', volume_path, mountpoint]) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to mount volume {} on {}: {}' + .format(volume_path, mountpoint, e) + ) + else: + try: + if cls._is_mounted(mountpoint): + util.pread(['umount', mountpoint]) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to umount volume {} on {}: {}' + .format(volume_path, mountpoint, e) + ) + + +# ============================================================================== + +# Check if a path is a DRBD resource and log the process name/pid +# that opened it. +def log_drbd_openers(path): + # Ignore if it's not a symlink to DRBD resource. + if not path.startswith(DRBD_BY_RES_PATH): + return + + # Compute resource name. + res_name_end = path.find('/', len(DRBD_BY_RES_PATH)) + if res_name_end == -1: + return + res_name = path[len(DRBD_BY_RES_PATH):res_name_end] + + volume_end = path.rfind('/') + if volume_end == res_name_end: + return + volume = path[volume_end + 1:] + + try: + # Ensure path is a DRBD. + drbd_path = os.path.realpath(path) + stats = os.stat(drbd_path) + if not stat.S_ISBLK(stats.st_mode) or os.major(stats.st_rdev) != 147: + return + + # Find where the device is open. + (ret, stdout, stderr) = util.doexec(['drbdadm', 'status', res_name]) + if ret != 0: + util.SMlog('Failed to execute `drbdadm status` on `{}`: {}'.format( + res_name, stderr + )) + return + + # Is it a local device? + if stdout.startswith('{} role:Primary'.format(res_name)): + util.SMlog( + 'DRBD resource `{}` is open on local host: {}' + .format(path, get_local_volume_openers(res_name, volume)) + ) + return + + # Is it a remote device? + util.SMlog( + 'DRBD resource `{}` is open on hosts: {}' + .format(path, get_all_volume_openers(res_name, volume)) + ) + except Exception as e: + util.SMlog( + 'Got exception while trying to determine where DRBD resource ' + + '`{}` is open: {}'.format(path, e) + ) diff --git a/drivers/on_slave.py b/drivers/on_slave.py index bb3f5db65..524424f61 100755 --- a/drivers/on_slave.py +++ b/drivers/on_slave.py @@ -79,6 +79,7 @@ def _is_open(session, args): import CephFSSR import EXTSR import GlusterFSSR + import LinstorSR import LVHDSR import MooseFSSR import NFSSR @@ -109,8 +110,28 @@ def _is_open(session, args): } cmd.params = {"command": None} + sr_uuid = srRec["uuid"] + + # Another ugly piece of code to load a real Linstor SR, otherwise + # we can't fetch the VDI path. + if srType == 'linstor': + host_ref = util.get_this_host_ref(session) + sr_ref = session.xenapi.SR.get_by_uuid(sr_uuid) + + pbd = util.find_my_pbd(session, host_ref, sr_ref) + if pbd is None: + raise util.SMException('Failed to find Linstor PBD') + + cmd.dconf = session.xenapi.PBD.get_device_config(pbd) + driver = SR.driver(srType) - sr = driver(cmd, srRec["uuid"]) + sr = driver(cmd, sr_uuid) + + # session_ref param is required to have a valid session when SR object is created. + # It's not the case here, so attach the current session object to make LinstorSR happy. + if srType == 'linstor': + sr.session = session + vdi = sr.vdi(vdiUuid) tapdisk = blktap2.Tapdisk.find_by_path(vdi.path) util.SMlog("Tapdisk for %s: %s" % (vdi.path, tapdisk)) diff --git a/drivers/tapdisk-pause b/drivers/tapdisk-pause index 932fc3ca6..75328757b 100755 --- a/drivers/tapdisk-pause +++ b/drivers/tapdisk-pause @@ -30,7 +30,7 @@ import vhdutil import lvmcache try: - from linstorvolumemanager import LinstorVolumeManager + from linstorvolumemanager import get_controller_uri, LinstorVolumeManager LINSTOR_AVAILABLE = True except ImportError: LINSTOR_AVAILABLE = False @@ -152,10 +152,6 @@ class Tapdisk: # "B" path. Note: "A", "B" and "OLD_A" are UUIDs. session = self.session - linstor_uri = 'linstor://{}'.format( - util.get_master_rec(session)['address'] - ) - host_ref = util.get_this_host_ref(session) sr_ref = session.xenapi.SR.get_by_uuid(self.sr_uuid) @@ -167,7 +163,7 @@ class Tapdisk: group_name = dconf['group-name'] device_path = LinstorVolumeManager( - linstor_uri, + get_controller_uri(), group_name, logger=util.SMlog ).get_device_path(self.vdi_uuid) diff --git a/drivers/util.py b/drivers/util.py index e73482288..376aaf703 100755 --- a/drivers/util.py +++ b/drivers/util.py @@ -699,32 +699,10 @@ def get_master_ref(session): return session.xenapi.pool.get_master(pools[0]) -def get_master_rec(session): - return session.xenapi.host.get_record(get_master_ref(session)) - - def is_master(session): return get_this_host_ref(session) == get_master_ref(session) -def get_master_address(): - address = None - try: - fd = open('/etc/xensource/pool.conf', 'r') - try: - items = fd.readline().split(':') - if items[0].strip() == 'master': - address = 'localhost' - else: - address = items[1].strip() - finally: - fd.close() - except Exception: - pass - return address - - - def get_localhost_ref(session): filename = '/etc/xensource-inventory' try: @@ -765,6 +743,17 @@ def get_hosts_attached_on(session, vdi_uuids): host_refs[key[len('host_'):]] = True return host_refs.keys() +def get_this_host_address(session): + host_uuid = get_this_host() + host_ref = session.xenapi.host.get_by_uuid(host_uuid) + return session.xenapi.host.get_record(host_ref)['address'] + +def get_host_addresses(session): + addresses = [] + hosts = session.xenapi.host.get_all_records() + for record in hosts.itervalues(): + addresses.append(record['address']) + return addresses def get_this_host_ref(session): host_uuid = get_this_host() @@ -1955,3 +1944,95 @@ def sessions_less_than_targets(other_config, device_config): return (sessions < targets) else: return False + + +def enable_and_start_service(name, start): + attempt = 0 + while True: + attempt += 1 + fn = 'enable' if start else 'disable' + args = ('systemctl', fn, '--now', name) + (ret, out, err) = doexec(args) + if ret == 0: + return + elif attempt >= 3: + raise Exception( + 'Failed to {} {}: {} {}'.format(fn, name, out, err) + ) + time.sleep(1) + + +def stop_service(name): + args = ('systemctl', 'stop', name) + (ret, out, err) = doexec(args) + if ret == 0: + return + raise Exception('Failed to stop {}: {} {}'.format(name, out, err)) + + +def restart_service(name): + attempt = 0 + while True: + attempt += 1 + SMlog('Restarting service {} {}...'.format(name, attempt)) + args = ('systemctl', 'restart', name) + (ret, out, err) = doexec(args) + if ret == 0: + return + elif attempt >= 3: + SMlog('Restart service FAILED {} {}'.format(name, attempt)) + raise Exception( + 'Failed to restart {}: {} {}'.format(name, out, err) + ) + time.sleep(1) + + +def check_pid_exists(pid): + try: + os.kill(pid, 0) + except OSError: + return False + else: + return True + + +def make_profile(name, function): + """ + Helper to execute cProfile using unique log file. + """ + + import cProfile + import itertools + import os.path + import time + + assert name + assert function + + FOLDER = '/tmp/sm-perfs/' + makedirs(FOLDER) + + filename = time.strftime('{}_%Y%m%d_%H%M%S.prof'.format(name)) + + def gen_path(path): + yield path + root, ext = os.path.splitext(path) + for i in itertools.count(start=1, step=1): + yield root + '.{}.'.format(i) + ext + + for profile_path in gen_path(FOLDER + filename): + try: + file = open_atomic(profile_path, 'w') + file.close() + break + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise + + try: + SMlog('* Start profiling of {} ({}) *'.format(name, filename)) + cProfile.runctx('function()', None, locals(), profile_path) + finally: + SMlog('* End profiling of {} ({}) *'.format(name, filename)) diff --git a/drivers/vhdutil.py b/drivers/vhdutil.py index 3a3027e70..279786ea7 100755 --- a/drivers/vhdutil.py +++ b/drivers/vhdutil.py @@ -93,13 +93,16 @@ def ioretry(cmd, text=True): errlist=[errno.EIO, errno.EAGAIN]) -def getVHDInfo(path, extractUuidFunction, includeParent=True): +def getVHDInfo(path, extractUuidFunction, includeParent=True, resolveParent=True): """Get the VHD info. The parent info may optionally be omitted: vhd-util tries to verify the parent by opening it, which results in error if the VHD resides on an inactive LV""" opts = "-vsf" if includeParent: opts += "p" + if not resolveParent: + opts += "u" + cmd = [VHD_UTIL, "query", OPT_LOG_ERR, opts, "-n", path] ret = ioretry(cmd) fields = ret.strip().split('\n') diff --git a/etc/systemd/system/linstor-satellite.service.d/override.conf b/etc/systemd/system/linstor-satellite.service.d/override.conf new file mode 100644 index 000000000..b1686b4f3 --- /dev/null +++ b/etc/systemd/system/linstor-satellite.service.d/override.conf @@ -0,0 +1,5 @@ +[Service] +Environment=LS_KEEP_RES=^xcp-persistent* + +[Unit] +After=drbd.service diff --git a/etc/systemd/system/var-lib-linstor.service b/etc/systemd/system/var-lib-linstor.service new file mode 100644 index 000000000..e9deb9042 --- /dev/null +++ b/etc/systemd/system/var-lib-linstor.service @@ -0,0 +1,21 @@ +# Regarding the current version of systemd (v.219) used in XCP-ng, we can't use +# the ReadWriteOnly option (to apply the -w flag, it's not the same than -o rw). +# This file is a workaround to avoid RO. It must be replaced with the code below +# in a mount unit. Compatible with version >= 246. +# +# [Unit] +# Description=Filesystem for the LINSTOR controller +# +# [Mount] +# What=/dev/drbd/by-res/xcp-persistent-database/0 +# Where=/var/lib/linstor +# ReadWriteOnly=true + +[Unit] +Description=Mount filesystem for the LINSTOR controller + +[Service] +Type=oneshot +ExecStart=/bin/mount -w /dev/drbd/by-res/xcp-persistent-database/0 /var/lib/linstor +ExecStop=/opt/xensource/libexec/safe-umount /var/lib/linstor +RemainAfterExit=true diff --git a/linstor/linstor-monitord.c b/linstor/linstor-monitord.c index 8161813d7..47740598c 100644 --- a/linstor/linstor-monitord.c +++ b/linstor/linstor-monitord.c @@ -14,8 +14,10 @@ * along with this program. If not, see . */ +#include #include #include +#include #include #include #include @@ -39,7 +41,8 @@ #define POOL_CONF_ABS_FILE POOL_CONF_DIR "/" POOL_CONF_FILE // In milliseconds. -#define POLL_TIMEOUT 2000 +#define UPDATE_LINSTOR_NODE_TIMEOUT 2000 +#define SR_SCAN_TIMEOUT 720000 // ----------------------------------------------------------------------------- @@ -130,24 +133,120 @@ static inline int isMasterHost (int *error) { typedef struct { int inotifyFd; + struct timespec lastScanTime; + int isMaster; // TODO: Should be completed with at least a hostname field. } State; // ----------------------------------------------------------------------------- -static inline int execCommand (char *argv[]) { +typedef struct { + char *data; + size_t size; + size_t capacity; +} Buffer; + +#define max(a, b) ({ \ + __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a > _b ? _a : _b; \ +}) + +static inline ssize_t readAll (int fd, Buffer *buffer) { + assert(buffer->capacity >= buffer->size); + + ssize_t ret = 0; + do { + size_t byteCount = buffer->capacity - buffer->size; + if (byteCount < 16) { + const size_t newCapacity = max(buffer->capacity << 1, 64); + char *p = realloc(buffer->data, newCapacity); + if (!p) + return -errno; + + buffer->data = p; + buffer->capacity = newCapacity; + + byteCount = buffer->capacity - buffer->size; + } + + ret = read(fd, buffer->data + buffer->size, byteCount); + if (ret > 0) + buffer->size += ret; + else if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) + ret = 0; + } while (ret > 0); + + return ret; +} + +// ----------------------------------------------------------------------------- + +static inline int execCommand (char *argv[], Buffer *buffer) { + int pipefd[2]; + if (buffer) { + if (pipe(pipefd) < 0) { + syslog(LOG_ERR, "Failed to exec pipe: `%s`.", strerror(errno)); + return -errno; + } + + if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) < 0) { + syslog(LOG_ERR, "Failed to exec fcntl on pipe in: `%s`.", strerror(errno)); + close(pipefd[0]); + close(pipefd[1]); + return -errno; + } + } + const pid_t pid = fork(); - if (pid < 0) + if (pid < 0) { + syslog(LOG_ERR, "Failed to fork: `%s`.", strerror(errno)); + if (buffer) { + close(pipefd[0]); + close(pipefd[1]); + } return -errno; + } // Child process. if (pid == 0) { + if (buffer) { + close(STDOUT_FILENO); + dup(pipefd[1]); + + close(pipefd[0]); + close(pipefd[1]); + } + if (execvp(*argv, argv) < 0) syslog(LOG_ERR, "Failed to exec `%s` command.", *argv); exit(EXIT_FAILURE); } // Main process. + int ret = 0; + if (buffer) { + close(pipefd[1]); + + do { + struct pollfd fds = { pipefd[0], POLLIN | POLLHUP, 0 }; + const int res = poll(&fds, 1, 0); + if (res < 0) { + if (errno == EAGAIN) + continue; + syslog(LOG_ERR, "Failed to poll from command: `%s`.", strerror(errno)); + ret = -errno; + } else if (res > 0) { + if (fds.revents & POLLIN) + ret = readAll(pipefd[0], buffer); + if (fds.revents & POLLHUP) + break; // Input has been closed. + } + } while (ret >= 0); + + close(pipefd[0]); + } + int status; if (waitpid(pid, &status, 0) < 0) { syslog(LOG_ERR, "Failed to wait command: `%s`.", *argv); @@ -163,7 +262,7 @@ static inline int execCommand (char *argv[]) { } else if (WIFSIGNALED(status)) syslog(LOG_ERR, "`%s` terminated by signal %d.", *argv, WTERMSIG(status)); - return 0; + return ret; } // ----------------------------------------------------------------------------- @@ -188,23 +287,6 @@ static inline int addInotifyWatch (int inotifyFd, const char *filepath, uint32_t // ----------------------------------------------------------------------------- -static inline int updateLinstorServices () { - int error; - const int isMaster = isMasterHost(&error); - if (error) - return error; - - syslog(LOG_INFO, "%s linstor-controller...", isMaster ? "Enabling" : "Disabling"); - char *argv[] = { - "systemctl", - isMaster ? "enable" : "disable", - "--now", - "linstor-controller", - NULL - }; - return execCommand(argv); -} - static inline int updateLinstorNode (State *state) { char buffer[256]; if (gethostname(buffer, sizeof buffer) == -1) { @@ -219,14 +301,53 @@ static inline int updateLinstorNode (State *state) { // ----------------------------------------------------------------------------- +#define UUID_PARAM "uuid=" +#define UUID_PARAM_LEN (sizeof(UUID_PARAM) - 1) +#define UUID_LENGTH 36 + +static inline void scanLinstorSr (const char *uuid) { + char uuidBuf[UUID_LENGTH + UUID_PARAM_LEN + 1] = UUID_PARAM; + strncpy(uuidBuf + UUID_PARAM_LEN, uuid, UUID_LENGTH); + uuidBuf[UUID_LENGTH + UUID_PARAM_LEN] = '\0'; + execCommand((char *[]){ "xe", "sr-scan", uuidBuf, NULL }, NULL); +} + +// Called to update the physical/virtual size used by LINSTOR SRs in XAPI DB. +static inline int scanLinstorSrs () { + Buffer srs = {}; + const int ret = execCommand((char *[]){ "xe", "sr-list", "type=linstor", "--minimal", NULL }, &srs); + if (ret) { + free(srs.data); + return ret; + } + + const char *end = srs.data + srs.size; + char *pos = srs.data; + for (char *off; (off = memchr(pos, ',', end - pos)); pos = off + 1) + if (off - pos == UUID_LENGTH) + scanLinstorSr(pos); + + if (end - pos >= UUID_LENGTH) { + for (--end; end - pos >= UUID_LENGTH && isspace(*end); --end) {} + if (isalnum(*end)) + scanLinstorSr(pos); + } + + free(srs.data); + + return 0; +} + +// ----------------------------------------------------------------------------- + #define PROCESS_MODE_DEFAULT 0 #define PROCESS_MODE_WAIT_FILE_CREATION 1 static inline int waitForPoolConfCreation (State *state, int *wdFile); -static inline int processPoolConfEvents (int inotifyFd, int wd, char **buffer, size_t *bufferSize, int mode, int *process) { +static inline int processPoolConfEvents (State *state, int wd, char **buffer, size_t *bufferSize, int mode, int *process) { size_t size = 0; - if (ioctl(inotifyFd, FIONREAD, (char *)&size) == -1) { + if (ioctl(state->inotifyFd, FIONREAD, (char *)&size) == -1) { syslog(LOG_ERR, "Failed to get buffer size from inotify descriptor: `%s`.", strerror(errno)); return -errno; } @@ -241,7 +362,7 @@ static inline int processPoolConfEvents (int inotifyFd, int wd, char **buffer, s *bufferSize = size; } - if ((size = (size_t)read(inotifyFd, *buffer, size)) == (size_t)-1) { + if ((size = (size_t)read(state->inotifyFd, *buffer, size)) == (size_t)-1) { syslog(LOG_ERR, "Failed to read buffer from inotify descriptor: `%s`.", strerror(errno)); return -errno; } @@ -280,10 +401,9 @@ static inline int processPoolConfEvents (int inotifyFd, int wd, char **buffer, s syslog(LOG_INFO, "Updating linstor services... (Inotify mask=%" PRIu32 ")", mask); if (mask & (IN_DELETE_SELF | IN_MOVE_SELF | IN_UNMOUNT)) { syslog(LOG_ERR, "Watched `" POOL_CONF_ABS_FILE "` file has been removed!"); - inotify_rm_watch(inotifyFd, wd); // Do not forget to remove watch to avoid leaks. + inotify_rm_watch(state->inotifyFd, wd); // Do not forget to remove watch to avoid leaks. return -EIO; } - ret = updateLinstorServices(); } else { if (mask & (IN_CREATE | IN_MOVED_TO)) { syslog(LOG_ERR, "Watched `" POOL_CONF_ABS_FILE "` file has been recreated!"); @@ -303,16 +423,24 @@ static inline int waitAndProcessEvents (State *state, int wd, int mode) { struct timespec previousTime = getCurrentTime(); do { - struct timespec currentTime = getCurrentTime(); + const struct timespec currentTime = getCurrentTime(); const int64_t elapsedTime = convertToMilliseconds(getTimeDiff(¤tTime, &previousTime)); int timeout; - if (elapsedTime >= POLL_TIMEOUT) { + if (elapsedTime >= UPDATE_LINSTOR_NODE_TIMEOUT) { updateLinstorNode(state); - timeout = POLL_TIMEOUT; + timeout = UPDATE_LINSTOR_NODE_TIMEOUT; previousTime = getCurrentTime(); } else { - timeout = POLL_TIMEOUT - elapsedTime; + timeout = UPDATE_LINSTOR_NODE_TIMEOUT - elapsedTime; + } + + const int64_t elapsedScanTime = convertToMilliseconds(getTimeDiff(¤tTime, &state->lastScanTime)); + if (elapsedScanTime >= SR_SCAN_TIMEOUT) { + state->isMaster = isMasterHost(&ret); + if (state->isMaster) + scanLinstorSrs(); + state->lastScanTime = getCurrentTime(); } struct pollfd fds = { state->inotifyFd, POLLIN, 0 }; @@ -323,7 +451,9 @@ static inline int waitAndProcessEvents (State *state, int wd, int mode) { syslog(LOG_ERR, "Failed to poll from inotify descriptor: `%s`.", strerror(errno)); ret = -errno; } else if (res > 0) { - ret = processPoolConfEvents(state->inotifyFd, wd, &buffer, &bufferSize, mode, &process); + state->isMaster = isMasterHost(&ret); + if (!ret) + ret = processPoolConfEvents(state, wd, &buffer, &bufferSize, mode, &process); } } while (ret >= 0 && process); @@ -350,7 +480,8 @@ static inline int waitForPoolConfCreation (State *state, int *wdFile) { do { do { // Update LINSTOR services... - ret = updateLinstorServices(); + int ret; + state->isMaster = isMasterHost(&ret); // Ok we can't read the pool configuration file. // Maybe the file doesn't exist. Waiting its creation... @@ -378,7 +509,9 @@ int main (int argc, char *argv[]) { setlogmask(LOG_UPTO(LOG_INFO)); State state = { - .inotifyFd = -1 + .inotifyFd = -1, + .lastScanTime = getCurrentTime(), + .isMaster = 0 }; const int inotifyFd = createInotifyInstance(); diff --git a/scripts/fork-log-daemon b/scripts/fork-log-daemon new file mode 100755 index 000000000..665a60baf --- /dev/null +++ b/scripts/fork-log-daemon @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +import select +import signal +import subprocess +import sys +import syslog + +def main(): + process = subprocess.Popen(sys.argv[1:], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + signal.signal(signal.SIGTERM, signal.SIG_IGN) + write_to_stdout = True + + while process.poll() is None: + while True: + output = process.stdout.readline() + if not output: + break + + if write_to_stdout: + try: + print(output) + sys.stdout.flush() + except Exception: + # Probably a broken pipe. So the process reading stdout is dead. + write_to_stdout = False + syslog.syslog(output) + +if __name__ == "__main__": + syslog.openlog(ident=sys.argv[1], facility=syslog.LOG_DAEMON) + try: + main() + except Exception as e: + syslog.syslog(sys.argv[1] + ' terminated with exception: {}'.format(e)) + finally: + syslog.syslog(sys.argv[1] + ' is now terminated!') diff --git a/scripts/linstor-kv-tool b/scripts/linstor-kv-tool new file mode 100755 index 000000000..c9070270c --- /dev/null +++ b/scripts/linstor-kv-tool @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# +# Copyright (C) 2022 Vates SAS +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import argparse +import json +import linstor + + +def dump_kv(controller_uri, group_name, namespace): + kv = linstor.KV( + group_name, + uri=controller_uri, + namespace=namespace + ) + print(json.dumps(kv, sort_keys=True, indent=2)) + + +def remove_volume(controller_uri, group_name, vdi_name): + assert vdi_name + kv = linstor.KV( + group_name, + uri=controller_uri, + namespace='/xcp/volume/{}'.format(vdi_name) + ) + + for key, value in list(kv.items()): + del kv[key] + + +def remove_all_volumes(controller_uri, group_name): + kv = linstor.KV( + group_name, + uri=controller_uri, + namespace='/' + ) + + for key, value in list(kv.items()): + if key.startswith('xcp/volume/') or key.startswith('xcp/sr/journal/'): + size = key.rindex('/') + kv.namespace = key[:size] + del kv[key[size + 1:]] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-u', '--uri', required=True) + parser.add_argument('-g', '--group-name', required=True) + parser.add_argument('-n', '--namespace', default='/') + + action = parser.add_mutually_exclusive_group(required=True) + action.add_argument('--dump-volumes', action='store_true') + action.add_argument('--remove-volume', metavar='VDI_UUID') + action.add_argument('--remove-all-volumes', action='store_true') + + args = parser.parse_args() + if args.dump_volumes: + dump_kv(args.uri, args.group_name, args.namespace) + elif args.remove_volume: + remove_volume(args.uri, args.group_name, args.remove_volume) + elif args.remove_all_volumes: + remove_all_volumes(args.uri, args.group_name) + + +if __name__ == '__main__': + main() diff --git a/scripts/safe-umount b/scripts/safe-umount new file mode 100755 index 000000000..9c1dcc400 --- /dev/null +++ b/scripts/safe-umount @@ -0,0 +1,39 @@ +#!/usr/bin/env python2 + +import argparse +import subprocess +import sys +import time + + +def safe_umount(path): + retry_count = 10 + not_mounted_str = 'umount: {}: not mounted'.format(path) + + last_code = 0 + while retry_count: + proc = subprocess.Popen(['mountpoint', '-q', path]) + proc.wait() + if proc.returncode: + return 0 + + proc = subprocess.Popen(['umount', path], stderr=subprocess.PIPE) + (stdout, stderr) = proc.communicate() + if not proc.returncode: + return 0 + + error = stderr.strip() + if error == not_mounted_str: + return 0 + + retry_count -= 1 + last_code = proc.returncode + time.sleep(0.500) + return last_code + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('path') + args = parser.parse_args() + sys.exit(safe_umount(args.path)) diff --git a/tests/test_on_slave.py b/tests/test_on_slave.py index 1aad3639a..d6f57130a 100644 --- a/tests/test_on_slave.py +++ b/tests/test_on_slave.py @@ -13,7 +13,15 @@ class Test_on_slave_is_open(unittest.TestCase): - MOCK_IMPORTS = ['SRCommand', 'SR', 'NFSSR', 'EXTSR', 'LVHDSR', 'blktap2'] + MOCK_IMPORTS = [ + 'SRCommand', + 'SR', + 'NFSSR', + 'EXTSR', + 'LVHDSR', + 'LinstorSR', + 'blktap2' + ] def fake_import(self, name, *args): print('Asked to import {}'.format(name))