dtest.py

from __future__ import with_statement

import ConfigParser
import copy
import errno
import glob
import logging
import os
import pprint
import re
import shutil
import signal
import subprocess
import sys
import tempfile
import thread
import threading
import time
import traceback
import types
import unittest.case
from collections import OrderedDict
from subprocess import CalledProcessError
from unittest import TestCase

import cassandra
import ccmlib.repository
from cassandra import ConsistencyLevel
from cassandra.auth import PlainTextAuthProvider
from cassandra.cluster import Cluster as PyCluster
from cassandra.cluster import NoHostAvailable
from cassandra.cluster import ExecutionProfile, EXEC_PROFILE_DEFAULT
from cassandra.policies import RetryPolicy, WhiteListRoundRobinPolicy
from ccmlib.cluster import Cluster
from ccmlib.cluster_factory import ClusterFactory
from ccmlib.common import get_version_from_build, is_win
from nose.exc import SkipTest
from nose.tools import assert_greater_equal
from six import print_

from plugins.dtestconfig import _CONFIG as CONFIG
# We don't want test files to know about the plugins module, so we import
# constants here and re-export them.
from plugins.dtestconfig import GlobalConfigObject
from tools.context import log_filter
from tools.funcutils import merge_dicts

LOG_SAVED_DIR = "logs"
try:
    os.mkdir(LOG_SAVED_DIR)
except OSError:
    pass

LAST_LOG = os.path.join(LOG_SAVED_DIR, "last")

LAST_TEST_DIR = 'last_test_dir'

DEFAULT_DIR = './'
config = ConfigParser.RawConfigParser()
if len(config.read(os.path.expanduser('~/.cassandra-dtest'))) > 0:
    if config.has_option('main', 'default_dir'):
        DEFAULT_DIR = os.path.expanduser(config.get('main', 'default_dir'))
CASSANDRA_DIR = os.environ.get('CASSANDRA_DIR', DEFAULT_DIR)

NO_SKIP = os.environ.get('SKIP', '').lower() in ('no', 'false')
DEBUG = os.environ.get('DEBUG', '').lower() in ('yes', 'true')
TRACE = os.environ.get('TRACE', '').lower() in ('yes', 'true')
KEEP_LOGS = os.environ.get('KEEP_LOGS', '').lower() in ('yes', 'true')
KEEP_TEST_DIR = os.environ.get('KEEP_TEST_DIR', '').lower() in ('yes', 'true')
PRINT_DEBUG = os.environ.get('PRINT_DEBUG', '').lower() in ('yes', 'true')
OFFHEAP_MEMTABLES = os.environ.get('OFFHEAP_MEMTABLES', '').lower() in ('yes', 'true')
NUM_TOKENS = os.environ.get('NUM_TOKENS', '256')
RECORD_COVERAGE = os.environ.get('RECORD_COVERAGE', '').lower() in ('yes', 'true')
IGNORE_REQUIRE = os.environ.get('IGNORE_REQUIRE', '').lower() in ('yes', 'true')
DATADIR_COUNT = os.environ.get('DATADIR_COUNT', '3')
ENABLE_ACTIVE_LOG_WATCHING = os.environ.get('ENABLE_ACTIVE_LOG_WATCHING', '').lower() in ('yes', 'true')
RUN_STATIC_UPGRADE_MATRIX = os.environ.get('RUN_STATIC_UPGRADE_MATRIX', '').lower() in ('yes', 'true')

# devault values for configuration from configuration plugin
_default_config = GlobalConfigObject(
    vnodes=True,
)

if CONFIG is None:
    CONFIG = _default_config

DISABLE_VNODES = not CONFIG.vnodes


if os.environ.get('DISABLE_VNODES', '').lower() in ('yes', 'true'):
    print 'DISABLE_VNODES environment variable deprecated. Use `./run_dtests.py --vnodes false` instead.'


CURRENT_TEST = ""

logging.basicConfig(filename=os.path.join(LOG_SAVED_DIR, "dtest.log"),
                    filemode='w',
                    format='%(asctime)s,%(msecs)d %(name)s %(current_test)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.DEBUG)

LOG = logging.getLogger('dtest')
# set python-driver log level to INFO by default for dtest
logging.getLogger('cassandra').setLevel(logging.INFO)


def get_sha(repo_dir):
    try:
        output = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=repo_dir).strip()
        prefix = 'github:apache/'
        local_repo_location = os.environ.get('LOCAL_GIT_REPO')
        if local_repo_location is not None:
            prefix = 'local:{}:'.format(local_repo_location)  # local: slugs take the form 'local:/some/path/to/cassandra/:branch_name_or_sha'
        return "{}{}".format(prefix, output)
    except CalledProcessError as e:
        if re.search('Not a git repository', e.message) is not None:
            # we tried to get a sha, but repo_dir isn't a git repo. No big deal, must just be working from a non-git install.
            return None
        else:
            # git call failed for some unknown reason
            raise


# There are times when we want to know the C* version we're testing against
# before we call Tester.setUp. In the general case, we can't know that -- the
# test method could use any version it wants for self.cluster. However, we can
# get the version from build.xml in the C* repository specified by
# CASSANDRA_VERSION or CASSANDRA_DIR. This should use the same resolution
# strategy as the actual checkout code in Tester.setUp; if it does not, that is
# a bug.
_cassandra_version_slug = os.environ.get('CASSANDRA_VERSION')
# Prefer CASSANDRA_VERSION if it's set in the environment. If not, use CASSANDRA_DIR
if _cassandra_version_slug:
    # fetch but don't build the specified C* version
    ccm_repo_cache_dir, _ = ccmlib.repository.setup(_cassandra_version_slug)
    CASSANDRA_VERSION_FROM_BUILD = get_version_from_build(ccm_repo_cache_dir)
    CASSANDRA_GITREF = get_sha(ccm_repo_cache_dir)  # will be set None when not a git repo
else:
    CASSANDRA_VERSION_FROM_BUILD = get_version_from_build(CASSANDRA_DIR)
    CASSANDRA_GITREF = get_sha(CASSANDRA_DIR)


# Determine the location of the libjemalloc jar so that we can specify it
# through environment variables when start Cassandra.  This reduces startup
# time, making the dtests run faster.
def find_libjemalloc():
    if is_win():
        # let the normal bat script handle finding libjemalloc
        return ""

    this_dir = os.path.dirname(os.path.realpath(__file__))
    script = os.path.join(this_dir, "findlibjemalloc.sh")
    try:
        p = subprocess.Popen([script], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
        if stderr or not stdout:
            return "-"  # tells C* not to look for libjemalloc
        else:
            return stdout
    except Exception as exc:
        print "Failed to run script to prelocate libjemalloc ({}): {}".format(script, exc)
        return ""


CASSANDRA_LIBJEMALLOC = find_libjemalloc()
# copy the initial environment variables so we can reset them later:
initial_environment = copy.deepcopy(os.environ)


class DtestTimeoutError(Exception):
    pass


def reset_environment_vars():
    os.environ.clear()
    os.environ.update(initial_environment)


def warning(msg):
    LOG.warning(msg, extra={"current_test": CURRENT_TEST})
    if PRINT_DEBUG:
        print "WARN: " + msg


def debug(msg):
    LOG.debug(msg, extra={"current_test": CURRENT_TEST})
    if PRINT_DEBUG:
        print msg


debug("Python driver version in use: {}".format(cassandra.__version__))


def retry_till_success(fun, *args, **kwargs):
    timeout = kwargs.pop('timeout', 60)
    bypassed_exception = kwargs.pop('bypassed_exception', Exception)

    deadline = time.time() + timeout
    while True:
        try:
            return fun(*args, **kwargs)
        except bypassed_exception:
            if time.time() > deadline:
                raise
            else:
                # brief pause before next attempt
                time.sleep(0.25)


class FlakyRetryPolicy(RetryPolicy):
    """
    A retry policy that retries 5 times by default, but can be configured to
    retry more times.
    """

    def __init__(self, max_retries=5):
        self.max_retries = max_retries

    def on_read_timeout(self, *args, **kwargs):
        if kwargs['retry_num'] < self.max_retries:
            debug("Retrying read after timeout. Attempt #" + str(kwargs['retry_num']))
            return (self.RETRY, None)
        else:
            return (self.RETHROW, None)

    def on_write_timeout(self, *args, **kwargs):
        if kwargs['retry_num'] < self.max_retries:
            debug("Retrying write after timeout. Attempt #" + str(kwargs['retry_num']))
            return (self.RETRY, None)
        else:
            return (self.RETHROW, None)

    def on_unavailable(self, *args, **kwargs):
        if kwargs['retry_num'] < self.max_retries:
            debug("Retrying request after UE. Attempt #" + str(kwargs['retry_num']))
            return (self.RETRY, None)
        else:
            return (self.RETHROW, None)


class Runner(threading.Thread):

    def __init__(self, func):
        threading.Thread.__init__(self)
        self.__func = func
        self.__error = None
        self.__stopped = False
        self.daemon = True

    def run(self):
        i = 0
        while True:
            if self.__stopped:
                return
            try:
                self.__func(i)
            except Exception as e:
                self.__error = e
                return
            i = i + 1

    def stop(self):
        self.__stopped = True
        self.join()
        if self.__error is not None:
            raise self.__error

    def check(self):
        if self.__error is not None:
            raise self.__error


def make_execution_profile(retry_policy=FlakyRetryPolicy(), consistency_level=ConsistencyLevel.ONE, **kwargs):
    return ExecutionProfile(retry_policy=retry_policy,
                            consistency_level=consistency_level,
                            **kwargs)


class Tester(TestCase):

    maxDiff = None
    allow_log_errors = False  # scan the log of each node for errors after every test.
    cluster_options = None

    def set_node_to_current_version(self, node):
        version = os.environ.get('CASSANDRA_VERSION')
        cdir = CASSANDRA_DIR

        if version:
            node.set_install_dir(version=version)
        else:
            node.set_install_dir(install_dir=cdir)

    def init_config(self):
        init_default_config(self.cluster, self.cluster_options)

    def setUp(self):
        self.set_current_tst_name()
        kill_windows_cassandra_procs()
        maybe_cleanup_cluster_from_last_test_file()

        self.test_path = get_test_path()
        self.cluster = create_ccm_cluster(self.test_path, name='test')

        self.maybe_begin_active_log_watch()
        maybe_setup_jacoco(self.test_path)

        self.init_config()
        write_last_test_file(self.test_path, self.cluster)

        set_log_levels(self.cluster)
        self.connections = []
        self.runners = []

    # this is intentionally spelled 'tst' instead of 'test' to avoid
    # making unittest think it's a test method
    def set_current_tst_name(self):
        global CURRENT_TEST
        CURRENT_TEST = self.id() + self._testMethodName

    def maybe_begin_active_log_watch(self):
        if ENABLE_ACTIVE_LOG_WATCHING:
            if not self.allow_log_errors:
                self.begin_active_log_watch()

    def begin_active_log_watch(self):
        """
        Calls into ccm to start actively watching logs.

        In the event that errors are seen in logs, ccm will call back to _log_error_handler.

        When the cluster is no longer in use, stop_active_log_watch should be called to end log watching.
        (otherwise a 'daemon' thread will (needlessly) run until the process exits).
        """
        # log watching happens in another thread, but we want it to halt the main
        # thread's execution, which we have to do by registering a signal handler
        signal.signal(signal.SIGINT, self._catch_interrupt)
        self._log_watch_thread = self.cluster.actively_watch_logs_for_error(self._log_error_handler, interval=0.25)

    def _log_error_handler(self, errordata):
        """
        Callback handler used in conjunction with begin_active_log_watch.
        When called, prepares exception instance, then will indirectly
        cause _catch_interrupt to be called, which can raise the exception in the main
        program thread.

        @param errordata is a dictonary mapping node name to failure list.
        """
        # in some cases self.allow_log_errors may get set after proactive log checking has been enabled
        # so we need to double-check first thing before proceeding
        if self.allow_log_errors:
            return

        reportable_errordata = OrderedDict()

        for nodename, errors in errordata.items():
            filtered_errors = list(self.__filter_errors(['\n'.join(msg) for msg in errors]))
            if len(filtered_errors) is not 0:
                reportable_errordata[nodename] = filtered_errors

        # no errors worthy of halting the test
        if not reportable_errordata:
            return

        message = "Errors seen in logs for: {nodes}".format(nodes=", ".join(reportable_errordata.keys()))
        for nodename, errors in reportable_errordata.items():
            for error in errors:
                message += "\n{nodename}: {error}".format(nodename=nodename, error=error)

        try:
            debug('Errors were just seen in logs, ending test (if not ending already)!')
            print_("Error details: \n{message}".format(message=message))
            self.test_is_ending  # will raise AttributeError if not present
        except AttributeError:
            self.test_is_ending = True
            self.exit_with_exception = AssertionError("Log error encountered during active log scanning, see stdout")
            # thread.interrupt_main will SIGINT in the main thread, which we can
            # catch to raise an exception with useful information
            thread.interrupt_main()

    """
    Finds files matching the glob pattern specified as argument on
    the given keyspace in all nodes
    """

    def glob_data_dirs(self, path, ks="ks"):
        result = []
        for node in self.cluster.nodelist():
            for data_dir in node.data_directories():
                ks_dir = os.path.join(data_dir, ks, path)
                result.extend(glob.glob(ks_dir))
        return result

    def _catch_interrupt(self, signal, frame):
        """
        Signal handler for registering on SIGINT.

        If called will look for a stored exception and raise it to abort test.
        If a stored exception is not present, this handler has likely caught a
        user interrupt via CTRL-C, and will raise a KeyboardInterrupt.
        """
        try:
            # check if we have a persisted exception to fail with
            raise self.exit_with_exception
        except AttributeError:
            # looks like this was just a plain CTRL-C event
            raise KeyboardInterrupt()

    def copy_logs(self, cluster, directory=None, name=None):
        """Copy the current cluster's log files somewhere, by default to LOG_SAVED_DIR with a name of 'last'"""
        if directory is None:
            directory = LOG_SAVED_DIR
        if name is None:
            name = LAST_LOG
        else:
            name = os.path.join(directory, name)
        if not os.path.exists(directory):
            os.mkdir(directory)
        logs = [(node.name, node.logfilename(), node.debuglogfilename(), node.gclogfilename(), node.compactionlogfilename())
                for node in self.cluster.nodes.values()]
        if len(logs) is not 0:
            basedir = str(int(time.time() * 1000)) + '_' + self.id()
            logdir = os.path.join(directory, basedir)
            os.mkdir(logdir)
            for n, log, debuglog, gclog, compactionlog in logs:
                if os.path.exists(log):
                    self.assertGreaterEqual(os.path.getsize(log), 0)
                    shutil.copyfile(log, os.path.join(logdir, n + ".log"))
                if os.path.exists(debuglog):
                    self.assertGreaterEqual(os.path.getsize(debuglog), 0)
                    shutil.copyfile(debuglog, os.path.join(logdir, n + "_debug.log"))
                if os.path.exists(gclog):
                    self.assertGreaterEqual(os.path.getsize(gclog), 0)
                    shutil.copyfile(gclog, os.path.join(logdir, n + "_gc.log"))
                if os.path.exists(compactionlog):
                    self.assertGreaterEqual(os.path.getsize(compactionlog), 0)
                    shutil.copyfile(compactionlog, os.path.join(logdir, n + "_compaction.log"))
            if os.path.exists(name):
                os.unlink(name)
            if not is_win():
                os.symlink(basedir, name)

    def cql_connection(self, node, keyspace=None, user=None,
                       password=None, compression=True, protocol_version=None, port=None, ssl_opts=None, **kwargs):

        return self._create_session(node, keyspace, user, password, compression,
                                    protocol_version, port=port, ssl_opts=ssl_opts, **kwargs)

    def exclusive_cql_connection(self, node, keyspace=None, user=None,
                                 password=None, compression=True, protocol_version=None, port=None, ssl_opts=None, **kwargs):

        node_ip = get_ip_from_node(node)
        wlrr = WhiteListRoundRobinPolicy([node_ip])

        return self._create_session(node, keyspace, user, password, compression,
                                    protocol_version, port=port, ssl_opts=ssl_opts, load_balancing_policy=wlrr, **kwargs)

    def _create_session(self, node, keyspace, user, password, compression, protocol_version,
                        port=None, ssl_opts=None, execution_profiles=None, **kwargs):
        node_ip = get_ip_from_node(node)
        if not port:
            port = get_port_from_node(node)

        if protocol_version is None:
            protocol_version = get_eager_protocol_version(node.cluster.version())

        if user is not None:
            auth_provider = get_auth_provider(user=user, password=password)
        else:
            auth_provider = None

        profiles = {EXEC_PROFILE_DEFAULT: make_execution_profile(**kwargs)
                    } if not execution_profiles else execution_profiles

        cluster = PyCluster([node_ip],
                            auth_provider=auth_provider,
                            compression=compression,
                            protocol_version=protocol_version,
                            port=port,
                            ssl_options=ssl_opts,
                            connect_timeout=10,
                            allow_beta_protocol_version=True,
                            execution_profiles=profiles)
        session = cluster.connect(wait_for_all_pools=True)

        if keyspace is not None:
            session.set_keyspace(keyspace)

        self.connections.append(session)
        return session

    def patient_cql_connection(self, node, keyspace=None,
                               user=None, password=None, timeout=30, compression=True,
                               protocol_version=None, port=None, ssl_opts=None, **kwargs):
        """
        Returns a connection after it stops throwing NoHostAvailables due to not being ready.

        If the timeout is exceeded, the exception is raised.
        """
        if is_win():
            timeout *= 2

        expected_log_lines = ('Control connection failed to connect, shutting down Cluster:', '[control connection] Error connecting to ')
        with log_filter('cassandra.cluster', expected_log_lines):
            session = retry_till_success(
                self.cql_connection,
                node,
                keyspace=keyspace,
                user=user,
                password=password,
                timeout=timeout,
                compression=compression,
                protocol_version=protocol_version,
                port=port,
                ssl_opts=ssl_opts,
                bypassed_exception=NoHostAvailable,
                **kwargs
            )

        return session

    def patient_exclusive_cql_connection(self, node, keyspace=None,
                                         user=None, password=None, timeout=30, compression=True,
                                         protocol_version=None, port=None, ssl_opts=None, **kwargs):
        """
        Returns a connection after it stops throwing NoHostAvailables due to not being ready.

        If the timeout is exceeded, the exception is raised.
        """
        if is_win():
            timeout *= 2

        return retry_till_success(
            self.exclusive_cql_connection,
            node,
            keyspace=keyspace,
            user=user,
            password=password,
            timeout=timeout,
            compression=compression,
            protocol_version=protocol_version,
            port=port,
            ssl_opts=ssl_opts,
            bypassed_exception=NoHostAvailable,
            **kwargs
        )

    @classmethod
    def tearDownClass(cls):
        reset_environment_vars()
        if os.path.exists(LAST_TEST_DIR):
            with open(LAST_TEST_DIR) as f:
                test_path = f.readline().strip('\n')
                name = f.readline()
                try:
                    cluster = ClusterFactory.load(test_path, name)
                    # Avoid waiting too long for node to be marked down
                    if KEEP_TEST_DIR:
                        cluster.stop(gently=RECORD_COVERAGE)
                    else:
                        cluster.remove()
                        os.rmdir(test_path)
                except IOError:
                    # after a restart, /tmp will be emptied so we'll get an IOError when loading the old cluster here
                    pass
            try:
                os.remove(LAST_TEST_DIR)
            except IOError:
                # Ignore - see comment above
                pass

    def tearDown(self):
        # test_is_ending prevents active log watching from being able to interrupt the test
        # which we don't want to happen once tearDown begins
        self.test_is_ending = True

        reset_environment_vars()

        for con in self.connections:
            con.cluster.shutdown()

        for runner in self.runners:
            try:
                runner.stop()
            except:
                pass

        failed = did_fail()
        try:
            if not self.allow_log_errors and self.check_logs_for_errors():
                failed = True
                raise AssertionError('Unexpected error in log, see stdout')
        finally:
            try:
                # save the logs for inspection
                if failed or KEEP_LOGS:
                    self.copy_logs(self.cluster)
            except Exception as e:
                print "Error saving log:", str(e)
            finally:
                log_watch_thread = getattr(self, '_log_watch_thread', None)
                cleanup_cluster(self.cluster, self.test_path, log_watch_thread)

    def check_logs_for_errors(self):
        for node in self.cluster.nodelist():
            errors = list(self.__filter_errors(
                ['\n'.join(msg) for msg in node.grep_log_for_errors()]))
            if len(errors) is not 0:
                for error in errors:
                    print_("Unexpected error in {node_name} log, error: \n{error}".format(node_name=node.name, error=error))
                return True

    def go(self, func):
        runner = Runner(func)
        self.runners.append(runner)
        runner.start()
        return runner

    def skip(self, msg):
        if not NO_SKIP:
            raise SkipTest(msg)

    def __filter_errors(self, errors):
        """Filter errors, removing those that match self.ignore_log_patterns"""
        if not hasattr(self, 'ignore_log_patterns'):
            self.ignore_log_patterns = []
        for e in errors:
            for pattern in self.ignore_log_patterns:
                if re.search(pattern, e):
                    break
            else:
                yield e

    # Disable docstrings printing in nosetest output
    def shortDescription(self):
        return None

    def get_jfr_jvm_args(self):
        """
        @return The JVM arguments required for attaching flight recorder to a Java process.
        """
        return ["-XX:+UnlockCommercialFeatures", "-XX:+FlightRecorder"]

    def start_jfr_recording(self, nodes):
        """
        Start Java flight recorder provided the cluster was started with the correct jvm arguments.
        """
        for node in nodes:
            p = subprocess.Popen(['jcmd', str(node.pid), 'JFR.start'],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            stdout, stderr = p.communicate()
            debug(stdout)
            debug(stderr)

    def dump_jfr_recording(self, nodes):
        """
        Save Java flight recorder results to file for analyzing with mission control.
        """
        for node in nodes:
            p = subprocess.Popen(['jcmd', str(node.pid), 'JFR.dump',
                                  'recording=1', 'filename=recording_{}.jfr'.format(node.address())],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            stdout, stderr = p.communicate()
            debug(stdout)
            debug(stderr)


def get_eager_protocol_version(cassandra_version):
    """
    Returns the highest protocol version accepted
    by the given C* version
    """
    if cassandra_version >= '2.2':
        protocol_version = 4
    elif cassandra_version >= '2.1':
        protocol_version = 3
    elif cassandra_version >= '2.0':
        protocol_version = 2
    else:
        protocol_version = 1
    return protocol_version


# We default to UTF8Type because it's simpler to use in tests
def create_cf(session, name, key_type="varchar", speculative_retry=None, read_repair=None, compression=None,
              gc_grace=None, columns=None, validation="UTF8Type", compact_storage=False):

    additional_columns = ""
    if columns is not None:
        for k, v in columns.items():
            additional_columns = "{}, {} {}".format(additional_columns, k, v)

    if additional_columns == "":
        query = 'CREATE COLUMNFAMILY %s (key %s, c varchar, v varchar, PRIMARY KEY(key, c)) WITH comment=\'test cf\'' % (name, key_type)
    else:
        query = 'CREATE COLUMNFAMILY %s (key %s PRIMARY KEY%s) WITH comment=\'test cf\'' % (name, key_type, additional_columns)

    if compression is not None:
        query = '%s AND compression = { \'sstable_compression\': \'%sCompressor\' }' % (query, compression)
    else:
        # if a compression option is omitted, C* will default to lz4 compression
        query += ' AND compression = {}'

    if read_repair is not None:
        query = '%s AND read_repair_chance=%f AND dclocal_read_repair_chance=%f' % (query, read_repair, read_repair)
    if gc_grace is not None:
        query = '%s AND gc_grace_seconds=%d' % (query, gc_grace)
    if speculative_retry is not None:
        query = '%s AND speculative_retry=\'%s\'' % (query, speculative_retry)

    if compact_storage:
        query += ' AND COMPACT STORAGE'

    session.execute(query)
    time.sleep(0.2)


def create_ks(session, name, rf):
    query = 'CREATE KEYSPACE %s WITH replication={%s}'
    if isinstance(rf, types.IntType):
        # we assume simpleStrategy
        session.execute(query % (name, "'class':'SimpleStrategy', 'replication_factor':%d" % rf))
    else:
        assert_greater_equal(len(rf), 0, "At least one datacenter/rf pair is needed")
        # we assume networkTopologyStrategy
        options = (', ').join(['\'%s\':%d' % (d, r) for d, r in rf.iteritems()])
        session.execute(query % (name, "'class':'NetworkTopologyStrategy', %s" % options))
    session.execute('USE {}'.format(name))


def get_auth_provider(user, password):
    return PlainTextAuthProvider(username=user, password=password)


def make_auth(user, password):
    def private_auth(node_ip):
        return {'username': user, 'password': password}
    return private_auth


def get_port_from_node(node):
    """
    Return the port that this node is listening on.
    We only use this to connect the native driver,
    so we only care about the binary port.
    """
    try:
        return node.network_interfaces['binary'][1]
    except Exception:
        raise RuntimeError("No network interface defined on this node object. {}".format(node.network_interfaces))


def get_ip_from_node(node):
    if node.network_interfaces['binary']:
        node_ip = node.network_interfaces['binary'][0]
    else:
        node_ip = node.network_interfaces['thrift'][0]
    return node_ip


def kill_windows_cassandra_procs():
    # On Windows, forcefully terminate any leftover previously running cassandra processes. This is a temporary
    # workaround until we can determine the cause of intermittent hung-open tests and file-handles.
    if is_win():
        try:
            import psutil
            for proc in psutil.process_iter():
                try:
                    pinfo = proc.as_dict(attrs=['pid', 'name', 'cmdline'])
                except psutil.NoSuchProcess:
                    pass
                else:
                    if (pinfo['name'] == 'java.exe' and '-Dcassandra' in pinfo['cmdline']):
                        print 'Found running cassandra process with pid: ' + str(pinfo['pid']) + '. Killing.'
                        psutil.Process(pinfo['pid']).kill()
        except ImportError:
            debug("WARN: psutil not installed. Cannot detect and kill "
                  "running cassandra processes - you may see cascading dtest failures.")


def get_test_path():
    test_path = tempfile.mkdtemp(prefix='dtest-')

    # ccm on cygwin needs absolute path to directory - it crosses from cygwin space into
    # regular Windows space on wmic calls which will otherwise break pathing
    if sys.platform == "cygwin":
        process = subprocess.Popen(["cygpath", "-m", test_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        test_path = process.communicate()[0].rstrip()

    return test_path


# nose will discover this as a test, so we manually make it not a test
get_test_path.__test__ = False


def create_ccm_cluster(test_path, name):
    debug("cluster ccm directory: " + test_path)
    version = os.environ.get('CASSANDRA_VERSION')
    cdir = CASSANDRA_DIR

    if version:
        cluster = Cluster(test_path, name, cassandra_version=version)
    else:
        cluster = Cluster(test_path, name, cassandra_dir=cdir)

    if DISABLE_VNODES:
        cluster.set_configuration_options(values={'num_tokens': None})
    else:
        cluster.set_configuration_options(values={'initial_token': None, 'num_tokens': NUM_TOKENS})

    if OFFHEAP_MEMTABLES:
        cluster.set_configuration_options(values={'memtable_allocation_type': 'offheap_objects'})

    cluster.set_datadir_count(DATADIR_COUNT)
    cluster.set_environment_variable('CASSANDRA_LIBJEMALLOC', CASSANDRA_LIBJEMALLOC)

    return cluster


def cleanup_cluster(cluster, test_path, log_watch_thread=None):
    with log_filter('cassandra'):  # quiet noise from driver when nodes start going down
        if KEEP_TEST_DIR:
            cluster.stop(gently=RECORD_COVERAGE)
        else:
            # when recording coverage the jvm has to exit normally
            # or the coverage information is not written by the jacoco agent
            # otherwise we can just kill the process
            if RECORD_COVERAGE:
                cluster.stop(gently=True)

            # Cleanup everything:
            try:
                if log_watch_thread:
                    stop_active_log_watch(log_watch_thread)
            finally:
                debug("removing ccm cluster {name} at: {path}".format(name=cluster.name, path=test_path))
                cluster.remove()

                debug("clearing ssl stores from [{0}] directory".format(test_path))
                for filename in ('keystore.jks', 'truststore.jks', 'ccm_node.cer'):
                    try:
                        os.remove(os.path.join(test_path, filename))
                    except OSError as e:
                        # once we port to py3, which has better reporting for exceptions raised while
                        # handling other excpetions, we should just assert e.errno == errno.ENOENT
                        if e.errno != errno.ENOENT:  # ENOENT = no such file or directory
                            raise

                os.rmdir(test_path)
                cleanup_last_test_dir()


def cleanup_last_test_dir():
    if os.path.exists(LAST_TEST_DIR):
        os.remove(LAST_TEST_DIR)


def stop_active_log_watch(log_watch_thread):
    """
    Joins the log watching thread, which will then exit.
    Should be called after each test, ideally after nodes are stopped but before cluster files are removed.

    Can be called multiple times without error.
    If not called, log watching thread will remain running until the parent process exits.
    """
    log_watch_thread.join(timeout=60)


def maybe_cleanup_cluster_from_last_test_file():
    # cleaning up if a previous execution didn't trigger tearDown (which
    # can happen if it is interrupted by KeyboardInterrupt)
    if os.path.exists(LAST_TEST_DIR):
        with open(LAST_TEST_DIR) as f:
            test_path = f.readline().strip('\n')
            name = f.readline()
        try:
            cluster = ClusterFactory.load(test_path, name)
            # Avoid waiting too long for node to be marked down
            cleanup_cluster(cluster, test_path)
        except IOError:
            # after a restart, /tmp will be emptied so we'll get an IOError when loading the old cluster here
            pass


def init_default_config(cluster, cluster_options):
    # the failure detector can be quite slow in such tests with quick start/stop
    phi_values = {'phi_convict_threshold': 5}

    timeout = 10000
    if cluster_options is not None:
        values = merge_dicts(cluster_options, phi_values)
    else:
        values = merge_dicts(phi_values, {
            'read_request_timeout_in_ms': timeout,
            'range_request_timeout_in_ms': timeout,
            'write_request_timeout_in_ms': timeout,
            'truncate_request_timeout_in_ms': timeout,
            'request_timeout_in_ms': timeout
        })

    # No more thrift in 4.0, and start_rpc doesn't exists anymore
    if cluster.version() >= '4' and 'start_rpc' in values:
        del values['start_rpc']

    cluster.set_configuration_options(values)
    debug("Done setting configuration options:\n" + pprint.pformat(cluster._config_options, indent=4))


def write_last_test_file(test_path, cluster):
    with open(LAST_TEST_DIR, 'w') as f:
        f.write(test_path + '\n')
        f.write(cluster.name)


def set_log_levels(cluster):
    if DEBUG:
        cluster.set_log_level("DEBUG")
    if TRACE:
        cluster.set_log_level("TRACE")

    if os.environ.get('DEBUG', 'no').lower() not in ('no', 'false', 'yes', 'true'):
        classes_to_debug = os.environ.get('DEBUG').split(":")
        cluster.set_log_level('DEBUG', None if len(classes_to_debug) == 0 else classes_to_debug)

    if os.environ.get('TRACE', 'no').lower() not in ('no', 'false', 'yes', 'true'):
        classes_to_trace = os.environ.get('TRACE').split(":")
        cluster.set_log_level('TRACE', None if len(classes_to_trace) == 0 else classes_to_trace)


def maybe_setup_jacoco(test_path, cluster_name='test'):
    """Setup JaCoCo code coverage support"""

    if not RECORD_COVERAGE:
        return

    # use explicit agent and execfile locations
    # or look for a cassandra build if they are not specified
    cdir = CASSANDRA_DIR

    agent_location = os.environ.get('JACOCO_AGENT_JAR', os.path.join(cdir, 'build/lib/jars/jacocoagent.jar'))
    jacoco_execfile = os.environ.get('JACOCO_EXECFILE', os.path.join(cdir, 'build/jacoco/jacoco.exec'))

    if os.path.isfile(agent_location):
        debug("Jacoco agent found at {}".format(agent_location))
        with open(os.path.join(
                test_path, cluster_name, 'cassandra.in.sh'), 'w') as f:

            f.write('JVM_OPTS="$JVM_OPTS -javaagent:{jar_path}=destfile={exec_file}"'
                    .format(jar_path=agent_location, exec_file=jacoco_execfile))

            if os.path.isfile(jacoco_execfile):
                debug("Jacoco execfile found at {}, execution data will be appended".format(jacoco_execfile))
            else:
                debug("Jacoco execfile will be created at {}".format(jacoco_execfile))
    else:
        debug("Jacoco agent not found or is not file. Execution will not be recorded.")


def did_fail():
    if sys.exc_info() == (None, None, None):
        return False

    exc_class, _, _ = sys.exc_info()
    return not issubclass(exc_class, unittest.case.SkipTest)


class ReusableClusterTester(Tester):
    """
    A Tester designed for reusing the same cluster across multiple
    test methods.  This makes test suites with many small tests run
    much, much faster.  However, there are a couple of downsides:

    First, test setup and teardown must be diligent about cleaning
    up any data or schema elements that may interfere with other
    tests.

    Second, errors triggered by one test method may cascade
    into other test failures.  In an attempt to limit this, the
    cluster will be restarted if a test fails or an exception is
    caught.  However, there may still be undetected problems in
    Cassandra that cause cascading failures.
    """

    test_path = None
    cluster = None
    cluster_options = None

    @classmethod
    def setUpClass(cls):
        kill_windows_cassandra_procs()
        maybe_cleanup_cluster_from_last_test_file()
        cls.initialize_cluster()

    def setUp(self):
        self.set_current_tst_name()
        self.connections = []

        # TODO enable active log watching
        # This needs to happen in setUp() and not setUpClass() so that individual
        # test methods can set allow_log_errors and so that error handling
        # only fails a single test method instead of the entire class.
        # The problem with this is that ccm doesn't yet support stopping the
        # active log watcher -- it runs until the cluster is destroyed.  Since
        # we reuse the same cluster, this doesn't work for us.

    def tearDown(self):
        # test_is_ending prevents active log watching from being able to interrupt the test
        self.test_is_ending = True

        failed = did_fail()
        try:
            if not self.allow_log_errors and self.check_logs_for_errors():
                failed = True
                raise AssertionError('Unexpected error in log, see stdout')
        finally:
            try:
                # save the logs for inspection
                if failed or KEEP_LOGS:
                    self.copy_logs(self.cluster)
            except Exception as e:
                print "Error saving log:", str(e)
            finally:
                reset_environment_vars()
                if failed:
                    cleanup_cluster(self.cluster, self.test_path)
                    kill_windows_cassandra_procs()
                    self.initialize_cluster()

    @classmethod
    def initialize_cluster(cls):
        """
        This method is responsible for initializing and configuring a ccm
        cluster for the next set of tests.  This can be called for two
        different reasons:
         * A class of tests is starting
         * A test method failed/errored, so the cluster has been wiped

        Subclasses that require custom initialization should generally
        do so by overriding post_initialize_cluster().
        """
        cls.test_path = get_test_path()
        cls.cluster = create_ccm_cluster(cls.test_path, name='test')
        cls.init_config()

        maybe_setup_jacoco(cls.test_path)
        cls.init_config()
        write_last_test_file(cls.test_path, cls.cluster)
        set_log_levels(cls.cluster)

        cls.post_initialize_cluster()

    @classmethod
    def post_initialize_cluster(cls):
        """
        This method is called after the ccm cluster has been created
        and default config options have been applied.  Any custom
        initialization for a test class should generally be done
        here in order to correctly handle cluster restarts after
        test method failures.
        """
        pass

    @classmethod
    def init_config(cls):
        init_default_config(cls.cluster, cls.cluster_options)


class MultiError(Exception):
    """
    Extends Exception to provide reporting multiple exceptions at once.
    """

    def __init__(self, exceptions, tracebacks):
        # an exception and the corresponding traceback should be found at the same
        # position in their respective lists, otherwise __str__ will be incorrect
        self.exceptions = exceptions
        self.tracebacks = tracebacks

    def __str__(self):
        output = "\n****************************** BEGIN MultiError ******************************\n"

        for (exc, tb) in zip(self.exceptions, self.tracebacks):
            output += str(exc)
            output += str(tb) + "\n"

        output += "****************************** END MultiError ******************************"

        return output


def run_scenarios(scenarios, handler, deferred_exceptions=tuple()):
    """
    Runs multiple scenarios from within a single test method.

    "Scenarios" are mini-tests where a common procedure can be reused with several different configurations.
    They are intended for situations where complex/expensive setup isn't required and some shared state is acceptable (or trivial to reset).

    Arguments: scenarios should be an iterable, handler should be a callable, and deferred_exceptions should be a tuple of exceptions which
    are safe to delay until the scenarios are all run. For each item in scenarios, handler(item) will be called in turn.

    Exceptions which occur will be bundled up and raised as a single MultiError exception, either when: a) all scenarios have run,
    or b) on the first exception encountered which is not whitelisted in deferred_exceptions.
    """
    errors = []
    tracebacks = []

    for i, scenario in enumerate(scenarios, 1):
        debug("running scenario {}/{}: {}".format(i, len(scenarios), scenario))

        try:
            handler(scenario)
        except deferred_exceptions as e:
            tracebacks.append(traceback.format_exc(sys.exc_info()))
            errors.append(type(e)('encountered {} {} running scenario:\n  {}\n'.format(e.__class__.__name__, e.message, scenario)))
            debug("scenario {}/{} encountered a deferrable exception, continuing".format(i, len(scenarios)))
        except Exception as e:
            # catch-all for any exceptions not intended to be deferred
            tracebacks.append(traceback.format_exc(sys.exc_info()))
            errors.append(type(e)('encountered {} {} running scenario:\n  {}\n'.format(e.__class__.__name__, e.message, scenario)))
            debug("scenario {}/{} encountered a non-deferrable exception, aborting".format(i, len(scenarios)))
            raise MultiError(errors, tracebacks)

    if errors:
        raise MultiError(errors, tracebacks)