topology_test.py

import re
import time
from threading import Thread
from unittest import skip

from cassandra import ConsistencyLevel
from ccmlib.node import TimeoutError, ToolError
from nose.plugins.attrib import attr

from dtest import Tester, debug, create_ks, create_cf
from tools.assertions import assert_almost_equal, assert_all, assert_none
from tools.data import insert_c1c2, query_c1c2
from tools.decorators import no_vnodes, since


class TestTopology(Tester):

    def do_not_join_ring_test(self):
        """
        @jira_ticket CASSANDRA-9034
        Check that AssertionError is not thrown on SizeEstimatesRecorder before node joins ring
        """
        cluster = self.cluster.populate(1)
        node1, = cluster.nodelist()

        node1.start(wait_for_binary_proto=True, join_ring=False,
                    jvm_args=["-Dcassandra.size_recorder_interval=1"])

        # initial delay is 30s
        time.sleep(40)

        node1.stop(gently=False)

    @since('3.0.11')
    def size_estimates_multidc_test(self):
        """
        Test that primary ranges are correctly generated on
        system.size_estimates for multi-dc, multi-ks scenario
        @jira_ticket CASSANDRA-9639
        """
        debug("Creating cluster")
        cluster = self.cluster
        cluster.set_configuration_options(values={'num_tokens': 2})
        cluster.populate([2, 1])
        node1_1, node1_2, node2_1 = cluster.nodelist()

        debug("Setting tokens")
        node1_tokens, node2_tokens, node3_tokens = ['-6639341390736545756,-2688160409776496397',
                                                    '-2506475074448728501,8473270337963525440',
                                                    '-3736333188524231709,8673615181726552074']
        node1_1.set_configuration_options(values={'initial_token': node1_tokens})
        node1_2.set_configuration_options(values={'initial_token': node2_tokens})
        node2_1.set_configuration_options(values={'initial_token': node3_tokens})
        cluster.set_configuration_options(values={'num_tokens': 2})

        debug("Starting cluster")
        cluster.start()

        out, _, _ = node1_1.nodetool('ring')
        debug("Nodetool ring output {}".format(out))

        debug("Creating keyspaces")
        session = self.patient_cql_connection(node1_1)
        create_ks(session, 'ks1', 3)
        create_ks(session, 'ks2', {'dc1': 2})
        create_cf(session, 'ks1.cf1', columns={'c1': 'text', 'c2': 'text'})
        create_cf(session, 'ks2.cf2', columns={'c1': 'text', 'c2': 'text'})

        debug("Refreshing size estimates")
        node1_1.nodetool('refreshsizeestimates')
        node1_2.nodetool('refreshsizeestimates')
        node2_1.nodetool('refreshsizeestimates')

        """
        CREATE KEYSPACE ks1 WITH replication =
            {'class': 'SimpleStrategy', 'replication_factor': '3'}
        CREATE KEYSPACE ks2 WITH replication =
            {'class': 'NetworkTopologyStrategy', 'dc1': '2'}  AND durable_writes = true;

        Datacenter: dc1
        ==========
        Address     Token
                    8473270337963525440
        127.0.0.1   -6639341390736545756
        127.0.0.1   -2688160409776496397
        127.0.0.2   -2506475074448728501
        127.0.0.2   8473270337963525440

        Datacenter: dc2
        ==========
        Address     Token
                    8673615181726552074
        127.0.0.3   -3736333188524231709
        127.0.0.3   8673615181726552074
        """

        debug("Checking node1_1 size_estimates primary ranges")
        session = self.patient_exclusive_cql_connection(node1_1)
        assert_all(session, "SELECT range_start, range_end FROM system.size_estimates "
                            "WHERE keyspace_name = 'ks1'", [['-3736333188524231709', '-2688160409776496397'],
                                                            ['-9223372036854775808', '-6639341390736545756'],
                                                            ['8673615181726552074', '-9223372036854775808']])
        assert_all(session, "SELECT range_start, range_end FROM system.size_estimates "
                            "WHERE keyspace_name = 'ks2'", [['-3736333188524231709', '-2688160409776496397'],
                                                            ['-6639341390736545756', '-3736333188524231709'],
                                                            ['-9223372036854775808', '-6639341390736545756'],
                                                            ['8473270337963525440', '8673615181726552074'],
                                                            ['8673615181726552074', '-9223372036854775808']])

        debug("Checking node1_2 size_estimates primary ranges")
        session = self.patient_exclusive_cql_connection(node1_2)
        assert_all(session, "SELECT range_start, range_end FROM system.size_estimates "
                            "WHERE keyspace_name = 'ks1'", [['-2506475074448728501', '8473270337963525440'],
                                                            ['-2688160409776496397', '-2506475074448728501']])
        assert_all(session, "SELECT range_start, range_end FROM system.size_estimates "
                            "WHERE keyspace_name = 'ks2'", [['-2506475074448728501', '8473270337963525440'],
                                                            ['-2688160409776496397', '-2506475074448728501']])

        debug("Checking node2_1 size_estimates primary ranges")
        session = self.patient_exclusive_cql_connection(node2_1)
        assert_all(session, "SELECT range_start, range_end FROM system.size_estimates "
                            "WHERE keyspace_name = 'ks1'", [['-6639341390736545756', '-3736333188524231709'],
                                                            ['8473270337963525440', '8673615181726552074']])
        assert_none(session, "SELECT range_start, range_end FROM system.size_estimates "
                             "WHERE keyspace_name = 'ks2'")

    def simple_decommission_test(self):
        """
        @jira_ticket CASSANDRA-9912
        Check that AssertionError is not thrown on SizeEstimatesRecorder after node is decommissioned
        """
        cluster = self.cluster
        cluster.populate(3)
        cluster.start(wait_for_binary_proto=True, jvm_args=["-Dcassandra.size_recorder_interval=1"])
        node1, node2, node3 = cluster.nodelist()

        session = self.patient_cql_connection(node1)

        if cluster.version() >= '2.2':
            # reduce system_distributed RF to 2 so we don't require forceful decommission
            session.execute("ALTER KEYSPACE system_distributed WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'2'};")

        # write some data
        node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8'])

        # Decommission node and wipe its data
        node2.decommission()
        node2.stop()

        # This sleep is here to give the cluster time to hit the AssertionError
        # described in 9912. Do not remove it.
        time.sleep(10)

    @skip('Hangs on CI for 2.1')
    def concurrent_decommission_not_allowed_test(self):
        """
        Test concurrent decommission is not allowed
        """
        cluster = self.cluster
        cluster.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1})
        cluster.populate(2).start(wait_other_notice=True)
        node1, node2 = cluster.nodelist()

        session = self.patient_cql_connection(node2)
        create_ks(session, 'ks', 1)
        create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'})
        insert_c1c2(session, n=10000, consistency=ConsistencyLevel.ALL)

        mark = node2.mark_log()

        def decommission():
            node2.nodetool('decommission')

        # Launch first decommission in a external thread
        t = Thread(target=decommission)
        t.start()

        # Make sure first decommission is initialized before second decommission
        node2.watch_log_for('DECOMMISSIONING', filename='debug.log')

        # Launch a second decommission, should fail
        with self.assertRaises(ToolError):
            node2.nodetool('decommission')

        # Check data is correctly forwarded to node1 after node2 is decommissioned
        t.join()
        node2.watch_log_for('DECOMMISSIONED', from_mark=mark)
        session = self.patient_cql_connection(node1)
        session.execute('USE ks')
        for n in xrange(0, 10000):
            query_c1c2(session, n, ConsistencyLevel.ONE)

    @since('3.10')
    def resumable_decommission_test(self):
        """
        @jira_ticket CASSANDRA-12008

        Test decommission operation is resumable
        """
        self.ignore_log_patterns = [r'Streaming error occurred', r'Error while decommissioning node', r'Remote peer 127.0.0.2 failed stream session']
        cluster = self.cluster
        cluster.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1})
        cluster.populate(3, install_byteman=True).start(wait_other_notice=True)
        node1, node2, node3 = cluster.nodelist()

        session = self.patient_cql_connection(node2)
        # reduce system_distributed RF to 2 so we don't require forceful decommission
        session.execute("ALTER KEYSPACE system_distributed WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'2'};")
        create_ks(session, 'ks', 2)
        create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'})
        insert_c1c2(session, n=10000, consistency=ConsistencyLevel.ALL)

        # Execute first rebuild, should fail
        with self.assertRaises(ToolError):
            if cluster.version() >= '4.0':
                script = ['./byteman/4.0/decommission_failure_inject.btm']
            else:
                script = ['./byteman/pre4.0/decommission_failure_inject.btm']
            node2.byteman_submit(script)
            node2.nodetool('decommission')

        # Make sure previous ToolError is due to decommission
        node2.watch_log_for('Error while decommissioning node')

        # Decommission again
        mark = node2.mark_log()
        node2.nodetool('decommission')

        # Check decommision is done and we skipped transfereed ranges
        node2.watch_log_for('DECOMMISSIONED', from_mark=mark)
        node2.grep_log("Skipping transferred range .* of keyspace ks, endpoint /127.0.0.3", filename='debug.log')

        # Check data is correctly forwarded to node1 and node3
        cluster.remove(node2)
        node3.stop(gently=False)
        session = self.patient_exclusive_cql_connection(node1)
        session.execute('USE ks')
        for i in xrange(0, 10000):
            query_c1c2(session, i, ConsistencyLevel.ONE)
        node1.stop(gently=False)
        node3.start()
        session.shutdown()
        mark = node3.mark_log()
        node3.watch_log_for('Starting listening for CQL clients', from_mark=mark)
        session = self.patient_exclusive_cql_connection(node3)
        session.execute('USE ks')
        for i in xrange(0, 10000):
            query_c1c2(session, i, ConsistencyLevel.ONE)

    @no_vnodes()
    def movement_test(self):
        cluster = self.cluster

        # Create an unbalanced ring
        cluster.populate(3, tokens=[0, 2**48, 2**62]).start()
        node1, node2, node3 = cluster.nodelist()

        session = self.patient_cql_connection(node1)
        create_ks(session, 'ks', 1)
        create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'})

        insert_c1c2(session, n=30000, consistency=ConsistencyLevel.ONE)

        cluster.flush()

        # Move nodes to balance the cluster
        def move_node(node, token, ip):
            mark = node.mark_log()
            node.move(token)  # can't assume 0 is balanced with m3p
            node.watch_log_for('{} state jump to NORMAL'.format(ip), from_mark=mark, timeout=180)
            time.sleep(3)

        balancing_tokens = cluster.balanced_tokens(3)

        move_node(node1, balancing_tokens[0], '127.0.0.1')
        move_node(node2, balancing_tokens[1], '127.0.0.2')
        move_node(node3, balancing_tokens[2], '127.0.0.3')

        time.sleep(1)

        cluster.cleanup()

        # Check we can get all the keys
        for n in xrange(0, 30000):
            query_c1c2(session, n, ConsistencyLevel.ONE)

        # Now the load should be basically even
        sizes = [node.data_size() for node in [node1, node2, node3]]

        assert_almost_equal(sizes[0], sizes[1])
        assert_almost_equal(sizes[0], sizes[2])
        assert_almost_equal(sizes[1], sizes[2])

    @no_vnodes()
    def decommission_test(self):
        cluster = self.cluster

        tokens = cluster.balanced_tokens(4)
        cluster.populate(4, tokens=tokens).start()
        node1, node2, node3, node4 = cluster.nodelist()

        session = self.patient_cql_connection(node1)
        create_ks(session, 'ks', 2)
        create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'})

        insert_c1c2(session, n=30000, consistency=ConsistencyLevel.QUORUM)

        cluster.flush()
        sizes = [node.data_size() for node in cluster.nodelist() if node.is_running()]
        init_size = sizes[0]
        assert_almost_equal(*sizes)

        time.sleep(.5)
        node4.decommission()
        node4.stop()
        cluster.cleanup()
        time.sleep(.5)

        # Check we can get all the keys
        for n in xrange(0, 30000):
            query_c1c2(session, n, ConsistencyLevel.QUORUM)

        sizes = [node.data_size() for node in cluster.nodelist() if node.is_running()]
        debug(sizes)
        assert_almost_equal(sizes[0], sizes[1])
        assert_almost_equal((2.0 / 3.0) * sizes[0], sizes[2])
        assert_almost_equal(sizes[2], init_size)

    @no_vnodes()
    def move_single_node_test(self):
        """ Test moving a node in a single-node cluster (#4200) """
        cluster = self.cluster

        # Create an unbalanced ring
        cluster.populate(1, tokens=[0]).start()
        node1 = cluster.nodelist()[0]
        time.sleep(0.2)

        session = self.patient_cql_connection(node1)
        create_ks(session, 'ks', 1)
        create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'})

        insert_c1c2(session, n=10000, consistency=ConsistencyLevel.ONE)

        cluster.flush()

        node1.move(2**25)
        time.sleep(1)

        cluster.cleanup()

        # Check we can get all the keys
        for n in xrange(0, 10000):
            query_c1c2(session, n, ConsistencyLevel.ONE)

    @since('3.0')
    def decommissioned_node_cant_rejoin_test(self):
        '''
        @jira_ticket CASSANDRA-8801

        Test that a decommissioned node can't rejoin the cluster by:

        - creating a cluster,
        - decommissioning a node, and
        - asserting that the "decommissioned node won't rejoin" error is in the
        logs for that node and
        - asserting that the node is not running.
        '''
        rejoin_err = 'This node was decommissioned and will not rejoin the ring'
        try:
            self.ignore_log_patterns = list(self.ignore_log_patterns)
        except AttributeError:
            self.ignore_log_patterns = []
        self.ignore_log_patterns.append(rejoin_err)

        self.cluster.populate(3).start(wait_for_binary_proto=True)
        node1, node2, node3 = self.cluster.nodelist()

        debug('decommissioning...')
        node3.decommission(force=self.cluster.version() >= '4.0')
        debug('stopping...')
        node3.stop()
        debug('attempting restart...')
        node3.start(wait_other_notice=False)
        try:
            # usually takes 3 seconds, so give it a generous 15
            node3.watch_log_for(rejoin_err, timeout=15)
        except TimeoutError:
            # TimeoutError is not very helpful to the reader of the test output;
            # let that pass and move on to string assertion below
            pass

        self.assertIn(rejoin_err,
                      '\n'.join(['\n'.join(err_list)
                                 for err_list in node3.grep_log_for_errors()]))

        # Give the node some time to shut down once it has detected
        # its invalid state. If it doesn't shut down in the 30 seconds,
        # consider filing a bug. It shouldn't take more than 10, in most cases.
        start = time.time()
        while start + 30 > time.time() and node3.is_running():
            time.sleep(1)

        self.assertFalse(node3.is_running())

    @since('3.0')
    def crash_during_decommission_test(self):
        """
        If a node crashes whilst another node is being decommissioned,
        upon restarting the crashed node should not have invalid entries
        for the decommissioned node
        @jira_ticket CASSANDRA-10231
        """
        cluster = self.cluster
        self.ignore_log_patterns = [r'Streaming error occurred', 'Stream failed']
        cluster.populate(3).start(wait_other_notice=True)

        node1, node2 = cluster.nodelist()[0:2]

        t = DecommissionInParallel(node1)
        t.start()

        node1.watch_log_for("DECOMMISSIONING", filename='debug.log')
        null_status_pattern = re.compile(".N(?:\s*)127\.0\.0\.1(?:.*)null(?:\s*)rack1")
        while t.is_alive():
            out = self.show_status(node2)
            if null_status_pattern.search(out):
                debug("Matched null status entry")
                break
            debug("Restarting node2")
            node2.stop(gently=False)
            node2.start(wait_for_binary_proto=True, wait_other_notice=False)

        debug("Waiting for decommission to complete")
        t.join()
        self.show_status(node2)

        debug("Sleeping for 30 seconds to allow gossip updates")
        time.sleep(30)
        out = self.show_status(node2)
        self.assertFalse(null_status_pattern.search(out))

    @since('3.12')
    @attr('resource-intensive')
    def stop_decommission_too_few_replicas_multi_dc_test(self):
        """
        Decommission should fail when it would result in the number of live replicas being less than
        the replication factor. --force should bypass this requirement.
        @jira_ticket CASSANDRA-12510
        @expected_errors ToolError when # nodes will drop below configured replicas in NTS/SimpleStrategy
        """
        cluster = self.cluster
        cluster.populate([2, 2]).start(wait_for_binary_proto=True)
        node1, node2, node3, node4 = self.cluster.nodelist()
        session = self.patient_cql_connection(node2)
        session.execute("ALTER KEYSPACE system_distributed WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'2'};")
        create_ks(session, 'ks', {'dc1': 2, 'dc2': 2})
        with self.assertRaises(ToolError):
            node4.nodetool('decommission')

        session.execute('DROP KEYSPACE ks')
        create_ks(session, 'ks2', 4)
        with self.assertRaises(ToolError):
            node4.nodetool('decommission')

        node4.nodetool('decommission --force')
        decommissioned = node4.watch_log_for("DECOMMISSIONED", timeout=120)
        self.assertTrue(decommissioned, "Node failed to decommission when passed --force")

    def show_status(self, node):
        out, _, _ = node.nodetool('status')
        debug("Status as reported by node {}".format(node.address()))
        debug(out)
        return out


class DecommissionInParallel(Thread):

    def __init__(self, node):
        Thread.__init__(self)
        self.node = node

    def run(self):
        node = self.node
        mark = node.mark_log()
        try:
            out, err, _ = node.nodetool("decommission")
            node.watch_log_for("DECOMMISSIONED", from_mark=mark)
            debug(out)
            debug(err)
        except ToolError as e:
            debug("Decommission failed with exception: " + str(e))
            pass