Merge pull request apache#819 from datastax/python-simulacron-tests

beltran · web-flow · commit b5cb3bfc30e9 · 2017-10-05T11:23:14.000-04:00
Added some tests around network partitioning, closing connections and &hellip;
diff --git a/tests/integration/simulacron/__init__.py b/tests/integration/simulacron/__init__.py
@@ -11,8 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
+try:
+    import unittest2 as unittest
+except ImportError:
+    import unittest  # noqa
+
+from tests.integration.simulacron.utils import stop_simulacron, clear_queries
 
-from tests.integration.simulacron.utils import stop_simulacron
 
 def teardown_package():
-    stop_simulacron()
+    stop_simulacron()
+
+
+class SimulacronBase(unittest.TestCase):
+    def tearDown(self):
+        clear_queries()
+        stop_simulacron()
diff --git a/tests/integration/simulacron/test_connection.py b/tests/integration/simulacron/test_connection.py
@@ -25,20 +25,22 @@
 
 from cassandra import OperationTimedOut
 from cassandra.cluster import (EXEC_PROFILE_DEFAULT, Cluster, ExecutionProfile,
-                               _Scheduler)
+                               _Scheduler, NoHostAvailable)
 from cassandra.policies import HostStateListener, RoundRobinPolicy
 from tests.integration import (CASSANDRA_VERSION, PROTOCOL_VERSION,
                                requiressimulacron)
 from tests.integration.util import assert_quiescent_pool_state
+from tests.integration.simulacron import SimulacronBase
 from tests.integration.simulacron.utils import (NO_THEN, PrimeOptions,
                                                 prime_query, prime_request,
                                                 start_and_prime_cluster_defaults,
                                                 start_and_prime_singledc,
-                                                stop_simulacron)
+                                                clear_queries)
 
 
 class TrackDownListener(HostStateListener):
-    hosts_marked_down = []
+    def __init__(self):
+        self.hosts_marked_down = []
 
     def on_down(self, host):
         self.hosts_marked_down.append(host)
@@ -50,8 +52,21 @@ def submit(self, fn, *args, **kwargs):
         self.called_functions.append(fn.__name__)
         return super(ThreadTracker, self).submit(fn, *args, **kwargs)
 
+
+class OrderedRoundRobinPolicy(RoundRobinPolicy):
+
+    def make_query_plan(self, working_keyspace=None, query=None):
+        self._position += 1
+
+        hosts = []
+        for _ in range(10):
+            hosts.extend(sorted(self._live_hosts, key=lambda x : x.address))
+
+        return hosts
+
+
 @requiressimulacron
-class ConnectionTest(unittest.TestCase):
+class ConnectionTests(SimulacronBase):
 
     def test_heart_beat_timeout(self):
         """
@@ -64,24 +79,23 @@ def test_heart_beat_timeout(self):
         @test_category metadata
         """
         number_of_dcs = 3
-        nodes_per_dc = 100
+        nodes_per_dc = 20
 
         query_to_prime = "INSERT INTO test3rf.test (k, v) VALUES (0, 1);"
 
         idle_heartbeat_timeout = 5
         idle_heartbeat_interval = 1
 
         start_and_prime_cluster_defaults(number_of_dcs, nodes_per_dc, CASSANDRA_VERSION)
-        self.addCleanup(stop_simulacron)
 
         listener = TrackDownListener()
-        executor = ThreadTracker(max_workers=16)
+        executor = ThreadTracker(max_workers=8)
 
         # We need to disable compression since it's not supported in simulacron
         cluster = Cluster(compression=False,
                           idle_heartbeat_interval=idle_heartbeat_interval,
                           idle_heartbeat_timeout=idle_heartbeat_timeout,
-                          executor_threads=16,
+                          executor_threads=8,
                           execution_profiles={
                               EXEC_PROFILE_DEFAULT: ExecutionProfile(load_balancing_policy=RoundRobinPolicy())})
         self.addCleanup(cluster.shutdown)
@@ -112,7 +126,7 @@ def test_heart_beat_timeout(self):
 
         # We allow from some extra time for all the hosts to be to on_down
         # The callbacks should start happening after idle_heartbeat_timeout + idle_heartbeat_interval
-        time.sleep((idle_heartbeat_timeout + idle_heartbeat_interval) * 2)
+        time.sleep((idle_heartbeat_timeout + idle_heartbeat_interval) * 2.5)
 
         for host in cluster.metadata.all_hosts():
             self.assertIn(host, listener.hosts_marked_down)
@@ -133,7 +147,6 @@ def test_callbacks_and_pool_when_oto(self):
         @test_category metadata
         """
         start_and_prime_singledc()
-        self.addCleanup(stop_simulacron)
 
         cluster = Cluster(protocol_version=PROTOCOL_VERSION, compression=False)
         session = cluster.connect()
@@ -155,3 +168,163 @@ def test_callbacks_and_pool_when_oto(self):
         # PYTHON-630 -- only the errback should be called
         errback.assert_called_once()
         callback.assert_not_called()
+
+    def test_close_when_query(self):
+        """
+        Test to ensure the driver behaves correctly if the connection is closed
+        just when querying
+        @since 3.12
+        @expected_result NoHostAvailable is risen
+
+        @test_category connection
+        """
+        start_and_prime_singledc()
+
+        cluster = Cluster(protocol_version=PROTOCOL_VERSION, compression=False)
+        session = cluster.connect()
+        self.addCleanup(cluster.shutdown)
+
+        query_to_prime = "SELECT * from testkesypace.testtable"
+
+        for close_type in ("disconnect", "shutdown_read", "shutdown_write"):
+            then = {
+                "result": "close_connection",
+                "delay_in_ms": 0,
+                "close_type": close_type,
+                "scope": "connection"
+            }
+
+            prime_query(query_to_prime, then=then)
+            self.assertRaises(NoHostAvailable, session.execute, query_to_prime)
+
+    def test_retry_after_defunct(self):
+        """
+        We test cluster._retry is called if an the connection is defunct
+        in the middle of a query
+
+        Finally we verify the driver recovers correctly in the event
+        of a network partition
+
+        @since 3.12
+        @expected_result the driver is able to query even if a host is marked
+        as down in the middle of the query, it will go to the next one if the timeout
+        hasn't expired
+
+        @test_category connection
+        """
+        number_of_dcs = 3
+        nodes_per_dc = 2
+
+        query_to_prime = "INSERT INTO test3rf.test (k, v) VALUES (0, 1);"
+
+        idle_heartbeat_timeout = 1
+        idle_heartbeat_interval = 5
+
+        simulacron_cluster = start_and_prime_cluster_defaults(number_of_dcs, nodes_per_dc, CASSANDRA_VERSION)
+
+        dc_ids = sorted(simulacron_cluster.data_center_ids)
+        last_host = dc_ids.pop()
+        prime_query(query_to_prime,
+                    cluster_name="{}/{}".format(simulacron_cluster.cluster_name, last_host))
+
+        roundrobin_lbp = OrderedRoundRobinPolicy()
+        cluster = Cluster(compression=False,
+                          idle_heartbeat_interval=idle_heartbeat_interval,
+                          idle_heartbeat_timeout=idle_heartbeat_timeout,
+                          execution_profiles={
+                              EXEC_PROFILE_DEFAULT: ExecutionProfile(load_balancing_policy=roundrobin_lbp)})
+
+        session = cluster.connect(wait_for_all_pools=True)
+        self.addCleanup(cluster.shutdown)
+
+        # This simulates we only have access to one DC
+        for dc_id in dc_ids:
+            datacenter_path = "{}/{}".format(simulacron_cluster.cluster_name, dc_id)
+            prime_query(query_to_prime, then=NO_THEN, cluster_name=datacenter_path)
+            prime_request(PrimeOptions(then=NO_THEN, cluster_name=datacenter_path))
+
+        # Only the last datacenter will respond, therefore the first host won't
+        # We want to make sure the returned hosts are 127.0.0.1,  127.0.0.2, ... 127.0.0.8
+        roundrobin_lbp._position = 0
+
+        # After 3 + 1 seconds the connection should be marked and down and another host retried
+        response_future = session.execute_async(query_to_prime, timeout=4 * idle_heartbeat_interval
+                                                                        + idle_heartbeat_timeout)
+        response_future.result()
+        self.assertGreater(len(response_future.attempted_hosts), 1)
+
+        # No error should be raised here since the hosts have been marked
+        # as down and there's still 1 DC available
+        for _ in range(10):
+            session.execute(query_to_prime)
+
+        # Might take some time to close the previous connections and reconnect
+        time.sleep(10)
+        assert_quiescent_pool_state(self, cluster)
+        clear_queries()
+
+        time.sleep(10)
+        assert_quiescent_pool_state(self, cluster)
+
+    def test_idle_connection_is_not_closed(self):
+        """
+        Test to ensure that the connections aren't closed if they are idle
+        @since 3.12
+        @jira_ticket PYTHON-573
+        @expected_result the connections aren't closed nor the hosts are
+        set to down if the connection is idle
+
+        @test_category connection
+        """
+        start_and_prime_singledc()
+
+        idle_heartbeat_timeout = 1
+        idle_heartbeat_interval = 1
+
+        listener = TrackDownListener()
+        cluster = Cluster(compression=False,
+                          idle_heartbeat_interval=idle_heartbeat_interval,
+                          idle_heartbeat_timeout=idle_heartbeat_timeout)
+        session = cluster.connect(wait_for_all_pools=True)
+        cluster.register_listener(listener)
+
+        self.addCleanup(cluster.shutdown)
+
+        time.sleep(20)
+
+        self.assertEqual(listener.hosts_marked_down, [])
+
+    def test_host_is_not_set_to_down_after_query_oto(self):
+        """
+        Test to ensure that the connections aren't closed if there's an
+        OperationTimedOut in a normal query. This should only happen from the
+        heart beat thread (in the case of a OperationTimedOut) with the default
+        configuration
+        @since 3.12
+        @expected_result the connections aren't closed nor the hosts are
+        set to down
+
+        @test_category connection
+        """
+        start_and_prime_singledc()
+
+        query_to_prime = "SELECT * FROM madeup_keyspace.madeup_table"
+
+        prime_query(query_to_prime, then=NO_THEN)
+
+        listener = TrackDownListener()
+        cluster = Cluster(compression=False)
+        session = cluster.connect(wait_for_all_pools=True)
+        cluster.register_listener(listener)
+
+        futures = []
+        for _ in range(10):
+            future = session.execute_async(query_to_prime)
+            futures.append(future)
+
+        for f in futures:
+            f._event.wait()
+            self.assertIsInstance(f._final_exception, OperationTimedOut)
+
+        self.assertEqual(listener.hosts_marked_down, [])
+        assert_quiescent_pool_state(self, cluster)
diff --git a/tests/integration/simulacron/test_policies.py b/tests/integration/simulacron/test_policies.py
@@ -22,7 +22,6 @@
 from cassandra.policies import ConstantSpeculativeExecutionPolicy, RoundRobinPolicy
 
 from tests.integration import PROTOCOL_VERSION, greaterthancass21, requiressimulacron, SIMULACRON_JAR
-from tests import notwindows
 from tests.integration.simulacron.utils import start_and_prime_singledc, prime_query, \
     stop_simulacron, NO_THEN, clear_queries
 
diff --git a/tests/integration/simulacron/utils.py b/tests/integration/simulacron/utils.py
@@ -17,6 +17,7 @@
 from tests.integration import CASSANDRA_VERSION, SIMULACRON_JAR
 import subprocess
 import time
+import nose
 
 DEFAULT_CLUSTER = "python_simulacron_cluster"
 
@@ -280,7 +281,7 @@ def start_and_prime_singledc(cluster_name=DEFAULT_CLUSTER):
     :param cluster_name: name of the cluster to start and prime
     :return:
     """
-    start_and_prime_cluster_defaults(number_of_dc=1, nodes_per_dc=3, cluster_name=cluster_name)
+    return start_and_prime_cluster_defaults(number_of_dc=1, nodes_per_dc=3, cluster_name=cluster_name)
 
 
 def start_and_prime_cluster_defaults(number_of_dc=1, nodes_per_dc=3, version=None, cluster_name=DEFAULT_CLUSTER):
@@ -291,9 +292,11 @@ def start_and_prime_cluster_defaults(number_of_dc=1, nodes_per_dc=3, version=Non
     """
     start_simulacron()
     data_centers = ",".join([str(nodes_per_dc)] * number_of_dc)
-    prime_cluster(data_centers=data_centers, version=version, cluster_name=cluster_name)
+    simulacron_cluster = prime_cluster(data_centers=data_centers, version=version, cluster_name=cluster_name)
     prime_driver_defaults()
 
+    return simulacron_cluster
+
 
 default_column_types = {
     "key": "bigint",
@@ -316,6 +319,13 @@ def prime_query(query, rows=default_rows, column_types=default_column_types, whe
     Shortcut function for priming a query
     :return:
     """
+    # If then is set, then rows and column_types should not
+    if then:
+        nose.tools.assert_equal(rows, default_rows)
+        nose.tools.assert_equal(column_types, default_column_types)
+        rows=None
+        column_types=None
+
     query = PrimeQuery(query, rows=rows, column_types=column_types, when=when, then=then, cluster_name=cluster_name)
     response = prime_request(query)
     return response
diff --git a/tests/integration/standard/test_cluster.py b/tests/integration/standard/test_cluster.py