New test to restart entire ONOS cluster before verifying.
New cord tester api to restart entire cluster used by the test.
Changed the restarts to check for onos start using wait_for_onos_start instead of a hard delay.
Added a robot test for restart cluster.

Change-Id: I8dbc163462570a6a8eaf8e7684c790fc3fea8f48
diff --git a/src/test/cluster/clusterTest.py b/src/test/cluster/clusterTest.py
index c3005ee..d5ad5fb 100644
--- a/src/test/cluster/clusterTest.py
+++ b/src/test/cluster/clusterTest.py
@@ -23,7 +23,7 @@
 from twisted.internet import defer
 from onosclidriver import OnosCliDriver
 from CordContainer import Container, Onos, Quagga
-from CordTestServer import cord_test_onos_restart, cord_test_onos_shutdown, cord_test_onos_add_cluster, cord_test_quagga_restart
+from CordTestServer import cord_test_onos_restart, cord_test_onos_shutdown, cord_test_onos_add_cluster, cord_test_quagga_restart, cord_test_restart_cluster
 from portmaps import g_subscriber_port_map
 from scapy.all import *
 import time, monotonic
@@ -61,7 +61,7 @@
     acl = cluster_acl()
     dhcprelay = cluster_dhcprelay()
     subscriber = cluster_subscriber()
-    testcaseLoggers = ('test_cluster_controller_restarts',)
+    testcaseLoggers = ('test_cluster_controller_restarts', 'test_cluster_single_controller_restarts', 'test_cluster_restarts')
 
     def setUp(self):
         if self._testMethodName not in self.testcaseLoggers:
@@ -450,6 +450,60 @@
             time.sleep(60)
             check_exception(controller, inclusive = True)
 
+    def test_cluster_restarts(self):
+        '''Test the cluster by repeatedly restarting the entire cluster'''
+        controllers = self.get_controllers()
+        ctlr_len = len(controllers)
+        if ctlr_len <= 1:
+            log.info('ONOS is not running in cluster mode. This test only works for cluster mode')
+            assert_greater(ctlr_len, 1)
+
+        #this call would verify the cluster for once
+        onos_map = self.get_cluster_container_names_ips()
+
+        def check_exception():
+            controller_list = controllers
+            storage_exceptions = []
+            for node in controller_list:
+                onosLog = OnosLog(host = node)
+                ##check the logs for storage exception
+                _, output = onosLog.get_log(('ERROR', 'Exception',))
+                if output and output.find('StorageException$Timeout') >= 0:
+                    log.info('\nStorage Exception Timeout found on node: %s\n' %node)
+                    log.info('Dumping the ERROR and Exception logs for node: %s\n' %node)
+                    log.info('\n' + '-' * 50 + '\n')
+                    log.info('%s' %output)
+                    log.info('\n' + '-' * 50 + '\n')
+                    storage_exceptions.append(node)
+
+            failed = self.verify_leaders(controller_list)
+            if failed:
+                log.info('Leaders command failed on nodes: %s' %failed)
+                if storage_exceptions:
+                    log.info('Storage exception seen on nodes: %s' %storage_exceptions)
+                    assert_equal(len(failed), 0)
+                    return
+
+            for ctlr in controller_list:
+                ips = self.get_cluster_current_member_ips(controller = ctlr,
+                                                          nodes_filter = \
+                                                          lambda nodes: [ n for n in nodes if n['state'] in [ 'ACTIVE', 'READY'] ])
+                log.info('ONOS cluster on node %s formed with controllers: %s' %(ctlr, ips))
+                assert_equal(len(ips), len(controllers))
+
+        tries = 10
+        for num in range(tries):
+            log.info('ITERATION: %d. Restarting cluster with controllers at %s' %(num+1, controllers))
+            try:
+                cord_test_restart_cluster()
+                log.info('Delaying before verifying cluster status')
+                time.sleep(60)
+            except:
+                time.sleep(10)
+                continue
+            #check for exceptions on the adjacent nodes
+            check_exception()
+
     #pass
     def test_cluster_formation_and_verification(self,onos_instances = ONOS_INSTANCES):
 	status = self.verify_cluster_status(onos_instances = onos_instances)
diff --git a/src/test/robot/cluster_controller.robot b/src/test/robot/cluster_controller.robot
index 61269ba..e2a72d7 100644
--- a/src/test/robot/cluster_controller.robot
+++ b/src/test/robot/cluster_controller.robot
@@ -19,6 +19,11 @@
   ${rc}=  Run Cord Tester  cluster:cluster_exchange.test_cluster_single_controller_restarts
   Should Be Equal As Integers  ${rc}  0
 
+Verify Onos Cluster Restart Functionality
+  [Documentation]  Verify ONOS cluster by restarting the entire cluster
+  ${rc}=  Run Cord Tester  cluster:cluster_exchange.test_cluster_restarts
+  Should Be Equal As Integers  ${rc}  0
+
 *** Keywords ***
 Cord Cluster Setup
   [Documentation]  Configure a ${NODES} node ONOS cluster for cord tester
diff --git a/src/test/utils/CordContainer.py b/src/test/utils/CordContainer.py
index a5dbdc6..b64c8fa 100644
--- a/src/test/utils/CordContainer.py
+++ b/src/test/utils/CordContainer.py
@@ -524,7 +524,7 @@
         cls.setup_cluster(cls.cluster_instances)
 
     @classmethod
-    def restart_cluster(cls, network_cfg = None):
+    def restart_cluster(cls, network_cfg = None, timeout = 10, setup = False):
         if cls.cluster_mode is False:
             return
         if not cls.cluster_instances:
@@ -535,19 +535,21 @@
             with open('{}/network-cfg.json'.format(cls.host_config_dir), 'w') as f:
                 f.write(json_data)
 
+        cls.cleanup_cluster()
+        if timeout > 0:
+            time.sleep(timeout)
+
         for onos in cls.cluster_instances:
-            if onos.exists():
-                onos.kill()
-            onos.remove_container(onos.name, force=True)
             print('Restarting ONOS container %s' %onos.name)
             onos.start(ports = onos.ports, environment = onos.env,
                        host_config = onos.host_config, volumes = onos.volumes, tty = True)
-            print('Waiting %d seconds for ONOS %s to boot' %(onos.boot_delay, onos.name))
-            time.sleep(onos.boot_delay)
             onos.ipaddr = onos.ip()
+            onos.wait_for_onos_start(onos.ipaddr)
+            onos.install_cord_apps(onos.ipaddr)
 
-        ##form the cluster
-        cls.setup_cluster(cls.cluster_instances)
+        ##form the cluster as appropriate
+        if setup is True:
+            cls.setup_cluster(cls.cluster_instances)
 
     @classmethod
     def cluster_ips(cls):
@@ -586,11 +588,9 @@
                 print('Restarting ONOS container %s' %onos.name)
                 onos.start(ports = onos.ports, environment = onos.env,
                            host_config = onos.host_config, volumes = onos.volumes, tty = True)
-                #onos.ipaddr = onos.ip()
-                #onos.wait_for_onos_start(onos.ipaddr)
-                print('Waiting %d seconds for ONOS %s to boot' %(onos.boot_delay, onos.name))
-                time.sleep(onos.boot_delay)
                 onos.ipaddr = onos.ip()
+                onos.wait_for_onos_start(onos.ipaddr)
+                onos.install_cord_apps(onos.ipaddr)
 
     @classmethod
     def install_cord_apps(cls, onos_ip = None):
diff --git a/src/test/utils/CordTestServer.py b/src/test/utils/CordTestServer.py
index e529219..de17261 100644
--- a/src/test/utils/CordTestServer.py
+++ b/src/test/utils/CordTestServer.py
@@ -64,6 +64,13 @@
     def shutdown_onos(self, kwargs):
         return self.__shutdown_onos(**kwargs)
 
+    def __restart_cluster(self, config = None, timeout = 10, setup = False):
+        Onos.restart_cluster(network_cfg = config, timeout = timeout, setup = setup)
+        return 'DONE'
+
+    def restart_cluster(self, kwargs):
+        return self.__restart_cluster(**kwargs)
+
     def __add_cluster_onos(self, count = 1, config = None):
         Onos.add_cluster(count = count, network_cfg = config)
         return 'DONE'
@@ -213,6 +220,17 @@
     return False
 
 @nottest
+def __cord_test_restart_cluster(**kwargs):
+    return rpc_server_instance().restart_cluster(kwargs)
+
+@nottest
+def cord_test_restart_cluster(config = None, timeout = 10, setup = False):
+    data = __cord_test_restart_cluster(config = config, timeout = timeout, setup = setup)
+    if data == 'DONE':
+        return True
+    return False
+
+@nottest
 def __cord_test_onos_add_cluster(**kwargs):
     return rpc_server_instance().add_cluster_onos(kwargs)