More support for ONOS in cluster mode
Add a ONOS cluster test case to randomly kill controller and check for exceptions.
Also verify leader status on all the nodes.

Change-Id: I0c5f9e35069c249dcee7135df6f59d2b6e8ad2b4
diff --git a/src/test/cluster/clusterTest.py b/src/test/cluster/clusterTest.py
index 55b47f8..b4c5f16 100644
--- a/src/test/cluster/clusterTest.py
+++ b/src/test/cluster/clusterTest.py
@@ -32,13 +32,15 @@
 from Cluster import *
 from EapTLS import TLSAuthTest
 from ACL import ACLTest
+from OnosLog import OnosLog
+from CordLogger import CordLogger
 import os
 import json
 import random
 import collections
 log.setLevel('INFO')
 
-class cluster_exchange(unittest.TestCase):
+class cluster_exchange(CordLogger):
     test_path = os.path.dirname(os.path.realpath(__file__))
     onos_config_path = os.path.join(test_path, '..', 'setup/onos-config')
     mac = RandMAC()._fix()
@@ -59,12 +61,26 @@
     acl = cluster_acl()
     dhcprelay = cluster_dhcprelay()
     subscriber = cluster_subscriber()
+    testcaseLoggers = ('test_cluster_controller_kills',)
+
+    def setUp(self):
+        if self._testMethodName not in self.testcaseLoggers:
+            super(cluster_exchange, self).setUp()
+
+    def tearDown(self):
+        if self._testMethodName not in self.testcaseLoggers:
+            super(cluster_exchange, self).tearDown()
 
     def get_controller(self):
         controller = os.getenv('ONOS_CONTROLLER_IP') or 'localhost'
         controller = controller.split(',')[0]
         return controller
 
+    @classmethod
+    def get_controllers(cls):
+        controllers = os.getenv('ONOS_CONTROLLER_IP') or ''
+        return controllers.split(',')
+
     def cliEnter(self,controller = None):
         retries = 0
         while retries < 3:
@@ -78,6 +94,27 @@
     def cliExit(self):
         self.cli.disconnect()
 
+    def get_leader(self, controller = None):
+        self.cliEnter(controller = controller)
+        result = json.loads(self.cli.leaders(jsonFormat = True))
+        if result is None:
+            log.info('Leaders command failure for controller %s' %controller)
+        else:
+            log.info('Leaders returned: %s' %result)
+        self.cliExit()
+        return result
+
+    def get_leaders(self, controller = None):
+        result = []
+        if type(controller) in [ list, tuple ]:
+            for c in controller:
+                leaders = self.get_leader(controller = c)
+                result.append(leaders)
+        else:
+            leaders = self.get_leader(controller = controller)
+            result.append(leaders)
+        return result
+
     def verify_cluster_status(self,controller = None,onos_instances=ONOS_INSTANCES,verify=False):
 	tries = 0
 	try:
@@ -132,14 +169,14 @@
             return cluster_ips
 
     def get_cluster_container_names_ips(self,controller=None):
-	onos_names_ips = {}
-	onos_ips = self.get_cluster_current_member_ips(controller=controller)
-	#onos_names = [Onos.NAME]
-	onos_names_ips[onos_ips[0]] = Onos.NAME
-	for i in range(1,len(onos_ips)):
-	    name = '{0}-{1}'.format(Onos.NAME,i+1)
-	    onos_names_ips[onos_ips[i]] = name
-	    #onos_names.append(name)
+        onos_names_ips = {}
+        onos_ips = self.get_cluster_current_member_ips(controller=controller)
+        onos_names_ips[onos_ips[0]] = Onos.NAME
+        onos_names_ips[Onos.NAME] = onos_ips[0]
+        for i in range(1,len(onos_ips)):
+            name = '{0}-{1}'.format(Onos.NAME,i+1)
+            onos_names_ips[onos_ips[i]] = name
+            onos_names_ips[name] = onos_ips[i]
 
         return onos_names_ips
 
@@ -266,6 +303,66 @@
 	log.info('Cluster new master is %s'%new_master_ip)
 	return True
 
+    def test_cluster_controller_kills(self):
+        '''Test the cluster by repeatedly killing the controllers'''
+        controllers = self.get_controllers()
+        ctlr_len = len(controllers)
+        if ctlr_len <= 1:
+            log.info('ONOS is not running in cluster mode. This test only works for cluster mode')
+            assert_greater(ctlr_len, 1)
+
+        #this call would verify the cluster for once
+        onos_map = self.get_cluster_container_names_ips()
+
+        def check_storage_exception(controller = None):
+            adjacent_controller = None
+            adjacent_controllers = None
+            if controller:
+                adjacent_controllers = set(controllers) - set( [controller] )
+                adjacent_controller = next(iter(adjacent_controllers))
+            for node in controllers:
+                onosLog = OnosLog(host = node)
+                ##check the logs for storage exception
+                _, output = onosLog.get_log(('ERROR', 'Exception',))
+                if output and output.find('StorageException') >= 0:
+                    log.info('Storage Exception found on node: %s' %node)
+                    log.info('%s' %output)
+                    assert_equal('Storage Exception on node {}'.format(node), False)
+                    return controller
+
+            try:
+                ips = self.get_cluster_current_member_ips(controller = controller)
+                print('ONOS cluster formed with controllers: %s' %ips)
+                st = True
+            except:
+                st = False
+
+            leaders = self.get_leaders(controllers)
+            failed = filter(lambda l: l == None, leaders)
+            assert_equal(len(failed), 0)
+
+            if st is False:
+                log.info('No storage exception and ONOS cluster was not formed successfully')
+            else:
+                controller = None
+
+            return controller
+
+        next_controller = None
+        tries = 10
+        for num in range(tries):
+            index = num % ctlr_len
+            #index = random.randrange(0, ctlr_len)
+            controller = onos_map[controllers[index]] if next_controller is None else next_controller
+            log.info('Restarting Controller %s' %controller)
+            try:
+                cord_test_onos_restart(node = controller)
+                time.sleep(30)
+            except:
+                time.sleep(5)
+                continue
+            next_controller = check_storage_exception(controller = controller)
+
     #pass
     def test_cluster_formation_and_verification(self,onos_instances = ONOS_INSTANCES):
 	status = self.verify_cluster_status(onos_instances = onos_instances)
diff --git a/src/test/utils/CordContainer.py b/src/test/utils/CordContainer.py
index fb71403..c4a73a0 100644
--- a/src/test/utils/CordContainer.py
+++ b/src/test/utils/CordContainer.py
@@ -201,7 +201,9 @@
     def restart(self, timeout =10):
         return self.dckr.restart(self.name, timeout)
 
-def get_mem():
+def get_mem(instances = 1):
+    if instances <= 0:
+        instances = 1
     with open('/proc/meminfo', 'r') as fd:
         meminfo = fd.readlines()
         mem = 0
@@ -209,7 +211,7 @@
             if m.startswith('MemTotal:') or m.startswith('SwapTotal:'):
                 mem += int(m.split(':')[1].strip().split()[0])
 
-        mem = max(mem/1024/1024/2, 1)
+        mem = max(mem/1024/1024/2/instances, 1)
         mem = min(mem, 16)
         return str(mem) + 'G'
 
@@ -299,7 +301,9 @@
 
     quagga_config = ( { 'bridge' : 'quagga-br', 'ip': '10.10.0.4', 'mask' : 16 }, )
     SYSTEM_MEMORY = (get_mem(),) * 2
+    INSTANCE_MEMORY = (get_mem(instances=3),) * 2
     JAVA_OPTS = '-Xms{} -Xmx{} -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode'.format(*SYSTEM_MEMORY)#-XX:+PrintGCDetails -XX:+PrintGCTimeStamps'
+    JAVA_OPTS_CLUSTER = '-Xms{} -Xmx{} -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode'.format(*INSTANCE_MEMORY)
     env = { 'ONOS_APPS' : 'drivers,openflow,proxyarp,vrouter', 'JAVA_OPTS' : JAVA_OPTS }
     onos_cord_apps = ( ('cord-config', '1.0-SNAPSHOT'),
                        ('aaa', '1.0-SNAPSHOT'),
@@ -369,6 +373,7 @@
         self.boot_delay = boot_delay
         if cluster is True:
             self.ports = []
+            self.env['JAVA_OPTS'] = self.JAVA_OPTS_CLUSTER
             if os.access(self.cluster_cfg, os.F_OK):
                 try:
                     os.unlink(self.cluster_cfg)
diff --git a/src/test/utils/OnosLog.py b/src/test/utils/OnosLog.py
index 5baa6f1..f5dcb77 100644
--- a/src/test/utils/OnosLog.py
+++ b/src/test/utils/OnosLog.py
@@ -6,7 +6,7 @@
     CLI_USER = 'karaf'
     CLI_PASSWD = 'karaf'
     CLI_PORT = 8101
-    HOST = os.getenv('ONOS_CONTROLLER_IP', '172.17.0.2')
+    HOST = os.getenv('ONOS_CONTROLLER_IP', '172.17.0.2').split(',')[0]
     last_snapshot_map = {}
 
     def __init__(self, host = HOST):
@@ -30,7 +30,7 @@
         cmd = 'cat /root/onos/apache-karaf-3.0.5/data/log/karaf.log'
         st, output = self.ssh_agent.run_cmd(cmd)
         if st is False:
-            return output
+            return st, output
         exception_map = {'Exception' : [] }
         last_snapshot = self.get_last_snapshot(self.ssh_agent.host)
         lines = output.splitlines()