More support for ONOS in cluster mode
Add a ONOS cluster test case to randomly kill controller and check for exceptions.
Also verify leader status on all the nodes.
Change-Id: I0c5f9e35069c249dcee7135df6f59d2b6e8ad2b4
diff --git a/src/test/cluster/clusterTest.py b/src/test/cluster/clusterTest.py
index 55b47f8..b4c5f16 100644
--- a/src/test/cluster/clusterTest.py
+++ b/src/test/cluster/clusterTest.py
@@ -32,13 +32,15 @@
from Cluster import *
from EapTLS import TLSAuthTest
from ACL import ACLTest
+from OnosLog import OnosLog
+from CordLogger import CordLogger
import os
import json
import random
import collections
log.setLevel('INFO')
-class cluster_exchange(unittest.TestCase):
+class cluster_exchange(CordLogger):
test_path = os.path.dirname(os.path.realpath(__file__))
onos_config_path = os.path.join(test_path, '..', 'setup/onos-config')
mac = RandMAC()._fix()
@@ -59,12 +61,26 @@
acl = cluster_acl()
dhcprelay = cluster_dhcprelay()
subscriber = cluster_subscriber()
+ testcaseLoggers = ('test_cluster_controller_kills',)
+
+ def setUp(self):
+ if self._testMethodName not in self.testcaseLoggers:
+ super(cluster_exchange, self).setUp()
+
+ def tearDown(self):
+ if self._testMethodName not in self.testcaseLoggers:
+ super(cluster_exchange, self).tearDown()
def get_controller(self):
controller = os.getenv('ONOS_CONTROLLER_IP') or 'localhost'
controller = controller.split(',')[0]
return controller
+ @classmethod
+ def get_controllers(cls):
+ controllers = os.getenv('ONOS_CONTROLLER_IP') or ''
+ return controllers.split(',')
+
def cliEnter(self,controller = None):
retries = 0
while retries < 3:
@@ -78,6 +94,27 @@
def cliExit(self):
self.cli.disconnect()
+ def get_leader(self, controller = None):
+ self.cliEnter(controller = controller)
+ result = json.loads(self.cli.leaders(jsonFormat = True))
+ if result is None:
+ log.info('Leaders command failure for controller %s' %controller)
+ else:
+ log.info('Leaders returned: %s' %result)
+ self.cliExit()
+ return result
+
+ def get_leaders(self, controller = None):
+ result = []
+ if type(controller) in [ list, tuple ]:
+ for c in controller:
+ leaders = self.get_leader(controller = c)
+ result.append(leaders)
+ else:
+ leaders = self.get_leader(controller = controller)
+ result.append(leaders)
+ return result
+
def verify_cluster_status(self,controller = None,onos_instances=ONOS_INSTANCES,verify=False):
tries = 0
try:
@@ -132,14 +169,14 @@
return cluster_ips
def get_cluster_container_names_ips(self,controller=None):
- onos_names_ips = {}
- onos_ips = self.get_cluster_current_member_ips(controller=controller)
- #onos_names = [Onos.NAME]
- onos_names_ips[onos_ips[0]] = Onos.NAME
- for i in range(1,len(onos_ips)):
- name = '{0}-{1}'.format(Onos.NAME,i+1)
- onos_names_ips[onos_ips[i]] = name
- #onos_names.append(name)
+ onos_names_ips = {}
+ onos_ips = self.get_cluster_current_member_ips(controller=controller)
+ onos_names_ips[onos_ips[0]] = Onos.NAME
+ onos_names_ips[Onos.NAME] = onos_ips[0]
+ for i in range(1,len(onos_ips)):
+ name = '{0}-{1}'.format(Onos.NAME,i+1)
+ onos_names_ips[onos_ips[i]] = name
+ onos_names_ips[name] = onos_ips[i]
return onos_names_ips
@@ -266,6 +303,66 @@
log.info('Cluster new master is %s'%new_master_ip)
return True
+ def test_cluster_controller_kills(self):
+ '''Test the cluster by repeatedly killing the controllers'''
+ controllers = self.get_controllers()
+ ctlr_len = len(controllers)
+ if ctlr_len <= 1:
+ log.info('ONOS is not running in cluster mode. This test only works for cluster mode')
+ assert_greater(ctlr_len, 1)
+
+ #this call would verify the cluster for once
+ onos_map = self.get_cluster_container_names_ips()
+
+ def check_storage_exception(controller = None):
+ adjacent_controller = None
+ adjacent_controllers = None
+ if controller:
+ adjacent_controllers = set(controllers) - set( [controller] )
+ adjacent_controller = next(iter(adjacent_controllers))
+ for node in controllers:
+ onosLog = OnosLog(host = node)
+ ##check the logs for storage exception
+ _, output = onosLog.get_log(('ERROR', 'Exception',))
+ if output and output.find('StorageException') >= 0:
+ log.info('Storage Exception found on node: %s' %node)
+ log.info('%s' %output)
+ assert_equal('Storage Exception on node {}'.format(node), False)
+ return controller
+
+ try:
+ ips = self.get_cluster_current_member_ips(controller = controller)
+ print('ONOS cluster formed with controllers: %s' %ips)
+ st = True
+ except:
+ st = False
+
+ leaders = self.get_leaders(controllers)
+ failed = filter(lambda l: l == None, leaders)
+ assert_equal(len(failed), 0)
+
+ if st is False:
+ log.info('No storage exception and ONOS cluster was not formed successfully')
+ else:
+ controller = None
+
+ return controller
+
+ next_controller = None
+ tries = 10
+ for num in range(tries):
+ index = num % ctlr_len
+ #index = random.randrange(0, ctlr_len)
+ controller = onos_map[controllers[index]] if next_controller is None else next_controller
+ log.info('Restarting Controller %s' %controller)
+ try:
+ cord_test_onos_restart(node = controller)
+ time.sleep(30)
+ except:
+ time.sleep(5)
+ continue
+ next_controller = check_storage_exception(controller = controller)
+
#pass
def test_cluster_formation_and_verification(self,onos_instances = ONOS_INSTANCES):
status = self.verify_cluster_status(onos_instances = onos_instances)
diff --git a/src/test/utils/CordContainer.py b/src/test/utils/CordContainer.py
index fb71403..c4a73a0 100644
--- a/src/test/utils/CordContainer.py
+++ b/src/test/utils/CordContainer.py
@@ -201,7 +201,9 @@
def restart(self, timeout =10):
return self.dckr.restart(self.name, timeout)
-def get_mem():
+def get_mem(instances = 1):
+ if instances <= 0:
+ instances = 1
with open('/proc/meminfo', 'r') as fd:
meminfo = fd.readlines()
mem = 0
@@ -209,7 +211,7 @@
if m.startswith('MemTotal:') or m.startswith('SwapTotal:'):
mem += int(m.split(':')[1].strip().split()[0])
- mem = max(mem/1024/1024/2, 1)
+ mem = max(mem/1024/1024/2/instances, 1)
mem = min(mem, 16)
return str(mem) + 'G'
@@ -299,7 +301,9 @@
quagga_config = ( { 'bridge' : 'quagga-br', 'ip': '10.10.0.4', 'mask' : 16 }, )
SYSTEM_MEMORY = (get_mem(),) * 2
+ INSTANCE_MEMORY = (get_mem(instances=3),) * 2
JAVA_OPTS = '-Xms{} -Xmx{} -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode'.format(*SYSTEM_MEMORY)#-XX:+PrintGCDetails -XX:+PrintGCTimeStamps'
+ JAVA_OPTS_CLUSTER = '-Xms{} -Xmx{} -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode'.format(*INSTANCE_MEMORY)
env = { 'ONOS_APPS' : 'drivers,openflow,proxyarp,vrouter', 'JAVA_OPTS' : JAVA_OPTS }
onos_cord_apps = ( ('cord-config', '1.0-SNAPSHOT'),
('aaa', '1.0-SNAPSHOT'),
@@ -369,6 +373,7 @@
self.boot_delay = boot_delay
if cluster is True:
self.ports = []
+ self.env['JAVA_OPTS'] = self.JAVA_OPTS_CLUSTER
if os.access(self.cluster_cfg, os.F_OK):
try:
os.unlink(self.cluster_cfg)
diff --git a/src/test/utils/OnosLog.py b/src/test/utils/OnosLog.py
index 5baa6f1..f5dcb77 100644
--- a/src/test/utils/OnosLog.py
+++ b/src/test/utils/OnosLog.py
@@ -6,7 +6,7 @@
CLI_USER = 'karaf'
CLI_PASSWD = 'karaf'
CLI_PORT = 8101
- HOST = os.getenv('ONOS_CONTROLLER_IP', '172.17.0.2')
+ HOST = os.getenv('ONOS_CONTROLLER_IP', '172.17.0.2').split(',')[0]
last_snapshot_map = {}
def __init__(self, host = HOST):
@@ -30,7 +30,7 @@
cmd = 'cat /root/onos/apache-karaf-3.0.5/data/log/karaf.log'
st, output = self.ssh_agent.run_cmd(cmd)
if st is False:
- return output
+ return st, output
exception_map = {'Exception' : [] }
last_snapshot = self.get_last_snapshot(self.ssh_agent.host)
lines = output.splitlines()