VOL-1670 - close and reestablish connections
When grpc connectivity to the core is broken the ofagent will
break the connection to ONOS and then work to reconnect to
the core. After connecting to the core the connection to
ONOS will be restablished.
Change-Id: I75e645de3784a64ef4f9992df8baf37959cbbd86
diff --git a/python/ofagent/agent.py b/python/ofagent/agent.py
index aa45f67..e4d415d 100755
--- a/python/ofagent/agent.py
+++ b/python/ofagent/agent.py
@@ -35,10 +35,10 @@
cached_generation_id = None
def __init__(self,
+ connection_manager,
controller_endpoint,
datapath_id,
device_id,
- rpc_stub,
enable_tls=False,
key_file=None,
cert_file=None,
@@ -47,7 +47,7 @@
self.controller_endpoint = controller_endpoint
self.datapath_id = datapath_id
self.device_id = device_id
- self.rpc_stub = rpc_stub
+ self.connection_manager = connection_manager
self.enable_tls = enable_tls
self.key_file = key_file
self.cert_file = cert_file
@@ -142,7 +142,7 @@
def protocol(self):
cxn = OpenFlowConnection(self) # Low level message handler
self.proto_handler = OpenFlowProtocolHandler(
- self.datapath_id, self.device_id, self, cxn, self.rpc_stub)
+ self.datapath_id, self.device_id, self, cxn)
return cxn
def clientConnectionFailed(self, connector, reason):
@@ -166,7 +166,6 @@
else:
log.error('unknown-change-event', change_event=event)
-
if __name__ == '__main__':
"""Run this to test the agent for N concurrent sessions:
python agent [<number-of-desired-instances>]
diff --git a/python/ofagent/connection_mgr.py b/python/ofagent/connection_mgr.py
index 3b10280..9c64854 100755
--- a/python/ofagent/connection_mgr.py
+++ b/python/ofagent/connection_mgr.py
@@ -37,12 +37,6 @@
# _ = third_party
class ConnectionManager(object):
- running = False
- core_ready = False
- channel = None
- subscription = None
- grpc_client = None
-
def __init__(self, consul_endpoint,
vcore_endpoint, vcore_grpc_timeout, vcore_binding_key,
vcore_transaction_key, controller_endpoints, instance_id,
@@ -73,6 +67,8 @@
self.devices_refresh_interval = devices_refresh_interval
self.subscription_refresh_interval = subscription_refresh_interval
self.subscription = None
+ self.connecting = True
+ self.monitor = True
self.running = False
@@ -84,8 +80,6 @@
log.debug('starting')
self.running = True
- ConnectionManager.core_ready = True # Assume core is ready until proven otherwise
- ConnectionManager.running = True
# Get a subscription to vcore
reactor.callInThread(self.get_vcore_subscription)
@@ -97,25 +91,25 @@
return self
- @classmethod
- def liveness_probe(cls):
- # Pod restarts when liveness condition fails
- return ConnectionManager.running
+ def grpc_client_terminated(self):
+ if not self.connecting and self.grpc_client is not None:
+ self.connecting = True
+ self._reset_grpc_attributes()
+ self.delete_all_agents()
+ reactor.callInThread(self.get_vcore_subscription)
- @classmethod
- def readiness_probe(cls):
+ def liveness_probe(self):
+ return self.running
+
+ def readiness_probe(self):
# Pod is isolated when readiness condition fails
- return bool(ConnectionManager.core_ready and ConnectionManager.channel and ConnectionManager.subscription and ConnectionManager.grpc_client)
+ return bool(not self.connecting and self.channel and self.subscription and self.grpc_client)
def stop(self):
log.debug('stopping')
- # clean up all controller connections
- for agent in self.agent_map.itervalues():
- agent.stop()
- self.running = False
-
+ self.delete_all_agents()
self._reset_grpc_attributes()
-
+ self.running = False
log.info('stopped')
def resolve_endpoint(self, endpoint):
@@ -148,10 +142,6 @@
self.subscription = None
self.grpc_client = None
- ConnectionManager.channel = None
- ConnectionManager.subscription = None
- ConnectionManager.grpc_client = None
-
log.debug('stop-reset-grpc-attributes')
def _assign_grpc_attributes(self):
@@ -166,15 +156,13 @@
# Establish a connection to the vcore GRPC server
self.channel = grpc.insecure_channel('{}:{}'.format(host, port))
- # For Readiness probe
- ConnectionManager.channel = self.channel
-
log.debug('stop-assign-grpc-attributes')
@inlineCallbacks
def get_vcore_subscription(self):
log.debug('start-get-vcore-subscription')
+ self.connecting = True
while self.running and self.subscription is None:
try:
# If a subscription is not yet assigned then establish new GRPC connection
@@ -190,9 +178,6 @@
subscription = yield self.grpc_client.subscribe(
OfAgentSubscriber(ofagent_id=container_name))
- #For Readiness probes
- ConnectionManager.subscription = subscription
- ConnectionManager.grpc_client = self.grpc_client
# If the subscriber id matches the current instance
# ... then the subscription has succeeded
if subscription is not None and subscription.ofagent_id == container_name:
@@ -201,6 +186,7 @@
log.debug('subscription-with-vcore-successful', subscription=subscription)
self.subscription = subscription
self.grpc_client.start()
+ self.connecting = False
# Sleep a bit in between each subscribe
yield asleep(self.subscription_refresh_interval)
@@ -228,30 +214,32 @@
log.debug('stop-get-vcore-subscription')
+ def get_rpc_client(self):
+ return self.grpc_client if not self.connecting else None
+
@inlineCallbacks
def get_list_of_logical_devices_from_voltha(self):
while self.running:
log.info('retrieve-logical-device-list')
- try:
- devices = yield \
- self.grpc_client.list_logical_devices()
+ rpc = self.get_rpc_client()
+ if rpc is not None:
+ try:
+ devices = yield rpc.list_logical_devices()
- ConnectionManager.core_ready = True # We've successfully talked to the core
+ for device in devices:
+ log.info("logical-device-entry", id=device.id,
+ datapath_id=device.datapath_id)
- for device in devices:
- log.info("logical-device-entry", id=device.id,
- datapath_id=device.datapath_id)
+ returnValue(devices)
- returnValue(devices)
+ except _Rendezvous, e:
+ rpc.stop()
+ status = e.code()
+ log.error('vcore-communication-failure', exception=e, status=status)
- except _Rendezvous, e:
- status = e.code()
- log.error('vcore-communication-failure', exception=e, status=status)
- ConnectionManager.core_ready = False # Will be reflected in readiness probe
-
- except Exception as e:
- log.exception('logical-devices-retrieval-failure', exception=e)
- ConnectionManager.core_ready = False # will be reflected in readiness probe
+ except Exception as e:
+ rpc.stop()
+ log.exception('logical-devices-retrieval-failure', exception=e)
log.info('reconnect', after_delay=self.vcore_retry_interval)
yield asleep(self.vcore_retry_interval)
@@ -295,13 +283,18 @@
datapath_id = device.datapath_id
device_id = device.id
for controller_endpoint in self.controller_endpoints:
- agent = Agent(controller_endpoint, datapath_id,
- device_id, self.grpc_client, self.enable_tls,
+ agent = Agent(self, controller_endpoint, datapath_id,
+ device_id, self.enable_tls,
self.key_file, self.cert_file)
agent.start()
self.agent_map[(datapath_id,controller_endpoint)] = agent
self.device_id_to_datapath_id_map[device_id] = datapath_id
+ def delete_all_agents(self):
+ for agent in self.agent_map.itervalues(): agent.stop()
+ self.agent_map = {}
+ self.device_id_to_datapath_id_map = {}
+
def delete_agent(self, datapath_id):
for controller_endpoint in self.controller_endpoints:
agent = self.agent_map[(datapath_id,controller_endpoint)]
@@ -321,11 +314,9 @@
# see https://jira.opencord.org/browse/CORD-821
try:
- if self.channel is not None and self.grpc_client is not None and \
- self.subscription is not None:
+ if self.channel is not None and self.get_rpc_client() is not None and self.subscription is not None:
# get current list from Voltha
- devices = yield \
- self.get_list_of_logical_devices_from_voltha()
+ devices = yield self.get_list_of_logical_devices_from_voltha()
# update agent list and mapping tables as needed
self.refresh_agent_connections(devices)
diff --git a/python/ofagent/grpc_client.py b/python/ofagent/grpc_client.py
index c8b2580..8e7b393 100755
--- a/python/ofagent/grpc_client.py
+++ b/python/ofagent/grpc_client.py
@@ -82,8 +82,13 @@
return self
def stop(self):
+ log.debug('stop requested')
+ if self.stopped:
+ log.debug('already stopped, no action taken')
+ return
log.debug('stopping')
self.stopped = True
+ self.connection_manager.grpc_client_terminated()
log.info('stopped')
def get_core_transaction_metadata(self):
@@ -92,7 +97,7 @@
def start_packet_out_stream(self):
def packet_generator():
- while 1:
+ while True:
try:
packet = self.packet_out_queue.get(block=True, timeout=1.0)
except Empty:
@@ -110,7 +115,7 @@
except _Rendezvous, e:
log.error('grpc-exception', status=e.code())
if e.code() == StatusCode.UNAVAILABLE:
- os.system("kill -15 {}".format(os.getpid()))
+ self.stop()
reactor.callInThread(stream_packets_out)
@@ -132,7 +137,7 @@
except _Rendezvous, e:
log.error('grpc-exception', status=e.code())
if e.code() == StatusCode.UNAVAILABLE:
- os.system("kill -15 {}".format(os.getpid()))
+ self.stop()
reactor.callInThread(receive_packet_in_stream)
@@ -152,7 +157,7 @@
except _Rendezvous, e:
log.error('grpc-exception', status=e.code())
if e.code() == StatusCode.UNAVAILABLE:
- os.system("kill -15 {}".format(os.getpid()))
+ self.stop()
reactor.callInThread(receive_change_events)
diff --git a/python/ofagent/main.py b/python/ofagent/main.py
index 58afe15..ec2ad0f 100755
--- a/python/ofagent/main.py
+++ b/python/ofagent/main.py
@@ -276,6 +276,7 @@
args.enable_tls,
args.key_file,
args.cert_file).start()
+ Probe.connection_manager = self.connection_manager
self.log.info('started-internal-services')
@inlineCallbacks
diff --git a/python/ofagent/of_protocol_handler.py b/python/ofagent/of_protocol_handler.py
index a4b3f70..f8f817c 100755
--- a/python/ofagent/of_protocol_handler.py
+++ b/python/ofagent/of_protocol_handler.py
@@ -34,21 +34,18 @@
MAX_METER_BANDS = 255
MAX_METER_COLORS = 255
- def __init__(self, datapath_id, device_id, agent, cxn, rpc):
+ def __init__(self, datapath_id, device_id, agent, cxn):
"""
The upper half of the OpenFlow protocol, focusing on message
exchanges.
:param agent: Reference to the Agent() instance, can be used to
indicate critical errors to break the connection.
:param cxn: The lower level message serdes part of the OF protocol.
- :param rpc: The application level stub on which RPC calls
- are made as result of processing incoming OpenFlow request messages.
"""
self.datapath_id = datapath_id
self.device_id = device_id
self.agent = agent
self.cxn = cxn
- self.rpc = rpc
self.role = None
@inlineCallbacks
@@ -98,12 +95,14 @@
@inlineCallbacks
def handle_feature_request(self, req):
- device_info = yield self.rpc.get_device_info(self.device_id)
- kw = pb2dict(device_info.switch_features)
- self.cxn.send(ofp.message.features_reply(
- xid=req.xid,
- datapath_id=self.datapath_id,
- **kw))
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ device_info = yield rpc.get_device_info(self.device_id)
+ kw = pb2dict(device_info.switch_features)
+ self.cxn.send(ofp.message.features_reply(
+ xid=req.xid,
+ datapath_id=self.datapath_id,
+ **kw))
def handle_stats_request(self, req):
handler = self.stats_handlers.get(req.stats_type, None)
@@ -122,40 +121,47 @@
raise NotImplementedError()
def handle_flow_mod_request(self, req):
- if self.role == ofp.OFPCR_ROLE_MASTER or self.role == ofp.OFPCR_ROLE_EQUAL:
- try:
- grpc_req = to_grpc(req)
- except Exception, e:
- log.exception('failed-to-convert', e=e)
- else:
- return self.rpc.update_flow_table(self.device_id, grpc_req)
+ log.debug('flow mod request')
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ if self.role == ofp.OFPCR_ROLE_MASTER or self.role == ofp.OFPCR_ROLE_EQUAL:
+ try:
+ grpc_req = to_grpc(req)
+ except Exception, e:
+ log.exception('failed-to-convert', e=e)
+ else:
+ return rpc.update_flow_table(self.device_id, grpc_req)
- elif self.role == ofp.OFPCR_ROLE_SLAVE:
- self.cxn.send(ofp.message.bad_request_error_msg(code=ofp.OFPBRC_IS_SLAVE))
+ elif self.role == ofp.OFPCR_ROLE_SLAVE:
+ self.cxn.send(ofp.message.bad_request_error_msg(code=ofp.OFPBRC_IS_SLAVE))
def handle_meter_mod_request(self, req):
log.info('Received handle_meter_mod_request', request=req)
- if self.role == ofp.OFPCR_ROLE_MASTER or self.role == ofp.OFPCR_ROLE_EQUAL:
- try:
- grpc_req = to_grpc(req)
- except Exception, e:
- log.exception('failed-to-convert-meter-mod-request', e=e)
- else:
- return self.rpc.update_meter_mod_table(self.device_id, grpc_req)
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ if self.role == ofp.OFPCR_ROLE_MASTER or self.role == ofp.OFPCR_ROLE_EQUAL:
+ try:
+ grpc_req = to_grpc(req)
+ except Exception, e:
+ log.exception('failed-to-convert-meter-mod-request', e=e)
+ else:
+ return rpc.update_meter_mod_table(self.device_id, grpc_req)
- elif self.role == ofp.OFPCR_ROLE_SLAVE:
- self.cxn.send(ofp.message.bad_request_error_msg(code=ofp.OFPBRC_IS_SLAVE))
+ elif self.role == ofp.OFPCR_ROLE_SLAVE:
+ self.cxn.send(ofp.message.bad_request_error_msg(code=ofp.OFPBRC_IS_SLAVE))
@inlineCallbacks
def handle_meter_stats_request(self, req):
log.info('Received handle_meter_stats_request', request=req)
- try:
- meters = yield self.rpc.list_meters(self.device_id)
- self.cxn.send(ofp.message.meter_stats_reply(
- xid=req.xid, entries=[to_loxi(m.stats) for m in meters]))
- except Exception, e:
- log.exception("failed-meter-stats-request", req=req, e=e)
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ try:
+ meters = yield rpc.list_meters(self.device_id)
+ self.cxn.send(ofp.message.meter_stats_reply(
+ xid=req.xid, entries=[to_loxi(m.stats) for m in meters]))
+ except Exception, e:
+ log.exception("failed-meter-stats-request", req=req, e=e)
def handle_get_async_request(self, req):
raise NotImplementedError()
@@ -168,10 +174,12 @@
@inlineCallbacks
def handle_group_mod_request(self, req):
- if self.role == ofp.OFPCR_ROLE_MASTER or self.role == ofp.OFPCR_ROLE_EQUAL:
- yield self.rpc.update_group_table(self.device_id, to_grpc(req))
- elif self.role == ofp.OFPCR_ROLE_SLAVE:
- self.cxn.send(ofp.message.bad_request_error_msg(code=ofp.OFPBRC_IS_SLAVE))
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ if self.role == ofp.OFPCR_ROLE_MASTER or self.role == ofp.OFPCR_ROLE_EQUAL:
+ yield rpc.update_group_table(self.device_id, to_grpc(req))
+ elif self.role == ofp.OFPCR_ROLE_SLAVE:
+ self.cxn.send(ofp.message.bad_request_error_msg(code=ofp.OFPBRC_IS_SLAVE))
def handle_role_request(self, req):
if req.role == ofp.OFPCR_ROLE_MASTER or req.role == ofp.OFPCR_ROLE_SLAVE:
@@ -192,11 +200,13 @@
xid=req.xid, role=req.role))
def handle_packet_out_request(self, req):
- if self.role == ofp.OFPCR_ROLE_MASTER or self.role == ofp.OFPCR_ROLE_EQUAL:
- self.rpc.send_packet_out(self.device_id, to_grpc(req))
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ if self.role == ofp.OFPCR_ROLE_MASTER or self.role == ofp.OFPCR_ROLE_EQUAL:
+ rpc.send_packet_out(self.device_id, to_grpc(req))
- elif self.role == ofp.OFPCR_ROLE_SLAVE:
- self.cxn.send(ofp.message.bad_request_error_msg(code=ofp.OFPBRC_IS_SLAVE))
+ elif self.role == ofp.OFPCR_ROLE_SLAVE:
+ self.cxn.send(ofp.message.bad_request_error_msg(code=ofp.OFPBRC_IS_SLAVE))
def handle_set_config_request(self, req):
# Handle set config appropriately
@@ -205,18 +215,20 @@
@inlineCallbacks
def handle_port_mod_request(self, req):
- if self.role == ofp.OFPCR_ROLE_MASTER or self.role == ofp.OFPCR_ROLE_EQUAL:
- port = yield self.rpc.get_port(self.device_id, str(req.port_no))
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ if self.role == ofp.OFPCR_ROLE_MASTER or self.role == ofp.OFPCR_ROLE_EQUAL:
+ port = yield rpc.get_port(self.device_id, str(req.port_no))
- if port.ofp_port.config & ofp.OFPPC_PORT_DOWN != \
- req.config & ofp.OFPPC_PORT_DOWN:
- if req.config & ofp.OFPPC_PORT_DOWN:
- self.rpc.disable_port(self.device_id, port.id)
- else:
- self.rpc.enable_port(self.device_id, port.id)
+ if port.ofp_port.config & ofp.OFPPC_PORT_DOWN != \
+ req.config & ofp.OFPPC_PORT_DOWN:
+ if req.config & ofp.OFPPC_PORT_DOWN:
+ rpc.disable_port(self.device_id, port.id)
+ else:
+ rpc.enable_port(self.device_id, port.id)
- elif self.role == ofp.OFPCR_ROLE_SLAVE:
- self.cxn.send(ofp.message.bad_request_error_msg(code=ofp.OFPBRC_IS_SLAVE))
+ elif self.role == ofp.OFPCR_ROLE_SLAVE:
+ self.cxn.send(ofp.message.bad_request_error_msg(code=ofp.OFPBRC_IS_SLAVE))
def handle_table_mod_request(self, req):
raise NotImplementedError()
@@ -232,33 +244,41 @@
@inlineCallbacks
def handle_device_description_request(self, req):
- device_info = yield self.rpc.get_device_info(self.device_id)
- kw = pb2dict(device_info.desc)
- self.cxn.send(ofp.message.desc_stats_reply(xid=req.xid, **kw))
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ device_info = yield rpc.get_device_info(self.device_id)
+ kw = pb2dict(device_info.desc)
+ self.cxn.send(ofp.message.desc_stats_reply(xid=req.xid, **kw))
def handle_experimenter_stats_request(self, req):
raise NotImplementedError()
@inlineCallbacks
def handle_flow_stats_request(self, req):
- try:
- flow_stats = yield self.rpc.list_flows(self.device_id)
- self.cxn.send(ofp.message.flow_stats_reply(
- xid=req.xid, entries=[to_loxi(f) for f in flow_stats]))
- except Exception, e:
- log.exception('failed-flow-stats-request', req=req)
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ try:
+ flow_stats = yield rpc.list_flows(self.device_id)
+ self.cxn.send(ofp.message.flow_stats_reply(
+ xid=req.xid, entries=[to_loxi(f) for f in flow_stats]))
+ except Exception, e:
+ log.exception('failed-flow-stats-request', req=req)
@inlineCallbacks
def handle_group_stats_request(self, req):
- group_stats = yield self.rpc.list_groups(self.device_id)
- self.cxn.send(ofp.message.group_stats_reply(
- xid=req.xid, entries=[to_loxi(g.stats) for g in group_stats]))
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ group_stats = yield rpc.list_groups(self.device_id)
+ self.cxn.send(ofp.message.group_stats_reply(
+ xid=req.xid, entries=[to_loxi(g.stats) for g in group_stats]))
@inlineCallbacks
def handle_group_descriptor_request(self, req):
- group_stats = yield self.rpc.list_groups(self.device_id)
- self.cxn.send(ofp.message.group_desc_stats_reply(
- xid=req.xid, entries=[to_loxi(g.desc) for g in group_stats]))
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ group_stats = yield rpc.list_groups(self.device_id)
+ self.cxn.send(ofp.message.group_desc_stats_reply(
+ xid=req.xid, entries=[to_loxi(g.desc) for g in group_stats]))
def handle_group_features_request(self, req):
raise NotImplementedError()
@@ -277,26 +297,30 @@
@inlineCallbacks
def handle_port_stats_request(self, req):
- try:
- ports = yield self.rpc.list_ports(self.device_id)
- port_stats = [to_loxi(p.ofp_port_stats) for p in ports]
- of_message = ofp.message.port_stats_reply(
- xid=req.xid,entries=port_stats)
- self.cxn.send(of_message)
- except:
- log.exception('failed-port_stats-request', req=req)
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ try:
+ ports = yield rpc.list_ports(self.device_id)
+ port_stats = [to_loxi(p.ofp_port_stats) for p in ports]
+ of_message = ofp.message.port_stats_reply(
+ xid=req.xid,entries=port_stats)
+ self.cxn.send(of_message)
+ except:
+ log.exception('failed-port_stats-request', req=req)
@inlineCallbacks
def handle_port_desc_request(self, req):
- port_list = yield self.rpc.get_port_list(self.device_id)
- try:
- self.cxn.send(ofp.message.port_desc_stats_reply(
- xid=req.xid,
- #flags=None,
- entries=[to_loxi(port.ofp_port) for port in port_list]
- ))
- except Exception as err:
- log.exception('failed-port-desc-reply', err=err)
+ rpc = self.agent.connection_manager.get_rpc_client()
+ if rpc is not None:
+ port_list = yield rpc.get_port_list(self.device_id)
+ try:
+ self.cxn.send(ofp.message.port_desc_stats_reply(
+ xid=req.xid,
+ #flags=None,
+ entries=[to_loxi(port.ofp_port) for port in port_list]
+ ))
+ except Exception as err:
+ log.exception('failed-port-desc-reply', err=err)
def handle_queue_stats_request(self, req):
raise NotImplementedError()
diff --git a/python/ofagent/probe.py b/python/ofagent/probe.py
index 612d661..1d2358e 100644
--- a/python/ofagent/probe.py
+++ b/python/ofagent/probe.py
@@ -15,11 +15,12 @@
#
from SimpleHTTPServer import SimpleHTTPRequestHandler
from structlog import get_logger
-from connection_mgr import ConnectionManager
log = get_logger()
class Probe(SimpleHTTPRequestHandler):
+ connection_manager = None
+
def do_GET(self):
if self.path == '/healthz':
@@ -30,7 +31,7 @@
def health_probe(self):
- if ConnectionManager.liveness_probe():
+ if Probe.connection_manager is None or Probe.connection_manager.liveness_probe():
self.send_response(200)
self.end_headers()
else :
@@ -39,7 +40,7 @@
def ready_probe(self):
- if ConnectionManager.readiness_probe():
+ if Probe.connection_manager is not None and Probe.connection_manager.readiness_probe():
self.send_response(200)
self.end_headers()
else :