Stephane Barbarie | 6e1bd50 | 2018-11-05 22:44:45 -0500 | [diff] [blame] | 1 | # |
| 2 | # Copyright 2017 the original author or authors. |
| 3 | # |
| 4 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | # you may not use this file except in compliance with the License. |
| 6 | # You may obtain a copy of the License at |
| 7 | # |
| 8 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | # |
| 10 | # Unless required by applicable law or agreed to in writing, software |
| 11 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | # See the License for the specific language governing permissions and |
| 14 | # limitations under the License. |
| 15 | # |
| 16 | import os |
| 17 | |
| 18 | import sys |
| 19 | |
| 20 | from twisted.internet import reactor |
| 21 | from twisted.internet.defer import Deferred, inlineCallbacks, returnValue |
| 22 | |
William Kurkian | fc0dcda | 2019-04-08 16:54:36 -0400 | [diff] [blame] | 23 | from pyvoltha.common.utils.asleep import asleep |
| 24 | from pyvoltha.common.utils.consulhelpers import get_endpoint_from_consul |
Stephane Barbarie | 6e1bd50 | 2018-11-05 22:44:45 -0500 | [diff] [blame] | 25 | from structlog import get_logger |
| 26 | import grpc |
| 27 | from grpc import StatusCode |
| 28 | from grpc._channel import _Rendezvous |
William Kurkian | fc0dcda | 2019-04-08 16:54:36 -0400 | [diff] [blame] | 29 | from voltha_protos.voltha_pb2 import OfAgentSubscriber |
Stephane Barbarie | 6e1bd50 | 2018-11-05 22:44:45 -0500 | [diff] [blame] | 30 | from grpc_client import GrpcClient |
| 31 | |
| 32 | from agent import Agent |
William Kurkian | fc0dcda | 2019-04-08 16:54:36 -0400 | [diff] [blame] | 33 | from pyvoltha.common.utils.dockerhelpers import get_my_containers_name |
Stephane Barbarie | 6e1bd50 | 2018-11-05 22:44:45 -0500 | [diff] [blame] | 34 | |
| 35 | |
| 36 | log = get_logger() |
| 37 | # _ = third_party |
| 38 | |
| 39 | class ConnectionManager(object): |
Richard Jankowski | 46464e9 | 2019-03-05 11:53:55 -0500 | [diff] [blame] | 40 | def __init__(self, consul_endpoint, |
| 41 | vcore_endpoint, vcore_grpc_timeout, vcore_binding_key, |
khenaidoo | 43aa6bd | 2019-05-29 13:35:13 -0400 | [diff] [blame] | 42 | vcore_transaction_key, controller_endpoints, instance_id, |
Stephane Barbarie | 6e1bd50 | 2018-11-05 22:44:45 -0500 | [diff] [blame] | 43 | enable_tls=False, key_file=None, cert_file=None, |
| 44 | vcore_retry_interval=0.5, devices_refresh_interval=5, |
| 45 | subscription_refresh_interval=5): |
| 46 | |
| 47 | log.info('init-connection-manager') |
| 48 | log.info('list-of-controllers', controller_endpoints=controller_endpoints) |
| 49 | self.controller_endpoints = controller_endpoints |
| 50 | self.consul_endpoint = consul_endpoint |
| 51 | self.vcore_endpoint = vcore_endpoint |
| 52 | self.grpc_timeout = vcore_grpc_timeout |
Richard Jankowski | 46464e9 | 2019-03-05 11:53:55 -0500 | [diff] [blame] | 53 | self.core_binding_key = vcore_binding_key |
khenaidoo | 43aa6bd | 2019-05-29 13:35:13 -0400 | [diff] [blame] | 54 | self.core_transaction_key = vcore_transaction_key |
Stephane Barbarie | 6e1bd50 | 2018-11-05 22:44:45 -0500 | [diff] [blame] | 55 | self.instance_id = instance_id |
| 56 | self.enable_tls = enable_tls |
| 57 | self.key_file = key_file |
| 58 | self.cert_file = cert_file |
| 59 | |
| 60 | self.channel = None |
| 61 | self.grpc_client = None # single, shared gRPC client to vcore |
| 62 | |
| 63 | self.agent_map = {} # (datapath_id, controller_endpoint) -> Agent() |
| 64 | self.device_id_to_datapath_id_map = {} |
| 65 | |
| 66 | self.vcore_retry_interval = vcore_retry_interval |
| 67 | self.devices_refresh_interval = devices_refresh_interval |
| 68 | self.subscription_refresh_interval = subscription_refresh_interval |
| 69 | self.subscription = None |
| 70 | |
| 71 | self.running = False |
| 72 | |
| 73 | def start(self): |
| 74 | |
| 75 | if self.running: |
| 76 | return |
| 77 | |
| 78 | log.debug('starting') |
| 79 | |
| 80 | self.running = True |
| 81 | |
| 82 | # Get a subscription to vcore |
| 83 | reactor.callInThread(self.get_vcore_subscription) |
| 84 | |
| 85 | # Start monitoring logical devices and manage agents accordingly |
| 86 | reactor.callLater(0, self.monitor_logical_devices) |
| 87 | |
| 88 | log.info('started') |
| 89 | |
| 90 | return self |
| 91 | |
| 92 | def stop(self): |
| 93 | log.debug('stopping') |
| 94 | # clean up all controller connections |
| 95 | for agent in self.agent_map.itervalues(): |
| 96 | agent.stop() |
| 97 | self.running = False |
| 98 | |
| 99 | self._reset_grpc_attributes() |
| 100 | |
| 101 | log.info('stopped') |
| 102 | |
| 103 | def resolve_endpoint(self, endpoint): |
| 104 | ip_port_endpoint = endpoint |
| 105 | if endpoint.startswith('@'): |
| 106 | try: |
| 107 | ip_port_endpoint = get_endpoint_from_consul( |
| 108 | self.consul_endpoint, endpoint[1:]) |
| 109 | log.info( |
| 110 | '{}-service-endpoint-found'.format(endpoint), address=ip_port_endpoint) |
| 111 | except Exception as e: |
| 112 | log.error('{}-service-endpoint-not-found'.format(endpoint), exception=repr(e)) |
| 113 | log.error('committing-suicide') |
| 114 | # Committing suicide in order to let docker restart ofagent |
| 115 | os.system("kill -15 {}".format(os.getpid())) |
| 116 | if ip_port_endpoint: |
| 117 | host, port = ip_port_endpoint.split(':', 2) |
| 118 | return host, int(port) |
| 119 | |
| 120 | def _reset_grpc_attributes(self): |
| 121 | log.debug('start-reset-grpc-attributes') |
| 122 | |
| 123 | if self.grpc_client is not None: |
| 124 | self.grpc_client.stop() |
| 125 | |
| 126 | if self.channel is not None: |
| 127 | del self.channel |
| 128 | |
| 129 | self.is_alive = False |
| 130 | self.channel = None |
| 131 | self.subscription = None |
| 132 | self.grpc_client = None |
| 133 | |
| 134 | log.debug('stop-reset-grpc-attributes') |
| 135 | |
| 136 | def _assign_grpc_attributes(self): |
| 137 | log.debug('start-assign-grpc-attributes') |
| 138 | |
| 139 | host, port = self.resolve_endpoint(self.vcore_endpoint) |
| 140 | log.info('revolved-vcore-endpoint', endpoint=self.vcore_endpoint, host=host, port=port) |
| 141 | |
| 142 | assert host is not None |
| 143 | assert port is not None |
| 144 | |
| 145 | # Establish a connection to the vcore GRPC server |
| 146 | self.channel = grpc.insecure_channel('{}:{}'.format(host, port)) |
| 147 | self.is_alive = True |
| 148 | |
| 149 | log.debug('stop-assign-grpc-attributes') |
| 150 | |
| 151 | @inlineCallbacks |
| 152 | def get_vcore_subscription(self): |
| 153 | log.debug('start-get-vcore-subscription') |
| 154 | |
| 155 | while self.running and self.subscription is None: |
| 156 | try: |
| 157 | # If a subscription is not yet assigned then establish new GRPC connection |
| 158 | # ... otherwise keep using existing connection details |
| 159 | if self.subscription is None: |
| 160 | self._assign_grpc_attributes() |
| 161 | |
| 162 | # Send subscription request to register the current ofagent instance |
| 163 | container_name = self.instance_id |
| 164 | if self.grpc_client is None: |
Richard Jankowski | 46464e9 | 2019-03-05 11:53:55 -0500 | [diff] [blame] | 165 | self.grpc_client = GrpcClient(self, self.channel, self.grpc_timeout, |
khenaidoo | 43aa6bd | 2019-05-29 13:35:13 -0400 | [diff] [blame] | 166 | self.core_binding_key, self.core_transaction_key) |
Stephane Barbarie | 6e1bd50 | 2018-11-05 22:44:45 -0500 | [diff] [blame] | 167 | subscription = yield self.grpc_client.subscribe( |
| 168 | OfAgentSubscriber(ofagent_id=container_name)) |
| 169 | |
| 170 | # If the subscriber id matches the current instance |
| 171 | # ... then the subscription has succeeded |
| 172 | if subscription is not None and subscription.ofagent_id == container_name: |
| 173 | if self.subscription is None: |
| 174 | # Keep details on the current GRPC session and subscription |
| 175 | log.debug('subscription-with-vcore-successful', subscription=subscription) |
| 176 | self.subscription = subscription |
| 177 | self.grpc_client.start() |
| 178 | |
| 179 | # Sleep a bit in between each subscribe |
| 180 | yield asleep(self.subscription_refresh_interval) |
| 181 | |
| 182 | # Move on to next subscribe request |
| 183 | continue |
| 184 | |
| 185 | # The subscription did not succeed, reset and move on |
| 186 | else: |
| 187 | log.info('subscription-with-vcore-unavailable', subscription=subscription) |
| 188 | |
| 189 | except _Rendezvous, e: |
| 190 | log.error('subscription-with-vcore-terminated',exception=e, status=e.code()) |
| 191 | |
| 192 | except Exception as e: |
| 193 | log.exception('unexpected-subscription-termination-with-vcore', e=e) |
| 194 | |
| 195 | # Reset grpc details |
| 196 | # The vcore instance is either not available for subscription |
| 197 | # or a failure occurred with the existing communication. |
| 198 | self._reset_grpc_attributes() |
| 199 | |
| 200 | # Sleep for a short period and retry |
| 201 | yield asleep(self.vcore_retry_interval) |
| 202 | |
| 203 | log.debug('stop-get-vcore-subscription') |
| 204 | |
| 205 | @inlineCallbacks |
| 206 | def get_list_of_logical_devices_from_voltha(self): |
| 207 | |
| 208 | while self.running: |
| 209 | log.info('retrieve-logical-device-list') |
| 210 | try: |
| 211 | devices = yield \ |
| 212 | self.grpc_client.list_logical_devices() |
| 213 | |
| 214 | for device in devices: |
| 215 | log.info("logical-device-entry", id=device.id, |
| 216 | datapath_id=device.datapath_id) |
| 217 | |
| 218 | returnValue(devices) |
| 219 | |
| 220 | except _Rendezvous, e: |
| 221 | status = e.code() |
| 222 | log.error('vcore-communication-failure', exception=e, status=status) |
| 223 | if status == StatusCode.UNAVAILABLE or status == StatusCode.DEADLINE_EXCEEDED: |
| 224 | os.system("kill -15 {}".format(os.getpid())) |
| 225 | |
| 226 | except Exception as e: |
| 227 | log.exception('logical-devices-retrieval-failure', exception=e) |
| 228 | |
| 229 | log.info('reconnect', after_delay=self.vcore_retry_interval) |
| 230 | yield asleep(self.vcore_retry_interval) |
| 231 | |
| 232 | def refresh_agent_connections(self, devices): |
| 233 | """ |
| 234 | Based on the new device list, update the following state in the class: |
| 235 | * agent_map |
| 236 | * datapath_map |
| 237 | * device_id_map |
| 238 | :param devices: full device list freshly received from Voltha |
| 239 | :return: None |
| 240 | """ |
| 241 | |
| 242 | # Use datapath ids for deciding what's new and what's obsolete |
| 243 | desired_datapath_ids = set(d.datapath_id for d in devices) |
| 244 | current_datapath_ids = set(datapath_ids[0] for datapath_ids in self.agent_map.iterkeys()) |
| 245 | |
| 246 | # if identical, nothing to do |
| 247 | if desired_datapath_ids == current_datapath_ids: |
| 248 | return |
| 249 | |
| 250 | # ... otherwise calculate differences |
| 251 | to_add = desired_datapath_ids.difference(current_datapath_ids) |
| 252 | to_del = current_datapath_ids.difference(desired_datapath_ids) |
| 253 | |
| 254 | # remove what we don't need |
| 255 | for datapath_id in to_del: |
| 256 | self.delete_agent(datapath_id) |
| 257 | |
| 258 | # start new agents as needed |
| 259 | for device in devices: |
| 260 | if device.datapath_id in to_add: |
| 261 | self.create_agent(device) |
| 262 | |
| 263 | log.debug('updated-agent-list', count=len(self.agent_map)) |
| 264 | log.debug('updated-device-id-to-datapath-id-map', |
| 265 | map=str(self.device_id_to_datapath_id_map)) |
| 266 | |
| 267 | def create_agent(self, device): |
| 268 | datapath_id = device.datapath_id |
| 269 | device_id = device.id |
| 270 | for controller_endpoint in self.controller_endpoints: |
| 271 | agent = Agent(controller_endpoint, datapath_id, |
| 272 | device_id, self.grpc_client, self.enable_tls, |
| 273 | self.key_file, self.cert_file) |
| 274 | agent.start() |
| 275 | self.agent_map[(datapath_id,controller_endpoint)] = agent |
| 276 | self.device_id_to_datapath_id_map[device_id] = datapath_id |
| 277 | |
| 278 | def delete_agent(self, datapath_id): |
| 279 | for controller_endpoint in self.controller_endpoints: |
| 280 | agent = self.agent_map[(datapath_id,controller_endpoint)] |
| 281 | device_id = agent.get_device_id() |
| 282 | agent.stop() |
| 283 | del self.agent_map[(datapath_id,controller_endpoint)] |
| 284 | del self.device_id_to_datapath_id_map[device_id] |
| 285 | |
| 286 | @inlineCallbacks |
| 287 | def monitor_logical_devices(self): |
| 288 | log.debug('start-monitor-logical-devices') |
| 289 | |
| 290 | while self.running: |
| 291 | log.info('monitoring-logical-devices') |
| 292 | |
| 293 | # should change to a gRPC streaming call |
| 294 | # see https://jira.opencord.org/browse/CORD-821 |
| 295 | |
| 296 | try: |
| 297 | if self.channel is not None and self.grpc_client is not None and \ |
| 298 | self.subscription is not None: |
| 299 | # get current list from Voltha |
| 300 | devices = yield \ |
| 301 | self.get_list_of_logical_devices_from_voltha() |
| 302 | |
| 303 | # update agent list and mapping tables as needed |
| 304 | self.refresh_agent_connections(devices) |
| 305 | else: |
| 306 | log.info('vcore-communication-unavailable') |
| 307 | |
| 308 | # wait before next poll |
| 309 | yield asleep(self.devices_refresh_interval) |
| 310 | |
| 311 | except _Rendezvous, e: |
| 312 | log.error('vcore-communication-failure', exception=repr(e), status=e.code()) |
| 313 | |
| 314 | except Exception as e: |
| 315 | log.exception('unexpected-vcore-communication-failure', exception=repr(e)) |
| 316 | |
| 317 | log.debug('stop-monitor-logical-devices') |
| 318 | |
| 319 | def forward_packet_in(self, device_id, ofp_packet_in): |
| 320 | datapath_id = self.device_id_to_datapath_id_map.get(device_id, None) |
| 321 | if datapath_id: |
| 322 | for controller_endpoint in self.controller_endpoints: |
| 323 | agent = self.agent_map[(datapath_id, controller_endpoint)] |
| 324 | agent.forward_packet_in(ofp_packet_in) |
| 325 | |
| 326 | def forward_change_event(self, device_id, event): |
| 327 | datapath_id = self.device_id_to_datapath_id_map.get(device_id, None) |
| 328 | if datapath_id: |
| 329 | for controller_endpoint in self.controller_endpoints: |
| 330 | agent = self.agent_map[(datapath_id, controller_endpoint)] |
| 331 | agent.forward_change_event(event) |