Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 1 | # |
Zsolt Haraszti | 3eb27a5 | 2017-01-03 21:56:48 -0800 | [diff] [blame] | 2 | # Copyright 2017 the original author or authors. |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 3 | # |
| 4 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | # you may not use this file except in compliance with the License. |
| 6 | # You may obtain a copy of the License at |
| 7 | # |
| 8 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | # |
| 10 | # Unless required by applicable law or agreed to in writing, software |
| 11 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | # See the License for the specific language governing permissions and |
| 14 | # limitations under the License. |
| 15 | # |
| 16 | import re |
| 17 | |
| 18 | from structlog import get_logger |
| 19 | from twisted.internet import reactor |
| 20 | from twisted.internet.base import DelayedCall |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 21 | from twisted.internet.defer import inlineCallbacks, returnValue, Deferred |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 22 | from simplejson import dumps, loads |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 23 | |
Zsolt Haraszti | 023ea7c | 2016-10-16 19:30:34 -0700 | [diff] [blame] | 24 | from common.utils.asleep import asleep |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 25 | |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 26 | log = get_logger() |
| 27 | |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 28 | |
| 29 | class Worker(object): |
| 30 | """ |
| 31 | Worker side of the coordinator. An instance of this class runs in every |
| 32 | voltha instance. It monitors what work is assigned to this instance by |
| 33 | the leader. This is all done via consul. |
| 34 | """ |
| 35 | |
| 36 | ASSIGNMENT_EXTRACTOR = '^%s(?P<member_id>[^/]+)/(?P<work_id>[^/]+)$' |
| 37 | |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 38 | # Public methods: |
| 39 | |
| 40 | def __init__(self, instance_id, coordinator): |
| 41 | |
| 42 | self.instance_id = instance_id |
| 43 | self.coord = coordinator |
| 44 | self.halted = False |
| 45 | self.soak_time = 0.5 # soak till assignment list settles |
| 46 | |
| 47 | self.my_workload = set() # list of work_id's assigned to me |
| 48 | |
| 49 | self.assignment_soak_timer = None |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 50 | self.assignment_core_store_soak_timer = None |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 51 | self.my_candidate_workload = set() # we stash here during soaking |
| 52 | |
| 53 | self.assignment_match = re.compile( |
Rouzbahan Rashidi-Tabrizi | 1c3eba8 | 2016-10-27 21:47:18 -0400 | [diff] [blame] | 54 | self.ASSIGNMENT_EXTRACTOR % self.coord.assignment_prefix).match |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 55 | |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 56 | self.mycore_store_id = None |
| 57 | |
| 58 | self.wait_for_core_store_assignment = Deferred() |
| 59 | |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 60 | self.peers_map = None |
| 61 | |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 62 | @inlineCallbacks |
| 63 | def start(self): |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 64 | log.debug('starting') |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 65 | yield self._start_tracking_my_assignments() |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 66 | yield self._start_tracking_my_peers() |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 67 | log.info('started') |
| 68 | returnValue(self) |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 69 | |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 70 | def stop(self): |
| 71 | log.debug('stopping') |
khenaidoo | e154d59 | 2017-08-03 19:08:27 -0400 | [diff] [blame] | 72 | self.halted = True |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 73 | if isinstance(self.assignment_soak_timer, DelayedCall): |
| 74 | if not self.assignment_soak_timer.called: |
| 75 | self.assignment_soak_timer.cancel() |
khenaidoo | e154d59 | 2017-08-03 19:08:27 -0400 | [diff] [blame] | 76 | |
| 77 | if isinstance(self.assignment_core_store_soak_timer, DelayedCall): |
| 78 | if not self.assignment_core_store_soak_timer.called: |
| 79 | self.assignment_core_store_soak_timer.cancel() |
| 80 | |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 81 | log.info('stopped') |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 82 | |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 83 | @inlineCallbacks |
| 84 | def get_core_store_id(self): |
| 85 | if self.mycore_store_id: |
| 86 | returnValue(self.mycore_store_id) |
| 87 | else: |
| 88 | # Let's wait until we get assigned a store_id from the leader |
| 89 | val = yield self.wait_for_core_store_assignment |
| 90 | returnValue(val) |
| 91 | |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 92 | # Private methods: |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 93 | def _start_tracking_my_assignments(self): |
| 94 | reactor.callLater(0, self._track_my_assignments, 0) |
| 95 | |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 96 | def _start_tracking_my_peers(self): |
| 97 | reactor.callLater(0, self._track_my_peers, 0) |
| 98 | |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 99 | @inlineCallbacks |
| 100 | def _track_my_assignments(self, index): |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 101 | try: |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 102 | # if there is no leader yet, wait for a stable leader |
| 103 | d = self.coord.wait_for_a_leader() |
| 104 | if not d.called: |
| 105 | yield d |
| 106 | # additional time to let leader update |
| 107 | # assignments, to minimize potential churn |
Rouzbahan Rashidi-Tabrizi | 1c3eba8 | 2016-10-27 21:47:18 -0400 | [diff] [blame] | 108 | yield asleep(self.coord.worker_config.get( |
| 109 | self.coord.worker_config['time_to_let_leader_update'], 5)) |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 110 | |
| 111 | (index, results) = yield self.coord.kv_get( |
Rouzbahan Rashidi-Tabrizi | 1c3eba8 | 2016-10-27 21:47:18 -0400 | [diff] [blame] | 112 | self.coord.assignment_prefix + self.instance_id, |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 113 | index=index, recurse=True) |
| 114 | |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 115 | # 1. Check whether we have been assigned a full voltha instance |
| 116 | if results and not self.mycore_store_id: |
| 117 | # We have no store id set yet |
| 118 | core_stores = [c['Value'] for c in results if |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 119 | c['Key'] == self.coord.assignment_prefix + |
| 120 | self.instance_id + '/' + |
Gertjan Van Droogenbroeck | 6384696 | 2017-10-10 17:58:38 -0500 | [diff] [blame] | 121 | self.coord.core_storage_suffix and c['Value']] |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 122 | if core_stores: |
| 123 | self.mycore_store_id = core_stores[0] |
| 124 | log.debug('store-assigned', |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 125 | mycore_store_id=self.mycore_store_id) |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 126 | self._stash_and_restart_core_store_soak_timer() |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 127 | |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 128 | # 2. Check whether we have been assigned a work item |
| 129 | if results and self.mycore_store_id: |
| 130 | # Check for difference between current worload and newer one |
| 131 | # TODO: Depending on how workload gets load balanced we may |
| 132 | # need to add workload distribution here |
| 133 | pass |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 134 | |
| 135 | except Exception, e: |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 136 | log.exception('assignments-track-error', e=e) |
Rouzbahan Rashidi-Tabrizi | 1c3eba8 | 2016-10-27 21:47:18 -0400 | [diff] [blame] | 137 | yield asleep( |
| 138 | self.coord.worker_config.get( |
| 139 | self.coord.worker_config[ |
| 140 | 'assignments_track_error_to_avoid_flood'], 1)) |
| 141 | # to prevent flood |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 142 | |
| 143 | finally: |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 144 | if not self.halted and not self.mycore_store_id: |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 145 | reactor.callLater(0, self._track_my_assignments, index) |
| 146 | |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 147 | @inlineCallbacks |
| 148 | def _track_my_peers(self, index): |
| 149 | try: |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 150 | prev_index = index |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 151 | if self.mycore_store_id: |
| 152 | # Wait for updates to the store assigment key |
khenaidoo | d6e0e80 | 2017-08-29 19:55:44 -0400 | [diff] [blame] | 153 | is_timeout, (tmp_index, mappings) = yield \ |
| 154 | self.coord.consul_get_with_timeout( |
| 155 | key=self.coord.core_store_assignment_key, |
| 156 | recurse=True, |
| 157 | index=index, |
| 158 | timeout=10) |
| 159 | |
| 160 | if is_timeout: |
| 161 | return |
| 162 | |
| 163 | # After timeout event the index returned from |
| 164 | # consul_get_with_timeout is None. If we are here it's not a |
| 165 | # timeout, therefore the index is a valid one. |
| 166 | index=tmp_index |
| 167 | |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 168 | if mappings and index != prev_index: |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 169 | new_map = loads(mappings[0]['Value']) |
| 170 | # Remove my id from my peers list |
| 171 | new_map.pop(self.mycore_store_id) |
| 172 | if self.peers_map is None or self.peers_map != new_map: |
| 173 | self.coord.publish_peers_map_change(new_map) |
| 174 | self.peers_map = new_map |
khenaidoo | 82ce00d | 2017-08-15 12:01:46 -0400 | [diff] [blame] | 175 | log.info('peer-mapping-changed', mapping=new_map) |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 176 | else: |
| 177 | log.debug('no-mapping-change', mappings=mappings, |
| 178 | index=index, prev_index=prev_index) |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 179 | |
| 180 | except Exception, e: |
| 181 | log.exception('peer-track-error', e=e) |
| 182 | yield asleep( |
| 183 | self.coord.worker_config.get( |
| 184 | self.coord.worker_config[ |
| 185 | 'assignments_track_error_to_avoid_flood'], 1)) |
| 186 | # to prevent flood |
| 187 | finally: |
| 188 | if not self.halted: |
| 189 | # Wait longer if we have not received a core id yet |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 190 | reactor.callLater(1 if self.mycore_store_id else 5, |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 191 | self._track_my_peers, index) |
| 192 | |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 193 | def _stash_and_restart_soak_timer(self, candidate_workload): |
| 194 | |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 195 | log.debug('re-start-assignment-soaking') |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 196 | |
| 197 | if self.assignment_soak_timer is not None: |
| 198 | if not self.assignment_soak_timer.called: |
| 199 | self.assignment_soak_timer.cancel() |
| 200 | |
| 201 | self.my_candidate_workload = candidate_workload |
| 202 | self.assignment_soak_timer = reactor.callLater( |
| 203 | self.soak_time, self._update_assignments) |
| 204 | |
| 205 | def _update_assignments(self): |
| 206 | """ |
| 207 | Called when finally the dust has settled on our assignments. |
| 208 | :return: None |
| 209 | """ |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 210 | log.debug('my-assignments-changed', |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 211 | old_count=len(self.my_workload), |
| 212 | new_count=len(self.my_candidate_workload), |
| 213 | workload=self.my_workload) |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 214 | self.my_workload, self.my_candidate_workload = \ |
| 215 | self.my_candidate_workload, None |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 216 | |
| 217 | def _stash_and_restart_core_store_soak_timer(self): |
| 218 | |
| 219 | log.debug('re-start-assignment-config-soaking') |
| 220 | |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 221 | if self.assignment_core_store_soak_timer is not None: |
| 222 | if not self.assignment_core_store_soak_timer.called: |
| 223 | self.assignment_core_store_soak_timer.cancel() |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 224 | |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 225 | self.assignment_core_store_soak_timer = reactor.callLater( |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 226 | self.soak_time, self._process_config_assignment) |
| 227 | |
| 228 | def _process_config_assignment(self): |
| 229 | log.debug('process-config-assignment', |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 230 | mycore_store_id=self.mycore_store_id) |
| 231 | self.wait_for_core_store_assignment.callback(self.mycore_store_id) |