Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 1 | # |
Zsolt Haraszti | 3eb27a5 | 2017-01-03 21:56:48 -0800 | [diff] [blame] | 2 | # Copyright 2017 the original author or authors. |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 3 | # |
| 4 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | # you may not use this file except in compliance with the License. |
| 6 | # You may obtain a copy of the License at |
| 7 | # |
| 8 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | # |
| 10 | # Unless required by applicable law or agreed to in writing, software |
| 11 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | # See the License for the specific language governing permissions and |
| 14 | # limitations under the License. |
| 15 | # |
| 16 | |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 17 | import re |
Zsolt Haraszti | 023ea7c | 2016-10-16 19:30:34 -0700 | [diff] [blame] | 18 | |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 19 | from hash_ring import HashRing |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 20 | from structlog import get_logger |
| 21 | from twisted.internet import reactor |
| 22 | from twisted.internet.base import DelayedCall |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 23 | from twisted.internet.defer import inlineCallbacks, DeferredList, returnValue |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 24 | from simplejson import dumps, loads |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 25 | |
Zsolt Haraszti | 023ea7c | 2016-10-16 19:30:34 -0700 | [diff] [blame] | 26 | from common.utils.asleep import asleep |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 27 | from common.utils.id_generation import get_next_core_id |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 28 | |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 29 | log = get_logger() |
| 30 | |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 31 | |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 32 | class ConfigMappingException(Exception): |
| 33 | pass |
| 34 | |
| 35 | |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 36 | class Leader(object): |
| 37 | """ |
| 38 | A single instance of this object shall exist across the whole cluster. |
| 39 | This is guaranteed by the coordinator which instantiates this class |
| 40 | only when it secured the leadership lock, as well as calling the halt() |
| 41 | method in cases it looses the leadership lock. |
| 42 | """ |
| 43 | |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 44 | ID_EXTRACTOR = '^(%s)([^/]+)$' |
Richard Jankowski | 4ea2663 | 2018-05-14 17:45:38 -0400 | [diff] [blame] | 45 | CORE_STORE_KEY_EXTRACTOR = '^%s/(?P<core_store_id>[^/]+)/root$' |
| 46 | START_TIMESTAMP_EXTRACTOR = '^.*_([0-9]+)$' |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 47 | ASSIGNMENT_ID_EXTRACTOR = '^(%s)([^/]+)/core_store$' |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 48 | |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 49 | # Public methods: |
| 50 | |
| 51 | def __init__(self, coordinator): |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 52 | |
| 53 | self.coord = coordinator |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 54 | self.halted = False |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 55 | self.soak_time = 3 # soak till membership/workload changes settle |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 56 | |
| 57 | self.workload = [] |
| 58 | self.members = [] |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 59 | self.core_store_ids = [] |
| 60 | self.core_store_assignment = None |
| 61 | |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 62 | self.reassignment_soak_timer = None |
khenaidoo | b1602a3 | 2017-07-27 16:59:52 -0400 | [diff] [blame] | 63 | self.core_store_reassignment_soak_timer = None |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 64 | |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 65 | self.workload_id_match = re.compile( |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 66 | self.ID_EXTRACTOR % self.coord.workload_prefix).match |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 67 | |
| 68 | self.member_id_match = re.compile( |
Rouzbahan Rashidi-Tabrizi | 1c3eba8 | 2016-10-27 21:47:18 -0400 | [diff] [blame] | 69 | self.ID_EXTRACTOR % self.coord.membership_prefix).match |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 70 | |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 71 | self.core_data_id_match = re.compile( |
| 72 | self.CORE_STORE_KEY_EXTRACTOR % self.coord.core_store_prefix).match |
| 73 | |
Richard Jankowski | 4ea2663 | 2018-05-14 17:45:38 -0400 | [diff] [blame] | 74 | self.core_match = re.compile(self.coord.container_name_regex).match |
| 75 | self.timestamp_match = re.compile(self.START_TIMESTAMP_EXTRACTOR).match |
khenaidoo | 82ce00d | 2017-08-15 12:01:46 -0400 | [diff] [blame] | 76 | |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 77 | self.assignment_id_match = re.compile( |
| 78 | self.ASSIGNMENT_ID_EXTRACTOR % self.coord.assignment_prefix).match |
khenaidoo | 82ce00d | 2017-08-15 12:01:46 -0400 | [diff] [blame] | 79 | |
khenaidoo | d6e0e80 | 2017-08-29 19:55:44 -0400 | [diff] [blame] | 80 | self.members_tracking_sleep_to_prevent_flood = \ |
| 81 | self.coord.leader_config.get((self.coord.leader_config[ |
| 82 | 'members_track_error_to_prevent_flood']), 1) |
| 83 | |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 84 | @inlineCallbacks |
| 85 | def start(self): |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 86 | log.debug('starting') |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 87 | # yield self._validate_workload() |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 88 | yield self._start_tracking_assignments() |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 89 | log.info('started') |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 90 | |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 91 | def stop(self): |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 92 | """Suspend leadership duties immediately""" |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 93 | log.debug('stopping') |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 94 | self.halted = True |
| 95 | |
| 96 | # any active cancellations, releases, etc., should happen here |
| 97 | if isinstance(self.reassignment_soak_timer, DelayedCall): |
Zsolt Haraszti | ac9310d | 2016-09-20 12:56:35 -0700 | [diff] [blame] | 98 | if not self.reassignment_soak_timer.called: |
| 99 | self.reassignment_soak_timer.cancel() |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 100 | |
khenaidoo | b1602a3 | 2017-07-27 16:59:52 -0400 | [diff] [blame] | 101 | if isinstance(self.core_store_reassignment_soak_timer, DelayedCall): |
| 102 | if not self.core_store_reassignment_soak_timer.called: |
| 103 | self.core_store_reassignment_soak_timer.cancel() |
| 104 | |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 105 | log.info('stopped') |
| 106 | |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 107 | # Private methods: |
| 108 | |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 109 | |
| 110 | def _start_tracking_assignments(self): |
| 111 | """ |
| 112 | We must track both the cluster member list as well as the workload |
| 113 | list. Upon change in either, we must rerun our sharding algorithm |
| 114 | and reassign work as/if needed. |
| 115 | """ |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 116 | reactor.callLater(0, self._track_members, 0) |
| 117 | |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 118 | @inlineCallbacks |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 119 | def _get_core_store_mappings(self): |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 120 | try: |
| 121 | # Get the mapping record |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 122 | (_, mappings) = yield self.coord.kv_get( |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 123 | self.coord.core_store_assignment_key, recurse=True) |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 124 | if mappings: |
| 125 | self.core_store_assignment = loads(mappings[0]['Value']) |
| 126 | return |
| 127 | else: # Key has not been created yet |
| 128 | # Create the key with an empty dictionary value |
| 129 | value = dict() |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 130 | result = yield self.coord.kv_put( |
| 131 | self.coord.core_store_assignment_key, |
| 132 | dumps(value)) |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 133 | if not result: |
| 134 | raise ConfigMappingException(self.instance_id) |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 135 | |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 136 | # Ensure the record was created |
| 137 | (_, mappings) = yield self.coord.kv_get( |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 138 | self.coord.core_store_assignment_key, recurse=True) |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 139 | |
| 140 | self.core_store_assignment = loads(mappings[0]['Value']) |
| 141 | |
| 142 | except Exception, e: |
| 143 | log.exception('error', e=e) |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 144 | |
| 145 | @inlineCallbacks |
| 146 | def _update_core_store_references(self): |
| 147 | try: |
| 148 | # Get the current set of configs keys |
| 149 | (_, results) = yield self.coord.kv_get( |
| 150 | self.coord.core_store_prefix, recurse=False, keys=True) |
| 151 | |
| 152 | matches = (self.core_data_id_match(e) for e in results or []) |
| 153 | core_ids = [m.group(1) for m in matches if m is not None] |
| 154 | |
| 155 | self.core_store_ids = core_ids |
| 156 | |
| 157 | # Update the config mapping |
| 158 | self._get_core_store_mappings() |
| 159 | |
| 160 | log.debug('core-data', core_ids=core_ids, |
| 161 | assignment=self.core_store_assignment) |
| 162 | |
| 163 | except Exception, e: |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 164 | log.exception('error-update-store', e=e) |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 165 | |
khenaidoo | 82ce00d | 2017-08-15 12:01:46 -0400 | [diff] [blame] | 166 | def _sanitize_member_list(self, members): |
| 167 | # This method removes any duplicates from the member list using the |
| 168 | # voltha number from the member id and the time that voltha instance |
| 169 | # started, again from the member id. This method is meaningful only |
Richard Jankowski | 4ea2663 | 2018-05-14 17:45:38 -0400 | [diff] [blame] | 170 | # in a clustered environment (e.g. Docker swarm or Kubernetes). In |
| 171 | # a non-cluster environment the member id is formatted differently. |
| 172 | # In such a case, the method below will create an exception and |
| 173 | # return the member list as is. |
khenaidoo | 82ce00d | 2017-08-15 12:01:46 -0400 | [diff] [blame] | 174 | |
| 175 | try: |
| 176 | unique_members = {} |
| 177 | update_occurred = False |
| 178 | log.info('members', members=members) |
| 179 | for member in members: |
| 180 | log.info('member', member=member) |
| 181 | # Extract the swarm assigned number of the voltha instance |
| 182 | voltha_number = self.core_match(member['id']).group(1) |
| 183 | timestamp = self.timestamp_match(member['id']).group(1) |
| 184 | if voltha_number not in unique_members: |
| 185 | unique_members[voltha_number] = {'id': member['id'], |
| 186 | 'timestamp': timestamp, |
| 187 | 'host': member['host']} |
| 188 | else: |
| 189 | # Verify whether if this member has the latest timestamp. If |
| 190 | # yes, overwrite the previous one |
| 191 | if unique_members[voltha_number]['timestamp'] < timestamp: |
| 192 | unique_members[voltha_number] = {'id': member['id'], |
| 193 | 'timestamp': timestamp, |
| 194 | 'host': member['host']} |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 195 | update_occurred = True |
khenaidoo | 82ce00d | 2017-08-15 12:01:46 -0400 | [diff] [blame] | 196 | |
| 197 | if update_occurred: |
| 198 | updated_members = [] |
| 199 | for _, unique_member in unique_members.iteritems(): |
| 200 | updated_members.append({'host': unique_member['host'], |
| 201 | 'id': unique_member['id']}) |
| 202 | return updated_members |
| 203 | else: |
| 204 | return members |
Richard Jankowski | 4ea2663 | 2018-05-14 17:45:38 -0400 | [diff] [blame] | 205 | except Exception as e: |
| 206 | log.exception('extraction-error', e=e) |
khenaidoo | 82ce00d | 2017-08-15 12:01:46 -0400 | [diff] [blame] | 207 | return members |
| 208 | |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 209 | @inlineCallbacks |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 210 | def _is_temporal_state(self, members): |
| 211 | try: |
| 212 | # First get the current core assignments |
| 213 | (_, results) = yield self.coord.kv_get( |
| 214 | self.coord.assignment_prefix, |
| 215 | recurse=True) |
| 216 | |
| 217 | log.debug('core-assignments', assignment=results) |
| 218 | if results: |
| 219 | old_assignment = [ |
| 220 | {'id': self.assignment_id_match(e['Key']).group(2), |
| 221 | 'core': e['Value']} |
| 222 | for e in results] |
| 223 | |
| 224 | # If there are no curr_assignments then we are starting the |
| 225 | # system. In this case we should keep processing |
| 226 | if len(old_assignment) == 0: |
| 227 | returnValue(False) |
| 228 | |
| 229 | # Tackle the simplest scenario - #members >= #old_assignment |
Gertjan Van Droogenbroeck | 26eabe1 | 2017-10-12 17:35:37 -0500 | [diff] [blame] | 230 | if members is not None and len(members) >= len(old_assignment): |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 231 | returnValue(False) |
| 232 | |
| 233 | # Everything else is a temporal state |
| 234 | log.info('temporal-state-detected', members=members, |
| 235 | old_assignments=old_assignment) |
| 236 | |
| 237 | returnValue(True) |
| 238 | else: |
| 239 | returnValue(False) |
| 240 | except Exception as e: |
| 241 | log.exception('temporal-state-error', e=e) |
| 242 | returnValue(True) |
| 243 | |
| 244 | @inlineCallbacks |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 245 | def _track_members(self, index): |
khenaidoo | b1602a3 | 2017-07-27 16:59:52 -0400 | [diff] [blame] | 246 | previous_index = index |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 247 | try: |
Zack Williams | 18357ed | 2018-11-14 10:41:08 -0700 | [diff] [blame^] | 248 | log.debug('member-tracking-before') |
khenaidoo | d6e0e80 | 2017-08-29 19:55:44 -0400 | [diff] [blame] | 249 | is_timeout, (tmp_index, results) = yield \ |
Zack Williams | 18357ed | 2018-11-14 10:41:08 -0700 | [diff] [blame^] | 250 | self.coord.coordinator_get_with_timeout( |
khenaidoo | d6e0e80 | 2017-08-29 19:55:44 -0400 | [diff] [blame] | 251 | key=self.coord.membership_prefix, |
| 252 | recurse=True, |
| 253 | index=index, |
| 254 | timeout=10) |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 255 | # Check whether we are still the leader - a new regime may be in |
| 256 | # place by the time we see a membership update |
| 257 | if self.halted: |
| 258 | log.info('I am no longer the leader') |
| 259 | return |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 260 | |
khenaidoo | d6e0e80 | 2017-08-29 19:55:44 -0400 | [diff] [blame] | 261 | if is_timeout: |
| 262 | log.debug('timeout-or-no-membership-changed') |
| 263 | return |
| 264 | |
khenaidoo | b1602a3 | 2017-07-27 16:59:52 -0400 | [diff] [blame] | 265 | # This can happen if consul went down and came back with no data |
| 266 | if not results: |
| 267 | log.error('no-active-members') |
| 268 | # Bail out of leadership and go for an early election |
| 269 | self.coord._just_lost_leadership() |
| 270 | return |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 271 | |
khenaidoo | d6e0e80 | 2017-08-29 19:55:44 -0400 | [diff] [blame] | 272 | # After timeout event the index returned from |
Zack Williams | 18357ed | 2018-11-14 10:41:08 -0700 | [diff] [blame^] | 273 | # coordinator_get_with_timeout is None. If we are here it's not a |
khenaidoo | d6e0e80 | 2017-08-29 19:55:44 -0400 | [diff] [blame] | 274 | # timeout, therefore the index is a valid one. |
| 275 | index=tmp_index |
| 276 | |
| 277 | log.info('membership-tracking-data', index=index, results=results) |
| 278 | |
khenaidoo | b1602a3 | 2017-07-27 16:59:52 -0400 | [diff] [blame] | 279 | if previous_index != index: |
| 280 | log.info('membership-updated', |
| 281 | previous_index=previous_index, index=index) |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 282 | |
khenaidoo | b1602a3 | 2017-07-27 16:59:52 -0400 | [diff] [blame] | 283 | # Rebuild the membership, if any |
| 284 | |
| 285 | # Only members with valid session are considered active |
| 286 | members = [{'id': self.member_id_match(e['Key']).group(2), |
| 287 | 'host': loads(e['Value'])['host_address']} |
| 288 | for e in results if 'Session' in e] |
| 289 | |
khenaidoo | 82ce00d | 2017-08-15 12:01:46 -0400 | [diff] [blame] | 290 | if members: |
| 291 | updated_members = self._sanitize_member_list(members) |
| 292 | else: |
| 293 | updated_members = None |
| 294 | |
| 295 | log.info('active-members', active_members=members, |
| 296 | sanitized_members=updated_members) |
khenaidoo | b1602a3 | 2017-07-27 16:59:52 -0400 | [diff] [blame] | 297 | |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 298 | # Check if we are in a temporal state. If true wait for the |
| 299 | # next membership changes |
| 300 | temporal_state = yield self._is_temporal_state(updated_members) |
| 301 | if temporal_state: |
| 302 | log.info('temporal-state-detected') |
| 303 | pass # Wait for next member list change |
| 304 | elif updated_members != self.members: |
| 305 | # if the two sets are the same |
khenaidoo | b1602a3 | 2017-07-27 16:59:52 -0400 | [diff] [blame] | 306 | # update the current set of config |
| 307 | yield self._update_core_store_references() |
| 308 | log.info('membership-changed', |
| 309 | prev_members=self.members, |
khenaidoo | 82ce00d | 2017-08-15 12:01:46 -0400 | [diff] [blame] | 310 | curr_members=updated_members, |
khenaidoo | b1602a3 | 2017-07-27 16:59:52 -0400 | [diff] [blame] | 311 | core_store_mapping=self.core_store_assignment) |
khenaidoo | 82ce00d | 2017-08-15 12:01:46 -0400 | [diff] [blame] | 312 | self.members = updated_members |
khenaidoo | b1602a3 | 2017-07-27 16:59:52 -0400 | [diff] [blame] | 313 | self._restart_core_store_reassignment_soak_timer() |
| 314 | else: |
| 315 | log.debug('no-membership-change', index=index) |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 316 | |
| 317 | except Exception, e: |
Zsolt Haraszti | 2bdb6b3 | 2016-11-03 16:56:17 -0700 | [diff] [blame] | 318 | log.exception('members-track-error', e=e) |
Rouzbahan Rashidi-Tabrizi | 1c3eba8 | 2016-10-27 21:47:18 -0400 | [diff] [blame] | 319 | # to prevent flood |
khenaidoo | d6e0e80 | 2017-08-29 19:55:44 -0400 | [diff] [blame] | 320 | yield asleep(self.members_tracking_sleep_to_prevent_flood) |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 321 | finally: |
| 322 | if not self.halted: |
khenaidoo | 5431e4c | 2017-08-17 15:05:40 -0400 | [diff] [blame] | 323 | reactor.callLater(1, self._track_members, index) |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 324 | |
| 325 | def _restart_reassignment_soak_timer(self): |
| 326 | |
| 327 | if self.reassignment_soak_timer is not None: |
| 328 | assert isinstance(self.reassignment_soak_timer, DelayedCall) |
Zsolt Haraszti | 8dc1f5e | 2016-09-18 23:35:39 -0700 | [diff] [blame] | 329 | if not self.reassignment_soak_timer.called: |
| 330 | self.reassignment_soak_timer.cancel() |
Zsolt Haraszti | a341031 | 2016-09-18 23:29:04 -0700 | [diff] [blame] | 331 | |
| 332 | self.reassignment_soak_timer = reactor.callLater( |
| 333 | self.soak_time, self._reassign_work) |
| 334 | |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 335 | def _restart_core_store_reassignment_soak_timer(self): |
| 336 | |
khenaidoo | b1602a3 | 2017-07-27 16:59:52 -0400 | [diff] [blame] | 337 | if self.core_store_reassignment_soak_timer is not None: |
| 338 | assert isinstance(self.core_store_reassignment_soak_timer, DelayedCall) |
| 339 | if not self.core_store_reassignment_soak_timer.called: |
| 340 | self.core_store_reassignment_soak_timer.cancel() |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 341 | |
khenaidoo | b1602a3 | 2017-07-27 16:59:52 -0400 | [diff] [blame] | 342 | self.core_store_reassignment_soak_timer = reactor.callLater( |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 343 | self.soak_time, self._reassign_core_stores) |
| 344 | |
| 345 | @inlineCallbacks |
| 346 | def _reassign_core_stores(self): |
| 347 | |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 348 | def _get_core_data_id_from_instance(instance_name): |
| 349 | for id, instance in self.core_store_assignment.iteritems(): |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 350 | if instance and instance['id'] == instance_name: |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 351 | return id |
| 352 | |
| 353 | try: |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 354 | log.info('core-members', curr_members=self.members, |
| 355 | prev_members=self.core_store_assignment) |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 356 | |
| 357 | # 1. clear the mapping for instances that are no longer running |
| 358 | updated_mapping = dict() |
| 359 | existing_active_config_members = set() |
| 360 | cleared_config_ids = set() |
| 361 | inactive_members = set() |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 362 | if self.core_store_assignment: |
| 363 | for id, instance in self.core_store_assignment.iteritems(): |
| 364 | if instance not in self.members: |
| 365 | updated_mapping[id] = None |
| 366 | cleared_config_ids.add(id) |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 367 | if instance: |
| 368 | inactive_members.add(instance['id']) |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 369 | else: |
| 370 | updated_mapping[id] = instance |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 371 | existing_active_config_members.add(instance['id']) |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 372 | |
| 373 | # 2. Update the mapping with the new set |
| 374 | current_id = max(self.core_store_assignment) \ |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 375 | if self.core_store_assignment else '0000' |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 376 | for instance in self.members: |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 377 | if instance['id'] not in existing_active_config_members: |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 378 | # Add the member to the config map |
| 379 | if cleared_config_ids: |
| 380 | # There is an empty slot |
| 381 | next_id = cleared_config_ids.pop() |
| 382 | updated_mapping[next_id] = instance |
| 383 | else: |
| 384 | # There are no empty slot, create new ids |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 385 | current_id = get_next_core_id(current_id) |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 386 | updated_mapping[current_id] = instance |
| 387 | |
| 388 | self.core_store_assignment = updated_mapping |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 389 | log.info('updated-assignment', |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 390 | core_store_assignment=self.core_store_assignment, |
| 391 | inactive_members=inactive_members) |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 392 | |
| 393 | # 3. save the mapping into consul |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 394 | yield self.coord.kv_put(self.coord.core_store_assignment_key, |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 395 | dumps(self.core_store_assignment)) |
| 396 | |
| 397 | # 4. Assign the new workload to the newly created members |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 398 | curr_members_set = set([m['id'] for m in self.members]) |
khenaidoo | 08d48d2 | 2017-06-29 19:42:49 -0400 | [diff] [blame] | 399 | new_members = curr_members_set.difference( |
| 400 | existing_active_config_members) |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 401 | for new_member_id in new_members: |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 402 | yield self.coord.kv_put( |
| 403 | self.coord.assignment_prefix |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 404 | + new_member_id + '/' + |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 405 | self.coord.core_storage_suffix, |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 406 | _get_core_data_id_from_instance(new_member_id)) |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 407 | |
| 408 | # 5. Remove non-existent members |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 409 | for member_id in inactive_members: |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 410 | yield self.coord.kv_delete( |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 411 | self.coord.assignment_prefix + member_id, recurse=True) |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 412 | yield self.coord.kv_delete( |
khenaidoo | a8588f2 | 2017-06-16 12:13:34 -0400 | [diff] [blame] | 413 | self.coord.membership_prefix + member_id, |
khenaidoo | 032d330 | 2017-06-09 14:50:04 -0400 | [diff] [blame] | 414 | recurse=True) |
| 415 | |
| 416 | except Exception as e: |
| 417 | log.exception('config-reassignment-failure', e=e) |
| 418 | self._restart_core_store_reassignment_soak_timer() |