VOL-794: Assignment lists in etcd not updated correctly in presence of vcore failures
* Fixed coordinator method _sanitize_member_list
* Fixed keys-only GET operation for etcd KV store
* Fixed key DELETE operation for etcd KV store
* Modified vcore manifest to allow a failed pod to be re-spawned on a
different host but with the same pod name
* Modified zookeeper manifest to allow a failed pod to be re-spawned
on a different host. With the current manifest, when all 3 zookeeper
pods are scheduled on the same node and that node fails, none of the
pods get re-spawned.
* Added NodePort for the Envoy service
* Removed anti-affinity rule from OFAgent and NetConf manifests to allow
the deployment of multiple pods on the same node
Change-Id: I052d952d81a81cafb96acfc1d57a192596e2e9a1
diff --git a/k8s/envoy_for_consul.yml b/k8s/envoy_for_consul.yml
index 6ed7923..e80812e 100644
--- a/k8s/envoy_for_consul.yml
+++ b/k8s/envoy_for_consul.yml
@@ -6,7 +6,7 @@
labels:
name: voltha
spec:
- clusterIP: None
+ type: NodePort
ports:
- name: rest
port: 8882
@@ -16,6 +16,7 @@
targetPort: 8001
- name: mystery2
port: 8443
+ nodePort: 32443
targetPort: 8443
- name: grpc
port: 50555
diff --git a/k8s/envoy_for_etcd.yml b/k8s/envoy_for_etcd.yml
index af5c1f4..d217eee 100644
--- a/k8s/envoy_for_etcd.yml
+++ b/k8s/envoy_for_etcd.yml
@@ -6,7 +6,7 @@
labels:
name: voltha
spec:
- clusterIP: None
+ type: NodePort
ports:
- name: rest
port: 8882
@@ -16,6 +16,7 @@
targetPort: 8001
- name: mystery2
port: 8443
+ nodePort: 32443
targetPort: 8443
- name: grpc
port: 50555
diff --git a/k8s/netconf.yml b/k8s/netconf.yml
index 9bb7e67..b15bc5c 100644
--- a/k8s/netconf.yml
+++ b/k8s/netconf.yml
@@ -26,16 +26,6 @@
cni: "calico"
spec:
terminationGracePeriodSeconds: 10
- affinity:
- podAntiAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- - labelSelector:
- matchExpressions:
- - key: app
- operator: In
- values:
- - netconf
- topologyKey: kubernetes.io/hostname
containers:
- name: netconf
image: voltha-netconf
diff --git a/k8s/ofagent.yml b/k8s/ofagent.yml
index c2415ae..5449947 100644
--- a/k8s/ofagent.yml
+++ b/k8s/ofagent.yml
@@ -13,16 +13,6 @@
cni: "calico"
spec:
terminationGracePeriodSeconds: 10
- affinity:
- podAntiAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- - labelSelector:
- matchExpressions:
- - key: app
- operator: In
- values:
- - ofagent
- topologyKey: kubernetes.io/hostname
containers:
- name: ofagent
image: voltha-ofagent
diff --git a/k8s/vcore_for_consul.yml b/k8s/vcore_for_consul.yml
index 2679451..b32f1c2 100644
--- a/k8s/vcore_for_consul.yml
+++ b/k8s/vcore_for_consul.yml
@@ -61,3 +61,4 @@
- "--backend=consul"
- "--pon-subnet=10.38.0.0/12"
- "--ponsim-comm=grpc"
+ - "--core-number-extractor=^.*-([0-9]+)_.*$"
diff --git a/k8s/vcore_for_etcd.yml b/k8s/vcore_for_etcd.yml
index 37ceeea..b2d5d5d 100644
--- a/k8s/vcore_for_etcd.yml
+++ b/k8s/vcore_for_etcd.yml
@@ -20,13 +20,19 @@
selector:
app: vcore
---
-apiVersion: apps/v1beta1
-kind: Deployment
+apiVersion: apps/v1
+kind: StatefulSet
metadata:
name: vcore
namespace: voltha
spec:
+ serviceName: vcore
replicas: 3
+ updateStrategy:
+ type: RollingUpdate
+ selector:
+ matchLabels:
+ app: vcore
template:
metadata:
labels:
@@ -34,6 +40,7 @@
annotations:
cni: "calico"
spec:
+ terminationGracePeriodSeconds: 0
containers:
- name: voltha
image: voltha-voltha
@@ -53,6 +60,7 @@
- "--backend=etcd"
- "--pon-subnet=10.38.0.0/12"
- "--ponsim-comm=grpc"
+ - "--core-number-extractor=^.*-([0-9]+)_.*$"
ports:
- containerPort: 8880
name: rest-port
diff --git a/k8s/zookeeper.yml b/k8s/zookeeper.yml
index 1c7f02d..93e4385 100644
--- a/k8s/zookeeper.yml
+++ b/k8s/zookeeper.yml
@@ -78,6 +78,7 @@
annotations:
cni: "calico"
spec:
+ terminationGracePeriodSeconds: 0
containers:
- name: zoo1
image: zookeeper:3.4.11
@@ -118,6 +119,7 @@
annotations:
cni: "calico"
spec:
+ terminationGracePeriodSeconds: 0
containers:
- name: zoo2
image: zookeeper:3.4.11
@@ -158,6 +160,7 @@
annotations:
cni: "calico"
spec:
+ terminationGracePeriodSeconds: 0
containers:
- name: zoo3
image: zookeeper:3.4.11
diff --git a/voltha/coordinator.py b/voltha/coordinator.py
index 9bad7d0..5b6f68f 100644
--- a/voltha/coordinator.py
+++ b/voltha/coordinator.py
@@ -65,7 +65,8 @@
instance_id,
rest_port,
config,
- consul='localhost:8500'):
+ consul='localhost:8500',
+ container_name_regex='^.*\.([0-9]+)\..*$'):
log.info('initializing-coordinator')
self.config = config['coordinator']
@@ -120,6 +121,8 @@
# TODO need to handle reconnect events properly
self.consul = Consul(host=self.host, port=self.port)
+ self.container_name_regex = container_name_regex
+
self.wait_for_leader_deferreds = []
self.peers_mapping_queue = MessageQueue()
diff --git a/voltha/coordinator_etcd.py b/voltha/coordinator_etcd.py
index 238223f..8ed25e0 100644
--- a/voltha/coordinator_etcd.py
+++ b/voltha/coordinator_etcd.py
@@ -66,7 +66,8 @@
rest_port,
config,
consul='localhost:8500',
- etcd='localhost:2379'):
+ etcd='localhost:2379',
+ container_name_regex='^.*\.([0-9]+)\..*$'):
log.info('initializing-coordinator')
self.config = config['coordinator']
@@ -129,6 +130,8 @@
self.etcd_url = u'http://' + kv_host + u':' + kv_port
self.etcd = Client(reactor, self.etcd_url)
+ self.container_name_regex = container_name_regex
+
self.wait_for_leader_deferreds = []
self.peers_mapping_queue = MessageQueue()
@@ -499,16 +502,19 @@
@inlineCallbacks
def _retry(self, operation, *args, **kw):
prefix = False
+ keys_only = False
for name, value in kw.items():
if name == 'acquire':
lease = value
kw['lease'] = lease
kw.pop('acquire')
elif name == 'keys':
+ keys_only = True
+ prefix = True
+ keyset = KeySet(bytes(args[0]), prefix=True)
kw['keys_only'] = True
kw.pop('keys')
elif name=='recurse':
-# if value == 'True':
prefix = True
keyset = KeySet(bytes(args[0]), prefix=True)
kw.pop('recurse')
@@ -536,8 +542,10 @@
result = (index, record)
else:
# Get values for all keys that match the prefix
+ # If keys_only requested, get only the keys
index = 0
records = []
+ keys = []
res = yield etcd.get(keyset, **kw)
if args[0] == 'service/voltha/assignments/':
log.info('assignments', result=res)
@@ -546,19 +554,22 @@
# Which index should be returned? The max over all keys?
if kv.mod_revision > index:
index = kv.mod_revision
- rec = dict()
- rec['Key'] = kv.key
- rec['Value'] = kv.value
- rec['ModifyIndex'] = kv.mod_revision
- rec['Session'] = self.lease.lease_id if self.lease else ''
- records.append(rec)
- result = (index, records)
+ if keys_only:
+ keys.append(kv.key)
+ else:
+ rec = dict()
+ rec['Key'] = kv.key
+ rec['Value'] = kv.value
+ rec['ModifyIndex'] = kv.mod_revision
+ rec['Session'] = self.lease.lease_id if self.lease else ''
+ records.append(rec)
+ result = (index, keys) if keys_only else (index, records)
elif operation == 'PUT':
key = bytes(args[0])
result = yield etcd.set(key, args[1], **kw)
elif operation == 'DELETE':
key = bytes(args[0])
- result = yield etcd.delete(key, **kw)
+ result = yield etcd.delete(keyset)
else:
# Default case - consider operation as a function call
result = yield operation(*args, **kw)
diff --git a/voltha/leader.py b/voltha/leader.py
index 60d1e6c..54f1117 100644
--- a/voltha/leader.py
+++ b/voltha/leader.py
@@ -42,9 +42,8 @@
"""
ID_EXTRACTOR = '^(%s)([^/]+)$'
- CORE_STORE_KEY_EXTRACTOR = '^%s(?P<core_store_id>[^/]+)/root$'
- CORE_NUMBER_EXTRACTOR = '^.*\.([0-9]+)\..*$'
- START_TIMESTAMP_EXTRACTOR = '^.*\..*\..*_([0-9]+)$'
+ CORE_STORE_KEY_EXTRACTOR = '^%s/(?P<core_store_id>[^/]+)/root$'
+ START_TIMESTAMP_EXTRACTOR = '^.*_([0-9]+)$'
ASSIGNMENT_ID_EXTRACTOR = '^(%s)([^/]+)/core_store$'
# Public methods:
@@ -72,8 +71,8 @@
self.core_data_id_match = re.compile(
self.CORE_STORE_KEY_EXTRACTOR % self.coord.core_store_prefix).match
- self.core_match = re.compile(self.CORE_NUMBER_EXTRACTOR).match
- self.timestamp_match = re.compile(self.START_TIMESTAMP_EXTRACTOR ).match
+ self.core_match = re.compile(self.coord.container_name_regex).match
+ self.timestamp_match = re.compile(self.START_TIMESTAMP_EXTRACTOR).match
self.assignment_id_match = re.compile(
self.ASSIGNMENT_ID_EXTRACTOR % self.coord.assignment_prefix).match
@@ -168,9 +167,10 @@
# This method removes any duplicates from the member list using the
# voltha number from the member id and the time that voltha instance
# started, again from the member id. This method is meaningful only
- # in the swarm cluster. In a non-cluster environment the member id
- # is formatted differently. In such a case, the method below will
- # create an exception and return the member list as is.
+ # in a clustered environment (e.g. Docker swarm or Kubernetes). In
+ # a non-cluster environment the member id is formatted differently.
+ # In such a case, the method below will create an exception and
+ # return the member list as is.
try:
unique_members = {}
@@ -202,7 +202,8 @@
return updated_members
else:
return members
- except:
+ except Exception as e:
+ log.exception('extraction-error', e=e)
return members
@inlineCallbacks
diff --git a/voltha/main.py b/voltha/main.py
index 29585b4..c19a018 100755
--- a/voltha/main.py
+++ b/voltha/main.py
@@ -52,6 +52,7 @@
defs = dict(
config=os.environ.get('CONFIG', './voltha.yml'),
+ container_name_regex=os.environ.get('CORE_NUMBER_EXTRACTOR', '^.*\.([0-9]+)\..*$'),
consul=os.environ.get('CONSUL', 'localhost:8500'),
etcd=os.environ.get('ETCD', 'localhost:2379'),
inter_core_subnet=os.environ.get('INTER_CORE_SUBNET', None),
@@ -83,6 +84,12 @@
default=defs['config'],
help=_help)
+ _help = 'Regular expression for extracting core number from container name (default: %s)' % defs['container_name_regex']
+ parser.add_argument(
+ '-X', '--core-number-extractor', dest='container_name_regex', action='store',
+ default=defs['container_name_regex'],
+ help=_help)
+
_help = '<hostname>:<port> to consul agent (default: %s)' % defs['consul']
parser.add_argument(
'-C', '--consul', dest='consul', action='store',
@@ -281,6 +288,7 @@
self.log = setup_logging(self.config.get('logging', {}),
args.instance_id,
verbosity_adjust=verbosity_adjust)
+ self.log.info('core-number-extractor', regex=args.container_name_regex)
# configurable variables from voltha.yml file
#self.configurable_vars = self.config.get('Constants', {})
@@ -288,7 +296,7 @@
if not args.no_banner:
print_banner(self.log)
- # Create a unique instnce id using the passed-in instanceid and
+ # Create a unique instance id using the passed-in instance id and
# UTC timestamp
current_time = arrow.utcnow().timestamp
self.instance_id = self.args.instance_id + '_' + str(current_time)
@@ -340,7 +348,8 @@
rest_port=self.args.rest_port,
instance_id=self.instance_id,
config=self.config,
- consul=self.args.consul)
+ consul=self.args.consul,
+ container_name_regex=self.args.container_name_regex)
).start()
elif self.args.backend == 'etcd':
yield registry.register(
@@ -352,7 +361,8 @@
instance_id=self.instance_id,
config=self.config,
consul=self.args.consul,
- etcd=self.args.etcd)
+ etcd=self.args.etcd,
+ container_name_regex=self.args.container_name_regex)
).start()
self.log.info('waiting-for-config-assignment')