VOL-2101 readiness and liveliness check added to message-bus and etcd updated vendor folder Change-Id: I0c1b983bf47e2740e0a24ea0c3a05a803caff15c updated testcases

commit: 95f2152a3297f98e15748d7ec9234a57c5fd99ad [log] [tgz]
author: cbabu <Chidambar.babu@radisys.com> Wed Nov 13 14:25:18 2019 +0100
committer: Girish Gowdra <girish@opennetworking.org> Thu Nov 14 13:44:11 2019 +0000
tree: 88ca8aeec5159cb138dc4f2d15c953f3c3ac560e
parent: e4996f371dc6dff2a7c190aa10c833e09e165394 [diff]
diff --git a/vendor/github.com/opencord/voltha-lib-go/v2/pkg/db/kvstore/etcdclient.go b/vendor/github.com/opencord/voltha-lib-go/v2/pkg/db/kvstore/etcdclient.go
index e8bc92c..3ae767c 100644
--- a/vendor/github.com/opencord/voltha-lib-go/v2/pkg/db/kvstore/etcdclient.go
+++ b/vendor/github.com/opencord/voltha-lib-go/v2/pkg/db/kvstore/etcdclient.go

@@ -38,6 +38,9 @@
 	lockToMutexLock  sync.Mutex
 }
 
+// Connection Timeout in Seconds
+var connTimeout int = 2
+
 // NewEtcdClient returns a new client for the Etcd KV store
 func NewEtcdClient(addr string, timeout int) (*EtcdClient, error) {
 	duration := GetDuration(timeout)
@@ -188,8 +191,13 @@
 		return nil, fmt.Errorf("unexpected-type%T", value)
 	}
 
+	duration := GetDuration(connTimeout)
+
 	// Create a lease
-	resp, err := c.ectdAPI.Grant(context.Background(), ttl)
+	ctx, cancel := context.WithTimeout(context.Background(), duration)
+	defer cancel()
+
+	resp, err := c.ectdAPI.Grant(ctx, ttl)
 	if err != nil {
 		log.Error(err)
 		return nil, err
@@ -255,8 +263,12 @@
 func (c *EtcdClient) ReleaseAllReservations() error {
 	c.writeLock.Lock()
 	defer c.writeLock.Unlock()
+	duration := GetDuration(connTimeout)
+	ctx, cancel := context.WithTimeout(context.Background(), duration)
+	defer cancel()
+
 	for key, leaseID := range c.keyReservations {
-		_, err := c.ectdAPI.Revoke(context.Background(), *leaseID)
+		_, err := c.ectdAPI.Revoke(ctx, *leaseID)
 		if err != nil {
 			log.Errorw("cannot-release-reservation", log.Fields{"key": key, "error": err})
 			return err
@@ -277,8 +289,12 @@
 	if leaseID, ok = c.keyReservations[key]; !ok {
 		return nil
 	}
+	duration := GetDuration(connTimeout)
+	ctx, cancel := context.WithTimeout(context.Background(), duration)
+	defer cancel()
+
 	if leaseID != nil {
-		_, err := c.ectdAPI.Revoke(context.Background(), *leaseID)
+		_, err := c.ectdAPI.Revoke(ctx, *leaseID)
 		if err != nil {
 			log.Error(err)
 			return err
@@ -299,9 +315,12 @@
 	if leaseID, ok = c.keyReservations[key]; !ok {
 		return errors.New("key-not-reserved")
 	}
+	duration := GetDuration(connTimeout)
+	ctx, cancel := context.WithTimeout(context.Background(), duration)
+	defer cancel()
 
 	if leaseID != nil {
-		_, err := c.ectdAPI.KeepAliveOnce(context.Background(), *leaseID)
+		_, err := c.ectdAPI.KeepAliveOnce(ctx, *leaseID)
 		if err != nil {
 			log.Errorw("lease-may-have-expired", log.Fields{"error": err})
 			return err

diff --git a/vendor/github.com/opencord/voltha-lib-go/v2/pkg/kafka/client.go b/vendor/github.com/opencord/voltha-lib-go/v2/pkg/kafka/client.go
index 007aa74..488bf9f 100644
--- a/vendor/github.com/opencord/voltha-lib-go/v2/pkg/kafka/client.go
+++ b/vendor/github.com/opencord/voltha-lib-go/v2/pkg/kafka/client.go

@@ -55,6 +55,7 @@
 	DefaultNumberReplicas           = 1
 	DefaultAutoCreateTopic          = false
 	DefaultMetadataMaxRetry         = 3
+	DefaultLivenessChannelInterval  = time.Second * 30
 )
 
 // MsgClient represents the set of APIs  a Kafka MsgClient must implement
@@ -66,4 +67,6 @@
 	Subscribe(topic *Topic, kvArgs ...*KVArg) (<-chan *ca.InterContainerMessage, error)
 	UnSubscribe(topic *Topic, ch <-chan *ca.InterContainerMessage) error
 	Send(msg interface{}, topic *Topic, keys ...string) error
+	SendLiveness() error
+	EnableLivenessChannel(enable bool) chan bool
 }

diff --git a/vendor/github.com/opencord/voltha-lib-go/v2/pkg/kafka/kafka_inter_container_library.go b/vendor/github.com/opencord/voltha-lib-go/v2/pkg/kafka/kafka_inter_container_library.go
index c576bc6..3326191 100644
--- a/vendor/github.com/opencord/voltha-lib-go/v2/pkg/kafka/kafka_inter_container_library.go
+++ b/vendor/github.com/opencord/voltha-lib-go/v2/pkg/kafka/kafka_inter_container_library.go

@@ -764,6 +764,14 @@
 	return nil
 }
 
+func (kp *InterContainerProxy) EnableLivenessChannel(enable bool) chan bool {
+	return kp.kafkaClient.EnableLivenessChannel(enable)
+}
+
+func (kp *InterContainerProxy) SendLiveness() error {
+	return kp.kafkaClient.SendLiveness()
+}
+
 //formatRequest formats a request to send over kafka and returns an InterContainerMessage message on success
 //or an error on failure
 func encodeRequest(rpc string, toTopic *Topic, replyTopic *Topic, key string, kvArgs ...*KVArg) (*ic.InterContainerMessage, error) {

diff --git a/vendor/github.com/opencord/voltha-lib-go/v2/pkg/kafka/sarama_client.go b/vendor/github.com/opencord/voltha-lib-go/v2/pkg/kafka/sarama_client.go
index fc75026..a251c56 100644
--- a/vendor/github.com/opencord/voltha-lib-go/v2/pkg/kafka/sarama_client.go
+++ b/vendor/github.com/opencord/voltha-lib-go/v2/pkg/kafka/sarama_client.go

@@ -74,6 +74,11 @@
 	topicLockMap                  map[string]*sync.RWMutex
 	lockOfTopicLockMap            sync.RWMutex
 	metadataMaxRetry              int
+	alive                         bool
+	liveness                      chan bool
+	livenessChannelInterval       time.Duration
+	lastLivenessTime              time.Time
+	started                       bool
 }
 
 type SaramaClientOption func(*SaramaClient)
@@ -186,6 +191,12 @@
 	}
 }
 
+func LivenessChannelInterval(opt time.Duration) SaramaClientOption {
+	return func(args *SaramaClient) {
+		args.livenessChannelInterval = opt
+	}
+}
+
 func NewSaramaClient(opts ...SaramaClientOption) *SaramaClient {
 	client := &SaramaClient{
 		KafkaHost: DefaultKafkaHost,
@@ -205,6 +216,7 @@
 	client.numReplicas = DefaultNumberReplicas
 	client.autoCreateTopic = DefaultAutoCreateTopic
 	client.metadataMaxRetry = DefaultMetadataMaxRetry
+	client.livenessChannelInterval = DefaultLivenessChannelInterval
 
 	for _, option := range opts {
 		option(client)
@@ -216,6 +228,10 @@
 	client.topicLockMap = make(map[string]*sync.RWMutex)
 	client.lockOfTopicLockMap = sync.RWMutex{}
 	client.lockOfGroupConsumers = sync.RWMutex{}
+
+	// alive until proven otherwise
+	client.alive = true
+
 	return client
 }
 
@@ -259,12 +275,16 @@
 
 	log.Info("kafka-sarama-client-started")
 
+	sc.started = true
+
 	return nil
 }
 
 func (sc *SaramaClient) Stop() {
 	log.Info("stopping-sarama-client")
 
+	sc.started = false
+
 	//Send a message over the done channel to close all long running routines
 	sc.doneCh <- 1
 
@@ -438,6 +458,30 @@
 	return err
 }
 
+func (sc *SaramaClient) updateLiveness(alive bool) {
+	// Post a consistent stream of liveness data to the channel,
+	// so that in a live state, the core does not timeout and
+	// send a forced liveness message. Production of liveness
+	// events to the channel is rate-limited by livenessChannelInterval.
+	if sc.liveness != nil {
+		if sc.alive != alive {
+			log.Info("update-liveness-channel-because-change")
+			sc.liveness <- alive
+			sc.lastLivenessTime = time.Now()
+		} else if time.Now().Sub(sc.lastLivenessTime) > sc.livenessChannelInterval {
+			log.Info("update-liveness-channel-because-interval")
+			sc.liveness <- alive
+			sc.lastLivenessTime = time.Now()
+		}
+	}
+
+	// Only emit a log message when the state changes
+	if sc.alive != alive {
+		log.Info("set-client-alive", log.Fields{"alive": alive})
+		sc.alive = alive
+	}
+}
+
 // send formats and sends the request onto the kafka messaging bus.
 func (sc *SaramaClient) Send(msg interface{}, topic *Topic, keys ...string) error {
 
@@ -474,8 +518,68 @@
 	select {
 	case ok := <-sc.producer.Successes():
 		log.Debugw("message-sent", log.Fields{"status": ok.Topic})
+		sc.updateLiveness(true)
 	case notOk := <-sc.producer.Errors():
 		log.Debugw("error-sending", log.Fields{"status": notOk})
+		if strings.Contains(notOk.Error(), "Failed to produce") {
+			sc.updateLiveness(false)
+		}
+		return notOk
+	}
+	return nil
+}
+
+// Enable the liveness monitor channel. This channel will report
+// a "true" or "false" on every publish, which indicates whether
+// or not the channel is still live. This channel is then picked up
+// by the service (i.e. rw_core / ro_core) to update readiness status
+// and/or take other actions.
+func (sc *SaramaClient) EnableLivenessChannel(enable bool) chan bool {
+	log.Infow("kafka-enable-liveness-channel", log.Fields{"enable": enable})
+	if enable {
+		if sc.liveness == nil {
+			log.Info("kafka-create-liveness-channel")
+			// At least 1, so we can immediately post to it without blocking
+			// Setting a bigger number (10) allows the monitor to fall behind
+			// without blocking others. The monitor shouldn't really fall
+			// behind...
+			sc.liveness = make(chan bool, 10)
+			// post intial state to the channel
+			sc.liveness <- sc.alive
+		}
+	} else {
+		// TODO: Think about whether we need the ability to turn off
+		// liveness monitoring
+		panic("Turning off liveness reporting is not supported")
+	}
+	return sc.liveness
+}
+
+// send an empty message on the liveness channel to check whether connectivity has
+// been restored.
+func (sc *SaramaClient) SendLiveness() error {
+	if !sc.started {
+		return fmt.Errorf("SendLiveness() called while not started")
+	}
+
+	kafkaMsg := &sarama.ProducerMessage{
+		Topic: "_liveness_test",
+		Value: sarama.StringEncoder(time.Now().Format(time.RFC3339)), // for debugging / informative use
+	}
+
+	// Send message to kafka
+	sc.producer.Input() <- kafkaMsg
+	// Wait for result
+	// TODO: Use a lock or a different mechanism to ensure the response received corresponds to the message sent.
+	select {
+	case ok := <-sc.producer.Successes():
+		log.Debugw("liveness-message-sent", log.Fields{"status": ok.Topic})
+		sc.updateLiveness(true)
+	case notOk := <-sc.producer.Errors():
+		log.Debugw("liveness-error-sending", log.Fields{"status": notOk})
+		if strings.Contains(notOk.Error(), "Failed to produce") {
+			sc.updateLiveness(false)
+		}
 		return notOk
 	}
 	return nil
@@ -713,7 +817,8 @@
 	config := scc.NewConfig()
 	config.ClientID = uuid.New().String()
 	config.Group.Mode = scc.ConsumerModeMultiplex
-	//config.Consumer.Return.Errors = true
+	config.Consumer.Group.Heartbeat.Interval, _ = time.ParseDuration("1s")
+	config.Consumer.Return.Errors = true
 	//config.Group.Return.Notifications = false
 	//config.Consumer.MaxWaitTime = time.Duration(DefaultConsumerMaxwait) * time.Millisecond
 	//config.Consumer.MaxProcessingTime = time.Duration(DefaultMaxProcessingTime) * time.Millisecond
@@ -791,16 +896,20 @@
 		select {
 		case err, ok := <-consumer.Errors():
 			if ok {
+				sc.updateLiveness(false)
 				log.Warnw("group-consumers-error", log.Fields{"topic": topic.Name, "error": err})
 			} else {
+				log.Warnw("group-consumers-closed-err", log.Fields{"topic": topic.Name})
 				// channel is closed
 				break startloop
 			}
 		case msg, ok := <-consumer.Messages():
 			if !ok {
+				log.Warnw("group-consumers-closed-msg", log.Fields{"topic": topic.Name})
 				// Channel closed
 				break startloop
 			}
+			sc.updateLiveness(true)
 			log.Debugw("message-received", log.Fields{"timestamp": msg.Timestamp, "receivedTopic": msg.Topic})
 			msgBody := msg.Value
 			icm := &ic.InterContainerMessage{}

diff --git a/vendor/github.com/opencord/voltha-lib-go/v2/pkg/probe/probe.go b/vendor/github.com/opencord/voltha-lib-go/v2/pkg/probe/probe.go
index 9823566..7e6dbf9 100644
--- a/vendor/github.com/opencord/voltha-lib-go/v2/pkg/probe/probe.go
+++ b/vendor/github.com/opencord/voltha-lib-go/v2/pkg/probe/probe.go

@@ -47,6 +47,9 @@
 
 	// ServiceStatusFailed service has stopped because of an error
 	ServiceStatusFailed
+
+	// ServiceStatusNotReady service has started but is unable to accept requests
+	ServiceStatusNotReady
 )
 
 const (
@@ -71,6 +74,8 @@
 		return "Stopped"
 	case ServiceStatusFailed:
 		return "Failed"
+	case ServiceStatusNotReady:
+		return "NotReady"
 	}
 }
 
@@ -137,6 +142,13 @@
 	if p.status == nil {
 		p.status = make(map[string]ServiceStatus)
 	}
+
+	// if status hasn't changed, avoid doing useless work
+	existingStatus, ok := p.status[name]
+	if ok && (existingStatus == status) {
+		return
+	}
+
 	p.status[name] = status
 	if p.readyFunc != nil {
 		p.isReady = p.readyFunc(p.status)
@@ -158,17 +170,41 @@
 		})
 }
 
+func (p *Probe) GetStatus(name string) ServiceStatus {
+	p.mutex.Lock()
+	defer p.mutex.Unlock()
+
+	if p.status == nil {
+		p.status = make(map[string]ServiceStatus)
+	}
+
+	currentStatus, ok := p.status[name]
+	if ok {
+		return currentStatus
+	}
+
+	return ServiceStatusUnknown
+}
+
+func GetProbeFromContext(ctx context.Context) *Probe {
+	if ctx != nil {
+		if value := ctx.Value(ProbeContextKey); value != nil {
+			if p, ok := value.(*Probe); ok {
+				return p
+			}
+		}
+	}
+	return nil
+}
+
 // UpdateStatusFromContext a convenience function to pull the Probe reference from the
 // Context, if it exists, and then calling UpdateStatus on that Probe reference. If Context
 // is nil or if a Probe reference is not associated with the ProbeContextKey then nothing
 // happens
 func UpdateStatusFromContext(ctx context.Context, name string, status ServiceStatus) {
-	if ctx != nil {
-		if value := ctx.Value(ProbeContextKey); value != nil {
-			if p, ok := value.(*Probe); ok {
-				p.UpdateStatus(name, status)
-			}
-		}
+	p := GetProbeFromContext(ctx)
+	if p != nil {
+		p.UpdateStatus(name, status)
 	}
 }
 
@@ -225,6 +261,10 @@
 	log.Fatal(s.ListenAndServe())
 }
 
+func (p *Probe) IsReady() bool {
+	return p.isReady
+}
+
 // defaultReadyFunc if all services are running then ready, else not
 func defaultReadyFunc(services map[string]ServiceStatus) bool {
 	if len(services) == 0 {
commit	95f2152a3297f98e15748d7ec9234a57c5fd99ad	[log] [tgz]
author	cbabu <Chidambar.babu@radisys.com>	Wed Nov 13 14:25:18 2019 +0100
committer	Girish Gowdra <girish@opennetworking.org>	Thu Nov 14 13:44:11 2019 +0000
tree	88ca8aeec5159cb138dc4f2d15c953f3c3ac560e
parent	e4996f371dc6dff2a7c190aa10c833e09e165394 [diff]