VOL-2204 Report health status of Kafka service

Change-Id: I993b89f500bcbaa26da77f193575f5f20c3fb7c9
diff --git a/pkg/kafka/client.go b/pkg/kafka/client.go
index 488bf9f..bda7ed9 100755
--- a/pkg/kafka/client.go
+++ b/pkg/kafka/client.go
@@ -69,4 +69,5 @@
 	Send(msg interface{}, topic *Topic, keys ...string) error
 	SendLiveness() error
 	EnableLivenessChannel(enable bool) chan bool
+	EnableHealthinessChannel(enable bool) chan bool
 }
diff --git a/pkg/kafka/kafka_inter_container_library.go b/pkg/kafka/kafka_inter_container_library.go
index 3326191..4e04b30 100644
--- a/pkg/kafka/kafka_inter_container_library.go
+++ b/pkg/kafka/kafka_inter_container_library.go
@@ -768,6 +768,10 @@
 	return kp.kafkaClient.EnableLivenessChannel(enable)
 }
 
+func (kp *InterContainerProxy) EnableHealthinessChannel(enable bool) chan bool {
+	return kp.kafkaClient.EnableHealthinessChannel(enable)
+}
+
 func (kp *InterContainerProxy) SendLiveness() error {
 	return kp.kafkaClient.SendLiveness()
 }
diff --git a/pkg/kafka/sarama_client.go b/pkg/kafka/sarama_client.go
index ff521a7..ca88dfd 100755
--- a/pkg/kafka/sarama_client.go
+++ b/pkg/kafka/sarama_client.go
@@ -81,6 +81,8 @@
 	livenessChannelInterval       time.Duration
 	lastLivenessTime              time.Time
 	started                       bool
+	healthy                       bool
+	healthiness                   chan bool
 }
 
 type SaramaClientOption func(*SaramaClient)
@@ -231,8 +233,9 @@
 	client.lockOfTopicLockMap = sync.RWMutex{}
 	client.lockOfGroupConsumers = sync.RWMutex{}
 
-	// alive until proven otherwise
+	// healthy and alive until proven otherwise
 	client.alive = true
+	client.healthy = true
 
 	return client
 }
@@ -484,6 +487,15 @@
 	}
 }
 
+// Once unhealthy, we never go back
+func (sc *SaramaClient) setUnhealthy() {
+	sc.healthy = false
+	if sc.healthiness != nil {
+		log.Infow("set-client-unhealthy", log.Fields{"healthy": sc.healthy})
+		sc.healthiness <- sc.healthy
+	}
+}
+
 func (sc *SaramaClient) isLivenessError(err error) bool {
 	// Sarama producers and consumers encapsulate the error inside
 	// a ProducerError or ConsumerError struct.
@@ -600,6 +612,30 @@
 	return sc.liveness
 }
 
+// Enable the Healthiness monitor channel. This channel will report "false"
+// if the kafka consumers die, or some other problem occurs which is
+// catastrophic that would require re-creating the client.
+func (sc *SaramaClient) EnableHealthinessChannel(enable bool) chan bool {
+	log.Infow("kafka-enable-healthiness-channel", log.Fields{"enable": enable})
+	if enable {
+		if sc.healthiness == nil {
+			log.Info("kafka-create-healthiness-channel")
+			// At least 1, so we can immediately post to it without blocking
+			// Setting a bigger number (10) allows the monitor to fall behind
+			// without blocking others. The monitor shouldn't really fall
+			// behind...
+			sc.healthiness = make(chan bool, 10)
+			// post intial state to the channel
+			sc.healthiness <- sc.healthy
+		}
+	} else {
+		// TODO: Think about whether we need the ability to turn off
+		// liveness monitoring
+		panic("Turning off healthiness reporting is not supported")
+	}
+	return sc.healthiness
+}
+
 // send an empty message on the liveness channel to check whether connectivity has
 // been restored.
 func (sc *SaramaClient) SendLiveness() error {
@@ -936,6 +972,7 @@
 		}
 	}
 	log.Infow("partition-consumer-stopped", log.Fields{"topic": topic.Name})
+	sc.setUnhealthy()
 }
 
 func (sc *SaramaClient) consumeGroupMessages(topic *Topic, consumer *scc.Consumer, consumerChnls *consumerChannels) {
@@ -979,6 +1016,7 @@
 		}
 	}
 	log.Infow("group-consumer-stopped", log.Fields{"topic": topic.Name})
+	sc.setUnhealthy()
 }
 
 func (sc *SaramaClient) startConsumers(topic *Topic) error {