blob: b74de4a549ddafbbcc5abc2ad85fc62fb4d5c9dc [file] [log] [blame]
khenaidoob9203542018-09-17 22:56:37 -04001/*
2 * Copyright 2018-present Open Networking Foundation
3
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7
8 * http://www.apache.org/licenses/LICENSE-2.0
9
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16package core
17
18import (
19 "context"
sbarbari17d7e222019-11-05 10:02:29 -050020 "github.com/opencord/voltha-go/db/model"
khenaidoob9203542018-09-17 22:56:37 -040021 "github.com/opencord/voltha-go/rw_core/config"
sbarbari17d7e222019-11-05 10:02:29 -050022 "github.com/opencord/voltha-lib-go/v2/pkg/db"
Scott Baker807addd2019-10-24 15:16:21 -070023 "github.com/opencord/voltha-lib-go/v2/pkg/db/kvstore"
Scott Baker807addd2019-10-24 15:16:21 -070024 grpcserver "github.com/opencord/voltha-lib-go/v2/pkg/grpc"
25 "github.com/opencord/voltha-lib-go/v2/pkg/kafka"
26 "github.com/opencord/voltha-lib-go/v2/pkg/log"
27 "github.com/opencord/voltha-lib-go/v2/pkg/probe"
Scott Baker555307d2019-11-04 08:58:01 -080028 "github.com/opencord/voltha-protos/v2/go/voltha"
khenaidoob9203542018-09-17 22:56:37 -040029 "google.golang.org/grpc"
khenaidoob3244212019-08-27 14:32:27 -040030 "google.golang.org/grpc/codes"
31 "google.golang.org/grpc/status"
32 "time"
khenaidoob9203542018-09-17 22:56:37 -040033)
34
35type Core struct {
36 instanceId string
37 deviceMgr *DeviceManager
38 logicalDeviceMgr *LogicalDeviceManager
39 grpcServer *grpcserver.GrpcServer
Richard Jankowskidbab94a2018-12-06 16:20:25 -050040 grpcNBIAPIHandler *APIHandler
khenaidoo2c6a0992019-04-29 13:46:56 -040041 adapterMgr *AdapterManager
khenaidoob9203542018-09-17 22:56:37 -040042 config *config.RWCoreFlags
khenaidoo43c82122018-11-22 18:38:28 -050043 kmp *kafka.InterContainerProxy
khenaidoo92e62c52018-10-03 14:02:54 -040044 clusterDataRoot model.Root
45 localDataRoot model.Root
khenaidoob9203542018-09-17 22:56:37 -040046 clusterDataProxy *model.Proxy
47 localDataProxy *model.Proxy
48 exitChannel chan int
Richard Jankowskie4d77662018-10-17 13:53:21 -040049 kvClient kvstore.Client
khenaidoo43c82122018-11-22 18:38:28 -050050 kafkaClient kafka.Client
khenaidoo2c6a0992019-04-29 13:46:56 -040051 deviceOwnership *DeviceOwnership
khenaidoob9203542018-09-17 22:56:37 -040052}
53
54func init() {
55 log.AddPackage(log.JSON, log.WarnLevel, nil)
56}
57
khenaidoo43c82122018-11-22 18:38:28 -050058func NewCore(id string, cf *config.RWCoreFlags, kvClient kvstore.Client, kafkaClient kafka.Client) *Core {
khenaidoob9203542018-09-17 22:56:37 -040059 var core Core
60 core.instanceId = id
61 core.exitChannel = make(chan int, 1)
62 core.config = cf
Richard Jankowskie4d77662018-10-17 13:53:21 -040063 core.kvClient = kvClient
khenaidoo43c82122018-11-22 18:38:28 -050064 core.kafkaClient = kafkaClient
Richard Jankowskie4d77662018-10-17 13:53:21 -040065
66 // Setup the KV store
sbarbari17d7e222019-11-05 10:02:29 -050067 backend := db.Backend{
khenaidoo7ccedd52018-12-14 16:48:54 -050068 Client: kvClient,
69 StoreType: cf.KVStoreType,
70 Host: cf.KVStoreHost,
71 Port: cf.KVStorePort,
72 Timeout: cf.KVStoreTimeout,
khenaidoo9cdc1a62019-01-24 21:57:40 -050073 PathPrefix: cf.KVStoreDataPrefix}
khenaidoo7ccedd52018-12-14 16:48:54 -050074 core.clusterDataRoot = model.NewRoot(&voltha.Voltha{}, &backend)
75 core.localDataRoot = model.NewRoot(&voltha.CoreInstance{}, &backend)
Stephane Barbarieef6650d2019-07-18 12:15:09 -040076 core.clusterDataProxy = core.clusterDataRoot.CreateProxy(context.Background(), "/", false)
77 core.localDataProxy = core.localDataRoot.CreateProxy(context.Background(), "/", false)
khenaidoob9203542018-09-17 22:56:37 -040078 return &core
79}
80
81func (core *Core) Start(ctx context.Context) {
David K. Bainbridgeb4a9ab02019-09-20 15:12:16 -070082
83 // If the context has a probe then fetch it and register our services
84 var p *probe.Probe
85 if value := ctx.Value(probe.ProbeContextKey); value != nil {
86 if _, ok := value.(*probe.Probe); ok {
87 p = value.(*probe.Probe)
88 p.RegisterService(
89 "message-bus",
90 "kv-store",
91 "device-manager",
92 "logical-device-manager",
93 "adapter-manager",
94 "grpc-service",
95 )
96 }
97 }
98
khenaidoob3244212019-08-27 14:32:27 -040099 log.Info("starting-core-services", log.Fields{"coreId": core.instanceId})
100
101 // Wait until connection to KV Store is up
102 if err := core.waitUntilKVStoreReachableOrMaxTries(ctx, core.config.MaxConnectionRetries, core.config.ConnectionRetryInterval); err != nil {
103 log.Fatal("Unable-to-connect-to-KV-store")
104 }
David K. Bainbridgeb4a9ab02019-09-20 15:12:16 -0700105 if p != nil {
106 p.UpdateStatus("kv-store", probe.ServiceStatusRunning)
107 }
khenaidoob3244212019-08-27 14:32:27 -0400108
Scott Bakeree6a0872019-10-29 15:59:52 -0700109 // core.kmp must be created before deviceMgr and adapterMgr, as they will make
110 // private copies of the poiner to core.kmp.
111 if err := core.initKafkaManager(ctx); err != nil {
112 log.Fatal("Failed-to-init-kafka-manager")
David K. Bainbridgeb4a9ab02019-09-20 15:12:16 -0700113 }
khenaidoob3244212019-08-27 14:32:27 -0400114
khenaidoo631fe542019-05-31 15:44:43 -0400115 log.Debugw("values", log.Fields{"kmp": core.kmp})
Richard Jankowski199fd862019-03-18 14:49:51 -0400116 core.deviceMgr = newDeviceManager(core)
khenaidooba6b6c42019-08-02 09:11:56 -0400117 core.adapterMgr = newAdapterManager(core.clusterDataProxy, core.instanceId, core.deviceMgr)
118 core.deviceMgr.adapterMgr = core.adapterMgr
khenaidoo2c6a0992019-04-29 13:46:56 -0400119 core.logicalDeviceMgr = newLogicalDeviceManager(core, core.deviceMgr, core.kmp, core.clusterDataProxy, core.config.DefaultCoreTimeout)
khenaidoo54e0ddf2019-02-27 16:21:33 -0500120
Scott Bakeree6a0872019-10-29 15:59:52 -0700121 // Start the KafkaManager. This must be done after the deviceMgr, adapterMgr, and
122 // logicalDeviceMgr have been created, as once the kmp is started, it will register
123 // the above with the kmp.
124
125 go core.startKafkaManager(ctx,
126 core.config.ConnectionRetryInterval,
127 core.config.LiveProbeInterval,
128 core.config.NotLiveProbeInterval)
khenaidoob3244212019-08-27 14:32:27 -0400129
khenaidoob9203542018-09-17 22:56:37 -0400130 go core.startDeviceManager(ctx)
131 go core.startLogicalDeviceManager(ctx)
132 go core.startGRPCService(ctx)
khenaidoo21d51152019-02-01 13:48:37 -0500133 go core.startAdapterManager(ctx)
khenaidoob9203542018-09-17 22:56:37 -0400134
khenaidoo1ce37ad2019-03-24 22:07:24 -0400135 // Setup device ownership context
136 core.deviceOwnership = NewDeviceOwnership(core.instanceId, core.kvClient, core.deviceMgr, core.logicalDeviceMgr,
137 "service/voltha/owns_device", 10)
138
khenaidoob3244212019-08-27 14:32:27 -0400139 log.Info("core-services-started")
khenaidoob9203542018-09-17 22:56:37 -0400140}
141
142func (core *Core) Stop(ctx context.Context) {
khenaidoo19374072018-12-11 11:05:15 -0500143 log.Info("stopping-adaptercore")
David Bainbridgef794fc52019-10-03 22:37:12 +0000144 if core.exitChannel != nil {
145 core.exitChannel <- 1
146 }
khenaidoo43c82122018-11-22 18:38:28 -0500147 // Stop all the started services
David Bainbridgef794fc52019-10-03 22:37:12 +0000148 if core.grpcServer != nil {
149 core.grpcServer.Stop()
150 }
151 if core.logicalDeviceMgr != nil {
152 core.logicalDeviceMgr.stop(ctx)
153 }
154 if core.deviceMgr != nil {
155 core.deviceMgr.stop(ctx)
156 }
157 if core.kmp != nil {
158 core.kmp.Stop()
159 }
khenaidoo19374072018-12-11 11:05:15 -0500160 log.Info("adaptercore-stopped")
khenaidoob9203542018-09-17 22:56:37 -0400161}
162
khenaidoo631fe542019-05-31 15:44:43 -0400163//startGRPCService creates the grpc service handlers, registers it to the grpc server and starts the server
khenaidoob9203542018-09-17 22:56:37 -0400164func (core *Core) startGRPCService(ctx context.Context) {
165 // create an insecure gserver server
Scott Bakeree6a0872019-10-29 15:59:52 -0700166 core.grpcServer = grpcserver.NewGrpcServer(core.config.GrpcHost, core.config.GrpcPort, nil, false, probe.GetProbeFromContext(ctx))
khenaidoob9203542018-09-17 22:56:37 -0400167 log.Info("grpc-server-created")
168
khenaidoo54e0ddf2019-02-27 16:21:33 -0500169 core.grpcNBIAPIHandler = NewAPIHandler(core)
Richard Jankowski46464e92019-03-05 11:53:55 -0500170 log.Infow("grpc-handler", log.Fields{"core_binding_key": core.config.CoreBindingKey})
Richard Jankowskidbab94a2018-12-06 16:20:25 -0500171 core.logicalDeviceMgr.setGrpcNbiHandler(core.grpcNBIAPIHandler)
khenaidoob9203542018-09-17 22:56:37 -0400172 // Create a function to register the core GRPC service with the GRPC server
173 f := func(gs *grpc.Server) {
174 voltha.RegisterVolthaServiceServer(
175 gs,
Richard Jankowskidbab94a2018-12-06 16:20:25 -0500176 core.grpcNBIAPIHandler,
khenaidoob9203542018-09-17 22:56:37 -0400177 )
178 }
179
180 core.grpcServer.AddService(f)
181 log.Info("grpc-service-added")
182
David K. Bainbridgeb4a9ab02019-09-20 15:12:16 -0700183 /*
184 * Start the GRPC server
185 *
186 * This is a bit sub-optimal here as the grpcServer.Start call does not return (blocks)
187 * until something fails, but we want to send a "start" status update. As written this
188 * means that we are actually sending the "start" status update before the server is
189 * started, which means it is possible that the status is "running" before it actually is.
190 *
191 * This means that there is a small window in which the core could return its status as
192 * ready, when it really isn't.
193 */
194 probe.UpdateStatusFromContext(ctx, "grpc-service", probe.ServiceStatusRunning)
khenaidoob9203542018-09-17 22:56:37 -0400195 log.Info("grpc-server-started")
David K. Bainbridgeb4a9ab02019-09-20 15:12:16 -0700196 core.grpcServer.Start(context.Background())
197 probe.UpdateStatusFromContext(ctx, "grpc-service", probe.ServiceStatusStopped)
khenaidoob9203542018-09-17 22:56:37 -0400198}
199
Scott Bakeree6a0872019-10-29 15:59:52 -0700200// Initialize the kafka manager, but we will start it later
201func (core *Core) initKafkaManager(ctx context.Context) error {
202 log.Infow("initialize-kafka-manager", log.Fields{"host": core.config.KafkaAdapterHost,
khenaidoob9203542018-09-17 22:56:37 -0400203 "port": core.config.KafkaAdapterPort, "topic": core.config.CoreTopic})
Scott Bakeree6a0872019-10-29 15:59:52 -0700204
205 probe.UpdateStatusFromContext(ctx, "message-bus", probe.ServiceStatusPreparing)
206
207 // create the proxy
khenaidoob9203542018-09-17 22:56:37 -0400208 var err error
khenaidoo43c82122018-11-22 18:38:28 -0500209 if core.kmp, err = kafka.NewInterContainerProxy(
210 kafka.InterContainerHost(core.config.KafkaAdapterHost),
211 kafka.InterContainerPort(core.config.KafkaAdapterPort),
212 kafka.MsgClient(core.kafkaClient),
khenaidoo79232702018-12-04 11:00:41 -0500213 kafka.DefaultTopic(&kafka.Topic{Name: core.config.CoreTopic}),
214 kafka.DeviceDiscoveryTopic(&kafka.Topic{Name: core.config.AffinityRouterTopic})); err != nil {
khenaidoob9203542018-09-17 22:56:37 -0400215 log.Errorw("fail-to-create-kafka-proxy", log.Fields{"error": err})
216 return err
217 }
Scott Bakeree6a0872019-10-29 15:59:52 -0700218
219 probe.UpdateStatusFromContext(ctx, "message-bus", probe.ServiceStatusPrepared)
220
221 return nil
222}
223
224/*
225 * KafkaMonitorThread
226 *
227 * Repsonsible for starting the Kafka Interadapter Proxy and monitoring its liveness
228 * state.
229 *
230 * Any producer that fails to send will cause KafkaInterContainerProxy to
231 * post a false event on its liveness channel. Any producer that succeeds in sending
232 * will cause KafkaInterContainerProxy to post a true event on its liveness
233 * channel. Group recievers also update liveness state, and a receiver will typically
234 * indicate a loss of liveness within 3-5 seconds of Kafka going down. Receivers
235 * only indicate restoration of liveness if a message is received. During normal
236 * operation, messages will be routinely produced and received, automatically
237 * indicating liveness state. These routine liveness indications are rate-limited
238 * inside sarama_client.
239 *
240 * This thread monitors the status of KafkaInterContainerProxy's liveness and pushes
241 * that state to the core's readiness probes. If no liveness event has been seen
242 * within a timeout, then the thread will make an attempt to produce a "liveness"
243 * message, which will in turn trigger a liveness event on the liveness channel, true
244 * or false depending on whether the attempt succeeded.
245 *
246 * The gRPC server in turn monitors the state of the readiness probe and will
247 * start issuing UNAVAILABLE response while the probe is not ready.
248 *
249 * startupRetryInterval -- interval between attempts to start
250 * liveProbeInterval -- interval between liveness checks when in a live state
251 * notLiveProbeInterval -- interval between liveness checks when in a notLive state
252 *
253 * liveProbeInterval and notLiveProbeInterval can be configured separately,
254 * though the current default is that both are set to 60 seconds.
255 */
256
257func (core *Core) startKafkaManager(ctx context.Context, startupRetryInterval int, liveProbeInterval int, notLiveProbeInterval int) {
258 log.Infow("starting-kafka-manager-thread", log.Fields{"host": core.config.KafkaAdapterHost,
259 "port": core.config.KafkaAdapterPort, "topic": core.config.CoreTopic})
260
261 started := false
262 for !started {
263 // If we haven't started yet, then try to start
264 log.Infow("starting-kafka-proxy", log.Fields{})
265 if err := core.kmp.Start(); err != nil {
266 // We failed to start. Delay and then try again later.
267 // Don't worry about liveness, as we can't be live until we've started.
268 probe.UpdateStatusFromContext(ctx, "message-bus", probe.ServiceStatusNotReady)
khenaidoob3244212019-08-27 14:32:27 -0400269 log.Infow("error-starting-kafka-messaging-proxy", log.Fields{"error": err})
Scott Bakeree6a0872019-10-29 15:59:52 -0700270 time.Sleep(time.Duration(startupRetryInterval) * time.Second)
khenaidoob3244212019-08-27 14:32:27 -0400271 } else {
Scott Bakeree6a0872019-10-29 15:59:52 -0700272 // We started. We only need to do this once.
273 // Next we'll fall through and start checking liveness.
274 log.Infow("started-kafka-proxy", log.Fields{})
275
276 // cannot do this until after the kmp is started
277 if err := core.registerAdapterRequestHandlers(ctx, core.instanceId, core.deviceMgr, core.logicalDeviceMgr, core.adapterMgr, core.clusterDataProxy, core.localDataProxy); err != nil {
278 log.Fatal("Failure-registering-adapterRequestHandler")
279 }
280
281 started = true
khenaidoob3244212019-08-27 14:32:27 -0400282 }
khenaidoob9203542018-09-17 22:56:37 -0400283 }
Scott Bakeree6a0872019-10-29 15:59:52 -0700284
285 log.Info("started-kafka-message-proxy")
286
287 livenessChannel := core.kmp.EnableLivenessChannel(true)
288
289 log.Info("enabled-kafka-liveness-channel")
290
291 timeout := time.Duration(liveProbeInterval) * time.Second
292 for {
293 timeoutTimer := time.NewTimer(timeout)
294 select {
295 case liveness := <-livenessChannel:
296 log.Infow("kafka-manager-thread-liveness-event", log.Fields{"liveness": liveness})
297 // there was a state change in Kafka liveness
298 if !liveness {
299 probe.UpdateStatusFromContext(ctx, "message-bus", probe.ServiceStatusNotReady)
300
301 if core.grpcServer != nil {
302 log.Info("kafka-manager-thread-set-server-notready")
303 }
304
305 // retry frequently while life is bad
306 timeout = time.Duration(notLiveProbeInterval) * time.Second
307 } else {
308 probe.UpdateStatusFromContext(ctx, "message-bus", probe.ServiceStatusRunning)
309
310 if core.grpcServer != nil {
311 log.Info("kafka-manager-thread-set-server-ready")
312 }
313
314 // retry infrequently while life is good
315 timeout = time.Duration(liveProbeInterval) * time.Second
316 }
317 if !timeoutTimer.Stop() {
318 <-timeoutTimer.C
319 }
320 case <-timeoutTimer.C:
321 log.Info("kafka-proxy-liveness-recheck")
322 // send the liveness probe in a goroutine; we don't want to deadlock ourselves as
323 // the liveness probe may wait (and block) writing to our channel.
324 go func() {
325 err := core.kmp.SendLiveness()
326 if err != nil {
327 // Catch possible error case if sending liveness after Sarama has been stopped.
328 log.Warnw("error-kafka-send-liveness", log.Fields{"error": err})
329 }
330 }()
331 }
332 }
khenaidoob9203542018-09-17 22:56:37 -0400333}
334
khenaidoob3244212019-08-27 14:32:27 -0400335// waitUntilKVStoreReachableOrMaxTries will wait until it can connect to a KV store or until maxtries has been reached
336func (core *Core) waitUntilKVStoreReachableOrMaxTries(ctx context.Context, maxRetries int, retryInterval int) error {
337 log.Infow("verifying-KV-store-connectivity", log.Fields{"host": core.config.KVStoreHost,
338 "port": core.config.KVStorePort, "retries": maxRetries, "retryInterval": retryInterval})
339 // Get timeout in seconds with 1 second set as minimum
340 timeout := int(core.config.DefaultCoreTimeout / 1000)
341 if timeout < 1 {
342 timeout = 1
343 }
344 count := 0
345 for {
346 if !core.kvClient.IsConnectionUp(timeout) {
347 log.Info("KV-store-unreachable")
348 if maxRetries != -1 {
349 if count >= maxRetries {
350 return status.Error(codes.Unavailable, "kv store unreachable")
351 }
352 }
353 count += 1
354 // Take a nap before retrying
355 time.Sleep(time.Duration(retryInterval) * time.Second)
356 log.Infow("retry-KV-store-connectivity", log.Fields{"retryCount": count, "maxRetries": maxRetries, "retryInterval": retryInterval})
357
358 } else {
359 break
360 }
361 }
362 log.Info("KV-store-reachable")
363 return nil
364}
365
khenaidoo54e0ddf2019-02-27 16:21:33 -0500366func (core *Core) registerAdapterRequestHandlers(ctx context.Context, coreInstanceId string, dMgr *DeviceManager,
khenaidoo297cd252019-02-07 22:10:23 -0500367 ldMgr *LogicalDeviceManager, aMgr *AdapterManager, cdProxy *model.Proxy, ldProxy *model.Proxy,
khenaidoo54e0ddf2019-02-27 16:21:33 -0500368) error {
Richard Jankowski199fd862019-03-18 14:49:51 -0400369 requestProxy := NewAdapterRequestHandlerProxy(core, coreInstanceId, dMgr, ldMgr, aMgr, cdProxy, ldProxy,
khenaidoo297cd252019-02-07 22:10:23 -0500370 core.config.InCompetingMode, core.config.LongRunningRequestTimeout, core.config.DefaultRequestTimeout)
khenaidoob9203542018-09-17 22:56:37 -0400371
khenaidoo54e0ddf2019-02-27 16:21:33 -0500372 // Register the broadcast topic to handle any core-bound broadcast requests
373 if err := core.kmp.SubscribeWithRequestHandlerInterface(kafka.Topic{Name: core.config.CoreTopic}, requestProxy); err != nil {
374 log.Fatalw("Failed-registering-broadcast-handler", log.Fields{"topic": core.config.CoreTopic})
375 return err
376 }
377
Kent Hagermana6d0c362019-07-30 12:50:21 -0400378 // Register the core-pair topic to handle core-bound requests destined to the core pair
379 if err := core.kmp.SubscribeWithDefaultRequestHandler(kafka.Topic{Name: core.config.CorePairTopic}, kafka.OffsetNewest); err != nil {
380 log.Fatalw("Failed-registering-pair-handler", log.Fields{"topic": core.config.CorePairTopic})
381 return err
382 }
383
khenaidoo54e0ddf2019-02-27 16:21:33 -0500384 log.Info("request-handler-registered")
khenaidoob9203542018-09-17 22:56:37 -0400385 return nil
386}
387
388func (core *Core) startDeviceManager(ctx context.Context) {
khenaidoo21d51152019-02-01 13:48:37 -0500389 log.Info("DeviceManager-Starting...")
khenaidoo4d4802d2018-10-04 21:59:49 -0400390 core.deviceMgr.start(ctx, core.logicalDeviceMgr)
khenaidoo21d51152019-02-01 13:48:37 -0500391 log.Info("DeviceManager-Started")
khenaidoob9203542018-09-17 22:56:37 -0400392}
393
394func (core *Core) startLogicalDeviceManager(ctx context.Context) {
khenaidoo21d51152019-02-01 13:48:37 -0500395 log.Info("Logical-DeviceManager-Starting...")
khenaidoo4d4802d2018-10-04 21:59:49 -0400396 core.logicalDeviceMgr.start(ctx)
khenaidoo21d51152019-02-01 13:48:37 -0500397 log.Info("Logical-DeviceManager-Started")
khenaidoob9203542018-09-17 22:56:37 -0400398}
khenaidoo21d51152019-02-01 13:48:37 -0500399
400func (core *Core) startAdapterManager(ctx context.Context) {
401 log.Info("Adapter-Manager-Starting...")
402 core.adapterMgr.start(ctx)
403 log.Info("Adapter-Manager-Started")
William Kurkiandaa6bb22019-03-07 12:26:28 -0500404}