khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2018-present Open Networking Foundation |
| 3 | |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 16 | |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 17 | package core |
| 18 | |
| 19 | import ( |
| 20 | "context" |
Thomas Lee S | e5a4401 | 2019-11-07 20:32:24 +0530 | [diff] [blame] | 21 | "fmt" |
Scott Baker | 2d87ee3 | 2020-03-03 13:04:01 -0800 | [diff] [blame] | 22 | "sync" |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 23 | "time" |
| 24 | |
sbarbari | 17d7e22 | 2019-11-05 10:02:29 -0500 | [diff] [blame] | 25 | "github.com/opencord/voltha-go/db/model" |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 26 | "github.com/opencord/voltha-go/rw_core/config" |
serkant.uluderya | 2ae470f | 2020-01-21 11:13:09 -0800 | [diff] [blame] | 27 | "github.com/opencord/voltha-lib-go/v3/pkg/db" |
| 28 | "github.com/opencord/voltha-lib-go/v3/pkg/db/kvstore" |
| 29 | grpcserver "github.com/opencord/voltha-lib-go/v3/pkg/grpc" |
| 30 | "github.com/opencord/voltha-lib-go/v3/pkg/kafka" |
| 31 | "github.com/opencord/voltha-lib-go/v3/pkg/log" |
| 32 | "github.com/opencord/voltha-lib-go/v3/pkg/probe" |
| 33 | "github.com/opencord/voltha-protos/v3/go/voltha" |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 34 | "google.golang.org/grpc" |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 35 | "google.golang.org/grpc/codes" |
| 36 | "google.golang.org/grpc/status" |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 37 | ) |
| 38 | |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 39 | // Core represent read,write core attributes |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 40 | type Core struct { |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 41 | instanceID string |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 42 | deviceMgr *DeviceManager |
| 43 | logicalDeviceMgr *LogicalDeviceManager |
| 44 | grpcServer *grpcserver.GrpcServer |
Richard Jankowski | dbab94a | 2018-12-06 16:20:25 -0500 | [diff] [blame] | 45 | grpcNBIAPIHandler *APIHandler |
khenaidoo | 2c6a099 | 2019-04-29 13:46:56 -0400 | [diff] [blame] | 46 | adapterMgr *AdapterManager |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 47 | config *config.RWCoreFlags |
npujar | 467fe75 | 2020-01-16 20:17:45 +0530 | [diff] [blame] | 48 | kmp kafka.InterContainerProxy |
khenaidoo | 92e62c5 | 2018-10-03 14:02:54 -0400 | [diff] [blame] | 49 | clusterDataRoot model.Root |
| 50 | localDataRoot model.Root |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 51 | clusterDataProxy *model.Proxy |
| 52 | localDataProxy *model.Proxy |
Scott Baker | 2d87ee3 | 2020-03-03 13:04:01 -0800 | [diff] [blame] | 53 | exitChannel chan struct{} |
| 54 | stopOnce sync.Once |
Richard Jankowski | e4d7766 | 2018-10-17 13:53:21 -0400 | [diff] [blame] | 55 | kvClient kvstore.Client |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 56 | backend db.Backend |
khenaidoo | 43c8212 | 2018-11-22 18:38:28 -0500 | [diff] [blame] | 57 | kafkaClient kafka.Client |
khenaidoo | 2c6a099 | 2019-04-29 13:46:56 -0400 | [diff] [blame] | 58 | deviceOwnership *DeviceOwnership |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 59 | } |
| 60 | |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 61 | // NewCore creates instance of rw core |
Thomas Lee S | e5a4401 | 2019-11-07 20:32:24 +0530 | [diff] [blame] | 62 | func NewCore(ctx context.Context, id string, cf *config.RWCoreFlags, kvClient kvstore.Client, kafkaClient kafka.Client) *Core { |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 63 | var core Core |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 64 | core.instanceID = id |
Scott Baker | 2d87ee3 | 2020-03-03 13:04:01 -0800 | [diff] [blame] | 65 | core.exitChannel = make(chan struct{}) |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 66 | core.config = cf |
Richard Jankowski | e4d7766 | 2018-10-17 13:53:21 -0400 | [diff] [blame] | 67 | core.kvClient = kvClient |
khenaidoo | 43c8212 | 2018-11-22 18:38:28 -0500 | [diff] [blame] | 68 | core.kafkaClient = kafkaClient |
Richard Jankowski | e4d7766 | 2018-10-17 13:53:21 -0400 | [diff] [blame] | 69 | |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 70 | // Configure backend to push Liveness Status at least every (cf.LiveProbeInterval / 2) seconds |
| 71 | // so as to avoid trigger of Liveness check (due to Liveness timeout) when backend is alive |
| 72 | livenessChannelInterval := cf.LiveProbeInterval / 2 |
| 73 | |
Richard Jankowski | e4d7766 | 2018-10-17 13:53:21 -0400 | [diff] [blame] | 74 | // Setup the KV store |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 75 | core.backend = db.Backend{ |
| 76 | Client: kvClient, |
| 77 | StoreType: cf.KVStoreType, |
| 78 | Host: cf.KVStoreHost, |
| 79 | Port: cf.KVStorePort, |
| 80 | Timeout: cf.KVStoreTimeout, |
| 81 | LivenessChannelInterval: livenessChannelInterval, |
| 82 | PathPrefix: cf.KVStoreDataPrefix} |
| 83 | core.clusterDataRoot = model.NewRoot(&voltha.Voltha{}, &core.backend) |
| 84 | core.localDataRoot = model.NewRoot(&voltha.CoreInstance{}, &core.backend) |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 85 | return &core |
| 86 | } |
| 87 | |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 88 | // Start brings up core services |
Thomas Lee S | e5a4401 | 2019-11-07 20:32:24 +0530 | [diff] [blame] | 89 | func (core *Core) Start(ctx context.Context) error { |
David K. Bainbridge | b4a9ab0 | 2019-09-20 15:12:16 -0700 | [diff] [blame] | 90 | |
| 91 | // If the context has a probe then fetch it and register our services |
| 92 | var p *probe.Probe |
| 93 | if value := ctx.Value(probe.ProbeContextKey); value != nil { |
| 94 | if _, ok := value.(*probe.Probe); ok { |
| 95 | p = value.(*probe.Probe) |
| 96 | p.RegisterService( |
| 97 | "message-bus", |
| 98 | "kv-store", |
| 99 | "device-manager", |
| 100 | "logical-device-manager", |
| 101 | "adapter-manager", |
| 102 | "grpc-service", |
| 103 | ) |
| 104 | } |
| 105 | } |
| 106 | |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 107 | logger.Info("starting-core-services", log.Fields{"coreId": core.instanceID}) |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 108 | |
| 109 | // Wait until connection to KV Store is up |
| 110 | if err := core.waitUntilKVStoreReachableOrMaxTries(ctx, core.config.MaxConnectionRetries, core.config.ConnectionRetryInterval); err != nil { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 111 | logger.Fatal("Unable-to-connect-to-KV-store") |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 112 | } |
David K. Bainbridge | b4a9ab0 | 2019-09-20 15:12:16 -0700 | [diff] [blame] | 113 | if p != nil { |
| 114 | p.UpdateStatus("kv-store", probe.ServiceStatusRunning) |
| 115 | } |
Thomas Lee S | e5a4401 | 2019-11-07 20:32:24 +0530 | [diff] [blame] | 116 | var err error |
| 117 | |
npujar | 467fe75 | 2020-01-16 20:17:45 +0530 | [diff] [blame] | 118 | core.clusterDataProxy, err = core.clusterDataRoot.CreateProxy(ctx, "/", false) |
Thomas Lee S | e5a4401 | 2019-11-07 20:32:24 +0530 | [diff] [blame] | 119 | if err != nil { |
| 120 | probe.UpdateStatusFromContext(ctx, "kv-store", probe.ServiceStatusNotReady) |
| 121 | return fmt.Errorf("Failed to create cluster data proxy") |
| 122 | } |
npujar | 467fe75 | 2020-01-16 20:17:45 +0530 | [diff] [blame] | 123 | core.localDataProxy, err = core.localDataRoot.CreateProxy(ctx, "/", false) |
Thomas Lee S | e5a4401 | 2019-11-07 20:32:24 +0530 | [diff] [blame] | 124 | if err != nil { |
| 125 | probe.UpdateStatusFromContext(ctx, "kv-store", probe.ServiceStatusNotReady) |
| 126 | return fmt.Errorf("Failed to create local data proxy") |
| 127 | } |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 128 | |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 129 | // core.kmp must be created before deviceMgr and adapterMgr, as they will make |
| 130 | // private copies of the poiner to core.kmp. |
npujar | 467fe75 | 2020-01-16 20:17:45 +0530 | [diff] [blame] | 131 | core.initKafkaManager(ctx) |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 132 | |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 133 | logger.Debugw("values", log.Fields{"kmp": core.kmp}) |
Richard Jankowski | 199fd86 | 2019-03-18 14:49:51 -0400 | [diff] [blame] | 134 | core.deviceMgr = newDeviceManager(core) |
Kent Hagerman | 16ce36a | 2019-12-17 13:40:53 -0500 | [diff] [blame] | 135 | core.adapterMgr = newAdapterManager(core.clusterDataProxy, core.instanceID, core.kafkaClient, core.deviceMgr) |
khenaidoo | ba6b6c4 | 2019-08-02 09:11:56 -0400 | [diff] [blame] | 136 | core.deviceMgr.adapterMgr = core.adapterMgr |
khenaidoo | 2c6a099 | 2019-04-29 13:46:56 -0400 | [diff] [blame] | 137 | core.logicalDeviceMgr = newLogicalDeviceManager(core, core.deviceMgr, core.kmp, core.clusterDataProxy, core.config.DefaultCoreTimeout) |
khenaidoo | 54e0ddf | 2019-02-27 16:21:33 -0500 | [diff] [blame] | 138 | |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 139 | // Start the KafkaManager. This must be done after the deviceMgr, adapterMgr, and |
| 140 | // logicalDeviceMgr have been created, as once the kmp is started, it will register |
| 141 | // the above with the kmp. |
| 142 | |
| 143 | go core.startKafkaManager(ctx, |
| 144 | core.config.ConnectionRetryInterval, |
| 145 | core.config.LiveProbeInterval, |
| 146 | core.config.NotLiveProbeInterval) |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 147 | |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 148 | go core.startDeviceManager(ctx) |
| 149 | go core.startLogicalDeviceManager(ctx) |
| 150 | go core.startGRPCService(ctx) |
khenaidoo | 21d5115 | 2019-02-01 13:48:37 -0500 | [diff] [blame] | 151 | go core.startAdapterManager(ctx) |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 152 | go core.monitorKvstoreLiveness(ctx) |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 153 | |
khenaidoo | 1ce37ad | 2019-03-24 22:07:24 -0400 | [diff] [blame] | 154 | // Setup device ownership context |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 155 | core.deviceOwnership = NewDeviceOwnership(core.instanceID, core.kvClient, core.deviceMgr, core.logicalDeviceMgr, |
khenaidoo | 1ce37ad | 2019-03-24 22:07:24 -0400 | [diff] [blame] | 156 | "service/voltha/owns_device", 10) |
| 157 | |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 158 | logger.Info("core-services-started") |
Thomas Lee S | e5a4401 | 2019-11-07 20:32:24 +0530 | [diff] [blame] | 159 | return nil |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 160 | } |
| 161 | |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 162 | // Stop brings down core services |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 163 | func (core *Core) Stop(ctx context.Context) { |
Scott Baker | 2d87ee3 | 2020-03-03 13:04:01 -0800 | [diff] [blame] | 164 | core.stopOnce.Do(func() { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 165 | logger.Info("stopping-adaptercore") |
Scott Baker | 2d87ee3 | 2020-03-03 13:04:01 -0800 | [diff] [blame] | 166 | // Signal to the KVStoreMonitor that we are stopping. |
| 167 | close(core.exitChannel) |
| 168 | // Stop all the started services |
| 169 | if core.grpcServer != nil { |
| 170 | core.grpcServer.Stop() |
| 171 | } |
| 172 | if core.logicalDeviceMgr != nil { |
| 173 | core.logicalDeviceMgr.stop(ctx) |
| 174 | } |
| 175 | if core.deviceMgr != nil { |
| 176 | core.deviceMgr.stop(ctx) |
| 177 | } |
| 178 | if core.kmp != nil { |
| 179 | core.kmp.Stop() |
| 180 | } |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 181 | logger.Info("adaptercore-stopped") |
Scott Baker | 2d87ee3 | 2020-03-03 13:04:01 -0800 | [diff] [blame] | 182 | }) |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 183 | } |
| 184 | |
khenaidoo | 631fe54 | 2019-05-31 15:44:43 -0400 | [diff] [blame] | 185 | //startGRPCService creates the grpc service handlers, registers it to the grpc server and starts the server |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 186 | func (core *Core) startGRPCService(ctx context.Context) { |
| 187 | // create an insecure gserver server |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 188 | core.grpcServer = grpcserver.NewGrpcServer(core.config.GrpcHost, core.config.GrpcPort, nil, false, probe.GetProbeFromContext(ctx)) |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 189 | logger.Info("grpc-server-created") |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 190 | |
khenaidoo | 54e0ddf | 2019-02-27 16:21:33 -0500 | [diff] [blame] | 191 | core.grpcNBIAPIHandler = NewAPIHandler(core) |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 192 | logger.Infow("grpc-handler", log.Fields{"core_binding_key": core.config.CoreBindingKey}) |
Richard Jankowski | dbab94a | 2018-12-06 16:20:25 -0500 | [diff] [blame] | 193 | core.logicalDeviceMgr.setGrpcNbiHandler(core.grpcNBIAPIHandler) |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 194 | // Create a function to register the core GRPC service with the GRPC server |
| 195 | f := func(gs *grpc.Server) { |
| 196 | voltha.RegisterVolthaServiceServer( |
| 197 | gs, |
Richard Jankowski | dbab94a | 2018-12-06 16:20:25 -0500 | [diff] [blame] | 198 | core.grpcNBIAPIHandler, |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 199 | ) |
| 200 | } |
| 201 | |
| 202 | core.grpcServer.AddService(f) |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 203 | logger.Info("grpc-service-added") |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 204 | |
David K. Bainbridge | b4a9ab0 | 2019-09-20 15:12:16 -0700 | [diff] [blame] | 205 | /* |
| 206 | * Start the GRPC server |
| 207 | * |
| 208 | * This is a bit sub-optimal here as the grpcServer.Start call does not return (blocks) |
| 209 | * until something fails, but we want to send a "start" status update. As written this |
| 210 | * means that we are actually sending the "start" status update before the server is |
| 211 | * started, which means it is possible that the status is "running" before it actually is. |
| 212 | * |
| 213 | * This means that there is a small window in which the core could return its status as |
| 214 | * ready, when it really isn't. |
| 215 | */ |
| 216 | probe.UpdateStatusFromContext(ctx, "grpc-service", probe.ServiceStatusRunning) |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 217 | logger.Info("grpc-server-started") |
npujar | 467fe75 | 2020-01-16 20:17:45 +0530 | [diff] [blame] | 218 | core.grpcServer.Start(ctx) |
David K. Bainbridge | b4a9ab0 | 2019-09-20 15:12:16 -0700 | [diff] [blame] | 219 | probe.UpdateStatusFromContext(ctx, "grpc-service", probe.ServiceStatusStopped) |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 220 | } |
| 221 | |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 222 | // Initialize the kafka manager, but we will start it later |
npujar | 467fe75 | 2020-01-16 20:17:45 +0530 | [diff] [blame] | 223 | func (core *Core) initKafkaManager(ctx context.Context) { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 224 | logger.Infow("initialize-kafka-manager", log.Fields{"host": core.config.KafkaAdapterHost, |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 225 | "port": core.config.KafkaAdapterPort, "topic": core.config.CoreTopic}) |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 226 | |
| 227 | probe.UpdateStatusFromContext(ctx, "message-bus", probe.ServiceStatusPreparing) |
| 228 | |
| 229 | // create the proxy |
npujar | 467fe75 | 2020-01-16 20:17:45 +0530 | [diff] [blame] | 230 | core.kmp = kafka.NewInterContainerProxy( |
khenaidoo | 43c8212 | 2018-11-22 18:38:28 -0500 | [diff] [blame] | 231 | kafka.InterContainerHost(core.config.KafkaAdapterHost), |
| 232 | kafka.InterContainerPort(core.config.KafkaAdapterPort), |
| 233 | kafka.MsgClient(core.kafkaClient), |
khenaidoo | 7923270 | 2018-12-04 11:00:41 -0500 | [diff] [blame] | 234 | kafka.DefaultTopic(&kafka.Topic{Name: core.config.CoreTopic}), |
npujar | 467fe75 | 2020-01-16 20:17:45 +0530 | [diff] [blame] | 235 | kafka.DeviceDiscoveryTopic(&kafka.Topic{Name: core.config.AffinityRouterTopic})) |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 236 | |
| 237 | probe.UpdateStatusFromContext(ctx, "message-bus", probe.ServiceStatusPrepared) |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 238 | } |
| 239 | |
| 240 | /* |
| 241 | * KafkaMonitorThread |
| 242 | * |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 243 | * Responsible for starting the Kafka Interadapter Proxy and monitoring its liveness |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 244 | * state. |
| 245 | * |
| 246 | * Any producer that fails to send will cause KafkaInterContainerProxy to |
| 247 | * post a false event on its liveness channel. Any producer that succeeds in sending |
| 248 | * will cause KafkaInterContainerProxy to post a true event on its liveness |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 249 | * channel. Group receivers also update liveness state, and a receiver will typically |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 250 | * indicate a loss of liveness within 3-5 seconds of Kafka going down. Receivers |
| 251 | * only indicate restoration of liveness if a message is received. During normal |
| 252 | * operation, messages will be routinely produced and received, automatically |
| 253 | * indicating liveness state. These routine liveness indications are rate-limited |
| 254 | * inside sarama_client. |
| 255 | * |
| 256 | * This thread monitors the status of KafkaInterContainerProxy's liveness and pushes |
| 257 | * that state to the core's readiness probes. If no liveness event has been seen |
| 258 | * within a timeout, then the thread will make an attempt to produce a "liveness" |
| 259 | * message, which will in turn trigger a liveness event on the liveness channel, true |
| 260 | * or false depending on whether the attempt succeeded. |
| 261 | * |
| 262 | * The gRPC server in turn monitors the state of the readiness probe and will |
| 263 | * start issuing UNAVAILABLE response while the probe is not ready. |
| 264 | * |
| 265 | * startupRetryInterval -- interval between attempts to start |
| 266 | * liveProbeInterval -- interval between liveness checks when in a live state |
| 267 | * notLiveProbeInterval -- interval between liveness checks when in a notLive state |
| 268 | * |
| 269 | * liveProbeInterval and notLiveProbeInterval can be configured separately, |
| 270 | * though the current default is that both are set to 60 seconds. |
| 271 | */ |
| 272 | |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 273 | func (core *Core) startKafkaManager(ctx context.Context, startupRetryInterval time.Duration, liveProbeInterval time.Duration, notLiveProbeInterval time.Duration) { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 274 | logger.Infow("starting-kafka-manager-thread", log.Fields{"host": core.config.KafkaAdapterHost, |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 275 | "port": core.config.KafkaAdapterPort, "topic": core.config.CoreTopic}) |
| 276 | |
| 277 | started := false |
| 278 | for !started { |
| 279 | // If we haven't started yet, then try to start |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 280 | logger.Infow("starting-kafka-proxy", log.Fields{}) |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 281 | if err := core.kmp.Start(); err != nil { |
| 282 | // We failed to start. Delay and then try again later. |
| 283 | // Don't worry about liveness, as we can't be live until we've started. |
| 284 | probe.UpdateStatusFromContext(ctx, "message-bus", probe.ServiceStatusNotReady) |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 285 | logger.Infow("error-starting-kafka-messaging-proxy", log.Fields{"error": err}) |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 286 | time.Sleep(startupRetryInterval) |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 287 | } else { |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 288 | // We started. We only need to do this once. |
| 289 | // Next we'll fall through and start checking liveness. |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 290 | logger.Infow("started-kafka-proxy", log.Fields{}) |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 291 | |
| 292 | // cannot do this until after the kmp is started |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 293 | if err := core.registerAdapterRequestHandlers(ctx, core.instanceID, core.deviceMgr, core.logicalDeviceMgr, core.adapterMgr, core.clusterDataProxy, core.localDataProxy); err != nil { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 294 | logger.Fatal("Failure-registering-adapterRequestHandler") |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 295 | } |
| 296 | |
| 297 | started = true |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 298 | } |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 299 | } |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 300 | |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 301 | logger.Info("started-kafka-message-proxy") |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 302 | |
| 303 | livenessChannel := core.kmp.EnableLivenessChannel(true) |
| 304 | |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 305 | logger.Info("enabled-kafka-liveness-channel") |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 306 | |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 307 | timeout := liveProbeInterval |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 308 | for { |
| 309 | timeoutTimer := time.NewTimer(timeout) |
| 310 | select { |
| 311 | case liveness := <-livenessChannel: |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 312 | logger.Infow("kafka-manager-thread-liveness-event", log.Fields{"liveness": liveness}) |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 313 | // there was a state change in Kafka liveness |
| 314 | if !liveness { |
| 315 | probe.UpdateStatusFromContext(ctx, "message-bus", probe.ServiceStatusNotReady) |
| 316 | |
| 317 | if core.grpcServer != nil { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 318 | logger.Info("kafka-manager-thread-set-server-notready") |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 319 | } |
| 320 | |
| 321 | // retry frequently while life is bad |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 322 | timeout = notLiveProbeInterval |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 323 | } else { |
| 324 | probe.UpdateStatusFromContext(ctx, "message-bus", probe.ServiceStatusRunning) |
| 325 | |
| 326 | if core.grpcServer != nil { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 327 | logger.Info("kafka-manager-thread-set-server-ready") |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 328 | } |
| 329 | |
| 330 | // retry infrequently while life is good |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 331 | timeout = liveProbeInterval |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 332 | } |
| 333 | if !timeoutTimer.Stop() { |
| 334 | <-timeoutTimer.C |
| 335 | } |
| 336 | case <-timeoutTimer.C: |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 337 | logger.Info("kafka-proxy-liveness-recheck") |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 338 | // send the liveness probe in a goroutine; we don't want to deadlock ourselves as |
| 339 | // the liveness probe may wait (and block) writing to our channel. |
| 340 | go func() { |
| 341 | err := core.kmp.SendLiveness() |
| 342 | if err != nil { |
| 343 | // Catch possible error case if sending liveness after Sarama has been stopped. |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 344 | logger.Warnw("error-kafka-send-liveness", log.Fields{"error": err}) |
Scott Baker | ee6a087 | 2019-10-29 15:59:52 -0700 | [diff] [blame] | 345 | } |
| 346 | }() |
| 347 | } |
| 348 | } |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 349 | } |
| 350 | |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 351 | // waitUntilKVStoreReachableOrMaxTries will wait until it can connect to a KV store or until maxtries has been reached |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 352 | func (core *Core) waitUntilKVStoreReachableOrMaxTries(ctx context.Context, maxRetries int, retryInterval time.Duration) error { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 353 | logger.Infow("verifying-KV-store-connectivity", log.Fields{"host": core.config.KVStoreHost, |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 354 | "port": core.config.KVStorePort, "retries": maxRetries, "retryInterval": retryInterval}) |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 355 | count := 0 |
| 356 | for { |
npujar | 467fe75 | 2020-01-16 20:17:45 +0530 | [diff] [blame] | 357 | if !core.kvClient.IsConnectionUp(ctx) { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 358 | logger.Info("KV-store-unreachable") |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 359 | if maxRetries != -1 { |
| 360 | if count >= maxRetries { |
| 361 | return status.Error(codes.Unavailable, "kv store unreachable") |
| 362 | } |
| 363 | } |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 364 | count++ |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 365 | // Take a nap before retrying |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 366 | time.Sleep(retryInterval) |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 367 | logger.Infow("retry-KV-store-connectivity", log.Fields{"retryCount": count, "maxRetries": maxRetries, "retryInterval": retryInterval}) |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 368 | |
| 369 | } else { |
| 370 | break |
| 371 | } |
| 372 | } |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 373 | logger.Info("KV-store-reachable") |
khenaidoo | b324421 | 2019-08-27 14:32:27 -0400 | [diff] [blame] | 374 | return nil |
| 375 | } |
| 376 | |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 377 | func (core *Core) registerAdapterRequestHandlers(ctx context.Context, coreInstanceID string, dMgr *DeviceManager, |
khenaidoo | 297cd25 | 2019-02-07 22:10:23 -0500 | [diff] [blame] | 378 | ldMgr *LogicalDeviceManager, aMgr *AdapterManager, cdProxy *model.Proxy, ldProxy *model.Proxy, |
khenaidoo | 54e0ddf | 2019-02-27 16:21:33 -0500 | [diff] [blame] | 379 | ) error { |
npujar | 1d86a52 | 2019-11-14 17:11:16 +0530 | [diff] [blame] | 380 | requestProxy := NewAdapterRequestHandlerProxy(core, coreInstanceID, dMgr, ldMgr, aMgr, cdProxy, ldProxy, |
khenaidoo | 297cd25 | 2019-02-07 22:10:23 -0500 | [diff] [blame] | 381 | core.config.InCompetingMode, core.config.LongRunningRequestTimeout, core.config.DefaultRequestTimeout) |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 382 | |
khenaidoo | 54e0ddf | 2019-02-27 16:21:33 -0500 | [diff] [blame] | 383 | // Register the broadcast topic to handle any core-bound broadcast requests |
| 384 | if err := core.kmp.SubscribeWithRequestHandlerInterface(kafka.Topic{Name: core.config.CoreTopic}, requestProxy); err != nil { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 385 | logger.Fatalw("Failed-registering-broadcast-handler", log.Fields{"topic": core.config.CoreTopic}) |
khenaidoo | 54e0ddf | 2019-02-27 16:21:33 -0500 | [diff] [blame] | 386 | return err |
| 387 | } |
| 388 | |
Kent Hagerman | a6d0c36 | 2019-07-30 12:50:21 -0400 | [diff] [blame] | 389 | // Register the core-pair topic to handle core-bound requests destined to the core pair |
| 390 | if err := core.kmp.SubscribeWithDefaultRequestHandler(kafka.Topic{Name: core.config.CorePairTopic}, kafka.OffsetNewest); err != nil { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 391 | logger.Fatalw("Failed-registering-pair-handler", log.Fields{"topic": core.config.CorePairTopic}) |
Kent Hagerman | a6d0c36 | 2019-07-30 12:50:21 -0400 | [diff] [blame] | 392 | return err |
| 393 | } |
| 394 | |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 395 | logger.Info("request-handler-registered") |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 396 | return nil |
| 397 | } |
| 398 | |
| 399 | func (core *Core) startDeviceManager(ctx context.Context) { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 400 | logger.Info("DeviceManager-Starting...") |
khenaidoo | 4d4802d | 2018-10-04 21:59:49 -0400 | [diff] [blame] | 401 | core.deviceMgr.start(ctx, core.logicalDeviceMgr) |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 402 | logger.Info("DeviceManager-Started") |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 403 | } |
| 404 | |
| 405 | func (core *Core) startLogicalDeviceManager(ctx context.Context) { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 406 | logger.Info("Logical-DeviceManager-Starting...") |
khenaidoo | 4d4802d | 2018-10-04 21:59:49 -0400 | [diff] [blame] | 407 | core.logicalDeviceMgr.start(ctx) |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 408 | logger.Info("Logical-DeviceManager-Started") |
khenaidoo | b920354 | 2018-09-17 22:56:37 -0400 | [diff] [blame] | 409 | } |
khenaidoo | 21d5115 | 2019-02-01 13:48:37 -0500 | [diff] [blame] | 410 | |
| 411 | func (core *Core) startAdapterManager(ctx context.Context) { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 412 | logger.Info("Adapter-Manager-Starting...") |
Thomas Lee S | e5a4401 | 2019-11-07 20:32:24 +0530 | [diff] [blame] | 413 | err := core.adapterMgr.start(ctx) |
| 414 | if err != nil { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 415 | logger.Fatalf("failed-to-start-adapter-manager: error %v ", err) |
Thomas Lee S | e5a4401 | 2019-11-07 20:32:24 +0530 | [diff] [blame] | 416 | } |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 417 | logger.Info("Adapter-Manager-Started") |
William Kurkian | daa6bb2 | 2019-03-07 12:26:28 -0500 | [diff] [blame] | 418 | } |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 419 | |
| 420 | /* |
| 421 | * Thread to monitor kvstore Liveness (connection status) |
| 422 | * |
| 423 | * This function constantly monitors Liveness State of kvstore as reported |
| 424 | * periodically by backend and updates the Status of kv-store service registered |
| 425 | * with rw_core probe. |
| 426 | * |
| 427 | * If no liveness event has been seen within a timeout, then the thread will |
| 428 | * perform a "liveness" check attempt, which will in turn trigger a liveness event on |
| 429 | * the liveness channel, true or false depending on whether the attempt succeeded. |
| 430 | * |
| 431 | * The gRPC server in turn monitors the state of the readiness probe and will |
| 432 | * start issuing UNAVAILABLE response while the probe is not ready. |
| 433 | */ |
| 434 | func (core *Core) monitorKvstoreLiveness(ctx context.Context) { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 435 | logger.Info("start-monitoring-kvstore-liveness") |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 436 | |
| 437 | // Instruct backend to create Liveness channel for transporting state updates |
| 438 | livenessChannel := core.backend.EnableLivenessChannel() |
| 439 | |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 440 | logger.Debug("enabled-kvstore-liveness-channel") |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 441 | |
| 442 | // Default state for kvstore is alive for rw_core |
| 443 | timeout := core.config.LiveProbeInterval |
Scott Baker | 2d87ee3 | 2020-03-03 13:04:01 -0800 | [diff] [blame] | 444 | loop: |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 445 | for { |
| 446 | timeoutTimer := time.NewTimer(timeout) |
| 447 | select { |
| 448 | |
| 449 | case liveness := <-livenessChannel: |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 450 | logger.Debugw("received-liveness-change-notification", log.Fields{"liveness": liveness}) |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 451 | |
| 452 | if !liveness { |
| 453 | probe.UpdateStatusFromContext(ctx, "kv-store", probe.ServiceStatusNotReady) |
| 454 | |
| 455 | if core.grpcServer != nil { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 456 | logger.Info("kvstore-set-server-notready") |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 457 | } |
| 458 | |
| 459 | timeout = core.config.NotLiveProbeInterval |
| 460 | |
| 461 | } else { |
| 462 | probe.UpdateStatusFromContext(ctx, "kv-store", probe.ServiceStatusRunning) |
| 463 | |
| 464 | if core.grpcServer != nil { |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 465 | logger.Info("kvstore-set-server-ready") |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 466 | } |
| 467 | |
| 468 | timeout = core.config.LiveProbeInterval |
| 469 | } |
| 470 | |
| 471 | if !timeoutTimer.Stop() { |
| 472 | <-timeoutTimer.C |
| 473 | } |
| 474 | |
Scott Baker | 2d87ee3 | 2020-03-03 13:04:01 -0800 | [diff] [blame] | 475 | case <-core.exitChannel: |
| 476 | break loop |
| 477 | |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 478 | case <-timeoutTimer.C: |
Girish Kumar | f56a468 | 2020-03-20 20:07:46 +0000 | [diff] [blame] | 479 | logger.Info("kvstore-perform-liveness-check-on-timeout") |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 480 | |
| 481 | // Trigger Liveness check if no liveness update received within the timeout period. |
| 482 | // The Liveness check will push Live state to same channel which this routine is |
| 483 | // reading and processing. This, do it asynchronously to avoid blocking for |
| 484 | // backend response and avoid any possibility of deadlock |
npujar | 467fe75 | 2020-01-16 20:17:45 +0530 | [diff] [blame] | 485 | go core.backend.PerformLivenessCheck(ctx) |
Girish Kumar | 4d3887d | 2019-11-22 14:22:05 +0000 | [diff] [blame] | 486 | } |
| 487 | } |
| 488 | } |