blob: 0b541d2641adc8e82fd3fa99d8c202e9de5a3045 [file] [log] [blame]
khenaidoo5fc5cea2021-08-11 17:39:16 -04001/*
2 *
3 * Copyright 2018 gRPC authors.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19package channelz
20
21import (
22 "net"
23 "sync"
24 "sync/atomic"
25 "time"
26
27 "google.golang.org/grpc/connectivity"
28 "google.golang.org/grpc/credentials"
29)
30
31// entry represents a node in the channelz database.
32type entry interface {
33 // addChild adds a child e, whose channelz id is id to child list
34 addChild(id int64, e entry)
35 // deleteChild deletes a child with channelz id to be id from child list
36 deleteChild(id int64)
37 // triggerDelete tries to delete self from channelz database. However, if child
38 // list is not empty, then deletion from the database is on hold until the last
39 // child is deleted from database.
40 triggerDelete()
41 // deleteSelfIfReady check whether triggerDelete() has been called before, and whether child
42 // list is now empty. If both conditions are met, then delete self from database.
43 deleteSelfIfReady()
44 // getParentID returns parent ID of the entry. 0 value parent ID means no parent.
45 getParentID() int64
46}
47
48// dummyEntry is a fake entry to handle entry not found case.
49type dummyEntry struct {
50 idNotFound int64
51}
52
53func (d *dummyEntry) addChild(id int64, e entry) {
54 // Note: It is possible for a normal program to reach here under race condition.
55 // For example, there could be a race between ClientConn.Close() info being propagated
56 // to addrConn and http2Client. ClientConn.Close() cancel the context and result
57 // in http2Client to error. The error info is then caught by transport monitor
58 // and before addrConn.tearDown() is called in side ClientConn.Close(). Therefore,
59 // the addrConn will create a new transport. And when registering the new transport in
60 // channelz, its parent addrConn could have already been torn down and deleted
61 // from channelz tracking, and thus reach the code here.
62 logger.Infof("attempt to add child of type %T with id %d to a parent (id=%d) that doesn't currently exist", e, id, d.idNotFound)
63}
64
65func (d *dummyEntry) deleteChild(id int64) {
66 // It is possible for a normal program to reach here under race condition.
67 // Refer to the example described in addChild().
68 logger.Infof("attempt to delete child with id %d from a parent (id=%d) that doesn't currently exist", id, d.idNotFound)
69}
70
71func (d *dummyEntry) triggerDelete() {
72 logger.Warningf("attempt to delete an entry (id=%d) that doesn't currently exist", d.idNotFound)
73}
74
75func (*dummyEntry) deleteSelfIfReady() {
76 // code should not reach here. deleteSelfIfReady is always called on an existing entry.
77}
78
79func (*dummyEntry) getParentID() int64 {
80 return 0
81}
82
83// ChannelMetric defines the info channelz provides for a specific Channel, which
84// includes ChannelInternalMetric and channelz-specific data, such as channelz id,
85// child list, etc.
86type ChannelMetric struct {
87 // ID is the channelz id of this channel.
88 ID int64
89 // RefName is the human readable reference string of this channel.
90 RefName string
91 // ChannelData contains channel internal metric reported by the channel through
92 // ChannelzMetric().
93 ChannelData *ChannelInternalMetric
94 // NestedChans tracks the nested channel type children of this channel in the format of
95 // a map from nested channel channelz id to corresponding reference string.
96 NestedChans map[int64]string
97 // SubChans tracks the subchannel type children of this channel in the format of a
98 // map from subchannel channelz id to corresponding reference string.
99 SubChans map[int64]string
100 // Sockets tracks the socket type children of this channel in the format of a map
101 // from socket channelz id to corresponding reference string.
102 // Note current grpc implementation doesn't allow channel having sockets directly,
103 // therefore, this is field is unused.
104 Sockets map[int64]string
105 // Trace contains the most recent traced events.
106 Trace *ChannelTrace
107}
108
109// SubChannelMetric defines the info channelz provides for a specific SubChannel,
110// which includes ChannelInternalMetric and channelz-specific data, such as
111// channelz id, child list, etc.
112type SubChannelMetric struct {
113 // ID is the channelz id of this subchannel.
114 ID int64
115 // RefName is the human readable reference string of this subchannel.
116 RefName string
117 // ChannelData contains subchannel internal metric reported by the subchannel
118 // through ChannelzMetric().
119 ChannelData *ChannelInternalMetric
120 // NestedChans tracks the nested channel type children of this subchannel in the format of
121 // a map from nested channel channelz id to corresponding reference string.
122 // Note current grpc implementation doesn't allow subchannel to have nested channels
123 // as children, therefore, this field is unused.
124 NestedChans map[int64]string
125 // SubChans tracks the subchannel type children of this subchannel in the format of a
126 // map from subchannel channelz id to corresponding reference string.
127 // Note current grpc implementation doesn't allow subchannel to have subchannels
128 // as children, therefore, this field is unused.
129 SubChans map[int64]string
130 // Sockets tracks the socket type children of this subchannel in the format of a map
131 // from socket channelz id to corresponding reference string.
132 Sockets map[int64]string
133 // Trace contains the most recent traced events.
134 Trace *ChannelTrace
135}
136
137// ChannelInternalMetric defines the struct that the implementor of Channel interface
138// should return from ChannelzMetric().
139type ChannelInternalMetric struct {
140 // current connectivity state of the channel.
141 State connectivity.State
142 // The target this channel originally tried to connect to. May be absent
143 Target string
144 // The number of calls started on the channel.
145 CallsStarted int64
146 // The number of calls that have completed with an OK status.
147 CallsSucceeded int64
148 // The number of calls that have a completed with a non-OK status.
149 CallsFailed int64
150 // The last time a call was started on the channel.
151 LastCallStartedTimestamp time.Time
152}
153
154// ChannelTrace stores traced events on a channel/subchannel and related info.
155type ChannelTrace struct {
156 // EventNum is the number of events that ever got traced (i.e. including those that have been deleted)
157 EventNum int64
158 // CreationTime is the creation time of the trace.
159 CreationTime time.Time
160 // Events stores the most recent trace events (up to $maxTraceEntry, newer event will overwrite the
161 // oldest one)
162 Events []*TraceEvent
163}
164
165// TraceEvent represent a single trace event
166type TraceEvent struct {
167 // Desc is a simple description of the trace event.
168 Desc string
169 // Severity states the severity of this trace event.
170 Severity Severity
171 // Timestamp is the event time.
172 Timestamp time.Time
173 // RefID is the id of the entity that gets referenced in the event. RefID is 0 if no other entity is
174 // involved in this event.
175 // e.g. SubChannel (id: 4[]) Created. --> RefID = 4, RefName = "" (inside [])
176 RefID int64
177 // RefName is the reference name for the entity that gets referenced in the event.
178 RefName string
179 // RefType indicates the referenced entity type, i.e Channel or SubChannel.
180 RefType RefChannelType
181}
182
183// Channel is the interface that should be satisfied in order to be tracked by
184// channelz as Channel or SubChannel.
185type Channel interface {
186 ChannelzMetric() *ChannelInternalMetric
187}
188
189type dummyChannel struct{}
190
191func (d *dummyChannel) ChannelzMetric() *ChannelInternalMetric {
192 return &ChannelInternalMetric{}
193}
194
195type channel struct {
196 refName string
197 c Channel
198 closeCalled bool
199 nestedChans map[int64]string
200 subChans map[int64]string
201 id int64
202 pid int64
203 cm *channelMap
204 trace *channelTrace
205 // traceRefCount is the number of trace events that reference this channel.
206 // Non-zero traceRefCount means the trace of this channel cannot be deleted.
207 traceRefCount int32
208}
209
210func (c *channel) addChild(id int64, e entry) {
211 switch v := e.(type) {
212 case *subChannel:
213 c.subChans[id] = v.refName
214 case *channel:
215 c.nestedChans[id] = v.refName
216 default:
217 logger.Errorf("cannot add a child (id = %d) of type %T to a channel", id, e)
218 }
219}
220
221func (c *channel) deleteChild(id int64) {
222 delete(c.subChans, id)
223 delete(c.nestedChans, id)
224 c.deleteSelfIfReady()
225}
226
227func (c *channel) triggerDelete() {
228 c.closeCalled = true
229 c.deleteSelfIfReady()
230}
231
232func (c *channel) getParentID() int64 {
233 return c.pid
234}
235
236// deleteSelfFromTree tries to delete the channel from the channelz entry relation tree, which means
237// deleting the channel reference from its parent's child list.
238//
239// In order for a channel to be deleted from the tree, it must meet the criteria that, removal of the
240// corresponding grpc object has been invoked, and the channel does not have any children left.
241//
242// The returned boolean value indicates whether the channel has been successfully deleted from tree.
243func (c *channel) deleteSelfFromTree() (deleted bool) {
244 if !c.closeCalled || len(c.subChans)+len(c.nestedChans) != 0 {
245 return false
246 }
247 // not top channel
248 if c.pid != 0 {
249 c.cm.findEntry(c.pid).deleteChild(c.id)
250 }
251 return true
252}
253
254// deleteSelfFromMap checks whether it is valid to delete the channel from the map, which means
255// deleting the channel from channelz's tracking entirely. Users can no longer use id to query the
256// channel, and its memory will be garbage collected.
257//
258// The trace reference count of the channel must be 0 in order to be deleted from the map. This is
259// specified in the channel tracing gRFC that as long as some other trace has reference to an entity,
260// the trace of the referenced entity must not be deleted. In order to release the resource allocated
261// by grpc, the reference to the grpc object is reset to a dummy object.
262//
263// deleteSelfFromMap must be called after deleteSelfFromTree returns true.
264//
265// It returns a bool to indicate whether the channel can be safely deleted from map.
266func (c *channel) deleteSelfFromMap() (delete bool) {
267 if c.getTraceRefCount() != 0 {
268 c.c = &dummyChannel{}
269 return false
270 }
271 return true
272}
273
274// deleteSelfIfReady tries to delete the channel itself from the channelz database.
275// The delete process includes two steps:
Joey Armstrongba3d9d12024-01-15 14:22:11 -0500276// 1. delete the channel from the entry relation tree, i.e. delete the channel reference from its
277// parent's child list.
278// 2. delete the channel from the map, i.e. delete the channel entirely from channelz. Lookup by id
279// will return entry not found error.
khenaidoo5fc5cea2021-08-11 17:39:16 -0400280func (c *channel) deleteSelfIfReady() {
281 if !c.deleteSelfFromTree() {
282 return
283 }
284 if !c.deleteSelfFromMap() {
285 return
286 }
287 c.cm.deleteEntry(c.id)
288 c.trace.clear()
289}
290
291func (c *channel) getChannelTrace() *channelTrace {
292 return c.trace
293}
294
295func (c *channel) incrTraceRefCount() {
296 atomic.AddInt32(&c.traceRefCount, 1)
297}
298
299func (c *channel) decrTraceRefCount() {
300 atomic.AddInt32(&c.traceRefCount, -1)
301}
302
303func (c *channel) getTraceRefCount() int {
304 i := atomic.LoadInt32(&c.traceRefCount)
305 return int(i)
306}
307
308func (c *channel) getRefName() string {
309 return c.refName
310}
311
312type subChannel struct {
313 refName string
314 c Channel
315 closeCalled bool
316 sockets map[int64]string
317 id int64
318 pid int64
319 cm *channelMap
320 trace *channelTrace
321 traceRefCount int32
322}
323
324func (sc *subChannel) addChild(id int64, e entry) {
325 if v, ok := e.(*normalSocket); ok {
326 sc.sockets[id] = v.refName
327 } else {
328 logger.Errorf("cannot add a child (id = %d) of type %T to a subChannel", id, e)
329 }
330}
331
332func (sc *subChannel) deleteChild(id int64) {
333 delete(sc.sockets, id)
334 sc.deleteSelfIfReady()
335}
336
337func (sc *subChannel) triggerDelete() {
338 sc.closeCalled = true
339 sc.deleteSelfIfReady()
340}
341
342func (sc *subChannel) getParentID() int64 {
343 return sc.pid
344}
345
346// deleteSelfFromTree tries to delete the subchannel from the channelz entry relation tree, which
347// means deleting the subchannel reference from its parent's child list.
348//
349// In order for a subchannel to be deleted from the tree, it must meet the criteria that, removal of
350// the corresponding grpc object has been invoked, and the subchannel does not have any children left.
351//
352// The returned boolean value indicates whether the channel has been successfully deleted from tree.
353func (sc *subChannel) deleteSelfFromTree() (deleted bool) {
354 if !sc.closeCalled || len(sc.sockets) != 0 {
355 return false
356 }
357 sc.cm.findEntry(sc.pid).deleteChild(sc.id)
358 return true
359}
360
361// deleteSelfFromMap checks whether it is valid to delete the subchannel from the map, which means
362// deleting the subchannel from channelz's tracking entirely. Users can no longer use id to query
363// the subchannel, and its memory will be garbage collected.
364//
365// The trace reference count of the subchannel must be 0 in order to be deleted from the map. This is
366// specified in the channel tracing gRFC that as long as some other trace has reference to an entity,
367// the trace of the referenced entity must not be deleted. In order to release the resource allocated
368// by grpc, the reference to the grpc object is reset to a dummy object.
369//
370// deleteSelfFromMap must be called after deleteSelfFromTree returns true.
371//
372// It returns a bool to indicate whether the channel can be safely deleted from map.
373func (sc *subChannel) deleteSelfFromMap() (delete bool) {
374 if sc.getTraceRefCount() != 0 {
375 // free the grpc struct (i.e. addrConn)
376 sc.c = &dummyChannel{}
377 return false
378 }
379 return true
380}
381
382// deleteSelfIfReady tries to delete the subchannel itself from the channelz database.
383// The delete process includes two steps:
Joey Armstrongba3d9d12024-01-15 14:22:11 -0500384// 1. delete the subchannel from the entry relation tree, i.e. delete the subchannel reference from
385// its parent's child list.
386// 2. delete the subchannel from the map, i.e. delete the subchannel entirely from channelz. Lookup
387// by id will return entry not found error.
khenaidoo5fc5cea2021-08-11 17:39:16 -0400388func (sc *subChannel) deleteSelfIfReady() {
389 if !sc.deleteSelfFromTree() {
390 return
391 }
392 if !sc.deleteSelfFromMap() {
393 return
394 }
395 sc.cm.deleteEntry(sc.id)
396 sc.trace.clear()
397}
398
399func (sc *subChannel) getChannelTrace() *channelTrace {
400 return sc.trace
401}
402
403func (sc *subChannel) incrTraceRefCount() {
404 atomic.AddInt32(&sc.traceRefCount, 1)
405}
406
407func (sc *subChannel) decrTraceRefCount() {
408 atomic.AddInt32(&sc.traceRefCount, -1)
409}
410
411func (sc *subChannel) getTraceRefCount() int {
412 i := atomic.LoadInt32(&sc.traceRefCount)
413 return int(i)
414}
415
416func (sc *subChannel) getRefName() string {
417 return sc.refName
418}
419
420// SocketMetric defines the info channelz provides for a specific Socket, which
421// includes SocketInternalMetric and channelz-specific data, such as channelz id, etc.
422type SocketMetric struct {
423 // ID is the channelz id of this socket.
424 ID int64
425 // RefName is the human readable reference string of this socket.
426 RefName string
427 // SocketData contains socket internal metric reported by the socket through
428 // ChannelzMetric().
429 SocketData *SocketInternalMetric
430}
431
432// SocketInternalMetric defines the struct that the implementor of Socket interface
433// should return from ChannelzMetric().
434type SocketInternalMetric struct {
435 // The number of streams that have been started.
436 StreamsStarted int64
437 // The number of streams that have ended successfully:
438 // On client side, receiving frame with eos bit set.
439 // On server side, sending frame with eos bit set.
440 StreamsSucceeded int64
441 // The number of streams that have ended unsuccessfully:
442 // On client side, termination without receiving frame with eos bit set.
443 // On server side, termination without sending frame with eos bit set.
444 StreamsFailed int64
445 // The number of messages successfully sent on this socket.
446 MessagesSent int64
447 MessagesReceived int64
448 // The number of keep alives sent. This is typically implemented with HTTP/2
449 // ping messages.
450 KeepAlivesSent int64
451 // The last time a stream was created by this endpoint. Usually unset for
452 // servers.
453 LastLocalStreamCreatedTimestamp time.Time
454 // The last time a stream was created by the remote endpoint. Usually unset
455 // for clients.
456 LastRemoteStreamCreatedTimestamp time.Time
457 // The last time a message was sent by this endpoint.
458 LastMessageSentTimestamp time.Time
459 // The last time a message was received by this endpoint.
460 LastMessageReceivedTimestamp time.Time
461 // The amount of window, granted to the local endpoint by the remote endpoint.
462 // This may be slightly out of date due to network latency. This does NOT
463 // include stream level or TCP level flow control info.
464 LocalFlowControlWindow int64
465 // The amount of window, granted to the remote endpoint by the local endpoint.
466 // This may be slightly out of date due to network latency. This does NOT
467 // include stream level or TCP level flow control info.
468 RemoteFlowControlWindow int64
469 // The locally bound address.
470 LocalAddr net.Addr
471 // The remote bound address. May be absent.
472 RemoteAddr net.Addr
473 // Optional, represents the name of the remote endpoint, if different than
474 // the original target name.
475 RemoteName string
476 SocketOptions *SocketOptionData
477 Security credentials.ChannelzSecurityValue
478}
479
480// Socket is the interface that should be satisfied in order to be tracked by
481// channelz as Socket.
482type Socket interface {
483 ChannelzMetric() *SocketInternalMetric
484}
485
486type listenSocket struct {
487 refName string
488 s Socket
489 id int64
490 pid int64
491 cm *channelMap
492}
493
494func (ls *listenSocket) addChild(id int64, e entry) {
495 logger.Errorf("cannot add a child (id = %d) of type %T to a listen socket", id, e)
496}
497
498func (ls *listenSocket) deleteChild(id int64) {
499 logger.Errorf("cannot delete a child (id = %d) from a listen socket", id)
500}
501
502func (ls *listenSocket) triggerDelete() {
503 ls.cm.deleteEntry(ls.id)
504 ls.cm.findEntry(ls.pid).deleteChild(ls.id)
505}
506
507func (ls *listenSocket) deleteSelfIfReady() {
508 logger.Errorf("cannot call deleteSelfIfReady on a listen socket")
509}
510
511func (ls *listenSocket) getParentID() int64 {
512 return ls.pid
513}
514
515type normalSocket struct {
516 refName string
517 s Socket
518 id int64
519 pid int64
520 cm *channelMap
521}
522
523func (ns *normalSocket) addChild(id int64, e entry) {
524 logger.Errorf("cannot add a child (id = %d) of type %T to a normal socket", id, e)
525}
526
527func (ns *normalSocket) deleteChild(id int64) {
528 logger.Errorf("cannot delete a child (id = %d) from a normal socket", id)
529}
530
531func (ns *normalSocket) triggerDelete() {
532 ns.cm.deleteEntry(ns.id)
533 ns.cm.findEntry(ns.pid).deleteChild(ns.id)
534}
535
536func (ns *normalSocket) deleteSelfIfReady() {
537 logger.Errorf("cannot call deleteSelfIfReady on a normal socket")
538}
539
540func (ns *normalSocket) getParentID() int64 {
541 return ns.pid
542}
543
544// ServerMetric defines the info channelz provides for a specific Server, which
545// includes ServerInternalMetric and channelz-specific data, such as channelz id,
546// child list, etc.
547type ServerMetric struct {
548 // ID is the channelz id of this server.
549 ID int64
550 // RefName is the human readable reference string of this server.
551 RefName string
552 // ServerData contains server internal metric reported by the server through
553 // ChannelzMetric().
554 ServerData *ServerInternalMetric
555 // ListenSockets tracks the listener socket type children of this server in the
556 // format of a map from socket channelz id to corresponding reference string.
557 ListenSockets map[int64]string
558}
559
560// ServerInternalMetric defines the struct that the implementor of Server interface
561// should return from ChannelzMetric().
562type ServerInternalMetric struct {
563 // The number of incoming calls started on the server.
564 CallsStarted int64
565 // The number of incoming calls that have completed with an OK status.
566 CallsSucceeded int64
567 // The number of incoming calls that have a completed with a non-OK status.
568 CallsFailed int64
569 // The last time a call was started on the server.
570 LastCallStartedTimestamp time.Time
571}
572
573// Server is the interface to be satisfied in order to be tracked by channelz as
574// Server.
575type Server interface {
576 ChannelzMetric() *ServerInternalMetric
577}
578
579type server struct {
580 refName string
581 s Server
582 closeCalled bool
583 sockets map[int64]string
584 listenSockets map[int64]string
585 id int64
586 cm *channelMap
587}
588
589func (s *server) addChild(id int64, e entry) {
590 switch v := e.(type) {
591 case *normalSocket:
592 s.sockets[id] = v.refName
593 case *listenSocket:
594 s.listenSockets[id] = v.refName
595 default:
596 logger.Errorf("cannot add a child (id = %d) of type %T to a server", id, e)
597 }
598}
599
600func (s *server) deleteChild(id int64) {
601 delete(s.sockets, id)
602 delete(s.listenSockets, id)
603 s.deleteSelfIfReady()
604}
605
606func (s *server) triggerDelete() {
607 s.closeCalled = true
608 s.deleteSelfIfReady()
609}
610
611func (s *server) deleteSelfIfReady() {
612 if !s.closeCalled || len(s.sockets)+len(s.listenSockets) != 0 {
613 return
614 }
615 s.cm.deleteEntry(s.id)
616}
617
618func (s *server) getParentID() int64 {
619 return 0
620}
621
622type tracedChannel interface {
623 getChannelTrace() *channelTrace
624 incrTraceRefCount()
625 decrTraceRefCount()
626 getRefName() string
627}
628
629type channelTrace struct {
630 cm *channelMap
631 createdTime time.Time
632 eventCount int64
633 mu sync.Mutex
634 events []*TraceEvent
635}
636
637func (c *channelTrace) append(e *TraceEvent) {
638 c.mu.Lock()
639 if len(c.events) == getMaxTraceEntry() {
640 del := c.events[0]
641 c.events = c.events[1:]
642 if del.RefID != 0 {
643 // start recursive cleanup in a goroutine to not block the call originated from grpc.
644 go func() {
645 // need to acquire c.cm.mu lock to call the unlocked attemptCleanup func.
646 c.cm.mu.Lock()
647 c.cm.decrTraceRefCount(del.RefID)
648 c.cm.mu.Unlock()
649 }()
650 }
651 }
652 e.Timestamp = time.Now()
653 c.events = append(c.events, e)
654 c.eventCount++
655 c.mu.Unlock()
656}
657
658func (c *channelTrace) clear() {
659 c.mu.Lock()
660 for _, e := range c.events {
661 if e.RefID != 0 {
662 // caller should have already held the c.cm.mu lock.
663 c.cm.decrTraceRefCount(e.RefID)
664 }
665 }
666 c.mu.Unlock()
667}
668
669// Severity is the severity level of a trace event.
670// The canonical enumeration of all valid values is here:
671// https://github.com/grpc/grpc-proto/blob/9b13d199cc0d4703c7ea26c9c330ba695866eb23/grpc/channelz/v1/channelz.proto#L126.
672type Severity int
673
674const (
675 // CtUnknown indicates unknown severity of a trace event.
676 CtUnknown Severity = iota
677 // CtInfo indicates info level severity of a trace event.
678 CtInfo
679 // CtWarning indicates warning level severity of a trace event.
680 CtWarning
681 // CtError indicates error level severity of a trace event.
682 CtError
683)
684
685// RefChannelType is the type of the entity being referenced in a trace event.
686type RefChannelType int
687
688const (
689 // RefChannel indicates the referenced entity is a Channel.
690 RefChannel RefChannelType = iota
691 // RefSubChannel indicates the referenced entity is a SubChannel.
692 RefSubChannel
693)
694
695func (c *channelTrace) dumpData() *ChannelTrace {
696 c.mu.Lock()
697 ct := &ChannelTrace{EventNum: c.eventCount, CreationTime: c.createdTime}
698 ct.Events = c.events[:len(c.events)]
699 c.mu.Unlock()
700 return ct
701}