blob: 17c2274cb3de5cea3ca8a999f38b9a57161c313d [file] [log] [blame]
Don Newton98fd8812019-09-23 15:15:02 -04001/*
2 *
3 * Copyright 2018 gRPC authors.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19package channelz
20
21import (
22 "net"
23 "sync"
24 "sync/atomic"
25 "time"
26
27 "google.golang.org/grpc/connectivity"
28 "google.golang.org/grpc/credentials"
29 "google.golang.org/grpc/grpclog"
30)
31
32// entry represents a node in the channelz database.
33type entry interface {
34 // addChild adds a child e, whose channelz id is id to child list
35 addChild(id int64, e entry)
36 // deleteChild deletes a child with channelz id to be id from child list
37 deleteChild(id int64)
38 // triggerDelete tries to delete self from channelz database. However, if child
39 // list is not empty, then deletion from the database is on hold until the last
40 // child is deleted from database.
41 triggerDelete()
42 // deleteSelfIfReady check whether triggerDelete() has been called before, and whether child
43 // list is now empty. If both conditions are met, then delete self from database.
44 deleteSelfIfReady()
45 // getParentID returns parent ID of the entry. 0 value parent ID means no parent.
46 getParentID() int64
47}
48
49// dummyEntry is a fake entry to handle entry not found case.
50type dummyEntry struct {
51 idNotFound int64
52}
53
54func (d *dummyEntry) addChild(id int64, e entry) {
55 // Note: It is possible for a normal program to reach here under race condition.
56 // For example, there could be a race between ClientConn.Close() info being propagated
57 // to addrConn and http2Client. ClientConn.Close() cancel the context and result
58 // in http2Client to error. The error info is then caught by transport monitor
59 // and before addrConn.tearDown() is called in side ClientConn.Close(). Therefore,
60 // the addrConn will create a new transport. And when registering the new transport in
61 // channelz, its parent addrConn could have already been torn down and deleted
62 // from channelz tracking, and thus reach the code here.
63 grpclog.Infof("attempt to add child of type %T with id %d to a parent (id=%d) that doesn't currently exist", e, id, d.idNotFound)
64}
65
66func (d *dummyEntry) deleteChild(id int64) {
67 // It is possible for a normal program to reach here under race condition.
68 // Refer to the example described in addChild().
69 grpclog.Infof("attempt to delete child with id %d from a parent (id=%d) that doesn't currently exist", id, d.idNotFound)
70}
71
72func (d *dummyEntry) triggerDelete() {
73 grpclog.Warningf("attempt to delete an entry (id=%d) that doesn't currently exist", d.idNotFound)
74}
75
76func (*dummyEntry) deleteSelfIfReady() {
77 // code should not reach here. deleteSelfIfReady is always called on an existing entry.
78}
79
80func (*dummyEntry) getParentID() int64 {
81 return 0
82}
83
84// ChannelMetric defines the info channelz provides for a specific Channel, which
85// includes ChannelInternalMetric and channelz-specific data, such as channelz id,
86// child list, etc.
87type ChannelMetric struct {
88 // ID is the channelz id of this channel.
89 ID int64
90 // RefName is the human readable reference string of this channel.
91 RefName string
92 // ChannelData contains channel internal metric reported by the channel through
93 // ChannelzMetric().
94 ChannelData *ChannelInternalMetric
95 // NestedChans tracks the nested channel type children of this channel in the format of
96 // a map from nested channel channelz id to corresponding reference string.
97 NestedChans map[int64]string
98 // SubChans tracks the subchannel type children of this channel in the format of a
99 // map from subchannel channelz id to corresponding reference string.
100 SubChans map[int64]string
101 // Sockets tracks the socket type children of this channel in the format of a map
102 // from socket channelz id to corresponding reference string.
103 // Note current grpc implementation doesn't allow channel having sockets directly,
104 // therefore, this is field is unused.
105 Sockets map[int64]string
106 // Trace contains the most recent traced events.
107 Trace *ChannelTrace
108}
109
110// SubChannelMetric defines the info channelz provides for a specific SubChannel,
111// which includes ChannelInternalMetric and channelz-specific data, such as
112// channelz id, child list, etc.
113type SubChannelMetric struct {
114 // ID is the channelz id of this subchannel.
115 ID int64
116 // RefName is the human readable reference string of this subchannel.
117 RefName string
118 // ChannelData contains subchannel internal metric reported by the subchannel
119 // through ChannelzMetric().
120 ChannelData *ChannelInternalMetric
121 // NestedChans tracks the nested channel type children of this subchannel in the format of
122 // a map from nested channel channelz id to corresponding reference string.
123 // Note current grpc implementation doesn't allow subchannel to have nested channels
124 // as children, therefore, this field is unused.
125 NestedChans map[int64]string
126 // SubChans tracks the subchannel type children of this subchannel in the format of a
127 // map from subchannel channelz id to corresponding reference string.
128 // Note current grpc implementation doesn't allow subchannel to have subchannels
129 // as children, therefore, this field is unused.
130 SubChans map[int64]string
131 // Sockets tracks the socket type children of this subchannel in the format of a map
132 // from socket channelz id to corresponding reference string.
133 Sockets map[int64]string
134 // Trace contains the most recent traced events.
135 Trace *ChannelTrace
136}
137
138// ChannelInternalMetric defines the struct that the implementor of Channel interface
139// should return from ChannelzMetric().
140type ChannelInternalMetric struct {
141 // current connectivity state of the channel.
142 State connectivity.State
143 // The target this channel originally tried to connect to. May be absent
144 Target string
145 // The number of calls started on the channel.
146 CallsStarted int64
147 // The number of calls that have completed with an OK status.
148 CallsSucceeded int64
149 // The number of calls that have a completed with a non-OK status.
150 CallsFailed int64
151 // The last time a call was started on the channel.
152 LastCallStartedTimestamp time.Time
153}
154
155// ChannelTrace stores traced events on a channel/subchannel and related info.
156type ChannelTrace struct {
157 // EventNum is the number of events that ever got traced (i.e. including those that have been deleted)
158 EventNum int64
159 // CreationTime is the creation time of the trace.
160 CreationTime time.Time
161 // Events stores the most recent trace events (up to $maxTraceEntry, newer event will overwrite the
162 // oldest one)
163 Events []*TraceEvent
164}
165
166// TraceEvent represent a single trace event
167type TraceEvent struct {
168 // Desc is a simple description of the trace event.
169 Desc string
170 // Severity states the severity of this trace event.
171 Severity Severity
172 // Timestamp is the event time.
173 Timestamp time.Time
174 // RefID is the id of the entity that gets referenced in the event. RefID is 0 if no other entity is
175 // involved in this event.
176 // e.g. SubChannel (id: 4[]) Created. --> RefID = 4, RefName = "" (inside [])
177 RefID int64
178 // RefName is the reference name for the entity that gets referenced in the event.
179 RefName string
180 // RefType indicates the referenced entity type, i.e Channel or SubChannel.
181 RefType RefChannelType
182}
183
184// Channel is the interface that should be satisfied in order to be tracked by
185// channelz as Channel or SubChannel.
186type Channel interface {
187 ChannelzMetric() *ChannelInternalMetric
188}
189
190type dummyChannel struct{}
191
192func (d *dummyChannel) ChannelzMetric() *ChannelInternalMetric {
193 return &ChannelInternalMetric{}
194}
195
196type channel struct {
197 refName string
198 c Channel
199 closeCalled bool
200 nestedChans map[int64]string
201 subChans map[int64]string
202 id int64
203 pid int64
204 cm *channelMap
205 trace *channelTrace
206 // traceRefCount is the number of trace events that reference this channel.
207 // Non-zero traceRefCount means the trace of this channel cannot be deleted.
208 traceRefCount int32
209}
210
211func (c *channel) addChild(id int64, e entry) {
212 switch v := e.(type) {
213 case *subChannel:
214 c.subChans[id] = v.refName
215 case *channel:
216 c.nestedChans[id] = v.refName
217 default:
218 grpclog.Errorf("cannot add a child (id = %d) of type %T to a channel", id, e)
219 }
220}
221
222func (c *channel) deleteChild(id int64) {
223 delete(c.subChans, id)
224 delete(c.nestedChans, id)
225 c.deleteSelfIfReady()
226}
227
228func (c *channel) triggerDelete() {
229 c.closeCalled = true
230 c.deleteSelfIfReady()
231}
232
233func (c *channel) getParentID() int64 {
234 return c.pid
235}
236
237// deleteSelfFromTree tries to delete the channel from the channelz entry relation tree, which means
238// deleting the channel reference from its parent's child list.
239//
240// In order for a channel to be deleted from the tree, it must meet the criteria that, removal of the
241// corresponding grpc object has been invoked, and the channel does not have any children left.
242//
243// The returned boolean value indicates whether the channel has been successfully deleted from tree.
244func (c *channel) deleteSelfFromTree() (deleted bool) {
245 if !c.closeCalled || len(c.subChans)+len(c.nestedChans) != 0 {
246 return false
247 }
248 // not top channel
249 if c.pid != 0 {
250 c.cm.findEntry(c.pid).deleteChild(c.id)
251 }
252 return true
253}
254
255// deleteSelfFromMap checks whether it is valid to delete the channel from the map, which means
256// deleting the channel from channelz's tracking entirely. Users can no longer use id to query the
257// channel, and its memory will be garbage collected.
258//
259// The trace reference count of the channel must be 0 in order to be deleted from the map. This is
260// specified in the channel tracing gRFC that as long as some other trace has reference to an entity,
261// the trace of the referenced entity must not be deleted. In order to release the resource allocated
262// by grpc, the reference to the grpc object is reset to a dummy object.
263//
264// deleteSelfFromMap must be called after deleteSelfFromTree returns true.
265//
266// It returns a bool to indicate whether the channel can be safely deleted from map.
267func (c *channel) deleteSelfFromMap() (delete bool) {
268 if c.getTraceRefCount() != 0 {
269 c.c = &dummyChannel{}
270 return false
271 }
272 return true
273}
274
275// deleteSelfIfReady tries to delete the channel itself from the channelz database.
276// The delete process includes two steps:
277// 1. delete the channel from the entry relation tree, i.e. delete the channel reference from its
278// parent's child list.
279// 2. delete the channel from the map, i.e. delete the channel entirely from channelz. Lookup by id
280// will return entry not found error.
281func (c *channel) deleteSelfIfReady() {
282 if !c.deleteSelfFromTree() {
283 return
284 }
285 if !c.deleteSelfFromMap() {
286 return
287 }
288 c.cm.deleteEntry(c.id)
289 c.trace.clear()
290}
291
292func (c *channel) getChannelTrace() *channelTrace {
293 return c.trace
294}
295
296func (c *channel) incrTraceRefCount() {
297 atomic.AddInt32(&c.traceRefCount, 1)
298}
299
300func (c *channel) decrTraceRefCount() {
301 atomic.AddInt32(&c.traceRefCount, -1)
302}
303
304func (c *channel) getTraceRefCount() int {
305 i := atomic.LoadInt32(&c.traceRefCount)
306 return int(i)
307}
308
309func (c *channel) getRefName() string {
310 return c.refName
311}
312
313type subChannel struct {
314 refName string
315 c Channel
316 closeCalled bool
317 sockets map[int64]string
318 id int64
319 pid int64
320 cm *channelMap
321 trace *channelTrace
322 traceRefCount int32
323}
324
325func (sc *subChannel) addChild(id int64, e entry) {
326 if v, ok := e.(*normalSocket); ok {
327 sc.sockets[id] = v.refName
328 } else {
329 grpclog.Errorf("cannot add a child (id = %d) of type %T to a subChannel", id, e)
330 }
331}
332
333func (sc *subChannel) deleteChild(id int64) {
334 delete(sc.sockets, id)
335 sc.deleteSelfIfReady()
336}
337
338func (sc *subChannel) triggerDelete() {
339 sc.closeCalled = true
340 sc.deleteSelfIfReady()
341}
342
343func (sc *subChannel) getParentID() int64 {
344 return sc.pid
345}
346
347// deleteSelfFromTree tries to delete the subchannel from the channelz entry relation tree, which
348// means deleting the subchannel reference from its parent's child list.
349//
350// In order for a subchannel to be deleted from the tree, it must meet the criteria that, removal of
351// the corresponding grpc object has been invoked, and the subchannel does not have any children left.
352//
353// The returned boolean value indicates whether the channel has been successfully deleted from tree.
354func (sc *subChannel) deleteSelfFromTree() (deleted bool) {
355 if !sc.closeCalled || len(sc.sockets) != 0 {
356 return false
357 }
358 sc.cm.findEntry(sc.pid).deleteChild(sc.id)
359 return true
360}
361
362// deleteSelfFromMap checks whether it is valid to delete the subchannel from the map, which means
363// deleting the subchannel from channelz's tracking entirely. Users can no longer use id to query
364// the subchannel, and its memory will be garbage collected.
365//
366// The trace reference count of the subchannel must be 0 in order to be deleted from the map. This is
367// specified in the channel tracing gRFC that as long as some other trace has reference to an entity,
368// the trace of the referenced entity must not be deleted. In order to release the resource allocated
369// by grpc, the reference to the grpc object is reset to a dummy object.
370//
371// deleteSelfFromMap must be called after deleteSelfFromTree returns true.
372//
373// It returns a bool to indicate whether the channel can be safely deleted from map.
374func (sc *subChannel) deleteSelfFromMap() (delete bool) {
375 if sc.getTraceRefCount() != 0 {
376 // free the grpc struct (i.e. addrConn)
377 sc.c = &dummyChannel{}
378 return false
379 }
380 return true
381}
382
383// deleteSelfIfReady tries to delete the subchannel itself from the channelz database.
384// The delete process includes two steps:
385// 1. delete the subchannel from the entry relation tree, i.e. delete the subchannel reference from
386// its parent's child list.
387// 2. delete the subchannel from the map, i.e. delete the subchannel entirely from channelz. Lookup
388// by id will return entry not found error.
389func (sc *subChannel) deleteSelfIfReady() {
390 if !sc.deleteSelfFromTree() {
391 return
392 }
393 if !sc.deleteSelfFromMap() {
394 return
395 }
396 sc.cm.deleteEntry(sc.id)
397 sc.trace.clear()
398}
399
400func (sc *subChannel) getChannelTrace() *channelTrace {
401 return sc.trace
402}
403
404func (sc *subChannel) incrTraceRefCount() {
405 atomic.AddInt32(&sc.traceRefCount, 1)
406}
407
408func (sc *subChannel) decrTraceRefCount() {
409 atomic.AddInt32(&sc.traceRefCount, -1)
410}
411
412func (sc *subChannel) getTraceRefCount() int {
413 i := atomic.LoadInt32(&sc.traceRefCount)
414 return int(i)
415}
416
417func (sc *subChannel) getRefName() string {
418 return sc.refName
419}
420
421// SocketMetric defines the info channelz provides for a specific Socket, which
422// includes SocketInternalMetric and channelz-specific data, such as channelz id, etc.
423type SocketMetric struct {
424 // ID is the channelz id of this socket.
425 ID int64
426 // RefName is the human readable reference string of this socket.
427 RefName string
428 // SocketData contains socket internal metric reported by the socket through
429 // ChannelzMetric().
430 SocketData *SocketInternalMetric
431}
432
433// SocketInternalMetric defines the struct that the implementor of Socket interface
434// should return from ChannelzMetric().
435type SocketInternalMetric struct {
436 // The number of streams that have been started.
437 StreamsStarted int64
438 // The number of streams that have ended successfully:
439 // On client side, receiving frame with eos bit set.
440 // On server side, sending frame with eos bit set.
441 StreamsSucceeded int64
442 // The number of streams that have ended unsuccessfully:
443 // On client side, termination without receiving frame with eos bit set.
444 // On server side, termination without sending frame with eos bit set.
445 StreamsFailed int64
446 // The number of messages successfully sent on this socket.
447 MessagesSent int64
448 MessagesReceived int64
449 // The number of keep alives sent. This is typically implemented with HTTP/2
450 // ping messages.
451 KeepAlivesSent int64
452 // The last time a stream was created by this endpoint. Usually unset for
453 // servers.
454 LastLocalStreamCreatedTimestamp time.Time
455 // The last time a stream was created by the remote endpoint. Usually unset
456 // for clients.
457 LastRemoteStreamCreatedTimestamp time.Time
458 // The last time a message was sent by this endpoint.
459 LastMessageSentTimestamp time.Time
460 // The last time a message was received by this endpoint.
461 LastMessageReceivedTimestamp time.Time
462 // The amount of window, granted to the local endpoint by the remote endpoint.
463 // This may be slightly out of date due to network latency. This does NOT
464 // include stream level or TCP level flow control info.
465 LocalFlowControlWindow int64
466 // The amount of window, granted to the remote endpoint by the local endpoint.
467 // This may be slightly out of date due to network latency. This does NOT
468 // include stream level or TCP level flow control info.
469 RemoteFlowControlWindow int64
470 // The locally bound address.
471 LocalAddr net.Addr
472 // The remote bound address. May be absent.
473 RemoteAddr net.Addr
474 // Optional, represents the name of the remote endpoint, if different than
475 // the original target name.
476 RemoteName string
477 SocketOptions *SocketOptionData
478 Security credentials.ChannelzSecurityValue
479}
480
481// Socket is the interface that should be satisfied in order to be tracked by
482// channelz as Socket.
483type Socket interface {
484 ChannelzMetric() *SocketInternalMetric
485}
486
487type listenSocket struct {
488 refName string
489 s Socket
490 id int64
491 pid int64
492 cm *channelMap
493}
494
495func (ls *listenSocket) addChild(id int64, e entry) {
496 grpclog.Errorf("cannot add a child (id = %d) of type %T to a listen socket", id, e)
497}
498
499func (ls *listenSocket) deleteChild(id int64) {
500 grpclog.Errorf("cannot delete a child (id = %d) from a listen socket", id)
501}
502
503func (ls *listenSocket) triggerDelete() {
504 ls.cm.deleteEntry(ls.id)
505 ls.cm.findEntry(ls.pid).deleteChild(ls.id)
506}
507
508func (ls *listenSocket) deleteSelfIfReady() {
509 grpclog.Errorf("cannot call deleteSelfIfReady on a listen socket")
510}
511
512func (ls *listenSocket) getParentID() int64 {
513 return ls.pid
514}
515
516type normalSocket struct {
517 refName string
518 s Socket
519 id int64
520 pid int64
521 cm *channelMap
522}
523
524func (ns *normalSocket) addChild(id int64, e entry) {
525 grpclog.Errorf("cannot add a child (id = %d) of type %T to a normal socket", id, e)
526}
527
528func (ns *normalSocket) deleteChild(id int64) {
529 grpclog.Errorf("cannot delete a child (id = %d) from a normal socket", id)
530}
531
532func (ns *normalSocket) triggerDelete() {
533 ns.cm.deleteEntry(ns.id)
534 ns.cm.findEntry(ns.pid).deleteChild(ns.id)
535}
536
537func (ns *normalSocket) deleteSelfIfReady() {
538 grpclog.Errorf("cannot call deleteSelfIfReady on a normal socket")
539}
540
541func (ns *normalSocket) getParentID() int64 {
542 return ns.pid
543}
544
545// ServerMetric defines the info channelz provides for a specific Server, which
546// includes ServerInternalMetric and channelz-specific data, such as channelz id,
547// child list, etc.
548type ServerMetric struct {
549 // ID is the channelz id of this server.
550 ID int64
551 // RefName is the human readable reference string of this server.
552 RefName string
553 // ServerData contains server internal metric reported by the server through
554 // ChannelzMetric().
555 ServerData *ServerInternalMetric
556 // ListenSockets tracks the listener socket type children of this server in the
557 // format of a map from socket channelz id to corresponding reference string.
558 ListenSockets map[int64]string
559}
560
561// ServerInternalMetric defines the struct that the implementor of Server interface
562// should return from ChannelzMetric().
563type ServerInternalMetric struct {
564 // The number of incoming calls started on the server.
565 CallsStarted int64
566 // The number of incoming calls that have completed with an OK status.
567 CallsSucceeded int64
568 // The number of incoming calls that have a completed with a non-OK status.
569 CallsFailed int64
570 // The last time a call was started on the server.
571 LastCallStartedTimestamp time.Time
572}
573
574// Server is the interface to be satisfied in order to be tracked by channelz as
575// Server.
576type Server interface {
577 ChannelzMetric() *ServerInternalMetric
578}
579
580type server struct {
581 refName string
582 s Server
583 closeCalled bool
584 sockets map[int64]string
585 listenSockets map[int64]string
586 id int64
587 cm *channelMap
588}
589
590func (s *server) addChild(id int64, e entry) {
591 switch v := e.(type) {
592 case *normalSocket:
593 s.sockets[id] = v.refName
594 case *listenSocket:
595 s.listenSockets[id] = v.refName
596 default:
597 grpclog.Errorf("cannot add a child (id = %d) of type %T to a server", id, e)
598 }
599}
600
601func (s *server) deleteChild(id int64) {
602 delete(s.sockets, id)
603 delete(s.listenSockets, id)
604 s.deleteSelfIfReady()
605}
606
607func (s *server) triggerDelete() {
608 s.closeCalled = true
609 s.deleteSelfIfReady()
610}
611
612func (s *server) deleteSelfIfReady() {
613 if !s.closeCalled || len(s.sockets)+len(s.listenSockets) != 0 {
614 return
615 }
616 s.cm.deleteEntry(s.id)
617}
618
619func (s *server) getParentID() int64 {
620 return 0
621}
622
623type tracedChannel interface {
624 getChannelTrace() *channelTrace
625 incrTraceRefCount()
626 decrTraceRefCount()
627 getRefName() string
628}
629
630type channelTrace struct {
631 cm *channelMap
632 createdTime time.Time
633 eventCount int64
634 mu sync.Mutex
635 events []*TraceEvent
636}
637
638func (c *channelTrace) append(e *TraceEvent) {
639 c.mu.Lock()
640 if len(c.events) == getMaxTraceEntry() {
641 del := c.events[0]
642 c.events = c.events[1:]
643 if del.RefID != 0 {
644 // start recursive cleanup in a goroutine to not block the call originated from grpc.
645 go func() {
646 // need to acquire c.cm.mu lock to call the unlocked attemptCleanup func.
647 c.cm.mu.Lock()
648 c.cm.decrTraceRefCount(del.RefID)
649 c.cm.mu.Unlock()
650 }()
651 }
652 }
653 e.Timestamp = time.Now()
654 c.events = append(c.events, e)
655 c.eventCount++
656 c.mu.Unlock()
657}
658
659func (c *channelTrace) clear() {
660 c.mu.Lock()
661 for _, e := range c.events {
662 if e.RefID != 0 {
663 // caller should have already held the c.cm.mu lock.
664 c.cm.decrTraceRefCount(e.RefID)
665 }
666 }
667 c.mu.Unlock()
668}
669
670// Severity is the severity level of a trace event.
671// The canonical enumeration of all valid values is here:
672// https://github.com/grpc/grpc-proto/blob/9b13d199cc0d4703c7ea26c9c330ba695866eb23/grpc/channelz/v1/channelz.proto#L126.
673type Severity int
674
675const (
676 // CtUNKNOWN indicates unknown severity of a trace event.
677 CtUNKNOWN Severity = iota
678 // CtINFO indicates info level severity of a trace event.
679 CtINFO
680 // CtWarning indicates warning level severity of a trace event.
681 CtWarning
682 // CtError indicates error level severity of a trace event.
683 CtError
684)
685
686// RefChannelType is the type of the entity being referenced in a trace event.
687type RefChannelType int
688
689const (
690 // RefChannel indicates the referenced entity is a Channel.
691 RefChannel RefChannelType = iota
692 // RefSubChannel indicates the referenced entity is a SubChannel.
693 RefSubChannel
694)
695
696func (c *channelTrace) dumpData() *ChannelTrace {
697 c.mu.Lock()
698 ct := &ChannelTrace{EventNum: c.eventCount, CreationTime: c.createdTime}
699 ct.Events = c.events[:len(c.events)]
700 c.mu.Unlock()
701 return ct
702}