blob: 09a319cc4443bf40af5cc63f9dc12bc384de9951 [file] [log] [blame]
Don Newton379ae252019-04-01 12:17:06 -04001// Copyright (C) MongoDB, Inc. 2017-present.
2//
3// Licensed under the Apache License, Version 2.0 (the "License"); you may
4// not use this file except in compliance with the License. You may obtain
5// a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
6
7// Package topology contains types that handles the discovery, monitoring, and selection
8// of servers. This package is designed to expose enough inner workings of service discovery
9// and monitoring to allow low level applications to have fine grained control, while hiding
10// most of the detailed implementation of the algorithms.
11package topology
12
13import (
14 "context"
15 "errors"
16 "math/rand"
17 "sync"
18 "sync/atomic"
19 "time"
20
21 "fmt"
22
23 "github.com/mongodb/mongo-go-driver/bson/bsoncodec"
24 "github.com/mongodb/mongo-go-driver/x/mongo/driver/session"
25 "github.com/mongodb/mongo-go-driver/x/network/address"
26 "github.com/mongodb/mongo-go-driver/x/network/description"
27)
28
29// ErrSubscribeAfterClosed is returned when a user attempts to subscribe to a
30// closed Server or Topology.
31var ErrSubscribeAfterClosed = errors.New("cannot subscribe after close")
32
33// ErrTopologyClosed is returned when a user attempts to call a method on a
34// closed Topology.
35var ErrTopologyClosed = errors.New("topology is closed")
36
37// ErrTopologyConnected is returned whena user attempts to connect to an
38// already connected Topology.
39var ErrTopologyConnected = errors.New("topology is connected or connecting")
40
41// ErrServerSelectionTimeout is returned from server selection when the server
42// selection process took longer than allowed by the timeout.
43var ErrServerSelectionTimeout = errors.New("server selection timeout")
44
45// MonitorMode represents the way in which a server is monitored.
46type MonitorMode uint8
47
48// These constants are the available monitoring modes.
49const (
50 AutomaticMode MonitorMode = iota
51 SingleMode
52)
53
54// Topology represents a MongoDB deployment.
55type Topology struct {
56 registry *bsoncodec.Registry
57
58 connectionstate int32
59
60 cfg *config
61
62 desc atomic.Value // holds a description.Topology
63
64 done chan struct{}
65
66 fsm *fsm
67 changes chan description.Server
68 changeswg sync.WaitGroup
69
70 SessionPool *session.Pool
71
72 // This should really be encapsulated into it's own type. This will likely
73 // require a redesign so we can share a minimum of data between the
74 // subscribers and the topology.
75 subscribers map[uint64]chan description.Topology
76 currentSubscriberID uint64
77 subscriptionsClosed bool
78 subLock sync.Mutex
79
80 // We should redesign how we connect and handle individal servers. This is
81 // too difficult to maintain and it's rather easy to accidentally access
82 // the servers without acquiring the lock or checking if the servers are
83 // closed. This lock should also be an RWMutex.
84 serversLock sync.Mutex
85 serversClosed bool
86 servers map[address.Address]*Server
87
88 wg sync.WaitGroup
89}
90
91// New creates a new topology.
92func New(opts ...Option) (*Topology, error) {
93 cfg, err := newConfig(opts...)
94 if err != nil {
95 return nil, err
96 }
97
98 t := &Topology{
99 cfg: cfg,
100 done: make(chan struct{}),
101 fsm: newFSM(),
102 changes: make(chan description.Server),
103 subscribers: make(map[uint64]chan description.Topology),
104 servers: make(map[address.Address]*Server),
105 }
106 t.desc.Store(description.Topology{})
107
108 if cfg.replicaSetName != "" {
109 t.fsm.SetName = cfg.replicaSetName
110 t.fsm.Kind = description.ReplicaSetNoPrimary
111 }
112
113 if cfg.mode == SingleMode {
114 t.fsm.Kind = description.Single
115 }
116
117 return t, nil
118}
119
120// Connect initializes a Topology and starts the monitoring process. This function
121// must be called to properly monitor the topology.
122func (t *Topology) Connect(ctx context.Context) error {
123 if !atomic.CompareAndSwapInt32(&t.connectionstate, disconnected, connecting) {
124 return ErrTopologyConnected
125 }
126
127 t.desc.Store(description.Topology{})
128 var err error
129 t.serversLock.Lock()
130 for _, a := range t.cfg.seedList {
131 addr := address.Address(a).Canonicalize()
132 t.fsm.Servers = append(t.fsm.Servers, description.Server{Addr: addr})
133 err = t.addServer(ctx, addr)
134 }
135 t.serversLock.Unlock()
136
137 go t.update()
138 t.changeswg.Add(1)
139
140 t.subscriptionsClosed = false // explicitly set in case topology was disconnected and then reconnected
141
142 atomic.StoreInt32(&t.connectionstate, connected)
143
144 // After connection, make a subscription to keep the pool updated
145 sub, err := t.Subscribe()
146 t.SessionPool = session.NewPool(sub.C)
147 return err
148}
149
150// Disconnect closes the topology. It stops the monitoring thread and
151// closes all open subscriptions.
152func (t *Topology) Disconnect(ctx context.Context) error {
153 if !atomic.CompareAndSwapInt32(&t.connectionstate, connected, disconnecting) {
154 return ErrTopologyClosed
155 }
156
157 t.serversLock.Lock()
158 t.serversClosed = true
159 for addr, server := range t.servers {
160 t.removeServer(ctx, addr, server)
161 }
162 t.serversLock.Unlock()
163
164 t.wg.Wait()
165 t.done <- struct{}{}
166 t.changeswg.Wait()
167
168 t.desc.Store(description.Topology{})
169
170 atomic.StoreInt32(&t.connectionstate, disconnected)
171 return nil
172}
173
174// Description returns a description of the topology.
175func (t *Topology) Description() description.Topology {
176 td, ok := t.desc.Load().(description.Topology)
177 if !ok {
178 td = description.Topology{}
179 }
180 return td
181}
182
183// Subscribe returns a Subscription on which all updated description.Topologys
184// will be sent. The channel of the subscription will have a buffer size of one,
185// and will be pre-populated with the current description.Topology.
186func (t *Topology) Subscribe() (*Subscription, error) {
187 if atomic.LoadInt32(&t.connectionstate) != connected {
188 return nil, errors.New("cannot subscribe to Topology that is not connected")
189 }
190 ch := make(chan description.Topology, 1)
191 td, ok := t.desc.Load().(description.Topology)
192 if !ok {
193 td = description.Topology{}
194 }
195 ch <- td
196
197 t.subLock.Lock()
198 defer t.subLock.Unlock()
199 if t.subscriptionsClosed {
200 return nil, ErrSubscribeAfterClosed
201 }
202 id := t.currentSubscriberID
203 t.subscribers[id] = ch
204 t.currentSubscriberID++
205
206 return &Subscription{
207 C: ch,
208 t: t,
209 id: id,
210 }, nil
211}
212
213// RequestImmediateCheck will send heartbeats to all the servers in the
214// topology right away, instead of waiting for the heartbeat timeout.
215func (t *Topology) RequestImmediateCheck() {
216 if atomic.LoadInt32(&t.connectionstate) != connected {
217 return
218 }
219 t.serversLock.Lock()
220 for _, server := range t.servers {
221 server.RequestImmediateCheck()
222 }
223 t.serversLock.Unlock()
224}
225
226// SupportsSessions returns true if the topology supports sessions.
227func (t *Topology) SupportsSessions() bool {
228 return t.Description().SessionTimeoutMinutes != 0 && t.Description().Kind != description.Single
229}
230
231// SelectServer selects a server given a selector.SelectServer complies with the
232// server selection spec, and will time out after severSelectionTimeout or when the
233// parent context is done.
234func (t *Topology) SelectServer(ctx context.Context, ss description.ServerSelector) (*SelectedServer, error) {
235 if atomic.LoadInt32(&t.connectionstate) != connected {
236 return nil, ErrTopologyClosed
237 }
238 var ssTimeoutCh <-chan time.Time
239
240 if t.cfg.serverSelectionTimeout > 0 {
241 ssTimeout := time.NewTimer(t.cfg.serverSelectionTimeout)
242 ssTimeoutCh = ssTimeout.C
243 defer ssTimeout.Stop()
244 }
245
246 sub, err := t.Subscribe()
247 if err != nil {
248 return nil, err
249 }
250 defer sub.Unsubscribe()
251
252 for {
253 suitable, err := t.selectServer(ctx, sub.C, ss, ssTimeoutCh)
254 if err != nil {
255 return nil, err
256 }
257
258 selected := suitable[rand.Intn(len(suitable))]
259 selectedS, err := t.FindServer(selected)
260 switch {
261 case err != nil:
262 return nil, err
263 case selectedS != nil:
264 return selectedS, nil
265 default:
266 // We don't have an actual server for the provided description.
267 // This could happen for a number of reasons, including that the
268 // server has since stopped being a part of this topology, or that
269 // the server selector returned no suitable servers.
270 }
271 }
272}
273
274// FindServer will attempt to find a server that fits the given server description.
275// This method will return nil, nil if a matching server could not be found.
276func (t *Topology) FindServer(selected description.Server) (*SelectedServer, error) {
277 if atomic.LoadInt32(&t.connectionstate) != connected {
278 return nil, ErrTopologyClosed
279 }
280 t.serversLock.Lock()
281 defer t.serversLock.Unlock()
282 server, ok := t.servers[selected.Addr]
283 if !ok {
284 return nil, nil
285 }
286
287 desc := t.Description()
288 return &SelectedServer{
289 Server: server,
290 Kind: desc.Kind,
291 }, nil
292}
293
294func wrapServerSelectionError(err error, t *Topology) error {
295 return fmt.Errorf("server selection error: %v\ncurrent topology: %s", err, t.String())
296}
297
298// selectServer is the core piece of server selection. It handles getting
299// topology descriptions and running sever selection on those descriptions.
300func (t *Topology) selectServer(ctx context.Context, subscriptionCh <-chan description.Topology, ss description.ServerSelector, timeoutCh <-chan time.Time) ([]description.Server, error) {
301 var current description.Topology
302 for {
303 select {
304 case <-ctx.Done():
305 return nil, ctx.Err()
306 case <-timeoutCh:
307 return nil, wrapServerSelectionError(ErrServerSelectionTimeout, t)
308 case current = <-subscriptionCh:
309 }
310
311 var allowed []description.Server
312 for _, s := range current.Servers {
313 if s.Kind != description.Unknown {
314 allowed = append(allowed, s)
315 }
316 }
317
318 suitable, err := ss.SelectServer(current, allowed)
319 if err != nil {
320 return nil, wrapServerSelectionError(err, t)
321 }
322
323 if len(suitable) > 0 {
324 return suitable, nil
325 }
326
327 t.RequestImmediateCheck()
328 }
329}
330
331func (t *Topology) update() {
332 defer t.changeswg.Done()
333 defer func() {
334 // ¯\_(ツ)_/¯
335 if r := recover(); r != nil {
336 <-t.done
337 }
338 }()
339
340 for {
341 select {
342 case change := <-t.changes:
343 current, err := t.apply(context.TODO(), change)
344 if err != nil {
345 continue
346 }
347
348 t.desc.Store(current)
349 t.subLock.Lock()
350 for _, ch := range t.subscribers {
351 // We drain the description if there's one in the channel
352 select {
353 case <-ch:
354 default:
355 }
356 ch <- current
357 }
358 t.subLock.Unlock()
359 case <-t.done:
360 t.subLock.Lock()
361 for id, ch := range t.subscribers {
362 close(ch)
363 delete(t.subscribers, id)
364 }
365 t.subscriptionsClosed = true
366 t.subLock.Unlock()
367 return
368 }
369 }
370}
371
372func (t *Topology) apply(ctx context.Context, desc description.Server) (description.Topology, error) {
373 var err error
374 prev := t.fsm.Topology
375
376 current, err := t.fsm.apply(desc)
377 if err != nil {
378 return description.Topology{}, err
379 }
380
381 diff := description.DiffTopology(prev, current)
382 t.serversLock.Lock()
383 if t.serversClosed {
384 t.serversLock.Unlock()
385 return description.Topology{}, nil
386 }
387
388 for _, removed := range diff.Removed {
389 if s, ok := t.servers[removed.Addr]; ok {
390 t.removeServer(ctx, removed.Addr, s)
391 }
392 }
393
394 for _, added := range diff.Added {
395 _ = t.addServer(ctx, added.Addr)
396 }
397 t.serversLock.Unlock()
398 return current, nil
399}
400
401func (t *Topology) addServer(ctx context.Context, addr address.Address) error {
402 if _, ok := t.servers[addr]; ok {
403 return nil
404 }
405
406 svr, err := ConnectServer(ctx, addr, t.cfg.serverOpts...)
407 if err != nil {
408 return err
409 }
410
411 t.servers[addr] = svr
412 var sub *ServerSubscription
413 sub, err = svr.Subscribe()
414 if err != nil {
415 return err
416 }
417
418 t.wg.Add(1)
419 go func() {
420 for c := range sub.C {
421 t.changes <- c
422 }
423
424 t.wg.Done()
425 }()
426
427 return nil
428}
429
430func (t *Topology) removeServer(ctx context.Context, addr address.Address, server *Server) {
431 _ = server.Disconnect(ctx)
432 delete(t.servers, addr)
433}
434
435// String implements the Stringer interface
436func (t *Topology) String() string {
437 desc := t.Description()
438 str := fmt.Sprintf("Type: %s\nServers:\n", desc.Kind)
439 for _, s := range t.servers {
440 str += s.String() + "\n"
441 }
442 return str
443}
444
445// Subscription is a subscription to updates to the description of the Topology that created this
446// Subscription.
447type Subscription struct {
448 C <-chan description.Topology
449 t *Topology
450 id uint64
451}
452
453// Unsubscribe unsubscribes this Subscription from updates and closes the
454// subscription channel.
455func (s *Subscription) Unsubscribe() error {
456 s.t.subLock.Lock()
457 defer s.t.subLock.Unlock()
458 if s.t.subscriptionsClosed {
459 return nil
460 }
461
462 ch, ok := s.t.subscribers[s.id]
463 if !ok {
464 return nil
465 }
466
467 close(ch)
468 delete(s.t.subscribers, s.id)
469
470 return nil
471}