blob: 09a319cc4443bf40af5cc63f9dc12bc384de9951 [file] [log] [blame]
// Copyright (C) MongoDB, Inc. 2017-present.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License. You may obtain
// a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
// Package topology contains types that handles the discovery, monitoring, and selection
// of servers. This package is designed to expose enough inner workings of service discovery
// and monitoring to allow low level applications to have fine grained control, while hiding
// most of the detailed implementation of the algorithms.
package topology
import (
"context"
"errors"
"math/rand"
"sync"
"sync/atomic"
"time"
"fmt"
"github.com/mongodb/mongo-go-driver/bson/bsoncodec"
"github.com/mongodb/mongo-go-driver/x/mongo/driver/session"
"github.com/mongodb/mongo-go-driver/x/network/address"
"github.com/mongodb/mongo-go-driver/x/network/description"
)
// ErrSubscribeAfterClosed is returned when a user attempts to subscribe to a
// closed Server or Topology.
var ErrSubscribeAfterClosed = errors.New("cannot subscribe after close")
// ErrTopologyClosed is returned when a user attempts to call a method on a
// closed Topology.
var ErrTopologyClosed = errors.New("topology is closed")
// ErrTopologyConnected is returned whena user attempts to connect to an
// already connected Topology.
var ErrTopologyConnected = errors.New("topology is connected or connecting")
// ErrServerSelectionTimeout is returned from server selection when the server
// selection process took longer than allowed by the timeout.
var ErrServerSelectionTimeout = errors.New("server selection timeout")
// MonitorMode represents the way in which a server is monitored.
type MonitorMode uint8
// These constants are the available monitoring modes.
const (
AutomaticMode MonitorMode = iota
SingleMode
)
// Topology represents a MongoDB deployment.
type Topology struct {
registry *bsoncodec.Registry
connectionstate int32
cfg *config
desc atomic.Value // holds a description.Topology
done chan struct{}
fsm *fsm
changes chan description.Server
changeswg sync.WaitGroup
SessionPool *session.Pool
// This should really be encapsulated into it's own type. This will likely
// require a redesign so we can share a minimum of data between the
// subscribers and the topology.
subscribers map[uint64]chan description.Topology
currentSubscriberID uint64
subscriptionsClosed bool
subLock sync.Mutex
// We should redesign how we connect and handle individal servers. This is
// too difficult to maintain and it's rather easy to accidentally access
// the servers without acquiring the lock or checking if the servers are
// closed. This lock should also be an RWMutex.
serversLock sync.Mutex
serversClosed bool
servers map[address.Address]*Server
wg sync.WaitGroup
}
// New creates a new topology.
func New(opts ...Option) (*Topology, error) {
cfg, err := newConfig(opts...)
if err != nil {
return nil, err
}
t := &Topology{
cfg: cfg,
done: make(chan struct{}),
fsm: newFSM(),
changes: make(chan description.Server),
subscribers: make(map[uint64]chan description.Topology),
servers: make(map[address.Address]*Server),
}
t.desc.Store(description.Topology{})
if cfg.replicaSetName != "" {
t.fsm.SetName = cfg.replicaSetName
t.fsm.Kind = description.ReplicaSetNoPrimary
}
if cfg.mode == SingleMode {
t.fsm.Kind = description.Single
}
return t, nil
}
// Connect initializes a Topology and starts the monitoring process. This function
// must be called to properly monitor the topology.
func (t *Topology) Connect(ctx context.Context) error {
if !atomic.CompareAndSwapInt32(&t.connectionstate, disconnected, connecting) {
return ErrTopologyConnected
}
t.desc.Store(description.Topology{})
var err error
t.serversLock.Lock()
for _, a := range t.cfg.seedList {
addr := address.Address(a).Canonicalize()
t.fsm.Servers = append(t.fsm.Servers, description.Server{Addr: addr})
err = t.addServer(ctx, addr)
}
t.serversLock.Unlock()
go t.update()
t.changeswg.Add(1)
t.subscriptionsClosed = false // explicitly set in case topology was disconnected and then reconnected
atomic.StoreInt32(&t.connectionstate, connected)
// After connection, make a subscription to keep the pool updated
sub, err := t.Subscribe()
t.SessionPool = session.NewPool(sub.C)
return err
}
// Disconnect closes the topology. It stops the monitoring thread and
// closes all open subscriptions.
func (t *Topology) Disconnect(ctx context.Context) error {
if !atomic.CompareAndSwapInt32(&t.connectionstate, connected, disconnecting) {
return ErrTopologyClosed
}
t.serversLock.Lock()
t.serversClosed = true
for addr, server := range t.servers {
t.removeServer(ctx, addr, server)
}
t.serversLock.Unlock()
t.wg.Wait()
t.done <- struct{}{}
t.changeswg.Wait()
t.desc.Store(description.Topology{})
atomic.StoreInt32(&t.connectionstate, disconnected)
return nil
}
// Description returns a description of the topology.
func (t *Topology) Description() description.Topology {
td, ok := t.desc.Load().(description.Topology)
if !ok {
td = description.Topology{}
}
return td
}
// Subscribe returns a Subscription on which all updated description.Topologys
// will be sent. The channel of the subscription will have a buffer size of one,
// and will be pre-populated with the current description.Topology.
func (t *Topology) Subscribe() (*Subscription, error) {
if atomic.LoadInt32(&t.connectionstate) != connected {
return nil, errors.New("cannot subscribe to Topology that is not connected")
}
ch := make(chan description.Topology, 1)
td, ok := t.desc.Load().(description.Topology)
if !ok {
td = description.Topology{}
}
ch <- td
t.subLock.Lock()
defer t.subLock.Unlock()
if t.subscriptionsClosed {
return nil, ErrSubscribeAfterClosed
}
id := t.currentSubscriberID
t.subscribers[id] = ch
t.currentSubscriberID++
return &Subscription{
C: ch,
t: t,
id: id,
}, nil
}
// RequestImmediateCheck will send heartbeats to all the servers in the
// topology right away, instead of waiting for the heartbeat timeout.
func (t *Topology) RequestImmediateCheck() {
if atomic.LoadInt32(&t.connectionstate) != connected {
return
}
t.serversLock.Lock()
for _, server := range t.servers {
server.RequestImmediateCheck()
}
t.serversLock.Unlock()
}
// SupportsSessions returns true if the topology supports sessions.
func (t *Topology) SupportsSessions() bool {
return t.Description().SessionTimeoutMinutes != 0 && t.Description().Kind != description.Single
}
// SelectServer selects a server given a selector.SelectServer complies with the
// server selection spec, and will time out after severSelectionTimeout or when the
// parent context is done.
func (t *Topology) SelectServer(ctx context.Context, ss description.ServerSelector) (*SelectedServer, error) {
if atomic.LoadInt32(&t.connectionstate) != connected {
return nil, ErrTopologyClosed
}
var ssTimeoutCh <-chan time.Time
if t.cfg.serverSelectionTimeout > 0 {
ssTimeout := time.NewTimer(t.cfg.serverSelectionTimeout)
ssTimeoutCh = ssTimeout.C
defer ssTimeout.Stop()
}
sub, err := t.Subscribe()
if err != nil {
return nil, err
}
defer sub.Unsubscribe()
for {
suitable, err := t.selectServer(ctx, sub.C, ss, ssTimeoutCh)
if err != nil {
return nil, err
}
selected := suitable[rand.Intn(len(suitable))]
selectedS, err := t.FindServer(selected)
switch {
case err != nil:
return nil, err
case selectedS != nil:
return selectedS, nil
default:
// We don't have an actual server for the provided description.
// This could happen for a number of reasons, including that the
// server has since stopped being a part of this topology, or that
// the server selector returned no suitable servers.
}
}
}
// FindServer will attempt to find a server that fits the given server description.
// This method will return nil, nil if a matching server could not be found.
func (t *Topology) FindServer(selected description.Server) (*SelectedServer, error) {
if atomic.LoadInt32(&t.connectionstate) != connected {
return nil, ErrTopologyClosed
}
t.serversLock.Lock()
defer t.serversLock.Unlock()
server, ok := t.servers[selected.Addr]
if !ok {
return nil, nil
}
desc := t.Description()
return &SelectedServer{
Server: server,
Kind: desc.Kind,
}, nil
}
func wrapServerSelectionError(err error, t *Topology) error {
return fmt.Errorf("server selection error: %v\ncurrent topology: %s", err, t.String())
}
// selectServer is the core piece of server selection. It handles getting
// topology descriptions and running sever selection on those descriptions.
func (t *Topology) selectServer(ctx context.Context, subscriptionCh <-chan description.Topology, ss description.ServerSelector, timeoutCh <-chan time.Time) ([]description.Server, error) {
var current description.Topology
for {
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-timeoutCh:
return nil, wrapServerSelectionError(ErrServerSelectionTimeout, t)
case current = <-subscriptionCh:
}
var allowed []description.Server
for _, s := range current.Servers {
if s.Kind != description.Unknown {
allowed = append(allowed, s)
}
}
suitable, err := ss.SelectServer(current, allowed)
if err != nil {
return nil, wrapServerSelectionError(err, t)
}
if len(suitable) > 0 {
return suitable, nil
}
t.RequestImmediateCheck()
}
}
func (t *Topology) update() {
defer t.changeswg.Done()
defer func() {
// ¯\_(ツ)_/¯
if r := recover(); r != nil {
<-t.done
}
}()
for {
select {
case change := <-t.changes:
current, err := t.apply(context.TODO(), change)
if err != nil {
continue
}
t.desc.Store(current)
t.subLock.Lock()
for _, ch := range t.subscribers {
// We drain the description if there's one in the channel
select {
case <-ch:
default:
}
ch <- current
}
t.subLock.Unlock()
case <-t.done:
t.subLock.Lock()
for id, ch := range t.subscribers {
close(ch)
delete(t.subscribers, id)
}
t.subscriptionsClosed = true
t.subLock.Unlock()
return
}
}
}
func (t *Topology) apply(ctx context.Context, desc description.Server) (description.Topology, error) {
var err error
prev := t.fsm.Topology
current, err := t.fsm.apply(desc)
if err != nil {
return description.Topology{}, err
}
diff := description.DiffTopology(prev, current)
t.serversLock.Lock()
if t.serversClosed {
t.serversLock.Unlock()
return description.Topology{}, nil
}
for _, removed := range diff.Removed {
if s, ok := t.servers[removed.Addr]; ok {
t.removeServer(ctx, removed.Addr, s)
}
}
for _, added := range diff.Added {
_ = t.addServer(ctx, added.Addr)
}
t.serversLock.Unlock()
return current, nil
}
func (t *Topology) addServer(ctx context.Context, addr address.Address) error {
if _, ok := t.servers[addr]; ok {
return nil
}
svr, err := ConnectServer(ctx, addr, t.cfg.serverOpts...)
if err != nil {
return err
}
t.servers[addr] = svr
var sub *ServerSubscription
sub, err = svr.Subscribe()
if err != nil {
return err
}
t.wg.Add(1)
go func() {
for c := range sub.C {
t.changes <- c
}
t.wg.Done()
}()
return nil
}
func (t *Topology) removeServer(ctx context.Context, addr address.Address, server *Server) {
_ = server.Disconnect(ctx)
delete(t.servers, addr)
}
// String implements the Stringer interface
func (t *Topology) String() string {
desc := t.Description()
str := fmt.Sprintf("Type: %s\nServers:\n", desc.Kind)
for _, s := range t.servers {
str += s.String() + "\n"
}
return str
}
// Subscription is a subscription to updates to the description of the Topology that created this
// Subscription.
type Subscription struct {
C <-chan description.Topology
t *Topology
id uint64
}
// Unsubscribe unsubscribes this Subscription from updates and closes the
// subscription channel.
func (s *Subscription) Unsubscribe() error {
s.t.subLock.Lock()
defer s.t.subLock.Unlock()
if s.t.subscriptionsClosed {
return nil
}
ch, ok := s.t.subscribers[s.id]
if !ok {
return nil
}
close(ch)
delete(s.t.subscribers, s.id)
return nil
}