blob: 09a319cc4443bf40af5cc63f9dc12bc384de9951 [file] [log] [blame]
// Copyright (C) MongoDB, Inc. 2017-present.
// Licensed under the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License. You may obtain
// a copy of the License at
// Package topology contains types that handles the discovery, monitoring, and selection
// of servers. This package is designed to expose enough inner workings of service discovery
// and monitoring to allow low level applications to have fine grained control, while hiding
// most of the detailed implementation of the algorithms.
package topology
import (
// ErrSubscribeAfterClosed is returned when a user attempts to subscribe to a
// closed Server or Topology.
var ErrSubscribeAfterClosed = errors.New("cannot subscribe after close")
// ErrTopologyClosed is returned when a user attempts to call a method on a
// closed Topology.
var ErrTopologyClosed = errors.New("topology is closed")
// ErrTopologyConnected is returned whena user attempts to connect to an
// already connected Topology.
var ErrTopologyConnected = errors.New("topology is connected or connecting")
// ErrServerSelectionTimeout is returned from server selection when the server
// selection process took longer than allowed by the timeout.
var ErrServerSelectionTimeout = errors.New("server selection timeout")
// MonitorMode represents the way in which a server is monitored.
type MonitorMode uint8
// These constants are the available monitoring modes.
const (
AutomaticMode MonitorMode = iota
// Topology represents a MongoDB deployment.
type Topology struct {
registry *bsoncodec.Registry
connectionstate int32
cfg *config
desc atomic.Value // holds a description.Topology
done chan struct{}
fsm *fsm
changes chan description.Server
changeswg sync.WaitGroup
SessionPool *session.Pool
// This should really be encapsulated into it's own type. This will likely
// require a redesign so we can share a minimum of data between the
// subscribers and the topology.
subscribers map[uint64]chan description.Topology
currentSubscriberID uint64
subscriptionsClosed bool
subLock sync.Mutex
// We should redesign how we connect and handle individal servers. This is
// too difficult to maintain and it's rather easy to accidentally access
// the servers without acquiring the lock or checking if the servers are
// closed. This lock should also be an RWMutex.
serversLock sync.Mutex
serversClosed bool
servers map[address.Address]*Server
wg sync.WaitGroup
// New creates a new topology.
func New(opts ...Option) (*Topology, error) {
cfg, err := newConfig(opts...)
if err != nil {
return nil, err
t := &Topology{
cfg: cfg,
done: make(chan struct{}),
fsm: newFSM(),
changes: make(chan description.Server),
subscribers: make(map[uint64]chan description.Topology),
servers: make(map[address.Address]*Server),
if cfg.replicaSetName != "" {
t.fsm.SetName = cfg.replicaSetName
t.fsm.Kind = description.ReplicaSetNoPrimary
if cfg.mode == SingleMode {
t.fsm.Kind = description.Single
return t, nil
// Connect initializes a Topology and starts the monitoring process. This function
// must be called to properly monitor the topology.
func (t *Topology) Connect(ctx context.Context) error {
if !atomic.CompareAndSwapInt32(&t.connectionstate, disconnected, connecting) {
return ErrTopologyConnected
var err error
for _, a := range t.cfg.seedList {
addr := address.Address(a).Canonicalize()
t.fsm.Servers = append(t.fsm.Servers, description.Server{Addr: addr})
err = t.addServer(ctx, addr)
go t.update()
t.subscriptionsClosed = false // explicitly set in case topology was disconnected and then reconnected
atomic.StoreInt32(&t.connectionstate, connected)
// After connection, make a subscription to keep the pool updated
sub, err := t.Subscribe()
t.SessionPool = session.NewPool(sub.C)
return err
// Disconnect closes the topology. It stops the monitoring thread and
// closes all open subscriptions.
func (t *Topology) Disconnect(ctx context.Context) error {
if !atomic.CompareAndSwapInt32(&t.connectionstate, connected, disconnecting) {
return ErrTopologyClosed
t.serversClosed = true
for addr, server := range t.servers {
t.removeServer(ctx, addr, server)
t.done <- struct{}{}
atomic.StoreInt32(&t.connectionstate, disconnected)
return nil
// Description returns a description of the topology.
func (t *Topology) Description() description.Topology {
td, ok := t.desc.Load().(description.Topology)
if !ok {
td = description.Topology{}
return td
// Subscribe returns a Subscription on which all updated description.Topologys
// will be sent. The channel of the subscription will have a buffer size of one,
// and will be pre-populated with the current description.Topology.
func (t *Topology) Subscribe() (*Subscription, error) {
if atomic.LoadInt32(&t.connectionstate) != connected {
return nil, errors.New("cannot subscribe to Topology that is not connected")
ch := make(chan description.Topology, 1)
td, ok := t.desc.Load().(description.Topology)
if !ok {
td = description.Topology{}
ch <- td
defer t.subLock.Unlock()
if t.subscriptionsClosed {
return nil, ErrSubscribeAfterClosed
id := t.currentSubscriberID
t.subscribers[id] = ch
return &Subscription{
C: ch,
t: t,
id: id,
}, nil
// RequestImmediateCheck will send heartbeats to all the servers in the
// topology right away, instead of waiting for the heartbeat timeout.
func (t *Topology) RequestImmediateCheck() {
if atomic.LoadInt32(&t.connectionstate) != connected {
for _, server := range t.servers {
// SupportsSessions returns true if the topology supports sessions.
func (t *Topology) SupportsSessions() bool {
return t.Description().SessionTimeoutMinutes != 0 && t.Description().Kind != description.Single
// SelectServer selects a server given a selector.SelectServer complies with the
// server selection spec, and will time out after severSelectionTimeout or when the
// parent context is done.
func (t *Topology) SelectServer(ctx context.Context, ss description.ServerSelector) (*SelectedServer, error) {
if atomic.LoadInt32(&t.connectionstate) != connected {
return nil, ErrTopologyClosed
var ssTimeoutCh <-chan time.Time
if t.cfg.serverSelectionTimeout > 0 {
ssTimeout := time.NewTimer(t.cfg.serverSelectionTimeout)
ssTimeoutCh = ssTimeout.C
defer ssTimeout.Stop()
sub, err := t.Subscribe()
if err != nil {
return nil, err
defer sub.Unsubscribe()
for {
suitable, err := t.selectServer(ctx, sub.C, ss, ssTimeoutCh)
if err != nil {
return nil, err
selected := suitable[rand.Intn(len(suitable))]
selectedS, err := t.FindServer(selected)
switch {
case err != nil:
return nil, err
case selectedS != nil:
return selectedS, nil
// We don't have an actual server for the provided description.
// This could happen for a number of reasons, including that the
// server has since stopped being a part of this topology, or that
// the server selector returned no suitable servers.
// FindServer will attempt to find a server that fits the given server description.
// This method will return nil, nil if a matching server could not be found.
func (t *Topology) FindServer(selected description.Server) (*SelectedServer, error) {
if atomic.LoadInt32(&t.connectionstate) != connected {
return nil, ErrTopologyClosed
defer t.serversLock.Unlock()
server, ok := t.servers[selected.Addr]
if !ok {
return nil, nil
desc := t.Description()
return &SelectedServer{
Server: server,
Kind: desc.Kind,
}, nil
func wrapServerSelectionError(err error, t *Topology) error {
return fmt.Errorf("server selection error: %v\ncurrent topology: %s", err, t.String())
// selectServer is the core piece of server selection. It handles getting
// topology descriptions and running sever selection on those descriptions.
func (t *Topology) selectServer(ctx context.Context, subscriptionCh <-chan description.Topology, ss description.ServerSelector, timeoutCh <-chan time.Time) ([]description.Server, error) {
var current description.Topology
for {
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-timeoutCh:
return nil, wrapServerSelectionError(ErrServerSelectionTimeout, t)
case current = <-subscriptionCh:
var allowed []description.Server
for _, s := range current.Servers {
if s.Kind != description.Unknown {
allowed = append(allowed, s)
suitable, err := ss.SelectServer(current, allowed)
if err != nil {
return nil, wrapServerSelectionError(err, t)
if len(suitable) > 0 {
return suitable, nil
func (t *Topology) update() {
defer t.changeswg.Done()
defer func() {
// ¯\_(ツ)_/¯
if r := recover(); r != nil {
for {
select {
case change := <-t.changes:
current, err := t.apply(context.TODO(), change)
if err != nil {
for _, ch := range t.subscribers {
// We drain the description if there's one in the channel
select {
case <-ch:
ch <- current
case <-t.done:
for id, ch := range t.subscribers {
delete(t.subscribers, id)
t.subscriptionsClosed = true
func (t *Topology) apply(ctx context.Context, desc description.Server) (description.Topology, error) {
var err error
prev := t.fsm.Topology
current, err := t.fsm.apply(desc)
if err != nil {
return description.Topology{}, err
diff := description.DiffTopology(prev, current)
if t.serversClosed {
return description.Topology{}, nil
for _, removed := range diff.Removed {
if s, ok := t.servers[removed.Addr]; ok {
t.removeServer(ctx, removed.Addr, s)
for _, added := range diff.Added {
_ = t.addServer(ctx, added.Addr)
return current, nil
func (t *Topology) addServer(ctx context.Context, addr address.Address) error {
if _, ok := t.servers[addr]; ok {
return nil
svr, err := ConnectServer(ctx, addr, t.cfg.serverOpts...)
if err != nil {
return err
t.servers[addr] = svr
var sub *ServerSubscription
sub, err = svr.Subscribe()
if err != nil {
return err
go func() {
for c := range sub.C {
t.changes <- c
return nil
func (t *Topology) removeServer(ctx context.Context, addr address.Address, server *Server) {
_ = server.Disconnect(ctx)
delete(t.servers, addr)
// String implements the Stringer interface
func (t *Topology) String() string {
desc := t.Description()
str := fmt.Sprintf("Type: %s\nServers:\n", desc.Kind)
for _, s := range t.servers {
str += s.String() + "\n"
return str
// Subscription is a subscription to updates to the description of the Topology that created this
// Subscription.
type Subscription struct {
C <-chan description.Topology
t *Topology
id uint64
// Unsubscribe unsubscribes this Subscription from updates and closes the
// subscription channel.
func (s *Subscription) Unsubscribe() error {
defer s.t.subLock.Unlock()
if s.t.subscriptionsClosed {
return nil
ch, ok := s.t.subscribers[]
if !ok {
return nil
return nil