[bgpd] Peer delete can race with reconfig leading to crash

2007-02-22 Paul Jakma <paul.jakma@sun.com>

	* bgp_fsm.c: (bgp_fsm_change_status) Handle state change into
	  clearing or greater here. Simpler.
	  (bgp_event) Clearing state change work moved to previous
	* bgp_route.c: (bgp_clear_route_node) Clearing adj-in here
	  is too late, as it leaves a race between a peer being deleted
	  and an identical peer being configured before clearing
	  completes, leading to a crash.
	  Simplest fix is to clean peers Adj-in up-front, rather than
	  queueing such work.
	  (bgp_clear_route_table) Clear peer's Adj-In and Adj-Out
	  up-front here, rather than queueing such work.
	  Extensive comment added on the various bits of indexed data
	  that exist and how they need to be dealt with.
	  (bgp_clear_route) Update comment.
diff --git a/bgpd/ChangeLog b/bgpd/ChangeLog
index 257b0ee..7e964c9 100644
--- a/bgpd/ChangeLog
+++ b/bgpd/ChangeLog
@@ -1,3 +1,20 @@
+2007-02-22 Paul Jakma <paul.jakma@sun.com>
+
+	* bgp_fsm.c: (bgp_fsm_change_status) Handle state change into
+	  clearing or greater here. Simpler.
+	  (bgp_event) Clearing state change work moved to previous
+	* bgp_route.c: (bgp_clear_route_node) Clearing adj-in here
+	  is too late, as it leaves a race between a peer being deleted
+	  and an identical peer being configured before clearing
+	  completes, leading to a crash.
+	  Simplest fix is to clean peers Adj-in up-front, rather than
+	  queueing such work.
+	  (bgp_clear_route_table) Clear peer's Adj-In and Adj-Out
+	  up-front here, rather than queueing such work.
+	  Extensive comment added on the various bits of indexed data
+	  that exist and how they need to be dealt with.
+	  (bgp_clear_route) Update comment.
+
 2006-12-12 Andrew J. Schorr <ajschorr@alumni.princeton.edu>
 	
 	* bgp_nexthop.c: (bgp_connected_add, bgp_connected_delete)
diff --git a/bgpd/bgp_fsm.c b/bgpd/bgp_fsm.c
index d704c29..db7e69a 100644
--- a/bgpd/bgp_fsm.c
+++ b/bgpd/bgp_fsm.c
@@ -409,10 +409,16 @@
 {
   bgp_dump_state (peer, peer->status, status);
 
+  /* Transition into Clearing or Deleted must /always/ clear all routes.. 
+   * (and must do so before actually changing into Deleted..
+   */
+  if (status >= Clearing)
+    bgp_clear_route_all (peer);
+  
   /* Preserve old status and change into new status. */
   peer->ostatus = peer->status;
   peer->status = status;
-
+  
   if (BGP_DEBUG (normal, NORMAL))
     zlog_debug ("%s went from %s to %s",
 		peer->host,
@@ -1089,13 +1095,7 @@
     {
       /* If status is changed. */
       if (next != peer->status)
-        {
-          /* Transition into Clearing must /always/ clear all routes.. */
-          if (next == Clearing)
-            bgp_clear_route_all (peer);
-          
-          bgp_fsm_change_status (peer, next);
-        }
+        bgp_fsm_change_status (peer, next);
       
       /* Make sure timer is set. */
       bgp_timer_set (peer);
diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c
index ce44842..c464cd0 100644
--- a/bgpd/bgp_route.c
+++ b/bgpd/bgp_route.c
@@ -2490,8 +2490,6 @@
 {
   struct bgp_node *rn = data;
   struct peer *peer = wq->spec.data;
-  struct bgp_adj_in *ain;
-  struct bgp_adj_out *aout;
   struct bgp_info *ri;
   afi_t afi = rn->table->afi;
   safi_t safi = rn->table->safi;
@@ -2511,20 +2509,6 @@
           bgp_rib_remove (rn, ri, peer, afi, safi);
         break;
       }
-  for (ain = rn->adj_in; ain; ain = ain->next)
-    if (ain->peer == peer)
-      {
-        bgp_adj_in_remove (rn, ain);
-        bgp_unlock_node (rn);
-        break;
-      }
-  for (aout = rn->adj_out; aout; aout = aout->next)
-    if (aout->peer == peer)
-      {
-        bgp_adj_out_remove (rn, aout, peer, afi, safi);
-        bgp_unlock_node (rn);
-        break;
-      }
   return WQ_SUCCESS;
 }
 
@@ -2577,6 +2561,7 @@
 {
   struct bgp_node *rn;
   
+  
   if (! table)
     table = (rsclient) ? rsclient->rib[afi][safi] : peer->bgp->rib[afi][safi];
   
@@ -2586,11 +2571,65 @@
   
   for (rn = bgp_table_top (table); rn; rn = bgp_route_next (rn))
     {
+      struct bgp_info *ri;
+      struct bgp_adj_in *ain;
+      struct bgp_adj_out *aout;
+      
       if (rn->info == NULL)
         continue;
-      
-      bgp_lock_node (rn); /* unlocked: bgp_clear_node_queue_del */
-      work_queue_add (peer->clear_node_queue, rn);
+
+      /* XXX:TODO: This is suboptimal, every non-empty route_node is
+       * queued for every clearing peer, regardless of whether it is
+       * relevant to the peer at hand.
+       *
+       * Overview: There are 3 different indices which need to be
+       * scrubbed, potentially, when a peer is removed:
+       *
+       * 1 peer's routes visible via the RIB (ie accepted routes)
+       * 2 peer's routes visible by the (optional) peer's adj-in index
+       * 3 other routes visible by the peer's adj-out index
+       *
+       * 3 there is no hurry in scrubbing, once the struct peer is
+       * removed from bgp->peer, we could just GC such deleted peer's
+       * adj-outs at our leisure.
+       *
+       * 1 and 2 must be 'scrubbed' in some way, at least made
+       * invisible via RIB index before peer session is allowed to be
+       * brought back up. So one needs to know when such a 'search' is
+       * complete.
+       *
+       * Ideally:
+       *
+       * - there'd be a single global queue or a single RIB walker
+       * - rather than tracking which route_nodes still need to be
+       *   examined on a peer basis, we'd track which peers still
+       *   aren't cleared
+       *
+       * Given that our per-peer prefix-counts now should be reliable,
+       * this may actually be achievable. It doesn't seem to be a huge
+       * problem at this time,
+       */
+      for (ri = rn->info; ri; ri = ri->next)
+        if (ri->peer == peer)
+          {
+            bgp_lock_node (rn); /* unlocked: bgp_clear_node_queue_del */
+            work_queue_add (peer->clear_node_queue, rn);
+          }
+
+      for (ain = rn->adj_in; ain; ain = ain->next)
+        if (ain->peer == peer)
+          {
+            bgp_adj_in_remove (rn, ain);
+            bgp_unlock_node (rn);
+            break;
+          }
+      for (aout = rn->adj_out; aout; aout = aout->next)
+        if (aout->peer == peer)
+          {
+            bgp_adj_out_remove (rn, aout, peer, afi, safi);
+            bgp_unlock_node (rn);
+            break;
+          }
     }
   return;
 }
@@ -2636,15 +2675,18 @@
     }
   
   /* If no routes were cleared, nothing was added to workqueue, the
-   * completion function won't be run by workqueue code - call it here.
+   * completion function won't be run by workqueue code - call it here. 
+   * XXX: Actually, this assumption doesn't hold, see
+   * bgp_clear_route_table(), we queue all non-empty nodes.
    *
    * Additionally, there is a presumption in FSM that clearing is only
-   * needed if peer state is Established - peers in pre-Established states
-   * shouldn't have any route-update state associated with them (in or out).
+   * really needed if peer state is Established - peers in
+   * pre-Established states shouldn't have any route-update state
+   * associated with them (in or out).
    *
-   * We still get here from FSM through bgp_stop->clear_route_all in
-   * pre-Established though, so this is a useful sanity check to ensure
-   * the assumption above holds.
+   * We still can get here in pre-Established though, through
+   * peer_delete -> bgp_fsm_change_status, so this is a useful sanity
+   * check to ensure the assumption above holds.
    *
    * At some future point, this check could be move to the top of the
    * function, and do a quick early-return when state is