[bgpd] Record afi/safi in bgp_table. Serialise peer clear with FSM.

2006-02-21 Paul Jakma <paul.jakma@sun.com>

	* bgpd.h: move the clear_node_queue to be peer specific.
	  Add a new peer status flag, PEER_STATUS_CLEARING.
	* bgp_table.h: (struct bgp_table) Add fields to record afi,
          safi of the table.
          (bgp_table_init) Take afi and safi to create table for.
        * bgp_table.c: (bgp_table_init) record the afi and safi.
        * bgp_nexthop.c: Update all calls to bgp_table_init.
        * bgp_vty.c: ditto.
        * bgpd.c: ditto.
        * bgp_fsm.c: (bgp_timer_set) dont bring up a session which is
	  clearing.
        * bgp_route.c: (general) Update all bgp_table_init calls.
          (bgp_process_{rsclient,main}) clear_node is serialised
          via PEER_STATUS_CLEARING and fsm now.
          (struct bgp_clear_node_queue) can be removed. struct bgp_node
          can be the queue item data directly, as struct peer can be
          kept in the new wq global user data and afi/safi can be
          retrieved via bgp_node -> bgp_table.
          (bgp_clear_route_node) fix to get peer via wq->spec.data,
          afi/safi via bgp_node->bgp_table.
          (bgp_clear_node_queue_del) no more item data to delete, only
          unlock the bgp_node.
          (bgp_clear_node_complete) only need to unset CLEARING flag
          and unlock struct peer.
          (bgp_clear_node_queue_init) queue attaches to struct peer
          now. record peer name as queue name.
          (bgp_clear_route_table) If queue transitions to active,
          serialise clearing by setting PEER_STATUS_CLEARING rather
          than plugging process queue, and lock peer while queue
          active.
          Update to pass only bgp_node as per-queue-item specific data.
diff --git a/bgpd/ChangeLog b/bgpd/ChangeLog
index c9d6189..9a960a3 100644
--- a/bgpd/ChangeLog
+++ b/bgpd/ChangeLog
@@ -1,3 +1,37 @@
+2006-02-21 Paul Jakma <paul.jakma@sun.com>
+
+	* bgpd.h: move the clear_node_queue to be peer specific.
+	  Add a new peer status flag, PEER_STATUS_CLEARING.
+	* bgp_table.h: (struct bgp_table) Add fields to record afi,
+          safi of the table.
+          (bgp_table_init) Take afi and safi to create table for.
+        * bgp_table.c: (bgp_table_init) record the afi and safi.
+        * bgp_nexthop.c: Update all calls to bgp_table_init.
+        * bgp_vty.c: ditto.
+        * bgpd.c: ditto.
+        * bgp_fsm.c: (bgp_timer_set) dont bring up a session which is
+	  clearing.
+        * bgp_route.c: (general) Update all bgp_table_init calls.
+          (bgp_process_{rsclient,main}) clear_node is serialised
+          via PEER_STATUS_CLEARING and fsm now.
+          (struct bgp_clear_node_queue) can be removed. struct bgp_node
+          can be the queue item data directly, as struct peer can be
+          kept in the new wq global user data and afi/safi can be
+          retrieved via bgp_node -> bgp_table.
+          (bgp_clear_route_node) fix to get peer via wq->spec.data,
+          afi/safi via bgp_node->bgp_table.
+          (bgp_clear_node_queue_del) no more item data to delete, only
+          unlock the bgp_node.
+          (bgp_clear_node_complete) only need to unset CLEARING flag
+          and unlock struct peer.
+          (bgp_clear_node_queue_init) queue attaches to struct peer
+          now. record peer name as queue name.
+          (bgp_clear_route_table) If queue transitions to active,
+          serialise clearing by setting PEER_STATUS_CLEARING rather
+          than plugging process queue, and lock peer while queue
+          active.
+          Update to pass only bgp_node as per-queue-item specific data.
+
 2006-02-18 Paul Jakma <paul.jakma@sun.com>
 
 	* bgp_routemap.c: (route_set_community) Quick, very hacky, fix
diff --git a/bgpd/bgp_fsm.c b/bgpd/bgp_fsm.c
index 220861c..f5f7892 100644
--- a/bgpd/bgp_fsm.c
+++ b/bgpd/bgp_fsm.c
@@ -84,6 +84,7 @@
 	 inactive.  All other timer must be turned off */
       if (CHECK_FLAG (peer->flags, PEER_FLAG_SHUTDOWN)
 	  || CHECK_FLAG (peer->sflags, PEER_STATUS_PREFIX_OVERFLOW)
+	  || CHECK_FLAG (peer->sflags, PEER_STATUS_CLEARING)
 	  || ! peer_active (peer))
 	{
 	  BGP_TIMER_OFF (peer->t_start);
diff --git a/bgpd/bgp_nexthop.c b/bgpd/bgp_nexthop.c
index ba1752f..ea707cc 100644
--- a/bgpd/bgp_nexthop.c
+++ b/bgpd/bgp_nexthop.c
@@ -1330,17 +1330,17 @@
   bgp_scan_interval = BGP_SCAN_INTERVAL_DEFAULT;
   bgp_import_interval = BGP_IMPORT_INTERVAL_DEFAULT;
 
-  cache1_table[AFI_IP] = bgp_table_init ();
-  cache2_table[AFI_IP] = bgp_table_init ();
+  cache1_table[AFI_IP] = bgp_table_init (AFI_IP, SAFI_UNICAST);
+  cache2_table[AFI_IP] = bgp_table_init (AFI_IP, SAFI_UNICAST);
   bgp_nexthop_cache_table[AFI_IP] = cache1_table[AFI_IP];
 
-  bgp_connected_table[AFI_IP] = bgp_table_init ();
+  bgp_connected_table[AFI_IP] = bgp_table_init (AFI_IP, SAFI_UNICAST);
 
 #ifdef HAVE_IPV6
-  cache1_table[AFI_IP6] = bgp_table_init ();
-  cache2_table[AFI_IP6] = bgp_table_init ();
+  cache1_table[AFI_IP6] = bgp_table_init (AFI_IP6, SAFI_UNICAST);
+  cache2_table[AFI_IP6] = bgp_table_init (AFI_IP6, SAFI_UNICAST);
   bgp_nexthop_cache_table[AFI_IP6] = cache1_table[AFI_IP6];
-  bgp_connected_table[AFI_IP6] = bgp_table_init ();
+  bgp_connected_table[AFI_IP6] = bgp_table_init (AFI_IP6, SAFI_UNICAST);
 #endif /* HAVE_IPV6 */
 
   /* Make BGP scan thread. */
diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c
index 6c21e3f..a73974f 100644
--- a/bgpd/bgp_route.c
+++ b/bgpd/bgp_route.c
@@ -71,7 +71,7 @@
       prn = bgp_node_get (table, (struct prefix *) prd);
 
       if (prn->info == NULL)
-	prn->info = bgp_table_init ();
+	prn->info = bgp_table_init (afi, safi);
       else
 	bgp_unlock_node (prn);
       table = prn->info;
@@ -1257,13 +1257,6 @@
   struct listnode *node, *nnode;
   struct peer *rsclient = rn->table->owner;
   
-  /* we shouldn't run if the clear_route_node queue is still running
-   * or scheduled to run, or we can race with session coming up
-   * and adding routes back before we've cleared them
-   */
-  if (bm->clear_node_queue && bm->clear_node_queue->thread)
-    return WQ_QUEUE_BLOCKED;
-  
   /* Best path selection. */
   bgp_best_selection (bgp, rn, &old_and_new);
   new_select = old_and_new.new;
@@ -1326,13 +1319,6 @@
   struct peer *peer;
   struct attr attr;
   
-  /* we shouldn't run if the clear_route_node queue is still running
-   * or scheduled to run, or we can race with session coming up
-   * and adding routes back before we've cleared them
-   */
-  if (bm->clear_node_queue && bm->clear_node_queue->thread)
-    return WQ_QUEUE_BLOCKED;
-
   /* Best path selection. */
   bgp_best_selection (bgp, rn, &old_and_new);
   old_select = old_and_new.old;
@@ -2465,54 +2451,49 @@
 	bgp_soft_reconfig_table (peer, afi, safi, table);
 }
 
-struct bgp_clear_node_queue
-{
-  struct bgp_node *rn;
-  struct peer *peer;
-  afi_t afi;
-  safi_t safi;
-};
-
 static wq_item_status
 bgp_clear_route_node (struct work_queue *wq, void *data)
 {
-  struct bgp_clear_node_queue *cq = data;
+  struct bgp_node *rn = data;
+  struct peer *peer = wq->spec.data;
   struct bgp_adj_in *ain;
   struct bgp_adj_out *aout;
   struct bgp_info *ri;
+  afi_t afi = rn->table->afi;
+  safi_t safi = rn->table->safi;
   
-  assert (cq->rn && cq->peer);
+  assert (rn && peer);
   
-  for (ri = cq->rn->info; ri; ri = ri->next)
-    if (ri->peer == cq->peer)
+  for (ri = rn->info; ri; ri = ri->next)
+    if (ri->peer == peer)
       {
         /* graceful restart STALE flag set. */
-        if (CHECK_FLAG (cq->peer->sflags, PEER_STATUS_NSF_WAIT)
-            && cq->peer->nsf[cq->afi][cq->safi]
+        if (CHECK_FLAG (peer->sflags, PEER_STATUS_NSF_WAIT)
+            && peer->nsf[afi][safi]
             && ! CHECK_FLAG (ri->flags, BGP_INFO_STALE)
             && ! CHECK_FLAG (ri->flags, BGP_INFO_HISTORY)
             && ! CHECK_FLAG (ri->flags, BGP_INFO_DAMPED)
             && ! CHECK_FLAG (ri->flags, BGP_INFO_REMOVED))
           {
-            bgp_pcount_decrement (cq->rn, ri, cq->afi, cq->safi);
+            bgp_pcount_decrement (rn, ri, afi, safi);
             SET_FLAG (ri->flags, BGP_INFO_STALE);
           }
         else
-          bgp_rib_remove (cq->rn, ri, cq->peer, cq->afi, cq->safi);
+          bgp_rib_remove (rn, ri, peer, afi, safi);
         break;
       }
-  for (ain = cq->rn->adj_in; ain; ain = ain->next)
-    if (ain->peer == cq->peer)
+  for (ain = rn->adj_in; ain; ain = ain->next)
+    if (ain->peer == peer)
       {
-        bgp_adj_in_remove (cq->rn, ain);
-        bgp_unlock_node (cq->rn);
+        bgp_adj_in_remove (rn, ain);
+        bgp_unlock_node (rn);
         break;
       }
-  for (aout = cq->rn->adj_out; aout; aout = aout->next)
-    if (aout->peer == cq->peer)
+  for (aout = rn->adj_out; aout; aout = aout->next)
+    if (aout->peer == peer)
       {
-        bgp_adj_out_remove (cq->rn, aout, cq->peer, cq->afi, cq->safi);
-        bgp_unlock_node (cq->rn);
+        bgp_adj_out_remove (rn, aout, peer, afi, safi);
+        bgp_unlock_node (rn);
         break;
       }
   return WQ_SUCCESS;
@@ -2521,78 +2502,84 @@
 static void
 bgp_clear_node_queue_del (struct work_queue *wq, void *data)
 {
-  struct bgp_clear_node_queue *cq = data;
-  bgp_unlock_node (cq->rn); 
-  peer_unlock (cq->peer); /* bgp_clear_node_queue_del */
-  XFREE (MTYPE_BGP_CLEAR_NODE_QUEUE, cq);
+  struct bgp_node *rn = data;
+  
+  bgp_unlock_node (rn); 
 }
 
 static void
 bgp_clear_node_complete (struct work_queue *wq)
 {
-  /* unplug the 2 processing queues */
-  if (bm->process_main_queue)
-    work_queue_unplug (bm->process_main_queue);
-  if (bm->process_rsclient_queue)
-    work_queue_unplug (bm->process_rsclient_queue);
+  struct peer *peer = wq->spec.data;
+  
+  UNSET_FLAG (peer->sflags, PEER_STATUS_CLEARING);
+  peer_unlock (peer); /* bgp_clear_node_complete */
 }
 
 static void
-bgp_clear_node_queue_init (void)
+bgp_clear_node_queue_init (struct peer *peer)
 {
-  if ( (bm->clear_node_queue
-          = work_queue_new (bm->master, "clear_route_node")) == NULL)
+#define CLEAR_QUEUE_NAME_LEN 26 /* "clear 2001:123:123:123::1" */
+  char wname[CLEAR_QUEUE_NAME_LEN];
+  
+  snprintf (wname, CLEAR_QUEUE_NAME_LEN, "clear %s", peer->host);
+#undef CLEAR_QUEUE_NAME_LEN
+
+  if ( (peer->clear_node_queue = work_queue_new (bm->master, wname)) == NULL)
     {
       zlog_err ("%s: Failed to allocate work queue", __func__);
       exit (1);
     }
-  bm->clear_node_queue->spec.hold = 10;
-  bm->clear_node_queue->spec.workfunc = &bgp_clear_route_node;
-  bm->clear_node_queue->spec.del_item_data = &bgp_clear_node_queue_del;
-  bm->clear_node_queue->spec.completion_func = &bgp_clear_node_complete;
-  bm->clear_node_queue->spec.max_retries = 0;
+  peer->clear_node_queue->spec.hold = 10;
+  peer->clear_node_queue->spec.workfunc = &bgp_clear_route_node;
+  peer->clear_node_queue->spec.del_item_data = &bgp_clear_node_queue_del;
+  peer->clear_node_queue->spec.completion_func = &bgp_clear_node_complete;
+  peer->clear_node_queue->spec.max_retries = 0;
+  
+  /* we only 'lock' this peer reference when the queue is actually active */
+  peer->clear_node_queue->spec.data = peer;
 }
 
 static void
 bgp_clear_route_table (struct peer *peer, afi_t afi, safi_t safi,
                       struct bgp_table *table, struct peer *rsclient)
 {
-  struct bgp_clear_node_queue *cqnode;
   struct bgp_node *rn;
   
   if (! table)
     table = (rsclient) ? rsclient->rib[afi][safi] : peer->bgp->rib[afi][safi];
-
+  
   /* If still no table => afi/safi isn't configured at all or smth. */
   if (! table)
     return;
 
-  if (bm->clear_node_queue == NULL)
-    bgp_clear_node_queue_init ();
+  if (peer->clear_node_queue == NULL)
+    bgp_clear_node_queue_init (peer);
   
-  /* plug the two bgp_process queues to avoid any chance of racing
-   * with a session coming back up and adding routes before we've
-   * cleared them all. We'll unplug them with completion callback.
+  /* bgp_fsm.c will not bring CLEARING sessions out of Idle this
+   * protects against peers which flap faster than we can we clear,
+   * which could lead to:
+   *
+   * a) race with routes from the new session being installed before
+   *    clear_route_node visits the node (to delete the route of that
+   *    peer)
+   * b) resource exhaustion, clear_route_node likely leads to an entry
+   *    on the process_main queue. Fast-flapping could cause that queue
+   *    to grow and grow.
    */
-  if (bm->process_main_queue)
-    work_queue_plug (bm->process_main_queue);
-  if (bm->process_rsclient_queue)
-    work_queue_plug (bm->process_rsclient_queue);
+  if (!CHECK_FLAG (peer->sflags, PEER_STATUS_CLEARING))
+    {
+      SET_FLAG (peer->sflags, PEER_STATUS_CLEARING);
+      peer_lock (peer); /* bgp_clear_node_complete */
+    }
   
   for (rn = bgp_table_top (table); rn; rn = bgp_route_next (rn))
     {
       if (rn->info == NULL)
         continue;
       
-      if ( (cqnode = XCALLOC (MTYPE_BGP_CLEAR_NODE_QUEUE, 
-                              sizeof (struct bgp_clear_node_queue))) == NULL)
-        continue;
-      
-      cqnode->rn = bgp_lock_node (rn); /* unlocked: bgp_clear_node_queue_del */
-      cqnode->afi = afi;
-      cqnode->safi = safi;
-      cqnode->peer = peer_lock (peer); /* bgp_clear_node_queue_del */
-      work_queue_add (bm->clear_node_queue, cqnode);
+      bgp_lock_node (rn); /* unlocked: bgp_clear_node_queue_del */
+      work_queue_add (peer->clear_node_queue, rn);
     }
   return;
 }
@@ -3524,7 +3511,7 @@
   prn = bgp_node_get (bgp->route[AFI_IP][SAFI_MPLS_VPN],
 			(struct prefix *)&prd);
   if (prn->info == NULL)
-    prn->info = bgp_table_init ();
+    prn->info = bgp_table_init (AFI_IP, SAFI_MPLS_VPN);
   else
     bgp_unlock_node (prn);
   table = prn->info;
@@ -3593,7 +3580,7 @@
   prn = bgp_node_get (bgp->route[AFI_IP][SAFI_MPLS_VPN],
 			(struct prefix *)&prd);
   if (prn->info == NULL)
-    prn->info = bgp_table_init ();
+    prn->info = bgp_table_init (AFI_IP, SAFI_MPLS_VPN);
   else
     bgp_unlock_node (prn);
   table = prn->info;
@@ -10408,7 +10395,7 @@
 bgp_route_init ()
 {
   /* Init BGP distance table. */
-  bgp_distance_table = bgp_table_init ();
+  bgp_distance_table = bgp_table_init (AFI_IP, SAFI_UNICAST);
 
   /* IPv4 BGP commands. */
   install_element (BGP_NODE, &bgp_network_cmd);
diff --git a/bgpd/bgp_table.c b/bgpd/bgp_table.c
index a9199d6..a3b489d 100644
--- a/bgpd/bgp_table.c
+++ b/bgpd/bgp_table.c
@@ -32,7 +32,7 @@
 void bgp_table_free (struct bgp_table *);
 
 struct bgp_table *
-bgp_table_init (void)
+bgp_table_init (afi_t afi, safi_t safi)
 {
   struct bgp_table *rt;
 
@@ -40,7 +40,9 @@
   memset (rt, 0, sizeof (struct bgp_table));
 
   rt->type = BGP_TABLE_MAIN;
-
+  rt->afi = afi;
+  rt->safi = safi;
+  
   return rt;
 }
 
diff --git a/bgpd/bgp_table.h b/bgpd/bgp_table.h
index d7c8272..e13022b 100644
--- a/bgpd/bgp_table.h
+++ b/bgpd/bgp_table.h
@@ -30,7 +30,11 @@
 struct bgp_table
 {
   bgp_table_t type;
-
+  
+  /* afi/safi of this table */
+  afi_t afi;
+  safi_t safi;
+  
   /* The owner of this 'bgp_table' structure. */
   void *owner;
 
@@ -63,7 +67,7 @@
 #define BGP_NODE_PROCESS_SCHEDULED	(1 << 0)
 };
 
-extern struct bgp_table *bgp_table_init (void);
+extern struct bgp_table *bgp_table_init (afi_t, safi_t);
 extern void bgp_table_finish (struct bgp_table *);
 extern void bgp_unlock_node (struct bgp_node *node);
 extern void bgp_node_delete (struct bgp_node *node);
diff --git a/bgpd/bgp_vty.c b/bgpd/bgp_vty.c
index 6ee4823..3cf4dc5 100644
--- a/bgpd/bgp_vty.c
+++ b/bgpd/bgp_vty.c
@@ -2048,7 +2048,7 @@
   if (ret < 0)
     return bgp_vty_return (vty, ret);
 
-  peer->rib[afi][safi] = bgp_table_init ();
+  peer->rib[afi][safi] = bgp_table_init (afi, safi);
   peer->rib[afi][safi]->type = BGP_TABLE_RSCLIENT;
   peer->rib[afi][safi]->owner = peer;
 
diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c
index 1f1b4fd..9f694f5 100644
--- a/bgpd/bgpd.c
+++ b/bgpd/bgpd.c
@@ -1866,9 +1866,9 @@
   for (afi = AFI_IP; afi < AFI_MAX; afi++)
     for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++)
       {
-	bgp->route[afi][safi] = bgp_table_init ();
-	bgp->aggregate[afi][safi] = bgp_table_init ();
-	bgp->rib[afi][safi] = bgp_table_init ();
+	bgp->route[afi][safi] = bgp_table_init (afi, safi);
+	bgp->aggregate[afi][safi] = bgp_table_init (afi, safi);
+	bgp->rib[afi][safi] = bgp_table_init (afi, safi);
       }
 
   bgp->default_local_pref = BGP_DEFAULT_LOCAL_PREF;
diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h
index 44045c0..9e2aa3e 100644
--- a/bgpd/bgpd.h
+++ b/bgpd/bgpd.h
@@ -40,7 +40,6 @@
   /* work queues */
   struct work_queue *process_main_queue;
   struct work_queue *process_rsclient_queue;
-  struct work_queue *clear_node_queue;
   
   /* BGP port number.  */
   u_int16_t port;
@@ -387,6 +386,7 @@
 #define PEER_STATUS_GROUP             (1 << 4) /* peer-group conf */
 #define PEER_STATUS_NSF_MODE          (1 << 5) /* NSF aware peer */
 #define PEER_STATUS_NSF_WAIT          (1 << 6) /* wait comeback peer */
+#define PEER_STATUS_CLEARING          (1 << 7) /* peers table being cleared */
 
   /* Peer status af flags (reset in bgp_stop) */
   u_int16_t af_sflags[AFI_MAX][SAFI_MAX];
@@ -398,7 +398,6 @@
 #define PEER_STATUS_EOR_SEND          (1 << 5) /* end-of-rib send to peer */
 #define PEER_STATUS_EOR_RECEIVED      (1 << 6) /* end-of-rib received from peer */
 
-
   /* Default attribute value for the peer. */
   u_int32_t config;
 #define PEER_CONFIG_WEIGHT            (1 << 0) /* Default weight. */
@@ -433,7 +432,10 @@
   struct thread *t_pmax_restart;
   struct thread *t_gr_restart;
   struct thread *t_gr_stale;
-
+  
+  /* workqueues */
+  struct work_queue *clear_node_queue;
+  
   /* Statistics field */
   u_int32_t open_in;		/* Open message input count */
   u_int32_t open_out;		/* Open message output count */