2004-09-12 Paul Jakma <paul@dishone.st>

        * ospf_packet.c: Fix bugzilla #107
          (ospf_packet_max) get rid of the magic 88 constant
          (ospf_swab_iph_ton) new function. set ip header to network order,
          taking BSDisms into account.
          (ospf_swab_iph_toh) the inverse.
          (ospf_write) Add support for IP fragmentation, will only work on
          linux though, other kernels make it impossible. get rid of the
          magic 4 constant.
          (ospf_make_ls_upd) Bound check to end of stream, not to
          interface mtu.
          (ospf_ls_upd_packet_new) New function, allocate upd packet
          taking oversized LSAs into account.
          (ospf_ls_upd_queue_send) use ospf_ls_upd_packet_new to allocate,
          rather than statically allocating mtu sized packet buffer, which
          actually was wrong - it didnt take ip header into account, which
          should not be included in packet buffer.
          (ospf_ls_upd_send_queue_event) minor tweaks and remove
          TODO comment.
diff --git a/ospfd/ospf_packet.c b/ospfd/ospf_packet.c
index 9afd929..a41627f 100644
--- a/ospfd/ospf_packet.c
+++ b/ospfd/ospf_packet.c
@@ -247,9 +247,11 @@
   int max;
 
   if ( ospf_auth_type (oi) == OSPF_AUTH_CRYPTOGRAPHIC)
-    max = oi->ifp->mtu - OSPF_AUTH_MD5_SIZE - 88;
+    max = oi->ifp->mtu - OSPF_AUTH_MD5_SIZE;
   else
-    max = oi->ifp->mtu - 88;
+    max = oi->ifp->mtu;
+  
+  max -= (OSPF_HEADER_SIZE + sizeof (struct ip));
 
   return max;
 }
@@ -480,6 +482,31 @@
   return 0;
 }
 
+/* swab ip header fields to required order for sendmsg */
+void
+ospf_swab_iph_ton (struct ip *iph)
+{
+  /* BSD and derived take iph in network order, except for 
+   * ip_len and ip_off
+   */
+#ifdef GNU_LINUX
+  iph->ip_len = htons(iph->ip_len);
+  iph->ip_off = htons(iph->ip_off);
+#endif
+  iph->ip_id = htons(iph->ip_id);
+}
+
+/* swab ip header fields to host order, as required */
+void
+ospf_swab_iph_toh (struct ip *iph)
+{
+#ifdef GNU_LINUX
+  iph->ip_len = ntohs(iph->ip_len);
+  iph->ip_off = ntohs(iph->ip_off);
+#endif  
+  iph->ip_id = ntohs(iph->ip_id);
+}
+
 int
 ospf_write (struct thread *thread)
 {
@@ -494,6 +521,10 @@
   int ret;
   int flags = 0;
   listnode node;
+  static u_int16_t ipid = 0;
+  u_int16_t maxdatasize, offset;
+#define OSPF_WRITE_IPHL_SHIFT 2
+#define OSPF_WRITE_FRAG_SHIFT 3
   
   ospf->t_write = NULL;
 
@@ -502,6 +533,13 @@
   oi = getdata (node);
   assert (oi);
   
+  /* seed ipid static with low order bits of time */
+  if (ipid == 0)
+    ipid = (time(NULL) & 0xffff);
+  
+  /* convenience - max OSPF data per packet */
+  maxdatasize = oi->ifp->mtu - sizeof (struct ip);
+  
   /* Get one packet from queue. */
   op = ospf_fifo_head (oi->obuf);
   assert (op);
@@ -509,12 +547,17 @@
 
   if (op->dst.s_addr == htonl (OSPF_ALLSPFROUTERS)
       || op->dst.s_addr == htonl (OSPF_ALLDROUTERS))
-    ospf_if_ipmulticast (ospf, oi->address, oi->ifp->ifindex);
-
+      ospf_if_ipmulticast (ospf, oi->address, oi->ifp->ifindex);
+    
   /* Rewrite the md5 signature & update the seq */
   ospf_make_md5_digest (oi, op);
 
+  /* reset get pointer */
+  stream_set_getp (op->s, 0);
+
+  memset (&iph, 0, sizeof (struct ip));
   memset (&sa_dst, 0, sizeof (sa_dst));
+  
   sa_dst.sin_family = AF_INET;
 #ifdef HAVE_SIN_LEN
   sa_dst.sin_len = sizeof(sa_dst);
@@ -527,15 +570,22 @@
     if (!IN_MULTICAST (htonl (op->dst.s_addr)))
       flags = MSG_DONTROUTE;
 
-  iph.ip_hl = sizeof (struct ip) >> 2;
+  iph.ip_hl = sizeof (struct ip) >> OSPF_WRITE_IPHL_SHIFT;
+  /* it'd be very strange for header to not be 4byte-word aligned but.. */
+  if ( sizeof (struct ip) > (iph.ip_hl << OSPF_WRITE_IPHL_SHIFT) )
+    iph.ip_hl++; /* we presume sizeof struct ip cant overflow ip_hl.. */
+  
   iph.ip_v = IPVERSION;
   iph.ip_tos = IPTOS_PREC_INTERNETCONTROL;
-#if defined(__NetBSD__) || defined(__FreeBSD__)
-  iph.ip_len = iph.ip_hl*4 + op->length;
-#else
-  iph.ip_len = htons (iph.ip_hl*4 + op->length);
-#endif
+  iph.ip_len = (iph.ip_hl << OSPF_WRITE_IPHL_SHIFT) + op->length;
   iph.ip_id = 0;
+
+  /* XXX-MT: not thread-safe at all..
+   * XXX: this presumes this is only programme sending OSPF packets 
+   * otherwise, no guarantee ipid will be unique
+   */
+  iph.ip_id = ++ipid;
+  
   iph.ip_off = 0;
   if (oi->type == OSPF_IFTYPE_VIRTUALLINK)
     iph.ip_ttl = OSPF_VL_IP_TTL;
@@ -552,15 +602,79 @@
   msg.msg_iov = iov;
   msg.msg_iovlen = 2;
   iov[0].iov_base = (char*)&iph;
-  iov[0].iov_len = iph.ip_hl*4;
-  iov[1].iov_base = STREAM_DATA (op->s);
+  iov[0].iov_len = iph.ip_hl << OSPF_WRITE_IPHL_SHIFT;
+  iov[1].iov_base = STREAM_PNT (op->s);
   iov[1].iov_len = op->length;
+  
+  /* Sadly we can not rely on kernels to fragment packets because of either
+   * IP_HDRINCL and/or multicast destination being set.
+   */
+  if ( op->length > maxdatasize )
+    {
+      assert ( op->length == stream_get_endp(op->s) );
 
+      /* we can but try.
+       *
+       * SunOS, BSD and BSD derived kernels likely will clear ip_id, as
+       * well as the IP_MF flag, making this all quite pointless.
+       *
+       * However, for a system on which IP_MF is left alone, and ip_id left
+       * alone or else which sets same ip_id for each fragment this might
+       * work, eg linux.
+       *
+       * XXX-TODO: It would be much nicer to have the kernel's use their
+       * existing fragmentation support to do this for us. Bugs/RFEs need to
+       * be raised against the various kernels.
+       */
+      
+      /* set More Frag */
+      iph.ip_off |= IP_MF;
+      
+      /* ip frag offset is expressed in units of 8byte words */
+      offset = maxdatasize >> OSPF_WRITE_FRAG_SHIFT;      
+      
+      while ( (stream_get_endp(op->s) - stream_get_getp (op->s)) 
+             > maxdatasize )
+        {
+          /* data length of this frag is to next offset value */
+          iov[1].iov_len = offset << OSPF_WRITE_FRAG_SHIFT;
+          iph.ip_len = iov[1].iov_len + sizeof (struct ip);
+          assert (iph.ip_len <= oi->ifp->mtu);
+
+          ospf_swab_iph_ton (&iph);
+
+          ret = sendmsg (ospf->fd, &msg, flags);
+          
+          ospf_swab_iph_toh (&iph);
+          
+          if (ret < 0)
+            zlog_warn ("*** sendmsg in ospf_write to %s,"
+                       " id %d, off %d, len %d failed with %s",
+                       inet_ntoa (iph.ip_dst),
+                       iph.ip_id,
+                       iph.ip_off,
+                       iph.ip_len,
+                       strerror (errno));
+          
+          iph.ip_off += offset;
+          stream_forward (op->s, iov[1].iov_len);
+          iov[1].iov_base = STREAM_PNT (op->s); 
+        }
+        
+      /* setup for final fragment */
+      iov[1].iov_len = stream_get_endp(op->s) - stream_get_getp (op->s);
+      iph.ip_len = iov[1].iov_len + sizeof (struct ip);
+      iph.ip_off &= (~IP_MF);
+    }
+
+  /* send final fragment (could be first) */
+  ospf_swab_iph_ton (&iph);
   ret = sendmsg (ospf->fd, &msg, flags);
+  ospf_swab_iph_toh (&iph);
   
   if (ret < 0)
-    zlog_warn ("*** sendmsg in ospf_write to %s failed with %s", 
-               inet_ntoa (iph.ip_dst), strerror (errno));
+    zlog_warn ("*** sendmsg in ospf_write to %s failed with %s",
+      inet_ntoa (iph.ip_dst), strerror (errno));
 
   /* Retrieve OSPF packet type. */
   stream_set_getp (op->s, 1);
@@ -2685,7 +2799,7 @@
     zlog_info ("ospf_make_ls_upd: Start");
 
   pp = stream_get_putp (s);
-  ospf_output_forward (s, 4);
+  ospf_output_forward (s, OSPF_LS_UPD_MIN_SIZE);
 
   while ((node = listhead (update)) != NULL)
     {
@@ -2699,12 +2813,8 @@
       assert (lsa);
       assert (lsa->data);
 
-      /* Check packet size. */
-      /* XXX: LSA can be > packet-headers, eg router-lsas for machines
-       * with hundreds of interfaces, received as several
-       * fragmented packets.
-       */
-      if (length + delta + ntohs (lsa->data->length) > OSPF_PACKET_MAX (oi))
+      /* Will it fit? */
+      if (length + delta + ntohs (lsa->data->length) > stream_get_size (s))
         break;
 
       /* Keep pointer to LS age. */
@@ -3069,6 +3179,68 @@
   list_delete (update);
 }
 
+/* Determine size for packet. Must be at least big enough to accomodate next
+ * LSA on list, which may be bigger than MTU size.
+ *
+ * Return pointer to new ospf_packet
+ * NULL if we can not allocate, eg because LSA is bigger than imposed limit
+ * on packet sizes (in which case offending LSA is deleted from update list)
+ */
+static struct ospf_packet *
+ospf_ls_upd_packet_new (struct list *update, struct ospf_interface *oi)
+{
+  struct ospf_lsa *lsa;
+  struct listnode *ln;
+  size_t size;
+  static char warned = 0;
+
+  ln = listhead (update);
+  lsa = getdata (ln);
+  assert (lsa);
+  assert (lsa->data);
+
+  if ((OSPF_LS_UPD_MIN_SIZE + ntohs (lsa->data->length))
+      > ospf_packet_max (oi))
+    {
+      if (!warned)
+        {
+          zlog_warn ("ospf_ls_upd_packet_new: oversized LSA encountered!"
+                     "will need to fragment. Not optimal. Try divide up"
+                     " your network with areas. Use 'debug ospf packet send'"
+                     " to see details, or look at 'show ip ospf database ..'");
+          warned = 1;
+        }
+
+      if (IS_DEBUG_OSPF_PACKET (0, SEND))
+        zlog_warn ("ospf_ls_upd_packet_new: oversized LSA id:%s,"
+                   " %d bytes originated by %s, will be fragmented!",
+                   inet_ntoa (lsa->data->id),
+                   ntohs (lsa->data->length),
+                   inet_ntoa (lsa->data->adv_router));
+
+      /* 
+       * Allocate just enough to fit this LSA only, to avoid including other
+       * LSAs in fragmented LSA Updates.
+       */
+      size = ntohs (lsa->data->length) + (oi->ifp->mtu - ospf_packet_max (oi))
+             + OSPF_LS_UPD_MIN_SIZE;
+    }
+  else
+    size = oi->ifp->mtu;
+
+  if (size > OSPF_MAX_PACKET_SIZE)
+    {
+      zlog_warn ("ospf_ls_upd_packet_new: oversized LSA id:%s too big,"
+                 " %d bytes, dropping it completely."
+                 " OSPF routing is broken!",
+                 inet_ntoa (lsa->data->id), ntohs (lsa->data->length));
+      list_delete_node (update, ln);
+      return NULL;
+    }
+
+  return ospf_packet_new (size);
+}
+
 static void
 ospf_ls_upd_queue_send (struct ospf_interface *oi, list update,
 			struct in_addr addr)
@@ -3078,8 +3250,8 @@
 
   if (IS_DEBUG_OSPF_EVENT)
     zlog_info ("listcount = %d, dst %s", listcount (update), inet_ntoa(addr));
-
-  op = ospf_packet_new (oi->ifp->mtu);
+  
+  op = ospf_ls_upd_packet_new (update, oi);
 
   /* Prepare OSPF common header. */
   ospf_make_header (OSPF_MSG_LS_UPD, oi, op->s);
@@ -3112,8 +3284,7 @@
   struct route_node *rn;
   struct route_node *rnext;
   struct list *update;
-  struct listnode *tn, *nn;
-  unsigned int again = 0;
+  char again = 0;
   
   oi->t_ls_upd_event = NULL;
 
@@ -3122,17 +3293,16 @@
 
   for (rn = route_top (oi->ls_upd_queue); rn; rn = rnext)
     {
-      update = (struct list *)rn->info;
       rnext = route_next (rn);
       
       if (rn->info == NULL)
         continue;
+      
+      update = (struct list *)rn->info;
 
       ospf_ls_upd_queue_send (oi, update, rn->p.u.prefix4);
       
-      /* list might not be empty.
-       * TODO: work out what to do about oversized LSAs.
-       */
+      /* list might not be empty. */
       if (listcount(update) == 0)
         {
           list_delete (rn->info);
@@ -3140,7 +3310,7 @@
           route_unlock_node (rn);
         }
       else
-        again++;
+        again = 1;
     }
 
   if (again != 0)