Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/netinet/ip_input.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/netinet/ip_input.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.268.2.6 retrieving revision 1.272.6.1 diff -u -p -r1.268.2.6 -r1.272.6.1 --- src/sys/netinet/ip_input.c 2010/08/11 22:54:56 1.268.2.6 +++ src/sys/netinet/ip_input.c 2008/10/19 22:17:46 1.272.6.1 @@ -1,4 +1,4 @@ -/* $NetBSD: ip_input.c,v 1.268.2.6 2010/08/11 22:54:56 yamt Exp $ */ +/* $NetBSD: ip_input.c,v 1.272.6.1 2008/10/19 22:17:46 haad Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -91,10 +91,9 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.268.2.6 2010/08/11 22:54:56 yamt Exp $"); +__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.272.6.1 2008/10/19 22:17:46 haad Exp $"); #include "opt_inet.h" -#include "opt_compat_netbsd.h" #include "opt_gateway.h" #include "opt_pfil_hooks.h" #include "opt_ipsec.h" @@ -104,6 +103,7 @@ __KERNEL_RCSID(0, "$NetBSD: ip_input.c,v #include #include +#include #include #include #include @@ -173,11 +173,6 @@ __KERNEL_RCSID(0, "$NetBSD: ip_input.c,v #define IPMTUDISCTIMEOUT (10 * 60) /* as per RFC 1191 */ #endif -#ifdef COMPAT_50 -#include -#include -#endif - /* * Note: DIRECTED_BROADCAST is handled this way so that previous * configuration using this option will Just Work. @@ -230,7 +225,6 @@ u_long in_multihash; /* size of hash int in_multientries; /* total number of addrs */ struct in_multihashhead *in_multihashtbl; struct ifqueue ipintrq; - uint16_t ip_id; percpu_t *ipstat_percpu; @@ -239,7 +233,105 @@ percpu_t *ipstat_percpu; struct pfil_head inet_pfil_hook; #endif +/* + * Cached copy of nmbclusters. If nbclusters is different, + * recalculate IP parameters derived from nmbclusters. + */ +static int ip_nmbclusters; /* copy of nmbclusters */ +static void ip_nmbclusters_changed(void); /* recalc limits */ + +#define CHECK_NMBCLUSTER_PARAMS() \ +do { \ + if (__predict_false(ip_nmbclusters != nmbclusters)) \ + ip_nmbclusters_changed(); \ +} while (/*CONSTCOND*/0) + +/* IP datagram reassembly queues (hashed) */ +#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) +#define IPREASS_HMASK (IPREASS_NHASH - 1) +#define IPREASS_HASH(x,y) \ + (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) +struct ipqhead ipq[IPREASS_NHASH]; +int ipq_locked; +static int ip_nfragpackets; /* packets in reass queue */ +static int ip_nfrags; /* total fragments in reass queues */ + +int ip_maxfragpackets = 200; /* limit on packets. XXX sysctl */ +int ip_maxfrags; /* limit on fragments. XXX sysctl */ + + +/* + * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for + * IP reassembly queue buffer managment. + * + * We keep a count of total IP fragments (NB: not fragmented packets!) + * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments. + * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the + * total fragments in reassembly queues.This AIMD policy avoids + * repeatedly deleting single packets under heavy fragmentation load + * (e.g., from lossy NFS peers). + */ +static u_int ip_reass_ttl_decr(u_int ticks); +static void ip_reass_drophalf(void); + + +static inline int ipq_lock_try(void); +static inline void ipq_unlock(void); + +static inline int +ipq_lock_try(void) +{ + int s; + + /* + * Use splvm() -- we're blocking things that would cause + * mbuf allocation. + */ + s = splvm(); + if (ipq_locked) { + splx(s); + return (0); + } + ipq_locked = 1; + splx(s); + return (1); +} + +static inline void +ipq_unlock(void) +{ + int s; + + s = splvm(); + ipq_locked = 0; + splx(s); +} + +#ifdef DIAGNOSTIC +#define IPQ_LOCK() \ +do { \ + if (ipq_lock_try() == 0) { \ + printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \ + panic("ipq_lock"); \ + } \ +} while (/*CONSTCOND*/ 0) +#define IPQ_LOCK_CHECK() \ +do { \ + if (ipq_locked == 0) { \ + printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \ + panic("ipq lock check"); \ + } \ +} while (/*CONSTCOND*/ 0) +#else +#define IPQ_LOCK() (void) ipq_lock_try() +#define IPQ_LOCK_CHECK() /* nothing */ +#endif + +#define IPQ_UNLOCK() ipq_unlock() + struct pool inmulti_pool; +struct pool ipqent_pool; #ifdef INET_CSUM_COUNTERS #include @@ -285,7 +377,15 @@ struct mowner ip_rx_mowner = MOWNER_INIT struct mowner ip_tx_mowner = MOWNER_INIT("internet", "tx"); #endif -static void sysctl_net_inet_ip_setup(struct sysctllog **); +/* + * Compute IP limits derived from the value of nmbclusters. + */ +static void +ip_nmbclusters_changed(void) +{ + ip_maxfrags = nmbclusters / 4; + ip_nmbclusters = nmbclusters; +} /* * IP initialization: fill in IP protocol switch table. @@ -297,10 +397,10 @@ ip_init(void) const struct protosw *pr; int i; - sysctl_net_inet_ip_setup(NULL); - pool_init(&inmulti_pool, sizeof(struct in_multi), 0, 0, 0, "inmltpl", NULL, IPL_SOFTNET); + pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl", + NULL, IPL_VM); pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == 0) @@ -313,12 +413,14 @@ ip_init(void) pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) ip_protox[pr->pr_protocol] = pr - inetsw; - ip_reass_init(); + for (i = 0; i < IPREASS_NHASH; i++) + LIST_INIT(&ipq[i]); ip_initid(); ip_id = time_second & 0xfffff; ipintrq.ifq_maxlen = ipqmaxlen; + ip_nmbclusters_changed(); TAILQ_INIT(&in_ifaddrhead); in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, true, @@ -362,35 +464,18 @@ ipintr(void) { int s; struct mbuf *m; - struct ifqueue lcl_intrq; - - memset(&lcl_intrq, 0, sizeof(lcl_intrq)); - ipintrq.ifq_maxlen = ipqmaxlen; mutex_enter(softnet_lock); KERNEL_LOCK(1, NULL); - if (!IF_IS_EMPTY(&ipintrq)) { + while (!IF_IS_EMPTY(&ipintrq)) { s = splnet(); - - /* Take existing queue onto stack */ - lcl_intrq = ipintrq; - - /* Zero out global queue, preserving maxlen and drops */ - ipintrq.ifq_head = NULL; - ipintrq.ifq_tail = NULL; - ipintrq.ifq_len = 0; - ipintrq.ifq_maxlen = lcl_intrq.ifq_maxlen; - ipintrq.ifq_drops = lcl_intrq.ifq_drops; - + IF_DEQUEUE(&ipintrq, m); splx(s); - } - KERNEL_UNLOCK_ONE(NULL); - while (!IF_IS_EMPTY(&lcl_intrq)) { - IF_DEQUEUE(&lcl_intrq, m); if (m == NULL) break; ip_input(m); } + KERNEL_UNLOCK_ONE(NULL); mutex_exit(softnet_lock); } @@ -402,21 +487,28 @@ void ip_input(struct mbuf *m) { struct ip *ip = NULL; + struct ipq *fp; struct in_ifaddr *ia; struct ifaddr *ifa; - int hlen = 0, len; + struct ipqent *ipqe; + int hlen = 0, mff, len; int downmatch; int checkif; int srcrt = 0; + int s; + u_int hash; #ifdef FAST_IPSEC struct m_tag *mtag; struct tdb_ident *tdbi; struct secpolicy *sp; - int error, s; + int error; #endif /* FAST_IPSEC */ MCLAIM(m, &ip_rx_mowner); - KASSERT((m->m_flags & M_PKTHDR) != 0); +#ifdef DIAGNOSTIC + if ((m->m_flags & M_PKTHDR) == 0) + panic("ipintr no HDR"); +#endif /* * If no IP addresses have been set yet but the interfaces @@ -804,12 +896,13 @@ ip_input(struct mbuf *m) ours: /* * If offset or IP_MF are set, must reassemble. + * Otherwise, nothing need be done. + * (We could look in the reassembly queue to see + * if the packet was previously fragmented, + * but it's not worth the time; just let them time out.) */ if (ip->ip_off & ~htons(IP_DF|IP_RF)) { - struct mbuf *m_final; - u_int off, flen; - bool mff; - + uint16_t off; /* * Prevent TCP blind data attacks by not allowing non-initial * fragments to start at less than 68 bytes (minimal fragment @@ -821,45 +914,83 @@ ours: IP_STATINC(IP_STAT_BADFRAGS); goto bad; } + /* + * Look for queue of fragments + * of this datagram. + */ + IPQ_LOCK(); + hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); + LIST_FOREACH(fp, &ipq[hash], ipq_q) { + if (ip->ip_id == fp->ipq_id && + in_hosteq(ip->ip_src, fp->ipq_src) && + in_hosteq(ip->ip_dst, fp->ipq_dst) && + ip->ip_p == fp->ipq_p) { + /* + * Make sure the TOS is matches previous + * fragments. + */ + if (ip->ip_tos != fp->ipq_tos) { + IP_STATINC(IP_STAT_BADFRAGS); + goto bad; + } + goto found; + } + } + fp = 0; +found: - /* Fragment length and MF flag. */ - flen = ntohs(ip->ip_len) - hlen; + /* + * Adjust ip_len to not reflect header, + * set ipqe_mff if more fragments are expected, + * convert offset of this to bytes. + */ + ip->ip_len = htons(ntohs(ip->ip_len) - hlen); mff = (ip->ip_off & htons(IP_MF)) != 0; if (mff) { - /* - * Make sure that fragments have a data length - * which is non-zero and multiple of 8 bytes. - */ - if (flen == 0 || (flen & 0x7) != 0) { + /* + * Make sure that fragments have a data length + * that's a non-zero multiple of 8 bytes. + */ + if (ntohs(ip->ip_len) == 0 || + (ntohs(ip->ip_len) & 0x7) != 0) { IP_STATINC(IP_STAT_BADFRAGS); + IPQ_UNLOCK(); goto bad; } } + ip->ip_off = htons((ntohs(ip->ip_off) & IP_OFFMASK) << 3); /* - * Adjust total IP length to not reflect header and convert - * offset of this to bytes. XXX: clobbers struct ip. + * If datagram marked as having more fragments + * or if this is not the first fragment, + * attempt reassembly; if it succeeds, proceed. */ - ip->ip_len = htons(flen); - ip->ip_off = htons(off); - - /* - * Pass to IP reassembly mechanism. - */ - if (ip_reass_packet(m, ip, mff, &m_final) != 0) { - /* Failed; invalid fragment(s) or packet. */ - goto bad; - } - if (m_final == NULL) { - /* More fragments should come; silently return. */ - return; - } - /* Reassembly is done, we have the final packet. */ - m = m_final; - - /* Updated local variable(s). */ - ip = mtod(m, struct ip *); - hlen = ip->ip_hl << 2; + if (mff || ip->ip_off != htons(0)) { + IP_STATINC(IP_STAT_FRAGMENTS); + s = splvm(); + ipqe = pool_get(&ipqent_pool, PR_NOWAIT); + splx(s); + if (ipqe == NULL) { + IP_STATINC(IP_STAT_RCVMEMDROP); + IPQ_UNLOCK(); + goto bad; + } + ipqe->ipqe_mff = mff; + ipqe->ipqe_m = m; + ipqe->ipqe_ip = ip; + m = ip_reass(ipqe, fp, &ipq[hash]); + if (m == 0) { + IPQ_UNLOCK(); + return; + } + IP_STATINC(IP_STAT_REASSEMBLED); + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; + ip->ip_len = htons(ntohs(ip->ip_len) + hlen); + } else + if (fp) + ip_freef(fp); + IPQ_UNLOCK(); } #if defined(IPSEC) @@ -937,30 +1068,399 @@ badcsum: } /* - * IP timer processing. + * Take incoming datagram fragment and try to + * reassemble it into whole datagram. If a chain for + * reassembly of this datagram already exists, then it + * is given as fp; otherwise have to make a chain. + */ +struct mbuf * +ip_reass(struct ipqent *ipqe, struct ipq *fp, struct ipqhead *ipqhead) +{ + struct mbuf *m = ipqe->ipqe_m; + struct ipqent *nq, *p, *q; + struct ip *ip; + struct mbuf *t; + int hlen = ipqe->ipqe_ip->ip_hl << 2; + int i, next, s; + + IPQ_LOCK_CHECK(); + + /* + * Presence of header sizes in mbufs + * would confuse code below. + */ + m->m_data += hlen; + m->m_len -= hlen; + +#ifdef notyet + /* make sure fragment limit is up-to-date */ + CHECK_NMBCLUSTER_PARAMS(); + + /* If we have too many fragments, drop the older half. */ + if (ip_nfrags >= ip_maxfrags) + ip_reass_drophalf(void); +#endif + + /* + * We are about to add a fragment; increment frag count. + */ + ip_nfrags++; + + /* + * If first fragment to arrive, create a reassembly queue. + */ + if (fp == 0) { + /* + * Enforce upper bound on number of fragmented packets + * for which we attempt reassembly; + * If maxfrag is 0, never accept fragments. + * If maxfrag is -1, accept all fragments without limitation. + */ + if (ip_maxfragpackets < 0) + ; + else if (ip_nfragpackets >= ip_maxfragpackets) + goto dropfrag; + ip_nfragpackets++; + MALLOC(fp, struct ipq *, sizeof (struct ipq), + M_FTABLE, M_NOWAIT); + if (fp == NULL) + goto dropfrag; + LIST_INSERT_HEAD(ipqhead, fp, ipq_q); + fp->ipq_nfrags = 1; + fp->ipq_ttl = IPFRAGTTL; + fp->ipq_p = ipqe->ipqe_ip->ip_p; + fp->ipq_id = ipqe->ipqe_ip->ip_id; + fp->ipq_tos = ipqe->ipqe_ip->ip_tos; + TAILQ_INIT(&fp->ipq_fragq); + fp->ipq_src = ipqe->ipqe_ip->ip_src; + fp->ipq_dst = ipqe->ipqe_ip->ip_dst; + p = NULL; + goto insert; + } else { + fp->ipq_nfrags++; + } + + /* + * Find a segment which begins after this one does. + */ + for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; + p = q, q = TAILQ_NEXT(q, ipqe_q)) + if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off)) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if (p != NULL) { + i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - + ntohs(ipqe->ipqe_ip->ip_off); + if (i > 0) { + if (i >= ntohs(ipqe->ipqe_ip->ip_len)) + goto dropfrag; + m_adj(ipqe->ipqe_m, i); + ipqe->ipqe_ip->ip_off = + htons(ntohs(ipqe->ipqe_ip->ip_off) + i); + ipqe->ipqe_ip->ip_len = + htons(ntohs(ipqe->ipqe_ip->ip_len) - i); + } + } + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + for (; q != NULL && + ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) > + ntohs(q->ipqe_ip->ip_off); q = nq) { + i = (ntohs(ipqe->ipqe_ip->ip_off) + + ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off); + if (i < ntohs(q->ipqe_ip->ip_len)) { + q->ipqe_ip->ip_len = + htons(ntohs(q->ipqe_ip->ip_len) - i); + q->ipqe_ip->ip_off = + htons(ntohs(q->ipqe_ip->ip_off) + i); + m_adj(q->ipqe_m, i); + break; + } + nq = TAILQ_NEXT(q, ipqe_q); + m_freem(q->ipqe_m); + TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); + s = splvm(); + pool_put(&ipqent_pool, q); + splx(s); + fp->ipq_nfrags--; + ip_nfrags--; + } + +insert: + /* + * Stick new segment in its place; + * check for complete reassembly. + */ + if (p == NULL) { + TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q); + } else { + TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q); + } + next = 0; + for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; + p = q, q = TAILQ_NEXT(q, ipqe_q)) { + if (ntohs(q->ipqe_ip->ip_off) != next) + return (0); + next += ntohs(q->ipqe_ip->ip_len); + } + if (p->ipqe_mff) + return (0); + + /* + * Reassembly is complete. Check for a bogus message size and + * concatenate fragments. + */ + q = TAILQ_FIRST(&fp->ipq_fragq); + ip = q->ipqe_ip; + if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) { + IP_STATINC(IP_STAT_TOOLONG); + ip_freef(fp); + return (0); + } + m = q->ipqe_m; + t = m->m_next; + m->m_next = 0; + m_cat(m, t); + nq = TAILQ_NEXT(q, ipqe_q); + s = splvm(); + pool_put(&ipqent_pool, q); + splx(s); + for (q = nq; q != NULL; q = nq) { + t = q->ipqe_m; + nq = TAILQ_NEXT(q, ipqe_q); + s = splvm(); + pool_put(&ipqent_pool, q); + splx(s); + m_cat(m, t); + } + ip_nfrags -= fp->ipq_nfrags; + + /* + * Create header for new ip packet by + * modifying header of first packet; + * dequeue and discard fragment reassembly header. + * Make header visible. + */ + ip->ip_len = htons(next); + ip->ip_src = fp->ipq_src; + ip->ip_dst = fp->ipq_dst; + LIST_REMOVE(fp, ipq_q); + FREE(fp, M_FTABLE); + ip_nfragpackets--; + m->m_len += (ip->ip_hl << 2); + m->m_data -= (ip->ip_hl << 2); + /* some debugging cruft by sklower, below, will go away soon */ + if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ + int plen = 0; + for (t = m; t; t = t->m_next) + plen += t->m_len; + m->m_pkthdr.len = plen; + m->m_pkthdr.csum_flags = 0; + } + return (m); + +dropfrag: + if (fp != 0) + fp->ipq_nfrags--; + ip_nfrags--; + IP_STATINC(IP_STAT_FRAGDROPPED); + m_freem(m); + s = splvm(); + pool_put(&ipqent_pool, ipqe); + splx(s); + return (0); +} + +/* + * Free a fragment reassembly header and all + * associated datagrams. + */ +void +ip_freef(struct ipq *fp) +{ + struct ipqent *q, *p; + u_int nfrags = 0; + int s; + + IPQ_LOCK_CHECK(); + + for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) { + p = TAILQ_NEXT(q, ipqe_q); + m_freem(q->ipqe_m); + nfrags++; + TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); + s = splvm(); + pool_put(&ipqent_pool, q); + splx(s); + } + + if (nfrags != fp->ipq_nfrags) + printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags); + ip_nfrags -= nfrags; + LIST_REMOVE(fp, ipq_q); + FREE(fp, M_FTABLE); + ip_nfragpackets--; +} + +/* + * IP reassembly TTL machinery for multiplicative drop. + */ +static u_int fragttl_histo[(IPFRAGTTL+1)]; + + +/* + * Decrement TTL of all reasembly queue entries by `ticks'. + * Count number of distinct fragments (as opposed to partial, fragmented + * datagrams) in the reassembly queue. While we traverse the entire + * reassembly queue, compute and return the median TTL over all fragments. + */ +static u_int +ip_reass_ttl_decr(u_int ticks) +{ + u_int nfrags, median, dropfraction, keepfraction; + struct ipq *fp, *nfp; + int i; + + nfrags = 0; + memset(fragttl_histo, 0, sizeof fragttl_histo); + + for (i = 0; i < IPREASS_NHASH; i++) { + for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) { + fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ? + 0 : fp->ipq_ttl - ticks); + nfp = LIST_NEXT(fp, ipq_q); + if (fp->ipq_ttl == 0) { + IP_STATINC(IP_STAT_FRAGTIMEOUT); + ip_freef(fp); + } else { + nfrags += fp->ipq_nfrags; + fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags; + } + } + } + + KASSERT(ip_nfrags == nfrags); + + /* Find median (or other drop fraction) in histogram. */ + dropfraction = (ip_nfrags / 2); + keepfraction = ip_nfrags - dropfraction; + for (i = IPFRAGTTL, median = 0; i >= 0; i--) { + median += fragttl_histo[i]; + if (median >= keepfraction) + break; + } + + /* Return TTL of median (or other fraction). */ + return (u_int)i; +} + +void +ip_reass_drophalf(void) +{ + + u_int median_ticks; + /* + * Compute median TTL of all fragments, and count frags + * with that TTL or lower (roughly half of all fragments). + */ + median_ticks = ip_reass_ttl_decr(0); + + /* Drop half. */ + median_ticks = ip_reass_ttl_decr(median_ticks); + +} + +/* + * IP timer processing; + * if a timer expires on a reassembly + * queue, discard it. */ void ip_slowtimo(void) { + static u_int dropscanidx = 0; + u_int i; + u_int median_ttl; mutex_enter(softnet_lock); KERNEL_LOCK(1, NULL); - ip_reass_slowtimo(); + IPQ_LOCK(); + + /* Age TTL of all fragments by 1 tick .*/ + median_ttl = ip_reass_ttl_decr(1); + + /* make sure fragment limit is up-to-date */ + CHECK_NMBCLUSTER_PARAMS(); + + /* If we have too many fragments, drop the older half. */ + if (ip_nfrags > ip_maxfrags) + ip_reass_ttl_decr(median_ttl); + + /* + * If we are over the maximum number of fragmented packets + * (due to the limit being lowered), drain off + * enough to get down to the new limit. Start draining + * from the reassembly hashqueue most recently drained. + */ + if (ip_maxfragpackets < 0) + ; + else { + int wrapped = 0; + + i = dropscanidx; + while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) { + while (LIST_FIRST(&ipq[i]) != NULL) + ip_freef(LIST_FIRST(&ipq[i])); + if (++i >= IPREASS_NHASH) { + i = 0; + } + /* + * Dont scan forever even if fragment counters are + * wrong: stop after scanning entire reassembly queue. + */ + if (i == dropscanidx) + wrapped = 1; + } + dropscanidx = i; + } + IPQ_UNLOCK(); KERNEL_UNLOCK_ONE(NULL); mutex_exit(softnet_lock); } /* - * IP drain processing. + * Drain off all datagram fragments. Don't acquire softnet_lock as + * can be called from hardware interrupt context. */ void ip_drain(void) { KERNEL_LOCK(1, NULL); - ip_reass_drain(); + + /* + * We may be called from a device's interrupt context. If + * the ipq is already busy, just bail out now. + */ + if (ipq_lock_try() != 0) { + /* + * Drop half the total fragments now. If more mbufs are + * needed, we will be called again soon. + */ + ip_reass_drophalf(); + IPQ_UNLOCK(); + } + KERNEL_UNLOCK_ONE(NULL); } @@ -1056,7 +1556,7 @@ ip_dooptions(struct mbuf *m) /* * locate outgoing interface */ - memcpy((void *)&ipaddr.sin_addr, (void *)(cp + off), + bcopy((void *)(cp + off), (void *)&ipaddr.sin_addr, sizeof(ipaddr.sin_addr)); if (opt == IPOPT_SSRR) ia = ifatoia(ifa_ifwithladdr(sintosa(&ipaddr))); @@ -1092,7 +1592,7 @@ ip_dooptions(struct mbuf *m) off--; /* 0 origin */ if ((off + sizeof(struct in_addr)) > optlen) break; - memcpy((void *)&ipaddr.sin_addr, (void *)(&ip->ip_dst), + bcopy((void *)(&ip->ip_dst), (void *)&ipaddr.sin_addr, sizeof(ipaddr.sin_addr)); /* * locate outgoing interface; if we're the destination, @@ -1159,7 +1659,7 @@ ip_dooptions(struct mbuf *m) (u_char *)ip; goto bad; } - memcpy(&ipaddr.sin_addr, cp0, + bcopy(cp0, &ipaddr.sin_addr, sizeof(struct in_addr)); if (ifatoia(ifa_ifwithaddr(sintosa(&ipaddr))) == NULL) @@ -1233,7 +1733,7 @@ save_rte(u_char *option, struct in_addr #endif /* 0 */ if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst))) return; - memcpy((void *)ip_srcrt.srcopt, (void *)option, olen); + bcopy((void *)option, (void *)ip_srcrt.srcopt, olen); ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); ip_srcrt.dst = dst; } @@ -1551,22 +2051,10 @@ ip_savecontrol(struct inpcb *inp, struct struct mbuf *m) { - if (inp->inp_socket->so_options & SO_TIMESTAMP -#ifdef SO_OTIMESTAMP - || inp->inp_socket->so_options & SO_OTIMESTAMP -#endif - ) { + if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; microtime(&tv); -#ifdef SO_OTIMESTAMP - if (inp->inp_socket->so_options & SO_OTIMESTAMP) { - struct timeval50 tv50; - timeval_to_timeval50(&tv, &tv50); - *mp = sbcreatecontrol((void *) &tv50, sizeof(tv50), - SCM_OTIMESTAMP, SOL_SOCKET); - } else -#endif *mp = sbcreatecontrol((void *) &tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) @@ -1612,12 +2100,6 @@ ip_savecontrol(struct inpcb *inp, struct if (*mp) mp = &(*mp)->m_next; } - if (inp->inp_flags & INP_RECVTTL) { - *mp = sbcreatecontrol((void *) &ip->ip_ttl, - sizeof(uint8_t), IP_RECVTTL, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; - } } /* @@ -1636,10 +2118,9 @@ sysctl_net_inet_ip_forwsrcrt(SYSCTLFN_AR if (error || newp == NULL) return (error); - error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_FORWSRCRT, - 0, NULL, NULL, NULL); - if (error) - return (error); + if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_FORWSRCRT, + 0, NULL, NULL, NULL)) + return (EPERM); ip_forwsrcrt = tmp; @@ -1701,7 +2182,7 @@ sysctl_net_inet_ip_maxflows(SYSCTLFN_ARG static int sysctl_net_inet_ip_hashsize(SYSCTLFN_ARGS) -{ +{ int error, tmp; struct sysctlnode node; @@ -1729,7 +2210,7 @@ sysctl_net_inet_ip_hashsize(SYSCTLFN_ARG * EINVAL if not a power of 2 */ error = EINVAL; - } + } return error; } @@ -1742,8 +2223,7 @@ sysctl_net_inet_ip_stats(SYSCTLFN_ARGS) return (NETSTAT_SYSCTL(ipstat_percpu, IP_NSTATS)); } -static void -sysctl_net_inet_ip_setup(struct sysctllog **clog) +SYSCTL_SETUP(sysctl_net_inet_ip_setup, "sysctl net.inet.ip subtree setup") { extern int subnetsarelocal, hostzeroisbroadcast; @@ -1903,6 +2383,14 @@ sysctl_net_inet_ip_setup(struct sysctllo CTL_NET, PF_INET, IPPROTO_IP, IPCTL_LOWPORTMAX, CTL_EOL); #endif /* IPNOPRIVPORTS */ + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "maxfragpackets", + SYSCTL_DESCR("Maximum number of fragments to retain for " + "possible reassembly"), + NULL, 0, &ip_maxfragpackets, 0, + CTL_NET, PF_INET, IPPROTO_IP, + IPCTL_MAXFRAGPACKETS, CTL_EOL); #if NGRE > 0 sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE,