Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/netinet/ip_input.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/netinet/ip_input.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.57 retrieving revision 1.78 diff -u -p -r1.57 -r1.78 --- src/sys/netinet/ip_input.c 1998/02/13 18:21:44 1.57 +++ src/sys/netinet/ip_input.c 1999/01/19 21:58:40 1.78 @@ -1,4 +1,41 @@ -/* $NetBSD: ip_input.c,v 1.57 1998/02/13 18:21:44 tls Exp $ */ +/* $NetBSD: ip_input.c,v 1.78 1999/01/19 21:58:40 mycroft Exp $ */ + +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Public Access Networks Corporation ("Panix"). It was developed under + * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /* * Copyright (c) 1982, 1986, 1988, 1993 @@ -35,6 +72,8 @@ * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ +#include "opt_gateway.h" +#include "opt_pfil_hooks.h" #include "opt_mrouting.h" #include @@ -49,6 +88,7 @@ #include #include #include +#include #include #include @@ -85,6 +125,9 @@ #ifndef IPMTUDISC #define IPMTUDISC 0 #endif +#ifndef IPMTUDISCTIMEOUT +#define IPMTUDISCTIMEOUT (10 * 60) /* as per RFC 1191 */ +#endif /* * Note: DIRECTED_BROADCAST is handled this way so that previous @@ -104,10 +147,13 @@ int ip_forwsrcrt = IPFORWSRCRT; int ip_directedbcast = IPDIRECTEDBCAST; int ip_allowsrcrt = IPALLOWSRCRT; int ip_mtudisc = IPMTUDISC; +u_int ip_mtudisc_timeout = IPMTUDISCTIMEOUT; #ifdef DIAGNOSTIC int ipprintfs = 0; #endif +struct rttimer_queue *ip_mtudisc_timeout_q = NULL; + extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; @@ -115,6 +161,64 @@ int ipqmaxlen = IFQ_MAXLEN; struct in_ifaddrhead in_ifaddr; struct in_ifaddrhashhead *in_ifaddrhashtbl; struct ifqueue ipintrq; +struct ipstat ipstat; +u_int16_t ip_id; +int ip_defttl; + +struct ipqhead ipq; +int ipq_locked; + +static __inline int ipq_lock_try __P((void)); +static __inline void ipq_unlock __P((void)); + +static __inline int +ipq_lock_try() +{ + int s; + + s = splimp(); + if (ipq_locked) { + splx(s); + return (0); + } + ipq_locked = 1; + splx(s); + return (1); +} + +static __inline void +ipq_unlock() +{ + int s; + + s = splimp(); + ipq_locked = 0; + splx(s); +} + +#ifdef DIAGNOSTIC +#define IPQ_LOCK() \ +do { \ + if (ipq_lock_try() == 0) { \ + printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \ + panic("ipq_lock"); \ + } \ +} while (0) +#define IPQ_LOCK_CHECK() \ +do { \ + if (ipq_locked == 0) { \ + printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \ + panic("ipq lock check"); \ + } \ +} while (0) +#else +#define IPQ_LOCK() (void) ipq_lock_try() +#define IPQ_LOCK_CHECK() /* nothing */ +#endif + +#define IPQ_UNLOCK() ipq_unlock() + +struct pool ipqent_pool; /* * We need to save the IP options in case a protocol wants to respond @@ -143,6 +247,9 @@ ip_init() register struct protosw *pr; register int i; + pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl", + 0, NULL, NULL, M_IPQ); + pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == 0) panic("ip_init"); @@ -159,6 +266,12 @@ ip_init() TAILQ_INIT(&in_ifaddr); in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, M_IFADDR, M_WAITOK, &in_ifaddrhash); + if (ip_mtudisc != 0) + ip_mtudisc_timeout_q = + rt_timer_queue_create(ip_mtudisc_timeout); +#ifdef GATEWAY + ipflow_init(); +#endif } struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; @@ -227,7 +340,7 @@ next: } ip = mtod(m, struct ip *); } - if ((ip->ip_sum = in_cksum(m, hlen)) != 0) { + if (in_cksum(m, hlen) != 0) { ipstat.ips_badsum++; goto bad; } @@ -258,17 +371,30 @@ next: m_adj(m, len - m->m_pkthdr.len); } + /* + * Assume that we can create a fast-forward IP flow entry + * based on this packet. + */ + m->m_flags |= M_CANFASTFWD; + #ifdef PFIL_HOOKS /* - * Run through list of hooks for input packets. + * Run through list of hooks for input packets. If there are any + * filters which require that additional packets in the flow are + * not fast-forwarded, they must clear the M_CANFASTFWD flag. + * Note that filters must _never_ set this flag, as another filter + * in the list may have previously cleared it. */ m0 = m; - for (pfh = pfil_hook_get(PFIL_IN); pfh; pfh = pfh->pfil_link.le_next) + for (pfh = pfil_hook_get(PFIL_IN); pfh; pfh = pfh->pfil_link.tqe_next) if (pfh->pfil_func) { rv = pfh->pfil_func(ip, hlen, m->m_pkthdr.rcvif, 0, &m0); if (rv) goto next; - ip = mtod(m = m0, struct ip *); + m = m0; + if (m == NULL) + goto next; + ip = mtod(m, struct ip *); } #endif /* PFIL_HOOKS */ @@ -392,6 +518,7 @@ ours: * Look for queue of fragments * of this datagram. */ + IPQ_LOCK(); for (fp = ipq.lh_first; fp != NULL; fp = fp->ipq_q.le_next) if (ip->ip_id == fp->ipq_id && in_hosteq(ip->ip_src, fp->ipq_src) && @@ -415,6 +542,7 @@ found: */ if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { ipstat.ips_badfrags++; + IPQ_UNLOCK(); goto bad; } } @@ -427,23 +555,27 @@ found: */ if (mff || ip->ip_off) { ipstat.ips_fragments++; - MALLOC(ipqe, struct ipqent *, sizeof (struct ipqent), - M_IPQ, M_NOWAIT); + ipqe = pool_get(&ipqent_pool, PR_NOWAIT); if (ipqe == NULL) { ipstat.ips_rcvmemdrop++; + IPQ_UNLOCK(); goto bad; } ipqe->ipqe_mff = mff; ipqe->ipqe_m = m; ipqe->ipqe_ip = ip; m = ip_reass(ipqe, fp); - if (m == 0) + if (m == 0) { + IPQ_UNLOCK(); goto next; + } ipstat.ips_reassembled++; ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; } else if (fp) ip_freef(fp); + IPQ_UNLOCK(); } else ip->ip_len -= hlen; @@ -476,6 +608,8 @@ ip_reass(ipqe, fp) int hlen = ipqe->ipqe_ip->ip_hl << 2; int i, next; + IPQ_LOCK_CHECK(); + /* * Presence of header sizes in mbufs * would confuse code below. @@ -544,7 +678,7 @@ ip_reass(ipqe, fp) nq = q->ipqe_q.le_next; m_freem(q->ipqe_m); LIST_REMOVE(q, ipqe_q); - FREE(q, M_IPQ); + pool_put(&ipqent_pool, q); } insert: @@ -583,11 +717,11 @@ insert: m->m_next = 0; m_cat(m, t); nq = q->ipqe_q.le_next; - FREE(q, M_IPQ); + pool_put(&ipqent_pool, q); for (q = nq; q != NULL; q = nq) { t = q->ipqe_m; nq = q->ipqe_q.le_next; - FREE(q, M_IPQ); + pool_put(&ipqent_pool, q); m_cat(m, t); } @@ -616,7 +750,7 @@ insert: dropfrag: ipstat.ips_fragdropped++; m_freem(m); - FREE(ipqe, M_IPQ); + pool_put(&ipqent_pool, ipqe); return (0); } @@ -630,11 +764,13 @@ ip_freef(fp) { register struct ipqent *q, *p; + IPQ_LOCK_CHECK(); + for (q = fp->ipq_fragq.lh_first; q != NULL; q = p) { p = q->ipqe_q.le_next; m_freem(q->ipqe_m); LIST_REMOVE(q, ipqe_q); - FREE(q, M_IPQ); + pool_put(&ipqent_pool, q); } LIST_REMOVE(fp, ipq_q); FREE(fp, M_FTABLE); @@ -651,6 +787,7 @@ ip_slowtimo() register struct ipq *fp, *nfp; int s = splsoftnet(); + IPQ_LOCK(); for (fp = ipq.lh_first; fp != NULL; fp = nfp) { nfp = fp->ipq_q.le_next; if (--fp->ipq_ttl == 0) { @@ -658,6 +795,10 @@ ip_slowtimo() ip_freef(fp); } } + IPQ_UNLOCK(); +#ifdef GATEWAY + ipflow_slowtimo(); +#endif splx(s); } @@ -668,10 +809,19 @@ void ip_drain() { + /* + * We may be called from a device's interrupt context. If + * the ipq is already busy, just bail out now. + */ + if (ipq_lock_try() == 0) + return; + while (ipq.lh_first != NULL) { ipstat.ips_fragdropped++; ip_freef(ipq.lh_first); } + + IPQ_UNLOCK(); } /* @@ -827,7 +977,7 @@ ip_dooptions(m) break; case IPOPT_TS_TSANDADDR: - if (ipt->ipt_ptr + sizeof(n_time) + + if (ipt->ipt_ptr - 1 + sizeof(n_time) + sizeof(struct in_addr) > ipt->ipt_len) goto bad; ipaddr.sin_addr = dst; @@ -841,7 +991,7 @@ ip_dooptions(m) break; case IPOPT_TS_PRESPEC: - if (ipt->ipt_ptr + sizeof(n_time) + + if (ipt->ipt_ptr - 1 + sizeof(n_time) + sizeof(struct in_addr) > ipt->ipt_len) goto bad; bcopy((caddr_t)sin, (caddr_t)&ipaddr.sin_addr, @@ -1063,15 +1213,15 @@ ip_forward(m, srcrt) dest = 0; #ifdef DIAGNOSTIC if (ipprintfs) - printf("forward: src %x dst %x ttl %x\n", - ip->ip_src.s_addr, ip->ip_dst.s_addr, ip->ip_ttl); + printf("forward: src %2.2x dst %2.2x ttl %x\n", + ntohl(ip->ip_src.s_addr), + ntohl(ip->ip_dst.s_addr), ip->ip_ttl); #endif if (m->m_flags & M_BCAST || in_canforward(ip->ip_dst) == 0) { ipstat.ips_cantforward++; m_freem(m); return; } - HTONS(ip->ip_id); if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0); return; @@ -1118,16 +1268,20 @@ ip_forward(m, srcrt) if (rt->rt_ifa && (ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_subnetmask) == ifatoia(rt->rt_ifa)->ia_subnet) { - if (rt->rt_flags & RTF_GATEWAY) - dest = satosin(rt->rt_gateway)->sin_addr.s_addr; - else - dest = ip->ip_dst.s_addr; - /* Router requirements says to only send host redirects */ - type = ICMP_REDIRECT; - code = ICMP_REDIRECT_HOST; + if (rt->rt_flags & RTF_GATEWAY) + dest = satosin(rt->rt_gateway)->sin_addr.s_addr; + else + dest = ip->ip_dst.s_addr; + /* + * Router requirements says to only send host + * redirects. + */ + type = ICMP_REDIRECT; + code = ICMP_REDIRECT_HOST; #ifdef DIAGNOSTIC - if (ipprintfs) - printf("redirect (%d) to %x\n", code, (u_int32_t)dest); + if (ipprintfs) + printf("redirect (%d) to %x\n", code, + (u_int32_t)dest); #endif } } @@ -1141,8 +1295,13 @@ ip_forward(m, srcrt) if (type) ipstat.ips_redirectsent++; else { - if (mcopy) + if (mcopy) { +#ifdef GATEWAY + if (mcopy->m_flags & M_CANFASTFWD) + ipflow_create(&ipforward_rt, mcopy); +#endif m_freem(mcopy); + } return; } } @@ -1288,8 +1447,16 @@ ip_sysctl(name, namelen, oldp, oldlenp, return (sysctl_int(oldp, oldlenp, newp, newlen, &subnetsarelocal)); case IPCTL_MTUDISC: - return (sysctl_int(oldp, oldlenp, newp, newlen, - &ip_mtudisc)); + error = sysctl_int(oldp, oldlenp, newp, newlen, + &ip_mtudisc); + if (ip_mtudisc != 0 && ip_mtudisc_timeout_q == NULL) { + ip_mtudisc_timeout_q = + rt_timer_queue_create(ip_mtudisc_timeout); + } else if (ip_mtudisc == 0 && ip_mtudisc_timeout_q != NULL) { + rt_timer_queue_destroy(ip_mtudisc_timeout_q, TRUE); + ip_mtudisc_timeout_q = NULL; + } + return error; case IPCTL_ANONPORTMIN: old = anonportmin; error = sysctl_int(oldp, oldlenp, newp, newlen, &anonportmin); @@ -1314,6 +1481,26 @@ ip_sysctl(name, namelen, oldp, oldlenp, return (EINVAL); } return (error); + case IPCTL_MTUDISCTIMEOUT: + error = sysctl_int(oldp, oldlenp, newp, newlen, + &ip_mtudisc_timeout); + if (ip_mtudisc_timeout_q != NULL) + rt_timer_queue_change(ip_mtudisc_timeout_q, + ip_mtudisc_timeout); + return (error); +#ifdef GATEWAY + case IPCTL_MAXFLOWS: + { + int s; + + error = sysctl_int(oldp, oldlenp, newp, newlen, + &ip_maxflows); + s = splsoftnet(); + ipflow_reap(0); + splx(s); + return (error); + } +#endif default: return (EOPNOTSUPP); }