[BACK]Return to raw_ip.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / netinet

File: [cvs.NetBSD.org] / src / sys / netinet / raw_ip.c (download)

Revision 1.116.2.1, Wed Jul 17 03:16:31 2013 UTC (10 years, 8 months ago) by rmind
Branch: rmind-smpnet
Changes since 1.116: +265 -257 lines

Checkpoint work in progress:
- Move PCB structures under __INPCB_PRIVATE, adjust most of the callers
  and thus make IPv4 PCB structures mostly opaque.  Any volunteers for
  merging in6pcb with inpcb (see rpaulo-netinet-merge-pcb branch)?
- Move various global vars to the modules where they belong, make them static.
- Some preliminary work for IPv4 PCB locking scheme.
- Make raw IP code mostly MP-safe.  Simplify some of it.
- Rework "fast" IP forwarding (ipflow) code to be mostly MP-safe.  It should
  run from a software interrupt, rather than hard.
- Rework tun(4) pseudo interface to be MP-safe.
- Work towards making some other interfaces more strict.

/*	$NetBSD: raw_ip.c,v 1.116.2.1 2013/07/17 03:16:31 rmind Exp $	*/

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
 */

/*
 * Raw interface to IP protocol.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: raw_ip.c,v 1.116.2.1 2013/07/17 03:16:31 rmind Exp $");

#include "opt_inet.h"
#include "opt_compat_netbsd.h"
#include "opt_ipsec.h"
#include "opt_mrouting.h"

#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <sys/kauth.h>

#include <net/if.h>
#include <net/route.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_private.h>
#include <netinet/ip_mroute.h>
#include <netinet/ip_icmp.h>
#include <netinet/in_pcb.h>
#include <netinet/in_proto.h>
#include <netinet/in_var.h>

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec_var.h>
#include <netipsec/ipsec_private.h>
#endif

#ifdef COMPAT_50
#include <compat/sys/socket.h>
#endif

static inpcbtable_t *	rawcbtable __read_mostly;

static void		sysctl_net_inet_raw_setup(struct sysctllog **);

/*
 * Nominal space allocated to a raw ip socket.
 */
#define	RIPSNDQ		8192
#define	RIPRCVQ		8192

static u_long		rip_sendspace = RIPSNDQ;
static u_long		rip_recvspace = RIPRCVQ;

struct rip_input_ctx {
	struct mbuf *		mbuf;
	struct ip *		ip;
	struct sockaddr_in	src;
	unsigned		hlen;
	unsigned		nfound;
};

struct rip_ctlinput_ctx {
	struct ip *		ip;
	struct in_addr		addr;
	int			errno;
};

void
rip_init(void)
{
	rawcbtable = inpcb_init(1, 1, 0);
	sysctl_net_inet_raw_setup(NULL);
}

/*
 * rip_append: pass the received datagram to the process.
 */
static void
rip_append(inpcb_t *inp, struct rip_input_ctx *rctx)
{
	struct socket *so = inpcb_get_socket(inp);
	int inpflags = inpcb_get_flags(inp);
	struct mbuf *n, *opts = NULL;

	/* XXX: Might optimise this, but not with a silly loop! */
	if ((n = m_copypacket(rctx->mbuf, M_DONTWAIT)) == NULL) {
		return;
	}

	if (inpflags & INP_NOHEADER) {
		m_adj(n, rctx->hlen);
	}

	if ((inpflags & INP_CONTROLOPTS) != 0
#ifdef SO_OTIMESTAMP
	    || (so->so_options & SO_OTIMESTAMP) != 0
#endif
	    || (so->so_options & SO_TIMESTAMP) != 0) {
		struct ip *ip = rctx->ip;
		ip_savecontrol(inp, &opts, ip, n);
	}

	if (sbappendaddr(&so->so_rcv, sintosa(&rctx->src), n, opts) == 0) {
		/* Should notify about lost packet. */
		if (opts) {
			m_freem(opts);
		}
		m_freem(n);
	} else {
		sorwakeup(so);
	}
}

static int
rip_pcb_process(inpcb_t *inp, void *arg)
{
	struct rip_input_ctx *rctx = arg;
	const struct ip *ip = rctx->ip;
	struct ip *inp_ip = in_getiphdr(inp);
	struct in_addr laddr, faddr;

	if (inp_ip->ip_p && inp_ip->ip_p != ip->ip_p) {
		return 0;
	}
	inpcb_get_addrs(inp, &laddr, &faddr);

	if (!in_nullhost(laddr) && !in_hosteq(laddr, ip->ip_dst)) {
		return 0;
	}
	if (!in_nullhost(faddr) && !in_hosteq(faddr, ip->ip_src)) {
		return 0;
	}

#if defined(IPSEC)
	/* Check AH/ESP integrity. */
	if (ipsec4_in_reject_so(rctx->mbuf, inpcb_get_socket(inp))) {
		/* Do not inject data into PCB. */
		IPSEC_STATINC(IPSEC_STAT_IN_POLVIO);
		return 0;
	}
#endif
	rip_append(inp, rctx);
	rctx->nfound++;
	return 0;
}

void
rip_input(struct mbuf *m, ...)
{
	struct ip *ip = mtod(m, struct ip *);
	int error, hlen, proto;
	va_list ap;

	va_start(ap, m);
	(void)va_arg(ap, int);		/* ignore value, advance ap */
	proto = va_arg(ap, int);
	va_end(ap);

	KASSERTMSG((proto == ip->ip_p), "%s: protocol mismatch", __func__);

	/*
	 * Compatibility: programs using raw IP expect ip_len field to have
	 * the header length subtracted.  Also, ip_len and ip_off fields are
	 * expected to be in host order.
	 */
	hlen = ip->ip_hl << 2;
	ip->ip_len = ntohs(ip->ip_len) - hlen;
	NTOHS(ip->ip_off);

	/* Save some context for the iterator. */
	struct rip_input_ctx rctx = {
		.mbuf = m, .ip = ip, .hlen = hlen, .nfound = 0
	};
	sockaddr_in_init(&rctx.src, &ip->ip_src, 0);

	/* Scan all raw IP PCBs for matching entries. */
	error = inpcb_foreach(rawcbtable, AF_INET, rip_pcb_process, &rctx);
	KASSERT(error == 0);

	/* Done, if found any. */
	if (rctx.nfound) {
		return;
	}

	if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) {
		uint64_t *ips;

		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0);
		ips = IP_STAT_GETREF();
		ips[IP_STAT_NOPROTO]++;
		ips[IP_STAT_DELIVERED]--;
		IP_STAT_PUTREF();
	} else {
		m_freem(m);
	}
}

static int
rip_pcbnotify(inpcb_t *inp, void *arg)
{
	struct rip_ctlinput_ctx *rctx = arg;
	const struct ip *ip = rctx->ip;
	struct ip *inp_ip = in_getiphdr(inp);
	struct in_addr laddr, faddr;

	if (inp_ip->ip_p && inp_ip->ip_p != ip->ip_p) {
		return 0;
	}
	inpcb_get_addrs(inp, &laddr, &faddr);

	if (in_hosteq(faddr, rctx->addr) && in_hosteq(laddr, ip->ip_src)) {
		inpcb_rtchange(inp, rctx->errno);
	}
	return 0;
}

void *
rip_ctlinput(int cmd, const struct sockaddr *sa, void *v)
{
	struct ip *ip = v;
	int errno;

	if (sa->sa_family != AF_INET ||
	    sa->sa_len != sizeof(struct sockaddr_in))
		return NULL;
	if ((unsigned)cmd >= PRC_NCMDS)
		return NULL;
	errno = inetctlerrmap[cmd];

	if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD || ip == NULL) {
		inpcb_notifyall(rawcbtable, satocsin(sa)->sin_addr,
		    errno, inpcb_rtchange);
		return NULL;
	} else if (errno == 0) {
		return NULL;
	}

	/* Note: mapped address case. */
	struct rip_ctlinput_ctx rctx = {
		.ip = ip, .addr = satocsin(sa)->sin_addr, .errno = errno
	};
	(void)inpcb_foreach(rawcbtable, AF_INET, rip_pcbnotify, &rctx);

	return NULL;
}

/*
 * Generate IP header and pass packet to the IP output routine.
 * Tack on options user may have setup with control call.
 */
int
rip_output(struct mbuf *m, ...)
{
	inpcb_t *inp;
	struct socket *so;
	struct ip *ip;
	struct mbuf *opts;
	int flags, inpflags;
	va_list ap;

	va_start(ap, m);
	inp = va_arg(ap, inpcb_t *);
	va_end(ap);

	so = inpcb_get_socket(inp);
	KASSERT(solocked(so));

	flags = (so->so_options & SO_DONTROUTE) |
	    IP_ALLOWBROADCAST | IP_RETURNMTU;
	inpflags = inpcb_get_flags(inp);

	/*
	 * If the user handed us a complete IP packet, use it.
	 * Otherwise, allocate an mbuf for a header and fill it in.
	 */
	if ((inpflags & INP_HDRINCL) == 0) {
		struct ip *inp_ip = in_getiphdr(inp);

		if ((m->m_pkthdr.len + sizeof(struct ip)) > IP_MAXPACKET) {
			m_freem(m);
			return EMSGSIZE;
		}
		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
		if (m == NULL) {
			return ENOBUFS;
		}
		ip = mtod(m, struct ip *);
		ip->ip_tos = 0;
		ip->ip_off = htons(0);
		ip->ip_p = inp_ip->ip_p;
		ip->ip_len = htons(m->m_pkthdr.len);
		inpcb_get_addrs(inp, &ip->ip_src, &ip->ip_dst);

		ip->ip_ttl = MAXTTL;
		opts = inpcb_get_options(inp);
	} else {
		if (m->m_pkthdr.len > IP_MAXPACKET) {
			m_freem(m);
			return EMSGSIZE;
		}
		ip = mtod(m, struct ip *);

		/*
		 * If the mbuf is read-only, we need to allocate a new mbuf
		 * for the header, since we need to modify the header.
		 */
		if (M_READONLY(m)) {
			const int hlen = ip->ip_hl << 2;

			m = m_copyup(m, hlen, (max_linkhdr + 3) & ~3);
			if (m == NULL) {
				return ENOMEM;	/* XXX */
			}
			ip = mtod(m, struct ip *);
		}

		/*
		 * Applications on raw sockets pass us packets
		 * in host byte order.
		 */
		if (m->m_pkthdr.len != ip->ip_len) {
			m_freem(m);
			return (EINVAL);
		}
		HTONS(ip->ip_len);
		HTONS(ip->ip_off);
		if (ip->ip_id || m->m_pkthdr.len < IP_MINFRAGSIZE) {
			flags |= IP_NOIPNEWID;
		}
		opts = NULL;

		/*
		 * Note: prevent IP output from overwriting header fields.
		 */
		flags |= IP_RAWOUTPUT;
		IP_STATINC(IP_STAT_RAWOUT);
	}

	return ip_output(m, opts, inpcb_get_route(inp), flags,
	    inpcb_get_moptions(inp), so);
}

/*
 * Raw IP socket option processing.
 */
int
rip_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
	inpcb_t *inp = sotoinpcb(so);
	int inpflags = inpcb_get_flags(inp);
	int error = 0, optval;

	KASSERT(solocked(so));

	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) {
		if (op == PRCO_GETOPT) {
			optval = (inpflags & INP_NOHEADER) ? 1 : 0;
			error = sockopt_set(sopt, &optval, sizeof(optval));
		} else if (op == PRCO_SETOPT) {
			error = sockopt_getint(sopt, &optval);
			if (error)
				goto out;
			if (optval) {
				inpflags &= ~INP_HDRINCL;
				inpflags |= INP_NOHEADER;
			} else
				inpflags &= ~INP_NOHEADER;
		}
		goto out;
	}

	if (sopt->sopt_level != IPPROTO_IP) {
		return ip_ctloutput(op, so, sopt);
	}

	switch (op) {
	case PRCO_SETOPT:
		switch (sopt->sopt_name) {
		case IP_HDRINCL:
			error = sockopt_getint(sopt, &optval);
			if (error)
				break;
			if (optval)
				inpflags |= INP_HDRINCL;
			else
				inpflags &= ~INP_HDRINCL;
			break;

#ifdef MROUTING
		case MRT_INIT:
		case MRT_DONE:
		case MRT_ADD_VIF:
		case MRT_DEL_VIF:
		case MRT_ADD_MFC:
		case MRT_DEL_MFC:
		case MRT_ASSERT:
		case MRT_API_CONFIG:
		case MRT_ADD_BW_UPCALL:
		case MRT_DEL_BW_UPCALL:
			error = ip_mrouter_set(so, sopt);
			break;
#endif

		default:
			error = ip_ctloutput(op, so, sopt);
			break;
		}
		break;

	case PRCO_GETOPT:
		switch (sopt->sopt_name) {
		case IP_HDRINCL:
			optval = inpflags & INP_HDRINCL;
			error = sockopt_set(sopt, &optval, sizeof(optval));
			break;

#ifdef MROUTING
		case MRT_VERSION:
		case MRT_ASSERT:
		case MRT_API_SUPPORT:
		case MRT_API_CONFIG:
			error = ip_mrouter_get(so, sopt);
			break;
#endif

		default:
			error = ip_ctloutput(op, so, sopt);
			break;
		}
		break;
	}
 out:
	if (!error) {
		inpcb_set_flags(inp, inpflags);
	}
	return error;
}

static int
rip_bind(inpcb_t *inp, struct mbuf *nam)
{
	struct sockaddr_in *addr = mtod(nam, struct sockaddr_in *);

	if (nam->m_len != sizeof(*addr))
		return EINVAL;
	if (!IFNET_FIRST())
		return EADDRNOTAVAIL;
	if (addr->sin_family != AF_INET)
		return EAFNOSUPPORT;
	if (!in_nullhost(addr->sin_addr) && !ifa_ifwithaddr(sintosa(addr)))
		return EADDRNOTAVAIL;

	inpcb_set_addrs(inp, &addr->sin_addr, NULL);
	return 0;
}

static int
rip_connect(inpcb_t *inp, struct mbuf *nam)
{
	struct sockaddr_in *addr = mtod(nam, struct sockaddr_in *);

	if (nam->m_len != sizeof(*addr))
		return EINVAL;
	if (!IFNET_FIRST())
		return EADDRNOTAVAIL;
	if (addr->sin_family != AF_INET)
		return EAFNOSUPPORT;

	inpcb_set_addrs(inp, NULL, &addr->sin_addr);
	return 0;
}

static void
rip_disconnect(inpcb_t *inp)
{
	inpcb_set_addrs(inp, NULL, &zeroin_addr);
}

int
rip_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
    struct mbuf *control, struct lwp *l)
{
	inpcb_t *inp;
	struct ip *ip;
	int error = 0;
#ifdef MROUTING
	extern struct socket *ip_mrouter;
#endif

	if (req == PRU_CONTROL) {
		return in_control(so, (long)m, nam, (ifnet_t *)control, l);
	}
	if (req == PRU_PURGEIF) {
		int s = splsoftnet();
		mutex_enter(softnet_lock);
		inpcb_purgeif0(rawcbtable, (ifnet_t *)control);
		in_purgeif((ifnet_t *)control);
		inpcb_purgeif(rawcbtable, (ifnet_t *)control);
		mutex_exit(softnet_lock);
		splx(s);
		return 0;
	}

	KASSERT(req == PRU_ATTACH || solocked(so));
	inp = sotoinpcb(so);

	KASSERT(!control || (req == PRU_SEND || req == PRU_SENDOOB));
	if (inp == NULL && req != PRU_ATTACH) {
		return EINVAL;
	}

	switch (req) {
	case PRU_ATTACH:
		sosetlock(so);
		if (inp) {
			error = EISCONN;
			break;
		}

		/* XXX: raw socket permissions are checked in socreate() */

		if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
			error = soreserve(so, rip_sendspace, rip_recvspace);
			if (error)
				break;
		}
		error = inpcb_create(so, rawcbtable);
		if (error)
			break;
		inp = sotoinpcb(so);
		ip = in_getiphdr(inp);
		ip->ip_p = (long)nam;
		break;

	case PRU_DETACH:
#ifdef MROUTING
		if (so == ip_mrouter)
			ip_mrouter_done();
#endif
		inpcb_destroy(inp);
		break;

	case PRU_BIND:
		error = rip_bind(inp, nam);
		break;

	case PRU_LISTEN:
		error = EOPNOTSUPP;
		break;

	case PRU_CONNECT:
		error = rip_connect(inp, nam);
		if (error)
			break;
		soisconnected(so);
		break;

	case PRU_CONNECT2:
		error = EOPNOTSUPP;
		break;

	case PRU_DISCONNECT:
		soisdisconnected(so);
		rip_disconnect(inp);
		break;

	/*
	 * Mark the connection as being incapable of further input.
	 */
	case PRU_SHUTDOWN:
		socantsendmore(so);
		break;

	case PRU_RCVD:
		error = EOPNOTSUPP;
		break;

	/*
	 * Ship a packet out.  The appropriate raw output
	 * routine handles any massaging necessary.
	 */
	case PRU_SEND:
		if (control && control->m_len) {
			m_freem(control);
			m_freem(m);
			error = EINVAL;
			break;
		}
		if ((so->so_state & SS_ISCONNECTED) != 0) {
			error = nam ? EISCONN : ENOTCONN;
			m_freem(m);
			break;
		}
		if (nam && (error = rip_connect(inp, nam)) != 0) {
			m_freem(m);
			break;
		}
		error = rip_output(m, inp);
		if (nam) {
			rip_disconnect(inp);
		}
		break;

	case PRU_SENSE:
		/*
		 * Stat: do not bother with a blocksize.
		 */
		return 0;

	case PRU_RCVOOB:
		error = EOPNOTSUPP;
		break;

	case PRU_SENDOOB:
		m_freem(control);
		m_freem(m);
		error = EOPNOTSUPP;
		break;

	case PRU_SOCKADDR:
		inpcb_fetch_sockaddr(inp, nam);
		break;

	case PRU_PEERADDR:
		inpcb_fetch_peeraddr(inp, nam);
		break;

	default:
		KASSERT(false);
	}

	return error;
}

static void
sysctl_net_inet_raw_setup(struct sysctllog **clog)
{
	sysctl_createv(clog, 0, NULL, NULL,
		       CTLFLAG_PERMANENT,
		       CTLTYPE_NODE, "net", NULL,
		       NULL, 0, NULL, 0,
		       CTL_NET, CTL_EOL);
	sysctl_createv(clog, 0, NULL, NULL,
		       CTLFLAG_PERMANENT,
		       CTLTYPE_NODE, "inet", NULL,
		       NULL, 0, NULL, 0,
		       CTL_NET, PF_INET, CTL_EOL);
	sysctl_createv(clog, 0, NULL, NULL,
		       CTLFLAG_PERMANENT,
		       CTLTYPE_NODE, "raw",
		       SYSCTL_DESCR("Raw IPv4 settings"),
		       NULL, 0, NULL, 0,
		       CTL_NET, PF_INET, IPPROTO_RAW, CTL_EOL);
	sysctl_createv(clog, 0, NULL, NULL,
		       CTLFLAG_PERMANENT,
		       CTLTYPE_STRUCT, "pcblist",
		       SYSCTL_DESCR("Raw IPv4 control block list"),
		       sysctl_inpcblist, 0, rawcbtable, 0,
		       CTL_NET, PF_INET, IPPROTO_RAW,
		       CTL_CREATE, CTL_EOL);
}