Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.96 retrieving revision 1.205.6.2 diff -u -p -r1.96 -r1.205.6.2 --- src/sys/kern/uipc_socket.c 2004/03/21 00:54:46 1.96 +++ src/sys/kern/uipc_socket.c 2012/04/05 21:33:40 1.205.6.2 @@ -1,11 +1,11 @@ -/* $NetBSD: uipc_socket.c,v 1.96 2004/03/21 00:54:46 mycroft Exp $ */ +/* $NetBSD: uipc_socket.c,v 1.205.6.2 2012/04/05 21:33:40 mrg Exp $ */ /*- - * Copyright (c) 2002 The NetBSD Foundation, Inc. + * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation - * by Jason R. Thorpe of Wasabi Systems, Inc. + * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -15,13 +15,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the NetBSD - * Foundation, Inc. and its contributors. - * 4. Neither the name of The NetBSD Foundation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED @@ -37,6 +30,8 @@ */ /* + * Copyright (c) 2004 The FreeBSD Foundation + * Copyright (c) 2004 Robert Watson * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * @@ -68,18 +63,21 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.96 2004/03/21 00:54:46 mycroft Exp $"); +__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.205.6.2 2012/04/05 21:33:40 mrg Exp $"); +#include "opt_compat_netbsd.h" #include "opt_sock_counters.h" #include "opt_sosend_loan.h" #include "opt_mbuftrace.h" #include "opt_somaxkva.h" +#include "opt_multiprocessor.h" /* XXX */ #include #include #include #include -#include +#include +#include #include #include #include @@ -88,128 +86,110 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_socket. #include #include #include -#include +#include #include #include +#include +#include +#include +#include + +#ifdef COMPAT_50 +#include +#include +#endif -#include - -struct pool socket_pool; +#include +#include +#include MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); MALLOC_DEFINE(M_SONAME, "soname", "socket name"); +extern const struct fileops socketops; + extern int somaxconn; /* patchable (XXX sysctl) */ int somaxconn = SOMAXCONN; +kmutex_t *softnet_lock; #ifdef SOSEND_COUNTERS #include -struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, +static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "loan big"); -struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, +static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "copy big"); -struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, +static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "copy small"); -struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, +static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "kva limit"); #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ +EVCNT_ATTACH_STATIC(sosend_loan_big); +EVCNT_ATTACH_STATIC(sosend_copy_big); +EVCNT_ATTACH_STATIC(sosend_copy_small); +EVCNT_ATTACH_STATIC(sosend_kvalimit); #else #define SOSEND_COUNTER_INCR(ev) /* nothing */ #endif /* SOSEND_COUNTERS */ -void -soinit(void) -{ - - /* Set the initial adjusted socket buffer size. */ - if (sb_max_set(sb_max)) - panic("bad initial sb_max value: %lu\n", sb_max); - - pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, - "sockpl", NULL); - -#ifdef SOSEND_COUNTERS - evcnt_attach_static(&sosend_loan_big); - evcnt_attach_static(&sosend_copy_big); - evcnt_attach_static(&sosend_copy_small); - evcnt_attach_static(&sosend_kvalimit); -#endif /* SOSEND_COUNTERS */ -} - -#ifdef SOSEND_NO_LOAN -int use_sosend_loan = 0; +#if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR) +int sock_loan_thresh = -1; #else -int use_sosend_loan = 1; +int sock_loan_thresh = 4096; #endif -struct simplelock so_pendfree_slock = SIMPLELOCK_INITIALIZER; -struct mbuf *so_pendfree; +static kmutex_t so_pendfree_lock; +static struct mbuf *so_pendfree = NULL; #ifndef SOMAXKVA #define SOMAXKVA (16 * 1024 * 1024) #endif int somaxkva = SOMAXKVA; -int socurkva; -int sokvawaiters; +static int socurkva; +static kcondvar_t socurkva_cv; + +static kauth_listener_t socket_listener; -#define SOCK_LOAN_THRESH 4096 #define SOCK_LOAN_CHUNK 65536 -static size_t sodopendfree(struct socket *); -static size_t sodopendfreel(struct socket *); -static __inline void sokvareserve(struct socket *, vsize_t); -static __inline void sokvaunreserve(vsize_t); +static void sopendfree_thread(void *); +static kcondvar_t pendfree_thread_cv; +static lwp_t *sopendfree_lwp; -static __inline void +static void sysctl_kern_somaxkva_setup(void); +static struct sysctllog *socket_sysctllog; + +static vsize_t sokvareserve(struct socket *so, vsize_t len) { - int s; + int error; - s = splvm(); - simple_lock(&so_pendfree_slock); + mutex_enter(&so_pendfree_lock); while (socurkva + len > somaxkva) { - size_t freed; - - /* - * try to do pendfree. - */ - - freed = sodopendfreel(so); - - /* - * if some kva was freed, try again. - */ - - if (freed) - continue; - SOSEND_COUNTER_INCR(&sosend_kvalimit); - sokvawaiters++; - (void) ltsleep(&socurkva, PVM, "sokva", 0, &so_pendfree_slock); - sokvawaiters--; + error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock); + if (error) { + len = 0; + break; + } } socurkva += len; - simple_unlock(&so_pendfree_slock); - splx(s); + mutex_exit(&so_pendfree_lock); + return len; } -static __inline void +static void sokvaunreserve(vsize_t len) { - int s; - s = splvm(); - simple_lock(&so_pendfree_slock); + mutex_enter(&so_pendfree_lock); socurkva -= len; - if (sokvawaiters) - wakeup(&socurkva); - simple_unlock(&so_pendfree_slock); - splx(s); + cv_broadcast(&socurkva_cv); + mutex_exit(&so_pendfree_lock); } /* @@ -217,7 +197,7 @@ sokvaunreserve(vsize_t len) */ vaddr_t -sokvaalloc(vsize_t len, struct socket *so) +sokvaalloc(vaddr_t sva, vsize_t len, struct socket *so) { vaddr_t lva; @@ -225,13 +205,15 @@ sokvaalloc(vsize_t len, struct socket *s * reserve kva. */ - sokvareserve(so, len); + if (sokvareserve(so, len) == 0) + return 0; /* * allocate kva. */ - lva = uvm_km_valloc_wait(kernel_map, len); + lva = uvm_km_alloc(kernel_map, len, atop(sva) & uvmexp.colormask, + UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA); if (lva == 0) { sokvaunreserve(len); return (0); @@ -252,7 +234,7 @@ sokvafree(vaddr_t sva, vsize_t len) * free kva. */ - uvm_km_free(kernel_map, sva, len); + uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY); /* * unreserve kva. @@ -262,106 +244,71 @@ sokvafree(vaddr_t sva, vsize_t len) } static void -sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size) +sodoloanfree(struct vm_page **pgs, void *buf, size_t size) { - vaddr_t va, sva, eva; + vaddr_t sva, eva; vsize_t len; - paddr_t pa; - int i, npgs; + int npgs; + + KASSERT(pgs != NULL); eva = round_page((vaddr_t) buf + size); sva = trunc_page((vaddr_t) buf); len = eva - sva; npgs = len >> PAGE_SHIFT; - if (__predict_false(pgs == NULL)) { - pgs = alloca(npgs * sizeof(*pgs)); - - for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { - if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) - panic("sodoloanfree: va 0x%lx not mapped", va); - pgs[i] = PHYS_TO_VM_PAGE(pa); - } - } - pmap_kremove(sva, len); pmap_update(pmap_kernel()); uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); sokvafree(sva, len); } -static size_t -sodopendfree(struct socket *so) -{ - int s; - size_t rv; - - s = splvm(); - simple_lock(&so_pendfree_slock); - rv = sodopendfreel(so); - simple_unlock(&so_pendfree_slock); - splx(s); - - return rv; -} - /* - * sodopendfreel: free mbufs on "pendfree" list. - * unlock and relock so_pendfree_slock when freeing mbufs. - * - * => called with so_pendfree_slock held. - * => called at splvm. + * sopendfree_thread: free mbufs on "pendfree" list. + * unlock and relock so_pendfree_lock when freeing mbufs. */ -static size_t -sodopendfreel(struct socket *so) +static void +sopendfree_thread(void *v) { - size_t rv = 0; + struct mbuf *m, *next; + size_t rv; - LOCK_ASSERT(simple_lock_held(&so_pendfree_slock)); + mutex_enter(&so_pendfree_lock); for (;;) { - struct mbuf *m; - struct mbuf *next; - - m = so_pendfree; - if (m == NULL) - break; - so_pendfree = NULL; - simple_unlock(&so_pendfree_slock); - /* XXX splx */ - - for (; m != NULL; m = next) { - next = m->m_next; + rv = 0; + while (so_pendfree != NULL) { + m = so_pendfree; + so_pendfree = NULL; + mutex_exit(&so_pendfree_lock); + + for (; m != NULL; m = next) { + next = m->m_next; + KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); + KASSERT(m->m_ext.ext_refcnt == 0); + + rv += m->m_ext.ext_size; + sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, + m->m_ext.ext_size); + pool_cache_put(mb_cache, m); + } - rv += m->m_ext.ext_size; - sodoloanfree((m->m_flags & M_EXT_PAGES) ? - m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, - m->m_ext.ext_size); - pool_cache_put(&mbpool_cache, m); + mutex_enter(&so_pendfree_lock); } - - /* XXX splvm */ - simple_lock(&so_pendfree_slock); + if (rv) + cv_broadcast(&socurkva_cv); + cv_wait(&pendfree_thread_cv, &so_pendfree_lock); } - - return (rv); + panic("sopendfree_thread"); + /* NOTREACHED */ } void -soloanfree(struct mbuf *m, caddr_t buf, size_t size, void *arg) +soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) { - int s; - - if (m == NULL) { - /* - * called from MEXTREMOVE. - */ - - sodoloanfree(NULL, buf, size); - return; - } + KASSERT(m != NULL); /* * postpone freeing mbuf. @@ -370,14 +317,11 @@ soloanfree(struct mbuf *m, caddr_t buf, * because we need to put kva back to kernel_map. */ - s = splvm(); - simple_lock(&so_pendfree_slock); + mutex_enter(&so_pendfree_lock); m->m_next = so_pendfree; so_pendfree = m; - if (sokvawaiters) - wakeup(&socurkva); - simple_unlock(&so_pendfree_slock); - splx(s); + cv_signal(&pendfree_thread_cv); + mutex_exit(&so_pendfree_lock); } static long @@ -386,10 +330,12 @@ sosend_loan(struct socket *so, struct ui struct iovec *iov = uio->uio_iov; vaddr_t sva, eva; vsize_t len; - vaddr_t lva, va; - int npgs, i, error; + vaddr_t lva; + int npgs, error; + vaddr_t va; + int i; - if (uio->uio_segflg != UIO_USERSPACE) + if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) return (0); if (iov->iov_len < (size_t) space) @@ -402,14 +348,13 @@ sosend_loan(struct socket *so, struct ui len = eva - sva; npgs = len >> PAGE_SHIFT; - /* XXX KDASSERT */ KASSERT(npgs <= M_EXT_MAXPAGES); - lva = sokvaalloc(len, so); + lva = sokvaalloc(sva, len, so); if (lva == 0) return 0; - error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len, + error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len, m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); if (error) { sokvafree(lva, len); @@ -418,17 +363,17 @@ sosend_loan(struct socket *so, struct ui for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), - VM_PROT_READ); + VM_PROT_READ, 0); pmap_update(pmap_kernel()); lva += (vaddr_t) iov->iov_base & PAGE_MASK; - MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so); + MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so); m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; uio->uio_resid -= space; /* uio_offset not updated, not set/used for write(2) */ - uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space; + uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space; uio->uio_iov->iov_len -= space; if (uio->uio_iov->iov_len == 0) { uio->uio_iov++; @@ -438,6 +383,101 @@ sosend_loan(struct socket *so, struct ui return (space); } +struct mbuf * +getsombuf(struct socket *so, int type) +{ + struct mbuf *m; + + m = m_get(M_WAIT, type); + MCLAIM(m, so->so_mowner); + return m; +} + +static int +socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, + void *arg0, void *arg1, void *arg2, void *arg3) +{ + int result; + enum kauth_network_req req; + + result = KAUTH_RESULT_DEFER; + req = (enum kauth_network_req)arg0; + + if ((action != KAUTH_NETWORK_SOCKET) && + (action != KAUTH_NETWORK_BIND)) + return result; + + switch (req) { + case KAUTH_REQ_NETWORK_BIND_PORT: + result = KAUTH_RESULT_ALLOW; + break; + + case KAUTH_REQ_NETWORK_SOCKET_DROP: { + /* Normal users can only drop their own connections. */ + struct socket *so = (struct socket *)arg1; + + if (proc_uidmatch(cred, so->so_cred)) + result = KAUTH_RESULT_ALLOW; + + break; + } + + case KAUTH_REQ_NETWORK_SOCKET_OPEN: + /* We allow "raw" routing/bluetooth sockets to anyone. */ + if ((u_long)arg1 == PF_ROUTE || (u_long)arg1 == PF_OROUTE + || (u_long)arg1 == PF_BLUETOOTH) { + result = KAUTH_RESULT_ALLOW; + } else { + /* Privileged, let secmodel handle this. */ + if ((u_long)arg2 == SOCK_RAW) + break; + } + + result = KAUTH_RESULT_ALLOW; + + break; + + case KAUTH_REQ_NETWORK_SOCKET_CANSEE: + result = KAUTH_RESULT_ALLOW; + + break; + + default: + break; + } + + return result; +} + +void +soinit(void) +{ + + sysctl_kern_somaxkva_setup(); + + mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM); + softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); + cv_init(&socurkva_cv, "sokva"); + cv_init(&pendfree_thread_cv, "sopendfr"); + soinit2(); + + /* Set the initial adjusted socket buffer size. */ + if (sb_max_set(sb_max)) + panic("bad initial sb_max value: %lu", sb_max); + + socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK, + socket_listener_cb, NULL); +} + +void +soinit1(void) +{ + int error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, + sopendfree_thread, NULL, &sopendfree_lwp, "sopendfree"); + if (error) + panic("soinit1 %d", error); +} + /* * Socket operation routines. * These routines are called by the routines in @@ -447,27 +487,40 @@ sosend_loan(struct socket *so, struct ui */ /*ARGSUSED*/ int -socreate(int dom, struct socket **aso, int type, int proto) +socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l, + struct socket *lockso) { - struct proc *p; - struct protosw *prp; + const struct protosw *prp; struct socket *so; - int error, s; + uid_t uid; + int error; + kmutex_t *lock; + + error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, + KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type), + KAUTH_ARG(proto)); + if (error != 0) + return error; - p = curproc; /* XXX */ if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); - if (prp == 0 || prp->pr_usrreq == 0) - return (EPROTONOSUPPORT); + if (prp == NULL) { + /* no support for domain */ + if (pffinddomain(dom) == 0) + return EAFNOSUPPORT; + /* no support for socket type */ + if (proto == 0 && type != 0) + return EPROTOTYPE; + return EPROTONOSUPPORT; + } + if (prp->pr_usrreq == NULL) + return EPROTONOSUPPORT; if (prp->pr_type != type) - return (EPROTOTYPE); - s = splsoftnet(); - so = pool_get(&socket_pool, PR_WAITOK); - memset((caddr_t)so, 0, sizeof(*so)); - TAILQ_INIT(&so->so_q0); - TAILQ_INIT(&so->so_q); + return EPROTOTYPE; + + so = soget(true); so->so_type = type; so->so_proto = prp; so->so_send = sosend; @@ -477,72 +530,155 @@ socreate(int dom, struct socket **aso, i so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; so->so_mowner = &prp->pr_domain->dom_mowner; #endif - if (p != 0) - so->so_uid = p->p_ucred->cr_uid; - error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, - (struct mbuf *)(long)proto, (struct mbuf *)0, p); - if (error) { + uid = kauth_cred_geteuid(l->l_cred); + so->so_uidinfo = uid_find(uid); + so->so_cpid = l->l_proc->p_pid; + if (lockso != NULL) { + /* Caller wants us to share a lock. */ + lock = lockso->so_lock; + so->so_lock = lock; + mutex_obj_hold(lock); + mutex_enter(lock); + } else { + /* Lock assigned and taken during PRU_ATTACH. */ + } + error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL, + (struct mbuf *)(long)proto, NULL, l); + KASSERT(solocked(so)); + if (error != 0) { so->so_state |= SS_NOFDREF; sofree(so); - splx(s); - return (error); + return error; } - splx(s); + so->so_cred = kauth_cred_dup(l->l_cred); + sounlock(so); *aso = so; - return (0); + return 0; } +/* On success, write file descriptor to fdout and return zero. On + * failure, return non-zero; *fdout will be undefined. + */ int -sobind(struct socket *so, struct mbuf *nam, struct proc *p) +fsocreate(int domain, struct socket **sop, int type, int protocol, + struct lwp *l, int *fdout) { - int s, error; + struct socket *so; + struct file *fp; + int fd, error; + int flags = type & SOCK_FLAGS_MASK; + + type &= ~SOCK_FLAGS_MASK; + if ((error = fd_allocfile(&fp, &fd)) != 0) + return error; + fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0); + fp->f_flag = FREAD|FWRITE|((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0)| + ((flags & SOCK_NOSIGPIPE) ? FNOSIGPIPE : 0); + fp->f_type = DTYPE_SOCKET; + fp->f_ops = &socketops; + error = socreate(domain, &so, type, protocol, l, NULL); + if (error != 0) { + fd_abort(curproc, fp, fd); + } else { + if (sop != NULL) + *sop = so; + fp->f_data = so; + fd_affix(curproc, fp, fd); + *fdout = fd; + } + return error; +} - s = splsoftnet(); - error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0, - nam, (struct mbuf *)0, p); - splx(s); - return (error); +int +sofamily(const struct socket *so) +{ + const struct protosw *pr; + const struct domain *dom; + + if ((pr = so->so_proto) == NULL) + return AF_UNSPEC; + if ((dom = pr->pr_domain) == NULL) + return AF_UNSPEC; + return dom->dom_family; } int -solisten(struct socket *so, int backlog) +sobind(struct socket *so, struct mbuf *nam, struct lwp *l) { - int s, error; + int error; - s = splsoftnet(); - error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); - if (error) { - splx(s); - return (error); + solock(so); + error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, l); + sounlock(so); + return error; +} + +int +solisten(struct socket *so, int backlog, struct lwp *l) +{ + int error; + + solock(so); + if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | + SS_ISDISCONNECTING)) != 0) { + sounlock(so); + return (EINVAL); + } + error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, + NULL, NULL, l); + if (error != 0) { + sounlock(so); + return error; } if (TAILQ_EMPTY(&so->so_q)) so->so_options |= SO_ACCEPTCONN; if (backlog < 0) backlog = 0; so->so_qlimit = min(backlog, somaxconn); - splx(s); - return (0); + sounlock(so); + return 0; } void sofree(struct socket *so) { + u_int refs; - if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) + KASSERT(solocked(so)); + + if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { + sounlock(so); return; + } if (so->so_head) { /* * We must not decommission a socket that's on the accept(2) * queue. If we do, then accept(2) may hang after select(2) * indicated that the listening socket was ready. */ - if (!soqremque(so, 0)) + if (!soqremque(so, 0)) { + sounlock(so); return; + } } - sbrelease(&so->so_snd); + if (so->so_rcv.sb_hiwat) + (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, + RLIM_INFINITY); + if (so->so_snd.sb_hiwat) + (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, + RLIM_INFINITY); + sbrelease(&so->so_snd, so); + KASSERT(!cv_has_waiters(&so->so_cv)); + KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); + KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); sorflush(so); - pool_put(&socket_pool, so); + refs = so->so_aborting; /* XXX */ + /* Remove acccept filter if one is present. */ + if (so->so_accf != NULL) + (void)accept_filt_clear(so); + sounlock(so); + if (refs == 0) /* XXX */ + soput(so); } /* @@ -554,18 +690,30 @@ int soclose(struct socket *so) { struct socket *so2; - int s, error; + int error; + int error2; error = 0; - s = splsoftnet(); /* conservative */ + solock(so); if (so->so_options & SO_ACCEPTCONN) { - while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { - (void) soqremque(so2, 0); - (void) soabort(so2); - } - while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { - (void) soqremque(so2, 1); - (void) soabort(so2); + for (;;) { + if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { + KASSERT(solocked2(so, so2)); + (void) soqremque(so2, 0); + /* soabort drops the lock. */ + (void) soabort(so2); + solock(so); + continue; + } + if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { + KASSERT(solocked2(so, so2)); + (void) soqremque(so2, 1); + /* soabort drops the lock. */ + (void) soabort(so2); + solock(so); + continue; + } + break; } } if (so->so_pcb == 0) @@ -577,13 +725,11 @@ soclose(struct socket *so) goto drop; } if (so->so_options & SO_LINGER) { - if ((so->so_state & SS_ISDISCONNECTING) && - (so->so_state & SS_NBIO)) + if ((so->so_state & (SS_ISDISCONNECTING|SS_NBIO)) == + (SS_ISDISCONNECTING|SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { - error = tsleep((caddr_t)&so->so_timeo, - PSOCK | PCATCH, netcls, - so->so_linger * hz); + error = sowait(so, true, so->so_linger * hz); if (error) break; } @@ -591,63 +737,74 @@ soclose(struct socket *so) } drop: if (so->so_pcb) { - int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, - (struct proc *)0); + error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, + NULL, NULL, NULL, NULL); if (error == 0) error = error2; } discard: if (so->so_state & SS_NOFDREF) panic("soclose: NOFDREF"); + kauth_cred_free(so->so_cred); so->so_state |= SS_NOFDREF; sofree(so); - splx(s); return (error); } /* - * Must be called at splsoftnet... + * Must be called with the socket locked.. Will return with it unlocked. */ int soabort(struct socket *so) { - - return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); + u_int refs; + int error; + + KASSERT(solocked(so)); + KASSERT(so->so_head == NULL); + + so->so_aborting++; /* XXX */ + error = (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, + NULL, NULL, NULL); + refs = --so->so_aborting; /* XXX */ + if (error || (refs == 0)) { + sofree(so); + } else { + sounlock(so); + } + return error; } int soaccept(struct socket *so, struct mbuf *nam) { - int s, error; + int error; + + KASSERT(solocked(so)); error = 0; - s = splsoftnet(); if ((so->so_state & SS_NOFDREF) == 0) panic("soaccept: !NOFDREF"); so->so_state &= ~SS_NOFDREF; if ((so->so_state & SS_ISDISCONNECTED) == 0 || (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, - (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0); + NULL, nam, NULL, NULL); else error = ECONNABORTED; - splx(s); return (error); } int -soconnect(struct socket *so, struct mbuf *nam) +soconnect(struct socket *so, struct mbuf *nam, struct lwp *l) { - struct proc *p; - int s, error; + int error; + + KASSERT(solocked(so)); - p = curproc; /* XXX */ if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); - s = splsoftnet(); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. @@ -660,44 +817,37 @@ soconnect(struct socket *so, struct mbuf error = EISCONN; else error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, - (struct mbuf *)0, nam, (struct mbuf *)0, p); - splx(s); + NULL, nam, NULL, l); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { - int s, error; + int error; + + KASSERT(solocked2(so1, so2)); - s = splsoftnet(); error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, - (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0, - (struct proc *)0); - splx(s); + NULL, (struct mbuf *)so2, NULL, NULL); return (error); } int sodisconnect(struct socket *so) { - int s, error; + int error; + + KASSERT(solocked(so)); - s = splsoftnet(); if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; - goto bad; - } - if (so->so_state & SS_ISDISCONNECTING) { + } else if (so->so_state & SS_ISDISCONNECTING) { error = EALREADY; - goto bad; + } else { + error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, + NULL, NULL, NULL, NULL); } - error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, - (struct proc *)0); - bad: - splx(s); - sodopendfree(so); return (error); } @@ -721,17 +871,24 @@ sodisconnect(struct socket *so) */ int sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, - struct mbuf *control, int flags) + struct mbuf *control, int flags, struct lwp *l) { - struct proc *p; struct mbuf **mp, *m; + struct proc *p; long space, len, resid, clen, mlen; int error, s, dontroute, atomic; + short wakeup_state = 0; - sodopendfree(so); - - p = curproc; /* XXX */ + p = l->l_proc; clen = 0; + + /* + * solock() provides atomicity of access. splsoftnet() prevents + * protocol processing soft interrupts from interrupting us and + * blocking (expensive). + */ + s = splsoftnet(); + solock(so); atomic = sosendallatonce(so) || top; if (uio) resid = uio->uio_resid; @@ -751,50 +908,60 @@ sosend(struct socket *so, struct mbuf *a dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); - p->p_stats->p_ru.ru_msgsnd++; + l->l_ru.ru_msgsnd++; if (control) clen = control->m_len; -#define snderr(errno) { error = errno; splx(s); goto release; } - restart: if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) goto out; do { - s = splsoftnet(); - if (so->so_state & SS_CANTSENDMORE) - snderr(EPIPE); + if (so->so_state & SS_CANTSENDMORE) { + error = EPIPE; + goto release; + } if (so->so_error) { error = so->so_error; so->so_error = 0; - splx(s); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if ((so->so_state & SS_ISCONFIRMING) == 0 && - !(resid == 0 && clen != 0)) - snderr(ENOTCONN); - } else if (addr == 0) - snderr(EDESTADDRREQ); + !(resid == 0 && clen != 0)) { + error = ENOTCONN; + goto release; + } + } else if (addr == 0) { + error = EDESTADDRREQ; + goto release; + } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || - clen > so->so_snd.sb_hiwat) - snderr(EMSGSIZE); + clen > so->so_snd.sb_hiwat) { + error = EMSGSIZE; + goto release; + } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { - if (so->so_state & SS_NBIO) - snderr(EWOULDBLOCK); + if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { + error = EWOULDBLOCK; + goto release; + } sbunlock(&so->so_snd); + if (wakeup_state & SS_RESTARTSYS) { + error = ERESTART; + goto out; + } error = sbwait(&so->so_snd); - splx(s); if (error) goto out; + wakeup_state = so->so_state; goto restart; } - splx(s); + wakeup_state = 0; mp = ⊤ space -= clen; do { @@ -806,19 +973,21 @@ sosend(struct socket *so, struct mbuf *a if (flags & MSG_EOR) top->m_flags |= M_EOR; } else do { - if (top == 0) { + sounlock(so); + splx(s); + if (top == NULL) { m = m_gethdr(M_WAIT, MT_DATA); mlen = MHLEN; m->m_pkthdr.len = 0; - m->m_pkthdr.rcvif = (struct ifnet *)0; + m->m_pkthdr.rcvif = NULL; } else { m = m_get(M_WAIT, MT_DATA); mlen = MLEN; } MCLAIM(m, so->so_snd.sb_mowner); - if (use_sosend_loan && - uio->uio_iov->iov_len >= SOCK_LOAN_THRESH && - space >= SOCK_LOAN_THRESH && + if (sock_loan_thresh >= 0 && + uio->uio_iov->iov_len >= sock_loan_thresh && + space >= sock_loan_thresh && (len = sosend_loan(so, uio, m, space)) != 0) { SOSEND_COUNTER_INCR(&sosend_loan_big); @@ -827,7 +996,7 @@ sosend(struct socket *so, struct mbuf *a } if (resid >= MINCLSIZE && space >= MCLBYTES) { SOSEND_COUNTER_INCR(&sosend_copy_big); - m_clget(m, M_WAIT); + m_clget(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) goto nopages; mlen = MCLBYTES; @@ -850,14 +1019,15 @@ sosend(struct socket *so, struct mbuf *a if (atomic && top == 0 && len < mlen) MH_ALIGN(m, len); } - error = uiomove(mtod(m, caddr_t), (int)len, - uio); + error = uiomove(mtod(m, void *), (int)len, uio); have_data: resid = uio->uio_resid; m->m_len = len; *mp = m; top->m_pkthdr.len += len; - if (error) + s = splsoftnet(); + solock(so); + if (error != 0) goto release; mp = &m->m_next; if (resid <= 0) { @@ -866,30 +1036,27 @@ sosend(struct socket *so, struct mbuf *a break; } } while (space > 0 && atomic); - - s = splsoftnet(); - - if (so->so_state & SS_CANTSENDMORE) - snderr(EPIPE); + if (so->so_state & SS_CANTSENDMORE) { + error = EPIPE; + goto release; + } if (dontroute) so->so_options |= SO_DONTROUTE; if (resid > 0) so->so_state |= SS_MORETOCOME; error = (*so->so_proto->pr_usrreq)(so, (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, - top, addr, control, p); + top, addr, control, curlwp); if (dontroute) so->so_options &= ~SO_DONTROUTE; if (resid > 0) so->so_state &= ~SS_MORETOCOME; - splx(s); - clen = 0; - control = 0; - top = 0; + control = NULL; + top = NULL; mp = ⊤ - if (error) + if (error != 0) goto release; } while (resid && space > 0); } while (resid); @@ -897,6 +1064,8 @@ sosend(struct socket *so, struct mbuf *a release: sbunlock(&so->so_snd); out: + sounlock(so); + splx(s); if (top) m_freem(top); if (control) @@ -905,6 +1074,43 @@ sosend(struct socket *so, struct mbuf *a } /* + * Following replacement or removal of the first mbuf on the first + * mbuf chain of a socket buffer, push necessary state changes back + * into the socket buffer so that other consumers see the values + * consistently. 'nextrecord' is the callers locally stored value of + * the original value of sb->sb_mb->m_nextpkt which must be restored + * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. + */ +static void +sbsync(struct sockbuf *sb, struct mbuf *nextrecord) +{ + + KASSERT(solocked(sb->sb_so)); + + /* + * First, update for the new value of nextrecord. If necessary, + * make it the first record. + */ + if (sb->sb_mb != NULL) + sb->sb_mb->m_nextpkt = nextrecord; + else + sb->sb_mb = nextrecord; + + /* + * Now update any dependent socket buffer fields to reflect + * the new state. This is an inline of SB_EMPTY_FIXUP, with + * the addition of a second clause that takes care of the + * case where sb_mb has been updated, but remains the last + * record. + */ + if (sb->sb_mb == NULL) { + sb->sb_mbtail = NULL; + sb->sb_lastrecord = NULL; + } else if (sb->sb_mb->m_nextpkt == NULL) + sb->sb_lastrecord = sb->sb_mb; +} + +/* * Implement receive operations on a socket. * We depend on the way that records are added to the sockbuf * by sbappend*. In particular, each record (mbufs linked through m_next) @@ -924,55 +1130,68 @@ int soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { - struct mbuf *m, **mp; - int flags, len, error, s, offset, moff, type, orig_resid; - struct protosw *pr; + struct lwp *l = curlwp; + struct mbuf *m, **mp, *mt; + int atomic, flags, len, error, s, offset, moff, type, orig_resid; + const struct protosw *pr; struct mbuf *nextrecord; int mbuf_removed = 0; + const struct domain *dom; + short wakeup_state = 0; pr = so->so_proto; + atomic = pr->pr_flags & PR_ATOMIC; + dom = pr->pr_domain; mp = mp0; type = 0; orig_resid = uio->uio_resid; - if (paddr) - *paddr = 0; - if (controlp) - *controlp = 0; - if (flagsp) + + if (paddr != NULL) + *paddr = NULL; + if (controlp != NULL) + *controlp = NULL; + if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; - if ((flags & MSG_DONTWAIT) == 0) - sodopendfree(so); - if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); + solock(so); error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, - (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0, - (struct proc *)0); + (struct mbuf *)(long)(flags & MSG_PEEK), NULL, l); + sounlock(so); if (error) goto bad; do { - error = uiomove(mtod(m, caddr_t), + error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); - } while (uio->uio_resid && error == 0 && m); + } while (uio->uio_resid > 0 && error == 0 && m); bad: - if (m) + if (m != NULL) m_freem(m); - return (error); + return error; } - if (mp) - *mp = (struct mbuf *)0; + if (mp != NULL) + *mp = NULL; + + /* + * solock() provides atomicity of access. splsoftnet() prevents + * protocol processing soft interrupts from interrupting us and + * blocking (expensive). + */ + s = splsoftnet(); + solock(so); if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) - (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); + (*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, l); restart: - if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) - return (error); - s = splsoftnet(); + if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) { + sounlock(so); + splx(s); + return error; + } m = so->so_rcv.sb_mb; /* @@ -986,17 +1205,19 @@ soreceive(struct socket *so, struct mbuf * we have to do the receive in sections, and thus risk returning * a short count if a timeout or signal occurs after we start. */ - if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && - so->so_rcv.sb_cc < uio->uio_resid) && - (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || - ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && - m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { + if (m == NULL || + ((flags & MSG_DONTWAIT) == 0 && + so->so_rcv.sb_cc < uio->uio_resid && + (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || + ((flags & MSG_WAITALL) && + uio->uio_resid <= so->so_rcv.sb_hiwat)) && + m->m_nextpkt == NULL && !atomic)) { #ifdef DIAGNOSTIC - if (m == 0 && so->so_rcv.sb_cc) + if (m == NULL && so->so_rcv.sb_cc) panic("receive 1"); #endif if (so->so_error) { - if (m) + if (m != NULL) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) @@ -1004,12 +1225,12 @@ soreceive(struct socket *so, struct mbuf goto release; } if (so->so_state & SS_CANTRCVMORE) { - if (m) + if (m != NULL) goto dontblock; else goto release; } - for (; m; m = m->m_next) + for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; @@ -1021,29 +1242,45 @@ soreceive(struct socket *so, struct mbuf } if (uio->uio_resid == 0) goto release; - if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { + if ((so->so_state & SS_NBIO) || + (flags & (MSG_DONTWAIT|MSG_NBIO))) { error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(&so->so_rcv); - error = sbwait(&so->so_rcv); - splx(s); - if (error) - return (error); + if (wakeup_state & SS_RESTARTSYS) + error = ERESTART; + else + error = sbwait(&so->so_rcv); + if (error != 0) { + sounlock(so); + splx(s); + return error; + } + wakeup_state = so->so_state; goto restart; } dontblock: /* * On entry here, m points to the first record of the socket buffer. - * While we process the initial mbufs containing address and control - * info, we save a copy of m->m_nextpkt into nextrecord. + * From this point onward, we maintain 'nextrecord' as a cache of the + * pointer to the next record in the socket buffer. We must keep the + * various socket buffer pointers and local stack versions of the + * pointers in sync, pushing out modifications before dropping the + * socket lock, and re-reading them when picking it up. + * + * Otherwise, we will race with the network stack appending new data + * or records onto the socket buffer by using inconsistent/stale + * versions of the field, possibly resulting in socket buffer + * corruption. + * + * By holding the high-level sblock(), we prevent simultaneous + * readers from pulling off the front of the socket buffer. */ -#ifdef notyet /* XXXX */ - if (uio->uio_procp) - uio->uio_procp->p_stats->p_ru.ru_msgrcv++; -#endif + if (l != NULL) + l->l_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb); SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); @@ -1061,80 +1298,98 @@ soreceive(struct socket *so, struct mbuf } else { sbfree(&so->so_rcv, m); mbuf_removed = 1; - if (paddr) { + if (paddr != NULL) { *paddr = m; so->so_rcv.sb_mb = m->m_next; - m->m_next = 0; + m->m_next = NULL; m = so->so_rcv.sb_mb; } else { MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } - } - } - while (m && m->m_type == MT_CONTROL && error == 0) { - if (flags & MSG_PEEK) { - if (controlp) - *controlp = m_copy(m, 0, m->m_len); - m = m->m_next; - } else { - sbfree(&so->so_rcv, m); - mbuf_removed = 1; - if (controlp) { - if (pr->pr_domain->dom_externalize && - mtod(m, struct cmsghdr *)->cmsg_type == - SCM_RIGHTS) - error = (*pr->pr_domain->dom_externalize)(m); - *controlp = m; - so->so_rcv.sb_mb = m->m_next; - m->m_next = 0; - m = so->so_rcv.sb_mb; - } else { - MFREE(m, so->so_rcv.sb_mb); - m = so->so_rcv.sb_mb; - } - } - if (controlp) { - orig_resid = 0; - controlp = &(*controlp)->m_next; + sbsync(&so->so_rcv, nextrecord); } } /* - * If m is non-NULL, we have some data to read. From now on, - * make sure to keep sb_lastrecord consistent when working on - * the last packet on the chain (nextrecord == NULL) and we - * change m->m_nextpkt. + * Process one or more MT_CONTROL mbufs present before any data mbufs + * in the first mbuf chain on the socket buffer. If MSG_PEEK, we + * just copy the data; if !MSG_PEEK, we call into the protocol to + * perform externalization (or freeing if controlp == NULL). */ - if (m) { - if ((flags & MSG_PEEK) == 0) { - m->m_nextpkt = nextrecord; - /* - * If nextrecord == NULL (this is a single chain), - * then sb_lastrecord may not be valid here if m - * was changed earlier. - */ - if (nextrecord == NULL) { - KASSERT(so->so_rcv.sb_mb == m); - so->so_rcv.sb_lastrecord = m; + if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) { + struct mbuf *cm = NULL, *cmn; + struct mbuf **cme = &cm; + + do { + if (flags & MSG_PEEK) { + if (controlp != NULL) { + *controlp = m_copy(m, 0, m->m_len); + controlp = &(*controlp)->m_next; + } + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); + so->so_rcv.sb_mb = m->m_next; + m->m_next = NULL; + *cme = m; + cme = &(*cme)->m_next; + m = so->so_rcv.sb_mb; + } + } while (m != NULL && m->m_type == MT_CONTROL); + if ((flags & MSG_PEEK) == 0) + sbsync(&so->so_rcv, nextrecord); + for (; cm != NULL; cm = cmn) { + cmn = cm->m_next; + cm->m_next = NULL; + type = mtod(cm, struct cmsghdr *)->cmsg_type; + if (controlp != NULL) { + if (dom->dom_externalize != NULL && + type == SCM_RIGHTS) { + sounlock(so); + splx(s); + error = (*dom->dom_externalize)(cm, l, + (flags & MSG_CMSG_CLOEXEC) ? + O_CLOEXEC : 0); + s = splsoftnet(); + solock(so); + } + *controlp = cm; + while (*controlp != NULL) + controlp = &(*controlp)->m_next; + } else { + /* + * Dispose of any SCM_RIGHTS message that went + * through the read path rather than recv. + */ + if (dom->dom_dispose != NULL && + type == SCM_RIGHTS) { + sounlock(so); + (*dom->dom_dispose)(cm); + solock(so); + } + m_freem(cm); } } + if (m != NULL) + nextrecord = so->so_rcv.sb_mb->m_nextpkt; + else + nextrecord = so->so_rcv.sb_mb; + orig_resid = 0; + } + + /* If m is non-NULL, we have some data to read. */ + if (__predict_true(m != NULL)) { type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; - } else { - if ((flags & MSG_PEEK) == 0) { - KASSERT(so->so_rcv.sb_mb == m); - so->so_rcv.sb_mb = nextrecord; - SB_EMPTY_FIXUP(&so->so_rcv); - } } SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); moff = 0; offset = 0; - while (m && uio->uio_resid > 0 && error == 0) { + while (m != NULL && uio->uio_resid > 0 && error == 0) { if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA) break; @@ -1145,6 +1400,7 @@ soreceive(struct socket *so, struct mbuf panic("receive 3"); #endif so->so_state &= ~SS_RCVATMARK; + wakeup_state = 0; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; @@ -1158,13 +1414,15 @@ soreceive(struct socket *so, struct mbuf * we must note any additions to the sockbuf when we * block interrupts again. */ - if (mp == 0) { + if (mp == NULL) { SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); + sounlock(so); splx(s); - error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); + error = uiomove(mtod(m, char *) + moff, (int)len, uio); s = splsoftnet(); - if (error) { + solock(so); + if (error != 0) { /* * If any part of the record has been removed * (such as the MT_SONAME mbuf, which will @@ -1176,8 +1434,7 @@ soreceive(struct socket *so, struct mbuf * This avoids a later panic("receive 1a") * when compiled with DIAGNOSTIC. */ - if (m && mbuf_removed - && (pr->pr_flags & PR_ATOMIC)) + if (m && mbuf_removed && atomic) (void) sbdroprecord(&so->so_rcv); goto release; @@ -1197,7 +1454,7 @@ soreceive(struct socket *so, struct mbuf *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; - *mp = (struct mbuf *)0; + *mp = NULL; } else { MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; @@ -1218,16 +1475,21 @@ soreceive(struct socket *so, struct mbuf SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); } - } else { - if (flags & MSG_PEEK) - moff += len; - else { - if (mp) - *mp = m_copym(m, 0, len, M_WAIT); - m->m_data += len; - m->m_len -= len; - so->so_rcv.sb_cc -= len; + } else if (flags & MSG_PEEK) + moff += len; + else { + if (mp != NULL) { + mt = m_copym(m, 0, len, M_NOWAIT); + if (__predict_false(mt == NULL)) { + sounlock(so); + mt = m_copym(m, 0, len, M_WAIT); + solock(so); + } + *mp = mt; } + m->m_data += len; + m->m_len -= len; + so->so_rcv.sb_cc -= len; } if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { @@ -1251,7 +1513,7 @@ soreceive(struct socket *so, struct mbuf * with a short count but without error. * Keep sockbuf locked against other readers. */ - while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && + while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && !nextrecord) { if (so->so_error || so->so_state & SS_CANTRCVMORE) break; @@ -1268,30 +1530,32 @@ soreceive(struct socket *so, struct mbuf */ if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) (*pr->pr_usrreq)(so, PRU_RCVD, - (struct mbuf *)0, - (struct mbuf *)(long)flags, - (struct mbuf *)0, - (struct proc *)0); + NULL, (struct mbuf *)(long)flags, NULL, l); SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); - error = sbwait(&so->so_rcv); - if (error) { + if (wakeup_state & SS_RESTARTSYS) + error = ERESTART; + else + error = sbwait(&so->so_rcv); + if (error != 0) { sbunlock(&so->so_rcv); + sounlock(so); splx(s); - return (0); + return 0; } if ((m = so->so_rcv.sb_mb) != NULL) nextrecord = m->m_nextpkt; + wakeup_state = so->so_state; } } - if (m && pr->pr_flags & PR_ATOMIC) { + if (m && atomic) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { - if (m == 0) { + if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if @@ -1307,55 +1571,79 @@ soreceive(struct socket *so, struct mbuf SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) - (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, - (struct mbuf *)(long)flags, (struct mbuf *)0, - (struct proc *)0); + (*pr->pr_usrreq)(so, PRU_RCVD, NULL, + (struct mbuf *)(long)flags, NULL, l); } if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { sbunlock(&so->so_rcv); - splx(s); goto restart; } - - if (flagsp) + + if (flagsp != NULL) *flagsp |= flags; release: sbunlock(&so->so_rcv); + sounlock(so); splx(s); - return (error); + return error; } int soshutdown(struct socket *so, int how) { - struct protosw *pr; + const struct protosw *pr; + int error; + + KASSERT(solocked(so)); pr = so->so_proto; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); - if (how == SHUT_RD || how == SHUT_RDWR) + if (how == SHUT_RD || how == SHUT_RDWR) { sorflush(so); + error = 0; + } if (how == SHUT_WR || how == SHUT_RDWR) - return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); - return (0); + error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, + NULL, NULL, NULL); + + return error; +} + +void +sorestart(struct socket *so) +{ + /* + * An application has called close() on an fd on which another + * of its threads has called a socket system call. + * Mark this and wake everyone up, and code that would block again + * instead returns ERESTART. + * On system call re-entry the fd is validated and EBADF returned. + * Any other fd will block again on the 2nd syscall. + */ + solock(so); + so->so_state |= SS_RESTARTSYS; + cv_broadcast(&so->so_cv); + cv_broadcast(&so->so_snd.sb_cv); + cv_broadcast(&so->so_rcv.sb_cv); + sounlock(so); } void sorflush(struct socket *so) { struct sockbuf *sb, asb; - struct protosw *pr; - int s; + const struct protosw *pr; + + KASSERT(solocked(so)); sb = &so->so_rcv; pr = so->so_proto; - sb->sb_flags |= SB_NOINTR; - (void) sblock(sb, M_WAITOK); - s = splnet(); socantrcvmore(so); + sb->sb_flags |= SB_NOINTR; + (void )sblock(sb, M_WAITOK); sbunlock(sb); asb = *sb; /* @@ -1364,242 +1652,519 @@ sorflush(struct socket *so) */ memset(&sb->sb_startzero, 0, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); - splx(s); - if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) + if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) { + sounlock(so); (*pr->pr_domain->dom_dispose)(asb.sb_mb); - sbrelease(&asb); + solock(so); + } + sbrelease(&asb, so); } -int -sosetopt(struct socket *so, int level, int optname, struct mbuf *m0) +/* + * internal set SOL_SOCKET options + */ +static int +sosetopt1(struct socket *so, const struct sockopt *sopt) { - int error; - struct mbuf *m; + int error = EINVAL, optval, opt; + struct linger l; + struct timeval tv; - error = 0; - m = m0; - if (level != SOL_SOCKET) { - if (so->so_proto && so->so_proto->pr_ctloutput) - return ((*so->so_proto->pr_ctloutput) - (PRCO_SETOPT, so, level, optname, &m0)); - error = ENOPROTOOPT; - } else { - switch (optname) { + switch ((opt = sopt->sopt_name)) { - case SO_LINGER: - if (m == NULL || m->m_len != sizeof(struct linger)) { - error = EINVAL; - goto bad; - } - so->so_linger = mtod(m, struct linger *)->l_linger; - /* fall thru... */ + case SO_ACCEPTFILTER: + error = accept_filt_setopt(so, sopt); + KASSERT(solocked(so)); + break; - case SO_DEBUG: - case SO_KEEPALIVE: - case SO_DONTROUTE: - case SO_USELOOPBACK: - case SO_BROADCAST: - case SO_REUSEADDR: - case SO_REUSEPORT: - case SO_OOBINLINE: - case SO_TIMESTAMP: - if (m == NULL || m->m_len < sizeof(int)) { - error = EINVAL; - goto bad; - } - if (*mtod(m, int *)) - so->so_options |= optname; - else - so->so_options &= ~optname; + case SO_LINGER: + error = sockopt_get(sopt, &l, sizeof(l)); + solock(so); + if (error) + break; + if (l.l_linger < 0 || l.l_linger > USHRT_MAX || + l.l_linger > (INT_MAX / hz)) { + error = EDOM; break; + } + so->so_linger = l.l_linger; + if (l.l_onoff) + so->so_options |= SO_LINGER; + else + so->so_options &= ~SO_LINGER; + break; + + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_DONTROUTE: + case SO_USELOOPBACK: + case SO_BROADCAST: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_OOBINLINE: + case SO_TIMESTAMP: + case SO_NOSIGPIPE: +#ifdef SO_OTIMESTAMP + case SO_OTIMESTAMP: +#endif + error = sockopt_getint(sopt, &optval); + solock(so); + if (error) + break; + if (optval) + so->so_options |= opt; + else + so->so_options &= ~opt; + break; - case SO_SNDBUF: - case SO_RCVBUF: - case SO_SNDLOWAT: - case SO_RCVLOWAT: - { - int optval; - - if (m == NULL || m->m_len < sizeof(int)) { - error = EINVAL; - goto bad; - } - - /* - * Values < 1 make no sense for any of these - * options, so disallow them. - */ - optval = *mtod(m, int *); - if (optval < 1) { - error = EINVAL; - goto bad; - } + case SO_SNDBUF: + case SO_RCVBUF: + case SO_SNDLOWAT: + case SO_RCVLOWAT: + error = sockopt_getint(sopt, &optval); + solock(so); + if (error) + break; - switch (optname) { + /* + * Values < 1 make no sense for any of these + * options, so disallow them. + */ + if (optval < 1) { + error = EINVAL; + break; + } - case SO_SNDBUF: - case SO_RCVBUF: - if (sbreserve(optname == SO_SNDBUF ? - &so->so_snd : &so->so_rcv, - (u_long) optval) == 0) { - error = ENOBUFS; - goto bad; - } + switch (opt) { + case SO_SNDBUF: + if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) { + error = ENOBUFS; break; + } + so->so_snd.sb_flags &= ~SB_AUTOSIZE; + break; - /* - * Make sure the low-water is never greater than - * the high-water. - */ - case SO_SNDLOWAT: - so->so_snd.sb_lowat = - (optval > so->so_snd.sb_hiwat) ? - so->so_snd.sb_hiwat : optval; - break; - case SO_RCVLOWAT: - so->so_rcv.sb_lowat = - (optval > so->so_rcv.sb_hiwat) ? - so->so_rcv.sb_hiwat : optval; + case SO_RCVBUF: + if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) { + error = ENOBUFS; break; } + so->so_rcv.sb_flags &= ~SB_AUTOSIZE; break; - } - case SO_SNDTIMEO: - case SO_RCVTIMEO: - { - struct timeval *tv; - short val; - - if (m == NULL || m->m_len < sizeof(*tv)) { - error = EINVAL; - goto bad; - } - tv = mtod(m, struct timeval *); - if (tv->tv_sec > (SHRT_MAX - tv->tv_usec / tick) / hz) { - error = EDOM; - goto bad; - } - val = tv->tv_sec * hz + tv->tv_usec / tick; - if (val == 0 && tv->tv_usec != 0) - val = 1; + /* + * Make sure the low-water is never greater than + * the high-water. + */ + case SO_SNDLOWAT: + if (optval > so->so_snd.sb_hiwat) + optval = so->so_snd.sb_hiwat; - switch (optname) { + so->so_snd.sb_lowat = optval; + break; - case SO_SNDTIMEO: - so->so_snd.sb_timeo = val; - break; - case SO_RCVTIMEO: - so->so_rcv.sb_timeo = val; - break; - } + case SO_RCVLOWAT: + if (optval > so->so_rcv.sb_hiwat) + optval = so->so_rcv.sb_hiwat; + + so->so_rcv.sb_lowat = optval; break; - } + } + break; - default: - error = ENOPROTOOPT; +#ifdef COMPAT_50 + case SO_OSNDTIMEO: + case SO_ORCVTIMEO: { + struct timeval50 otv; + error = sockopt_get(sopt, &otv, sizeof(otv)); + if (error) { + solock(so); break; } - if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { - (void) ((*so->so_proto->pr_ctloutput) - (PRCO_SETOPT, so, level, optname, &m0)); - m = NULL; /* freed by protocol */ + timeval50_to_timeval(&otv, &tv); + opt = opt == SO_OSNDTIMEO ? SO_SNDTIMEO : SO_RCVTIMEO; + error = 0; + /*FALLTHROUGH*/ + } +#endif /* COMPAT_50 */ + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + if (error) + error = sockopt_get(sopt, &tv, sizeof(tv)); + solock(so); + if (error) + break; + + if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) { + error = EDOM; + break; + } + + optval = tv.tv_sec * hz + tv.tv_usec / tick; + if (optval == 0 && tv.tv_usec != 0) + optval = 1; + + switch (opt) { + case SO_SNDTIMEO: + so->so_snd.sb_timeo = optval; + break; + case SO_RCVTIMEO: + so->so_rcv.sb_timeo = optval; + break; } + break; + + default: + solock(so); + error = ENOPROTOOPT; + break; } - bad: - if (m) - (void) m_free(m); + KASSERT(solocked(so)); + return error; +} + +int +sosetopt(struct socket *so, struct sockopt *sopt) +{ + int error, prerr; + + if (sopt->sopt_level == SOL_SOCKET) { + error = sosetopt1(so, sopt); + KASSERT(solocked(so)); + } else { + error = ENOPROTOOPT; + solock(so); + } + + if ((error == 0 || error == ENOPROTOOPT) && + so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { + /* give the protocol stack a shot */ + prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt); + if (prerr == 0) + error = 0; + else if (prerr != ENOPROTOOPT) + error = prerr; + } + sounlock(so); + return error; +} + +/* + * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt() + */ +int +so_setsockopt(struct lwp *l, struct socket *so, int level, int name, + const void *val, size_t valsize) +{ + struct sockopt sopt; + int error; + + KASSERT(valsize == 0 || val != NULL); + + sockopt_init(&sopt, level, name, valsize); + sockopt_set(&sopt, val, valsize); + + error = sosetopt(so, &sopt); + + sockopt_destroy(&sopt); + + return error; +} + +/* + * internal get SOL_SOCKET options + */ +static int +sogetopt1(struct socket *so, struct sockopt *sopt) +{ + int error, optval, opt; + struct linger l; + struct timeval tv; + + switch ((opt = sopt->sopt_name)) { + + case SO_ACCEPTFILTER: + error = accept_filt_getopt(so, sopt); + break; + + case SO_LINGER: + l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0; + l.l_linger = so->so_linger; + + error = sockopt_set(sopt, &l, sizeof(l)); + break; + + case SO_USELOOPBACK: + case SO_DONTROUTE: + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_BROADCAST: + case SO_OOBINLINE: + case SO_TIMESTAMP: + case SO_NOSIGPIPE: +#ifdef SO_OTIMESTAMP + case SO_OTIMESTAMP: +#endif + error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0); + break; + + case SO_TYPE: + error = sockopt_setint(sopt, so->so_type); + break; + + case SO_ERROR: + error = sockopt_setint(sopt, so->so_error); + so->so_error = 0; + break; + + case SO_SNDBUF: + error = sockopt_setint(sopt, so->so_snd.sb_hiwat); + break; + + case SO_RCVBUF: + error = sockopt_setint(sopt, so->so_rcv.sb_hiwat); + break; + + case SO_SNDLOWAT: + error = sockopt_setint(sopt, so->so_snd.sb_lowat); + break; + + case SO_RCVLOWAT: + error = sockopt_setint(sopt, so->so_rcv.sb_lowat); + break; + +#ifdef COMPAT_50 + case SO_OSNDTIMEO: + case SO_ORCVTIMEO: { + struct timeval50 otv; + + optval = (opt == SO_OSNDTIMEO ? + so->so_snd.sb_timeo : so->so_rcv.sb_timeo); + + otv.tv_sec = optval / hz; + otv.tv_usec = (optval % hz) * tick; + + error = sockopt_set(sopt, &otv, sizeof(otv)); + break; + } +#endif /* COMPAT_50 */ + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + optval = (opt == SO_SNDTIMEO ? + so->so_snd.sb_timeo : so->so_rcv.sb_timeo); + + tv.tv_sec = optval / hz; + tv.tv_usec = (optval % hz) * tick; + + error = sockopt_set(sopt, &tv, sizeof(tv)); + break; + + case SO_OVERFLOWED: + error = sockopt_setint(sopt, so->so_rcv.sb_overflowed); + break; + + default: + error = ENOPROTOOPT; + break; + } + return (error); } int -sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) +sogetopt(struct socket *so, struct sockopt *sopt) { - struct mbuf *m; + int error; - if (level != SOL_SOCKET) { + solock(so); + if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) { - return ((*so->so_proto->pr_ctloutput) - (PRCO_GETOPT, so, level, optname, mp)); + error = ((*so->so_proto->pr_ctloutput) + (PRCO_GETOPT, so, sopt)); } else - return (ENOPROTOOPT); + error = (ENOPROTOOPT); } else { - m = m_get(M_WAIT, MT_SOOPTS); - m->m_len = sizeof(int); + error = sogetopt1(so, sopt); + } + sounlock(so); + return (error); +} + +/* + * alloc sockopt data buffer buffer + * - will be released at destroy + */ +static int +sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag) +{ - switch (optname) { + KASSERT(sopt->sopt_size == 0); - case SO_LINGER: - m->m_len = sizeof(struct linger); - mtod(m, struct linger *)->l_onoff = - so->so_options & SO_LINGER; - mtod(m, struct linger *)->l_linger = so->so_linger; - break; + if (len > sizeof(sopt->sopt_buf)) { + sopt->sopt_data = kmem_zalloc(len, kmflag); + if (sopt->sopt_data == NULL) + return ENOMEM; + } else + sopt->sopt_data = sopt->sopt_buf; - case SO_USELOOPBACK: - case SO_DONTROUTE: - case SO_DEBUG: - case SO_KEEPALIVE: - case SO_REUSEADDR: - case SO_REUSEPORT: - case SO_BROADCAST: - case SO_OOBINLINE: - case SO_TIMESTAMP: - *mtod(m, int *) = so->so_options & optname; - break; + sopt->sopt_size = len; + return 0; +} - case SO_TYPE: - *mtod(m, int *) = so->so_type; - break; +/* + * initialise sockopt storage + * - MAY sleep during allocation + */ +void +sockopt_init(struct sockopt *sopt, int level, int name, size_t size) +{ - case SO_ERROR: - *mtod(m, int *) = so->so_error; - so->so_error = 0; - break; + memset(sopt, 0, sizeof(*sopt)); - case SO_SNDBUF: - *mtod(m, int *) = so->so_snd.sb_hiwat; - break; + sopt->sopt_level = level; + sopt->sopt_name = name; + (void)sockopt_alloc(sopt, size, KM_SLEEP); +} - case SO_RCVBUF: - *mtod(m, int *) = so->so_rcv.sb_hiwat; - break; +/* + * destroy sockopt storage + * - will release any held memory references + */ +void +sockopt_destroy(struct sockopt *sopt) +{ - case SO_SNDLOWAT: - *mtod(m, int *) = so->so_snd.sb_lowat; - break; + if (sopt->sopt_data != sopt->sopt_buf) + kmem_free(sopt->sopt_data, sopt->sopt_size); - case SO_RCVLOWAT: - *mtod(m, int *) = so->so_rcv.sb_lowat; - break; + memset(sopt, 0, sizeof(*sopt)); +} - case SO_SNDTIMEO: - case SO_RCVTIMEO: - { - int val = (optname == SO_SNDTIMEO ? - so->so_snd.sb_timeo : so->so_rcv.sb_timeo); - - m->m_len = sizeof(struct timeval); - mtod(m, struct timeval *)->tv_sec = val / hz; - mtod(m, struct timeval *)->tv_usec = - (val % hz) * tick; - break; - } +/* + * set sockopt value + * - value is copied into sockopt + * - memory is allocated when necessary, will not sleep + */ +int +sockopt_set(struct sockopt *sopt, const void *buf, size_t len) +{ + int error; + + if (sopt->sopt_size == 0) { + error = sockopt_alloc(sopt, len, KM_NOSLEEP); + if (error) + return error; + } + + KASSERT(sopt->sopt_size == len); + memcpy(sopt->sopt_data, buf, len); + return 0; +} + +/* + * common case of set sockopt integer value + */ +int +sockopt_setint(struct sockopt *sopt, int val) +{ + + return sockopt_set(sopt, &val, sizeof(int)); +} + +/* + * get sockopt value + * - correct size must be given + */ +int +sockopt_get(const struct sockopt *sopt, void *buf, size_t len) +{ + + if (sopt->sopt_size != len) + return EINVAL; + + memcpy(buf, sopt->sopt_data, len); + return 0; +} + +/* + * common case of get sockopt integer value + */ +int +sockopt_getint(const struct sockopt *sopt, int *valp) +{ + + return sockopt_get(sopt, valp, sizeof(int)); +} + +/* + * set sockopt value from mbuf + * - ONLY for legacy code + * - mbuf is released by sockopt + * - will not sleep + */ +int +sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m) +{ + size_t len; + int error; + + len = m_length(m); + + if (sopt->sopt_size == 0) { + error = sockopt_alloc(sopt, len, KM_NOSLEEP); + if (error) + return error; + } + + KASSERT(sopt->sopt_size == len); + m_copydata(m, 0, len, sopt->sopt_data); + m_freem(m); + + return 0; +} + +/* + * get sockopt value into mbuf + * - ONLY for legacy code + * - mbuf to be released by the caller + * - will not sleep + */ +struct mbuf * +sockopt_getmbuf(const struct sockopt *sopt) +{ + struct mbuf *m; + + if (sopt->sopt_size > MCLBYTES) + return NULL; + + m = m_get(M_DONTWAIT, MT_SOOPTS); + if (m == NULL) + return NULL; - default: - (void)m_free(m); - return (ENOPROTOOPT); + if (sopt->sopt_size > MLEN) { + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_free(m); + return NULL; } - *mp = m; - return (0); } + + memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size); + m->m_len = sopt->sopt_size; + + return m; } void sohasoutofband(struct socket *so) { + fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); - selwakeup(&so->so_rcv.sb_sel); + selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT); } static void @@ -1607,10 +2172,12 @@ filt_sordetach(struct knote *kn) { struct socket *so; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; + solock(so); SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) so->so_rcv.sb_flags &= ~SB_KNOTE; + sounlock(so); } /*ARGSUSED*/ @@ -1618,19 +2185,25 @@ static int filt_soread(struct knote *kn, long hint) { struct socket *so; + int rv; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; + if (hint != NOTE_SUBMIT) + solock(so); kn->kn_data = so->so_rcv.sb_cc; if (so->so_state & SS_CANTRCVMORE) { - kn->kn_flags |= EV_EOF; + kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; - return (1); - } - if (so->so_error) /* temporary udp error */ - return (1); - if (kn->kn_sfflags & NOTE_LOWAT) - return (kn->kn_data >= kn->kn_sdata); - return (kn->kn_data >= so->so_rcv.sb_lowat); + rv = 1; + } else if (so->so_error) /* temporary udp error */ + rv = 1; + else if (kn->kn_sfflags & NOTE_LOWAT) + rv = (kn->kn_data >= kn->kn_sdata); + else + rv = (kn->kn_data >= so->so_rcv.sb_lowat); + if (hint != NOTE_SUBMIT) + sounlock(so); + return rv; } static void @@ -1638,10 +2211,12 @@ filt_sowdetach(struct knote *kn) { struct socket *so; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; + solock(so); SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) so->so_snd.sb_flags &= ~SB_KNOTE; + sounlock(so); } /*ARGSUSED*/ @@ -1649,22 +2224,28 @@ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; + int rv; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; + if (hint != NOTE_SUBMIT) + solock(so); kn->kn_data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { - kn->kn_flags |= EV_EOF; + kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; - return (1); - } - if (so->so_error) /* temporary udp error */ - return (1); - if (((so->so_state & SS_ISCONNECTED) == 0) && + rv = 1; + } else if (so->so_error) /* temporary udp error */ + rv = 1; + else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) - return (0); - if (kn->kn_sfflags & NOTE_LOWAT) - return (kn->kn_data >= kn->kn_sdata); - return (kn->kn_data >= so->so_snd.sb_lowat); + rv = 0; + else if (kn->kn_sfflags & NOTE_LOWAT) + rv = (kn->kn_data >= kn->kn_sdata); + else + rv = (kn->kn_data >= so->so_snd.sb_lowat); + if (hint != NOTE_SUBMIT) + sounlock(so); + return rv; } /*ARGSUSED*/ @@ -1672,15 +2253,21 @@ static int filt_solisten(struct knote *kn, long hint) { struct socket *so; + int rv; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; /* * Set kn_data to number of incoming connections, not * counting partial (incomplete) connections. - */ + */ + if (hint != NOTE_SUBMIT) + solock(so); kn->kn_data = so->so_qlen; - return (kn->kn_data > 0); + rv = (kn->kn_data > 0); + if (hint != NOTE_SUBMIT) + sounlock(so); + return rv; } static const struct filterops solisten_filtops = @@ -1696,7 +2283,8 @@ soo_kqfilter(struct file *fp, struct kno struct socket *so; struct sockbuf *sb; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; + solock(so); switch (kn->kn_filter) { case EVFILT_READ: if (so->so_options & SO_ACCEPTCONN) @@ -1710,13 +2298,70 @@ soo_kqfilter(struct file *fp, struct kno sb = &so->so_snd; break; default: - return (1); + sounlock(so); + return (EINVAL); } SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); sb->sb_flags |= SB_KNOTE; + sounlock(so); return (0); } +static int +sodopoll(struct socket *so, int events) +{ + int revents; + + revents = 0; + + if (events & (POLLIN | POLLRDNORM)) + if (soreadable(so)) + revents |= events & (POLLIN | POLLRDNORM); + + if (events & (POLLOUT | POLLWRNORM)) + if (sowritable(so)) + revents |= events & (POLLOUT | POLLWRNORM); + + if (events & (POLLPRI | POLLRDBAND)) + if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) + revents |= events & (POLLPRI | POLLRDBAND); + + return revents; +} + +int +sopoll(struct socket *so, int events) +{ + int revents = 0; + +#ifndef DIAGNOSTIC + /* + * Do a quick, unlocked check in expectation that the socket + * will be ready for I/O. Don't do this check if DIAGNOSTIC, + * as the solocked() assertions will fail. + */ + if ((revents = sodopoll(so, events)) != 0) + return revents; +#endif + + solock(so); + if ((revents = sodopoll(so, events)) == 0) { + if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { + selrecord(curlwp, &so->so_rcv.sb_sel); + so->so_rcv.sb_flags |= SB_NOTIFY; + } + + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(curlwp, &so->so_snd.sb_sel); + so->so_snd.sb_flags |= SB_NOTIFY; + } + } + sounlock(so); + + return revents; +} + + #include static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); @@ -1731,7 +2376,6 @@ sysctl_kern_somaxkva(SYSCTLFN_ARGS) { int error, new_somaxkva; struct sysctlnode node; - int s; new_somaxkva = somaxkva; node = *rnode; @@ -1743,21 +2387,30 @@ sysctl_kern_somaxkva(SYSCTLFN_ARGS) if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */ return (EINVAL); - s = splvm(); - simple_lock(&so_pendfree_slock); + mutex_enter(&so_pendfree_lock); somaxkva = new_somaxkva; - wakeup(&socurkva); - simple_unlock(&so_pendfree_slock); - splx(s); + cv_broadcast(&socurkva_cv); + mutex_exit(&so_pendfree_lock); return (error); } -SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup") +static void +sysctl_kern_somaxkva_setup(void) { - sysctl_createv(SYSCTL_PERMANENT|SYSCTL_READWRITE, - CTLTYPE_INT, "somaxkva", NULL, + KASSERT(socket_sysctllog == NULL); + sysctl_createv(&socket_sysctllog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "kern", NULL, + NULL, 0, NULL, 0, + CTL_KERN, CTL_EOL); + + sysctl_createv(&socket_sysctllog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "somaxkva", + SYSCTL_DESCR("Maximum amount of kernel memory to be " + "used for socket buffers"), sysctl_kern_somaxkva, 0, NULL, 0, CTL_KERN, KERN_SOMAXKVA, CTL_EOL); }