Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.55 retrieving revision 1.140.6.4 diff -u -p -r1.55 -r1.140.6.4 --- src/sys/kern/uipc_socket.c 2001/03/21 19:22:29 1.55 +++ src/sys/kern/uipc_socket.c 2007/11/11 16:48:17 1.140.6.4 @@ -1,4 +1,40 @@ -/* $NetBSD: uipc_socket.c,v 1.55 2001/03/21 19:22:29 thorpej Exp $ */ +/* $NetBSD: uipc_socket.c,v 1.140.6.4 2007/11/11 16:48:17 joerg Exp $ */ + +/*- + * Copyright (c) 2002, 2007 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe of Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 @@ -12,11 +48,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -35,12 +67,19 @@ * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 */ -#include "opt_compat_sunos.h" +#include +__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.140.6.4 2007/11/11 16:48:17 joerg Exp $"); + +#include "opt_sock_counters.h" +#include "opt_sosend_loan.h" +#include "opt_mbuftrace.h" +#include "opt_somaxkva.h" #include #include #include #include +#include #include #include #include @@ -51,18 +90,377 @@ #include #include #include +#include +#include +#include +#include +#include -struct pool socket_pool; +#include + +POOL_INIT(socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL, + IPL_SOFTNET); + +MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); +MALLOC_DEFINE(M_SONAME, "soname", "socket name"); + +extern const struct fileops socketops; extern int somaxconn; /* patchable (XXX sysctl) */ int somaxconn = SOMAXCONN; +#ifdef SOSEND_COUNTERS +#include + +static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "loan big"); +static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "copy big"); +static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "copy small"); +static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "kva limit"); + +#define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ + +EVCNT_ATTACH_STATIC(sosend_loan_big); +EVCNT_ATTACH_STATIC(sosend_copy_big); +EVCNT_ATTACH_STATIC(sosend_copy_small); +EVCNT_ATTACH_STATIC(sosend_kvalimit); +#else + +#define SOSEND_COUNTER_INCR(ev) /* nothing */ + +#endif /* SOSEND_COUNTERS */ + +static struct callback_entry sokva_reclaimerentry; + +#ifdef SOSEND_NO_LOAN +int sock_loan_thresh = -1; +#else +int sock_loan_thresh = 4096; +#endif + +static kmutex_t so_pendfree_lock; +static struct mbuf *so_pendfree; + +#ifndef SOMAXKVA +#define SOMAXKVA (16 * 1024 * 1024) +#endif +int somaxkva = SOMAXKVA; +static int socurkva; +static kcondvar_t socurkva_cv; + +#define SOCK_LOAN_CHUNK 65536 + +static size_t sodopendfree(void); +static size_t sodopendfreel(void); + +static vsize_t +sokvareserve(struct socket *so, vsize_t len) +{ + int error; + + mutex_enter(&so_pendfree_lock); + while (socurkva + len > somaxkva) { + size_t freed; + + /* + * try to do pendfree. + */ + + freed = sodopendfreel(); + + /* + * if some kva was freed, try again. + */ + + if (freed) + continue; + + SOSEND_COUNTER_INCR(&sosend_kvalimit); + error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock); + if (error) { + len = 0; + break; + } + } + socurkva += len; + mutex_exit(&so_pendfree_lock); + return len; +} + +static void +sokvaunreserve(vsize_t len) +{ + + mutex_enter(&so_pendfree_lock); + socurkva -= len; + cv_broadcast(&socurkva_cv); + mutex_exit(&so_pendfree_lock); +} + +/* + * sokvaalloc: allocate kva for loan. + */ + +vaddr_t +sokvaalloc(vsize_t len, struct socket *so) +{ + vaddr_t lva; + + /* + * reserve kva. + */ + + if (sokvareserve(so, len) == 0) + return 0; + + /* + * allocate kva. + */ + + lva = uvm_km_alloc(kernel_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA); + if (lva == 0) { + sokvaunreserve(len); + return (0); + } + + return lva; +} + +/* + * sokvafree: free kva for loan. + */ + +void +sokvafree(vaddr_t sva, vsize_t len) +{ + + /* + * free kva. + */ + + uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY); + + /* + * unreserve kva. + */ + + sokvaunreserve(len); +} + +static void +sodoloanfree(struct vm_page **pgs, void *buf, size_t size) +{ + vaddr_t va, sva, eva; + vsize_t len; + paddr_t pa; + int i, npgs; + + eva = round_page((vaddr_t) buf + size); + sva = trunc_page((vaddr_t) buf); + len = eva - sva; + npgs = len >> PAGE_SHIFT; + + if (__predict_false(pgs == NULL)) { + pgs = alloca(npgs * sizeof(*pgs)); + + for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { + if (pmap_extract(pmap_kernel(), va, &pa) == false) + panic("sodoloanfree: va 0x%lx not mapped", va); + pgs[i] = PHYS_TO_VM_PAGE(pa); + } + } + + pmap_kremove(sva, len); + pmap_update(pmap_kernel()); + uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); + sokvafree(sva, len); +} + +static size_t +sodopendfree() +{ + size_t rv; + + mutex_enter(&so_pendfree_lock); + rv = sodopendfreel(); + mutex_exit(&so_pendfree_lock); + + return rv; +} + +/* + * sodopendfreel: free mbufs on "pendfree" list. + * unlock and relock so_pendfree_lock when freeing mbufs. + * + * => called with so_pendfree_lock held. + */ + +static size_t +sodopendfreel() +{ + struct mbuf *m, *next; + size_t rv = 0; + + KASSERT(mutex_owned(&so_pendfree_lock)); + + while (so_pendfree != NULL) { + m = so_pendfree; + so_pendfree = NULL; + mutex_exit(&so_pendfree_lock); + + for (; m != NULL; m = next) { + next = m->m_next; + + rv += m->m_ext.ext_size; + sodoloanfree((m->m_flags & M_EXT_PAGES) ? + m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, + m->m_ext.ext_size); + pool_cache_put(mb_cache, m); + } + + mutex_enter(&so_pendfree_lock); + } + + return (rv); +} + +void +soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) +{ + + if (m == NULL) { + + /* + * called from MEXTREMOVE. + */ + + sodoloanfree(NULL, buf, size); + return; + } + + /* + * postpone freeing mbuf. + * + * we can't do it in interrupt context + * because we need to put kva back to kernel_map. + */ + + mutex_enter(&so_pendfree_lock); + m->m_next = so_pendfree; + so_pendfree = m; + cv_broadcast(&socurkva_cv); + mutex_exit(&so_pendfree_lock); +} + +static long +sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) +{ + struct iovec *iov = uio->uio_iov; + vaddr_t sva, eva; + vsize_t len; + vaddr_t lva, va; + int npgs, i, error; + + if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) + return (0); + + if (iov->iov_len < (size_t) space) + space = iov->iov_len; + if (space > SOCK_LOAN_CHUNK) + space = SOCK_LOAN_CHUNK; + + eva = round_page((vaddr_t) iov->iov_base + space); + sva = trunc_page((vaddr_t) iov->iov_base); + len = eva - sva; + npgs = len >> PAGE_SHIFT; + + /* XXX KDASSERT */ + KASSERT(npgs <= M_EXT_MAXPAGES); + + lva = sokvaalloc(len, so); + if (lva == 0) + return 0; + + error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len, + m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); + if (error) { + sokvafree(lva, len); + return (0); + } + + for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), + VM_PROT_READ); + pmap_update(pmap_kernel()); + + lva += (vaddr_t) iov->iov_base & PAGE_MASK; + + MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so); + m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; + + uio->uio_resid -= space; + /* uio_offset not updated, not set/used for write(2) */ + uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space; + uio->uio_iov->iov_len -= space; + if (uio->uio_iov->iov_len == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + } + + return (space); +} + +static int +sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg) +{ + + KASSERT(ce == &sokva_reclaimerentry); + KASSERT(obj == NULL); + + sodopendfree(); + if (!vm_map_starved_p(kernel_map)) { + return CALLBACK_CHAIN_ABORT; + } + return CALLBACK_CHAIN_CONTINUE; +} + +struct mbuf * +getsombuf(struct socket *so) +{ + struct mbuf *m; + + m = m_get(M_WAIT, MT_SONAME); + MCLAIM(m, so->so_mowner); + return m; +} + +struct mbuf * +m_intopt(struct socket *so, int val) +{ + struct mbuf *m; + + m = getsombuf(so); + m->m_len = sizeof(int); + *mtod(m, int *) = val; + return m; +} + void soinit(void) { - pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, - "sockpl", 0, NULL, NULL, M_SOCKET); + mutex_init(&so_pendfree_lock, MUTEX_DRIVER, IPL_VM); + cv_init(&socurkva_cv, "sokva"); + + /* Set the initial adjusted socket buffer size. */ + if (sb_max_set(sb_max)) + panic("bad initial sb_max value: %lu", sb_max); + + callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, + &sokva_reclaimerentry, NULL, sokva_reclaim_callback); } /* @@ -74,63 +472,111 @@ soinit(void) */ /*ARGSUSED*/ int -socreate(int dom, struct socket **aso, int type, int proto) +socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l) { - struct proc *p; - struct protosw *prp; + const struct protosw *prp; struct socket *so; + uid_t uid; int error, s; - p = curproc; /* XXX */ + error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, + KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type), + KAUTH_ARG(proto)); + if (error != 0) + return error; + if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); - if (prp == 0 || prp->pr_usrreq == 0) - return (EPROTONOSUPPORT); + if (prp == NULL) { + /* no support for domain */ + if (pffinddomain(dom) == 0) + return EAFNOSUPPORT; + /* no support for socket type */ + if (proto == 0 && type != 0) + return EPROTOTYPE; + return EPROTONOSUPPORT; + } + if (prp->pr_usrreq == NULL) + return EPROTONOSUPPORT; if (prp->pr_type != type) - return (EPROTOTYPE); + return EPROTOTYPE; s = splsoftnet(); so = pool_get(&socket_pool, PR_WAITOK); - memset((caddr_t)so, 0, sizeof(*so)); + memset(so, 0, sizeof(*so)); TAILQ_INIT(&so->so_q0); TAILQ_INIT(&so->so_q); so->so_type = type; so->so_proto = prp; so->so_send = sosend; so->so_receive = soreceive; - if (p != 0) - so->so_uid = p->p_ucred->cr_uid; - error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, - (struct mbuf *)(long)proto, (struct mbuf *)0, p); - if (error) { +#ifdef MBUFTRACE + so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; + so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; + so->so_mowner = &prp->pr_domain->dom_mowner; +#endif + selinit(&so->so_rcv.sb_sel); + selinit(&so->so_snd.sb_sel); + uid = kauth_cred_geteuid(l->l_cred); + so->so_uidinfo = uid_find(uid); + error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL, + (struct mbuf *)(long)proto, NULL, l); + if (error != 0) { so->so_state |= SS_NOFDREF; sofree(so); splx(s); - return (error); - } -#ifdef COMPAT_SUNOS - { - extern struct emul emul_sunos; - if (p->p_emul == &emul_sunos && type == SOCK_DGRAM) - so->so_options |= SO_BROADCAST; + return error; } -#endif splx(s); *aso = so; - return (0); + return 0; } +/* On success, write file descriptor to fdout and return zero. On + * failure, return non-zero; *fdout will be undefined. + */ int -sobind(struct socket *so, struct mbuf *nam, struct proc *p) +fsocreate(int domain, struct socket **sop, int type, int protocol, + struct lwp *l, int *fdout) +{ + struct filedesc *fdp; + struct socket *so; + struct file *fp; + int fd, error; + + fdp = l->l_proc->p_fd; + /* falloc() will use the desciptor for us */ + if ((error = falloc(l, &fp, &fd)) != 0) + return (error); + fp->f_flag = FREAD|FWRITE; + fp->f_type = DTYPE_SOCKET; + fp->f_ops = &socketops; + error = socreate(domain, &so, type, protocol, l); + if (error != 0) { + FILE_UNUSE(fp, l); + fdremove(fdp, fd); + ffree(fp); + } else { + if (sop != NULL) + *sop = so; + fp->f_data = so; + FILE_SET_MATURE(fp); + FILE_UNUSE(fp, l); + *fdout = fd; + } + return error; +} + +int +sobind(struct socket *so, struct mbuf *nam, struct lwp *l) { int s, error; s = splsoftnet(); - error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0, - nam, (struct mbuf *)0, p); + error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, l); splx(s); - return (error); + return error; } int @@ -139,19 +585,19 @@ solisten(struct socket *so, int backlog) int s, error; s = splsoftnet(); - error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); - if (error) { + error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, + NULL, NULL, NULL); + if (error != 0) { splx(s); - return (error); + return error; } - if (so->so_q.tqh_first == NULL) + if (TAILQ_EMPTY(&so->so_q)) so->so_options |= SO_ACCEPTCONN; if (backlog < 0) backlog = 0; so->so_qlimit = min(backlog, somaxconn); splx(s); - return (0); + return 0; } void @@ -169,8 +615,16 @@ sofree(struct socket *so) if (!soqremque(so, 0)) return; } - sbrelease(&so->so_snd); + if (so->so_rcv.sb_hiwat) + (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, + RLIM_INFINITY); + if (so->so_snd.sb_hiwat) + (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, + RLIM_INFINITY); + sbrelease(&so->so_snd, so); sorflush(so); + seldestroy(&so->so_rcv.sb_sel); + seldestroy(&so->so_snd.sb_sel); pool_put(&socket_pool, so); } @@ -188,11 +642,11 @@ soclose(struct socket *so) error = 0; s = splsoftnet(); /* conservative */ if (so->so_options & SO_ACCEPTCONN) { - while ((so2 = so->so_q0.tqh_first) != 0) { + while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { (void) soqremque(so2, 0); (void) soabort(so2); } - while ((so2 = so->so_q.tqh_first) != 0) { + while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { (void) soqremque(so2, 1); (void) soabort(so2); } @@ -210,7 +664,7 @@ soclose(struct socket *so) (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { - error = tsleep((caddr_t)&so->so_timeo, + error = tsleep((void *)&so->so_timeo, PSOCK | PCATCH, netcls, so->so_linger * hz); if (error) @@ -221,8 +675,7 @@ soclose(struct socket *so) drop: if (so->so_pcb) { int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, - (struct proc *)0); + NULL, NULL, NULL, NULL); if (error == 0) error = error2; } @@ -241,9 +694,15 @@ soclose(struct socket *so) int soabort(struct socket *so) { + int error; - return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); + KASSERT(so->so_head == NULL); + error = (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, + NULL, NULL, NULL); + if (error) { + sofree(so); + } + return error; } int @@ -259,7 +718,7 @@ soaccept(struct socket *so, struct mbuf if ((so->so_state & SS_ISDISCONNECTED) == 0 || (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, - (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0); + NULL, nam, NULL, NULL); else error = ECONNABORTED; @@ -268,12 +727,10 @@ soaccept(struct socket *so, struct mbuf } int -soconnect(struct socket *so, struct mbuf *nam) +soconnect(struct socket *so, struct mbuf *nam, struct lwp *l) { - struct proc *p; int s, error; - p = curproc; /* XXX */ if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); s = splsoftnet(); @@ -289,7 +746,7 @@ soconnect(struct socket *so, struct mbuf error = EISCONN; else error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, - (struct mbuf *)0, nam, (struct mbuf *)0, p); + NULL, nam, NULL, l); splx(s); return (error); } @@ -301,8 +758,7 @@ soconnect2(struct socket *so1, struct so s = splsoftnet(); error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, - (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0, - (struct proc *)0); + NULL, (struct mbuf *)so2, NULL, NULL); splx(s); return (error); } @@ -322,10 +778,10 @@ sodisconnect(struct socket *so) goto bad; } error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, - (struct proc *)0); + NULL, NULL, NULL, NULL); bad: splx(s); + sodopendfree(); return (error); } @@ -349,14 +805,16 @@ sodisconnect(struct socket *so) */ int sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, - struct mbuf *control, int flags) + struct mbuf *control, int flags, struct lwp *l) { - struct proc *p; struct mbuf **mp, *m; - long space, len, resid; - int clen, error, s, dontroute, mlen, atomic; + struct proc *p; + long space, len, resid, clen, mlen; + int error, s, dontroute, atomic; + + p = l->l_proc; + sodopendfree(); - p = curproc; /* XXX */ clen = 0; atomic = sosendallatonce(so) || top; if (uio) @@ -377,7 +835,8 @@ sosend(struct socket *so, struct mbuf *a dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); - p->p_stats->p_ru.ru_msgsnd++; + if (p) + p->p_stats->p_ru.ru_msgsnd++; if (control) clen = control->m_len; #define snderr(errno) { error = errno; splx(s); goto release; } @@ -409,7 +868,7 @@ sosend(struct socket *so, struct mbuf *a if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) snderr(EMSGSIZE); - if (space < resid + clen && uio && + if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if (so->so_state & SS_NBIO) snderr(EWOULDBLOCK); @@ -432,34 +891,42 @@ sosend(struct socket *so, struct mbuf *a if (flags & MSG_EOR) top->m_flags |= M_EOR; } else do { - if (top == 0) { - MGETHDR(m, M_WAIT, MT_DATA); + if (top == NULL) { + m = m_gethdr(M_WAIT, MT_DATA); mlen = MHLEN; m->m_pkthdr.len = 0; - m->m_pkthdr.rcvif = (struct ifnet *)0; + m->m_pkthdr.rcvif = NULL; } else { - MGET(m, M_WAIT, MT_DATA); + m = m_get(M_WAIT, MT_DATA); mlen = MLEN; } + MCLAIM(m, so->so_snd.sb_mowner); + if (sock_loan_thresh >= 0 && + uio->uio_iov->iov_len >= sock_loan_thresh && + space >= sock_loan_thresh && + (len = sosend_loan(so, uio, m, + space)) != 0) { + SOSEND_COUNTER_INCR(&sosend_loan_big); + space -= len; + goto have_data; + } if (resid >= MINCLSIZE && space >= MCLBYTES) { - MCLGET(m, M_WAIT); + SOSEND_COUNTER_INCR(&sosend_copy_big); + m_clget(m, M_WAIT); if ((m->m_flags & M_EXT) == 0) goto nopages; mlen = MCLBYTES; -#ifdef MAPPED_MBUFS - len = min(MCLBYTES, resid); -#else if (atomic && top == 0) { - len = min(MCLBYTES - max_hdr, + len = lmin(MCLBYTES - max_hdr, resid); m->m_data += max_hdr; } else - len = min(MCLBYTES, resid); -#endif + len = lmin(MCLBYTES, resid); space -= len; } else { -nopages: - len = min(min(mlen, resid), space); + nopages: + SOSEND_COUNTER_INCR(&sosend_copy_small); + len = lmin(lmin(mlen, resid), space); space -= len; /* * For datagram protocols, leave room @@ -468,13 +935,13 @@ nopages: if (atomic && top == 0 && len < mlen) MH_ALIGN(m, len); } - error = uiomove(mtod(m, caddr_t), (int)len, - uio); + error = uiomove(mtod(m, void *), (int)len, uio); + have_data: resid = uio->uio_resid; m->m_len = len; *mp = m; top->m_pkthdr.len += len; - if (error) + if (error != 0) goto release; mp = &m->m_next; if (resid <= 0) { @@ -483,7 +950,7 @@ nopages: break; } } while (space > 0 && atomic); - + s = splsoftnet(); if (so->so_state & SS_CANTSENDMORE) @@ -495,7 +962,7 @@ nopages: so->so_state |= SS_MORETOCOME; error = (*so->so_proto->pr_usrreq)(so, (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, - top, addr, control, p); + top, addr, control, curlwp); /* XXX */ if (dontroute) so->so_options &= ~SO_DONTROUTE; if (resid > 0) @@ -503,10 +970,10 @@ nopages: splx(s); clen = 0; - control = 0; - top = 0; + control = NULL; + top = NULL; mp = ⊤ - if (error) + if (error != 0) goto release; } while (resid && space > 0); } while (resid); @@ -541,49 +1008,54 @@ int soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { + struct lwp *l = curlwp; struct mbuf *m, **mp; int flags, len, error, s, offset, moff, type, orig_resid; - struct protosw *pr; + const struct protosw *pr; struct mbuf *nextrecord; + int mbuf_removed = 0; pr = so->so_proto; mp = mp0; type = 0; orig_resid = uio->uio_resid; - if (paddr) - *paddr = 0; - if (controlp) - *controlp = 0; - if (flagsp) + + if (paddr != NULL) + *paddr = NULL; + if (controlp != NULL) + *controlp = NULL; + if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; + + if ((flags & MSG_DONTWAIT) == 0) + sodopendfree(); + if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, - (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0, - (struct proc *)0); + (struct mbuf *)(long)(flags & MSG_PEEK), NULL, l); if (error) goto bad; do { - error = uiomove(mtod(m, caddr_t), + error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); - } while (uio->uio_resid && error == 0 && m); + } while (uio->uio_resid > 0 && error == 0 && m); bad: - if (m) + if (m != NULL) m_freem(m); - return (error); + return error; } - if (mp) - *mp = (struct mbuf *)0; + if (mp != NULL) + *mp = NULL; if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) - (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); + (*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, l); restart: if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) - return (error); + return error; s = splsoftnet(); m = so->so_rcv.sb_mb; @@ -598,17 +1070,20 @@ soreceive(struct socket *so, struct mbuf * we have to do the receive in sections, and thus risk returning * a short count if a timeout or signal occurs after we start. */ - if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && - so->so_rcv.sb_cc < uio->uio_resid) && - (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || - ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && - m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { + if (m == NULL || + ((flags & MSG_DONTWAIT) == 0 && + so->so_rcv.sb_cc < uio->uio_resid && + (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || + ((flags & MSG_WAITALL) && + uio->uio_resid <= so->so_rcv.sb_hiwat)) && + m->m_nextpkt == NULL && + (pr->pr_flags & PR_ATOMIC) == 0)) { #ifdef DIAGNOSTIC - if (m == 0 && so->so_rcv.sb_cc) + if (m == NULL && so->so_rcv.sb_cc) panic("receive 1"); #endif if (so->so_error) { - if (m) + if (m != NULL) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) @@ -616,12 +1091,12 @@ soreceive(struct socket *so, struct mbuf goto release; } if (so->so_state & SS_CANTRCVMORE) { - if (m) + if (m != NULL) goto dontblock; else goto release; } - for (; m; m = m->m_next) + for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; @@ -637,18 +1112,26 @@ soreceive(struct socket *so, struct mbuf error = EWOULDBLOCK; goto release; } + SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(&so->so_rcv); error = sbwait(&so->so_rcv); splx(s); - if (error) - return (error); + if (error != 0) + return error; goto restart; } dontblock: -#ifdef notyet /* XXXX */ - if (uio->uio_procp) - uio->uio_procp->p_stats->p_ru.ru_msgrcv++; -#endif + /* + * On entry here, m points to the first record of the socket buffer. + * While we process the initial mbufs containing address and control + * info, we save a copy of m->m_nextpkt into nextrecord. + */ + if (l != NULL) + l->l_proc->p_stats->p_ru.ru_msgrcv++; + KASSERT(m == so->so_rcv.sb_mb); + SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { #ifdef DIAGNOSTIC @@ -662,10 +1145,11 @@ soreceive(struct socket *so, struct mbuf m = m->m_next; } else { sbfree(&so->so_rcv, m); - if (paddr) { + mbuf_removed = 1; + if (paddr != NULL) { *paddr = m; so->so_rcv.sb_mb = m->m_next; - m->m_next = 0; + m->m_next = NULL; m = so->so_rcv.sb_mb; } else { MFREE(m, so->so_rcv.sb_mb); @@ -673,42 +1157,77 @@ soreceive(struct socket *so, struct mbuf } } } - while (m && m->m_type == MT_CONTROL && error == 0) { + while (m != NULL && m->m_type == MT_CONTROL && error == 0) { if (flags & MSG_PEEK) { - if (controlp) + if (controlp != NULL) *controlp = m_copy(m, 0, m->m_len); m = m->m_next; } else { sbfree(&so->so_rcv, m); - if (controlp) { - if (pr->pr_domain->dom_externalize && + mbuf_removed = 1; + if (controlp != NULL) { + struct domain *dom = pr->pr_domain; + if (dom->dom_externalize && l && mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) - error = (*pr->pr_domain->dom_externalize)(m); + error = (*dom->dom_externalize)(m, l); *controlp = m; so->so_rcv.sb_mb = m->m_next; - m->m_next = 0; + m->m_next = NULL; m = so->so_rcv.sb_mb; } else { + /* + * Dispose of any SCM_RIGHTS message that went + * through the read path rather than recv. + */ + if (pr->pr_domain->dom_dispose && + mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) + (*pr->pr_domain->dom_dispose)(m); MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } } - if (controlp) { + if (controlp != NULL) { orig_resid = 0; controlp = &(*controlp)->m_next; } } - if (m) { - if ((flags & MSG_PEEK) == 0) + + /* + * If m is non-NULL, we have some data to read. From now on, + * make sure to keep sb_lastrecord consistent when working on + * the last packet on the chain (nextrecord == NULL) and we + * change m->m_nextpkt. + */ + if (m != NULL) { + if ((flags & MSG_PEEK) == 0) { m->m_nextpkt = nextrecord; + /* + * If nextrecord == NULL (this is a single chain), + * then sb_lastrecord may not be valid here if m + * was changed earlier. + */ + if (nextrecord == NULL) { + KASSERT(so->so_rcv.sb_mb == m); + so->so_rcv.sb_lastrecord = m; + } + } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; + } else { + if ((flags & MSG_PEEK) == 0) { + KASSERT(so->so_rcv.sb_mb == m); + so->so_rcv.sb_mb = nextrecord; + SB_EMPTY_FIXUP(&so->so_rcv); + } } + SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); + moff = 0; offset = 0; - while (m && uio->uio_resid > 0 && error == 0) { + while (m != NULL && uio->uio_resid > 0 && error == 0) { if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA) break; @@ -732,10 +1251,30 @@ soreceive(struct socket *so, struct mbuf * we must note any additions to the sockbuf when we * block interrupts again. */ - if (mp == 0) { + if (mp == NULL) { + SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); splx(s); - error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); + error = uiomove(mtod(m, char *) + moff, (int)len, uio); s = splsoftnet(); + if (error != 0) { + /* + * If any part of the record has been removed + * (such as the MT_SONAME mbuf, which will + * happen when PR_ADDR, and thus also + * PR_ATOMIC, is set), then drop the entire + * record to maintain the atomicity of the + * receive operation. + * + * This avoids a later panic("receive 1a") + * when compiled with DIAGNOSTIC. + */ + if (m && mbuf_removed + && (pr->pr_flags & PR_ATOMIC)) + (void) sbdroprecord(&so->so_rcv); + + goto release; + } } else uio->uio_resid -= len; if (len == m->m_len - moff) { @@ -751,24 +1290,35 @@ soreceive(struct socket *so, struct mbuf *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; - *mp = (struct mbuf *)0; + *mp = NULL; } else { MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } - if (m) + /* + * If m != NULL, we also know that + * so->so_rcv.sb_mb != NULL. + */ + KASSERT(so->so_rcv.sb_mb == m); + if (m) { m->m_nextpkt = nextrecord; + if (nextrecord == NULL) + so->so_rcv.sb_lastrecord = m; + } else { + so->so_rcv.sb_mb = nextrecord; + SB_EMPTY_FIXUP(&so->so_rcv); + } + SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); } - } else { - if (flags & MSG_PEEK) - moff += len; - else { - if (mp) - *mp = m_copym(m, 0, len, M_WAIT); - m->m_data += len; - m->m_len -= len; - so->so_rcv.sb_cc -= len; - } + } else if (flags & MSG_PEEK) + moff += len; + else { + if (mp != NULL) + *mp = m_copym(m, 0, len, M_WAIT); + m->m_data += len; + m->m_len -= len; + so->so_rcv.sb_cc -= len; } if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { @@ -792,15 +1342,31 @@ soreceive(struct socket *so, struct mbuf * with a short count but without error. * Keep sockbuf locked against other readers. */ - while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && + while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && !nextrecord) { if (so->so_error || so->so_state & SS_CANTRCVMORE) break; + /* + * If we are peeking and the socket receive buffer is + * full, stop since we can't get more data to peek at. + */ + if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) + break; + /* + * If we've drained the socket buffer, tell the + * protocol in case it needs to do something to + * get it filled again. + */ + if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) + (*pr->pr_usrreq)(so, PRU_RCVD, + NULL, (struct mbuf *)(long)flags, NULL, l); + SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); error = sbwait(&so->so_rcv); - if (error) { + if (error != 0) { sbunlock(&so->so_rcv); splx(s); - return (0); + return 0; } if ((m = so->so_rcv.sb_mb) != NULL) nextrecord = m->m_nextpkt; @@ -813,12 +1379,24 @@ soreceive(struct socket *so, struct mbuf (void) sbdroprecord(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { - if (m == 0) + if (m == NULL) { + /* + * First part is an inline SB_EMPTY_FIXUP(). Second + * part makes sure sb_lastrecord is up-to-date if + * there is still data in the socket buffer. + */ so->so_rcv.sb_mb = nextrecord; + if (so->so_rcv.sb_mb == NULL) { + so->so_rcv.sb_mbtail = NULL; + so->so_rcv.sb_lastrecord = NULL; + } else if (nextrecord->m_nextpkt == NULL) + so->so_rcv.sb_lastrecord = nextrecord; + } + SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) - (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, - (struct mbuf *)(long)flags, (struct mbuf *)0, - (struct proc *)0); + (*pr->pr_usrreq)(so, PRU_RCVD, NULL, + (struct mbuf *)(long)flags, NULL, l); } if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { @@ -826,19 +1404,19 @@ soreceive(struct socket *so, struct mbuf splx(s); goto restart; } - - if (flagsp) + + if (flagsp != NULL) *flagsp |= flags; release: sbunlock(&so->so_rcv); splx(s); - return (error); + return error; } int soshutdown(struct socket *so, int how) { - struct protosw *pr; + const struct protosw *pr; pr = so->so_proto; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) @@ -847,171 +1425,172 @@ soshutdown(struct socket *so, int how) if (how == SHUT_RD || how == SHUT_RDWR) sorflush(so); if (how == SHUT_WR || how == SHUT_RDWR) - return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); - return (0); + return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, + NULL, NULL, NULL); + return 0; } void sorflush(struct socket *so) { struct sockbuf *sb, asb; - struct protosw *pr; + const struct protosw *pr; int s; sb = &so->so_rcv; pr = so->so_proto; sb->sb_flags |= SB_NOINTR; (void) sblock(sb, M_WAITOK); - s = splimp(); + s = splnet(); socantrcvmore(so); sbunlock(sb); asb = *sb; - memset((caddr_t)sb, 0, sizeof(*sb)); + /* + * Clear most of the sockbuf structure, but leave some of the + * fields valid. + */ + memset(&sb->sb_startzero, 0, + sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); splx(s); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) (*pr->pr_domain->dom_dispose)(asb.sb_mb); - sbrelease(&asb); + sbrelease(&asb, so); } -int -sosetopt(struct socket *so, int level, int optname, struct mbuf *m0) +static int +sosetopt1(struct socket *so, int level, int optname, struct mbuf *m) { - int error; - struct mbuf *m; - - error = 0; - m = m0; - if (level != SOL_SOCKET) { - if (so->so_proto && so->so_proto->pr_ctloutput) - return ((*so->so_proto->pr_ctloutput) - (PRCO_SETOPT, so, level, optname, &m0)); - error = ENOPROTOOPT; - } else { - switch (optname) { + int optval, val; + struct linger *l; + struct sockbuf *sb; + struct timeval *tv; + + switch (optname) { + + case SO_LINGER: + if (m == NULL || m->m_len != sizeof(struct linger)) + return EINVAL; + l = mtod(m, struct linger *); + if (l->l_linger < 0 || l->l_linger > USHRT_MAX || + l->l_linger > (INT_MAX / hz)) + return EDOM; + so->so_linger = l->l_linger; + if (l->l_onoff) + so->so_options |= SO_LINGER; + else + so->so_options &= ~SO_LINGER; + break; + + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_DONTROUTE: + case SO_USELOOPBACK: + case SO_BROADCAST: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_OOBINLINE: + case SO_TIMESTAMP: + if (m == NULL || m->m_len < sizeof(int)) + return EINVAL; + if (*mtod(m, int *)) + so->so_options |= optname; + else + so->so_options &= ~optname; + break; + + case SO_SNDBUF: + case SO_RCVBUF: + case SO_SNDLOWAT: + case SO_RCVLOWAT: + if (m == NULL || m->m_len < sizeof(int)) + return EINVAL; - case SO_LINGER: - if (m == NULL || m->m_len != sizeof(struct linger)) { - error = EINVAL; - goto bad; - } - so->so_linger = mtod(m, struct linger *)->l_linger; - /* fall thru... */ + /* + * Values < 1 make no sense for any of these + * options, so disallow them. + */ + optval = *mtod(m, int *); + if (optval < 1) + return EINVAL; - case SO_DEBUG: - case SO_KEEPALIVE: - case SO_DONTROUTE: - case SO_USELOOPBACK: - case SO_BROADCAST: - case SO_REUSEADDR: - case SO_REUSEPORT: - case SO_OOBINLINE: - case SO_TIMESTAMP: - if (m == NULL || m->m_len < sizeof(int)) { - error = EINVAL; - goto bad; - } - if (*mtod(m, int *)) - so->so_options |= optname; - else - so->so_options &= ~optname; - break; + switch (optname) { case SO_SNDBUF: case SO_RCVBUF: + sb = (optname == SO_SNDBUF) ? + &so->so_snd : &so->so_rcv; + if (sbreserve(sb, (u_long)optval, so) == 0) + return ENOBUFS; + sb->sb_flags &= ~SB_AUTOSIZE; + break; + + /* + * Make sure the low-water is never greater than + * the high-water. + */ case SO_SNDLOWAT: + so->so_snd.sb_lowat = + (optval > so->so_snd.sb_hiwat) ? + so->so_snd.sb_hiwat : optval; + break; case SO_RCVLOWAT: - { - int optval; - - if (m == NULL || m->m_len < sizeof(int)) { - error = EINVAL; - goto bad; - } - - /* - * Values < 1 make no sense for any of these - * options, so disallow them. - */ - optval = *mtod(m, int *); - if (optval < 1) { - error = EINVAL; - goto bad; - } - - switch (optname) { + so->so_rcv.sb_lowat = + (optval > so->so_rcv.sb_hiwat) ? + so->so_rcv.sb_hiwat : optval; + break; + } + break; - case SO_SNDBUF: - case SO_RCVBUF: - if (sbreserve(optname == SO_SNDBUF ? - &so->so_snd : &so->so_rcv, - (u_long) optval) == 0) { - error = ENOBUFS; - goto bad; - } - break; + case SO_SNDTIMEO: + case SO_RCVTIMEO: + if (m == NULL || m->m_len < sizeof(*tv)) + return EINVAL; + tv = mtod(m, struct timeval *); + if (tv->tv_sec > (INT_MAX - tv->tv_usec / tick) / hz) + return EDOM; + val = tv->tv_sec * hz + tv->tv_usec / tick; + if (val == 0 && tv->tv_usec != 0) + val = 1; - /* - * Make sure the low-water is never greater than - * the high-water. - */ - case SO_SNDLOWAT: - so->so_snd.sb_lowat = - (optval > so->so_snd.sb_hiwat) ? - so->so_snd.sb_hiwat : optval; - break; - case SO_RCVLOWAT: - so->so_rcv.sb_lowat = - (optval > so->so_rcv.sb_hiwat) ? - so->so_rcv.sb_hiwat : optval; - break; - } - break; - } + switch (optname) { case SO_SNDTIMEO: + so->so_snd.sb_timeo = val; + break; case SO_RCVTIMEO: - { - struct timeval *tv; - short val; + so->so_rcv.sb_timeo = val; + break; + } + break; - if (m == NULL || m->m_len < sizeof(*tv)) { - error = EINVAL; - goto bad; - } - tv = mtod(m, struct timeval *); - if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) { - error = EDOM; - goto bad; - } - val = tv->tv_sec * hz + tv->tv_usec / tick; + default: + return ENOPROTOOPT; + } + return 0; +} - switch (optname) { +int +sosetopt(struct socket *so, int level, int optname, struct mbuf *m) +{ + int error, prerr; - case SO_SNDTIMEO: - so->so_snd.sb_timeo = val; - break; - case SO_RCVTIMEO: - so->so_rcv.sb_timeo = val; - break; - } - break; - } + if (level == SOL_SOCKET) + error = sosetopt1(so, level, optname, m); + else + error = ENOPROTOOPT; - default: - error = ENOPROTOOPT; - break; - } - if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { - (void) ((*so->so_proto->pr_ctloutput) - (PRCO_SETOPT, so, level, optname, &m0)); - m = NULL; /* freed by protocol */ - } - } - bad: - if (m) - (void) m_free(m); - return (error); + if ((error == 0 || error == ENOPROTOOPT) && + so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { + /* give the protocol stack a shot */ + prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, level, + optname, &m); + if (prerr == 0) + error = 0; + else if (prerr != ENOPROTOOPT) + error = prerr; + } else if (m != NULL) + (void)m_free(m); + return error; } int @@ -1034,7 +1613,7 @@ sogetopt(struct socket *so, int level, i case SO_LINGER: m->m_len = sizeof(struct linger); mtod(m, struct linger *)->l_onoff = - so->so_options & SO_LINGER; + (so->so_options & SO_LINGER) ? 1 : 0; mtod(m, struct linger *)->l_linger = so->so_linger; break; @@ -1047,7 +1626,7 @@ sogetopt(struct socket *so, int level, i case SO_BROADCAST: case SO_OOBINLINE: case SO_TIMESTAMP: - *mtod(m, int *) = so->so_options & optname; + *mtod(m, int *) = (so->so_options & optname) ? 1 : 0; break; case SO_TYPE: @@ -1088,6 +1667,10 @@ sogetopt(struct socket *so, int level, i break; } + case SO_OVERFLOWED: + *mtod(m, int *) = so->so_rcv.sb_overflowed; + break; + default: (void)m_free(m); return (ENOPROTOOPT); @@ -1100,11 +1683,172 @@ sogetopt(struct socket *so, int level, i void sohasoutofband(struct socket *so) { - struct proc *p; - - if (so->so_pgid < 0) - gsignal(-so->so_pgid, SIGURG); - else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) - psignal(p, SIGURG); + fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); selwakeup(&so->so_rcv.sb_sel); } + +static void +filt_sordetach(struct knote *kn) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); + if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) + so->so_rcv.sb_flags &= ~SB_KNOTE; +} + +/*ARGSUSED*/ +static int +filt_soread(struct knote *kn, long hint) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + kn->kn_data = so->so_rcv.sb_cc; + if (so->so_state & SS_CANTRCVMORE) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + return (1); + } + if (so->so_error) /* temporary udp error */ + return (1); + if (kn->kn_sfflags & NOTE_LOWAT) + return (kn->kn_data >= kn->kn_sdata); + return (kn->kn_data >= so->so_rcv.sb_lowat); +} + +static void +filt_sowdetach(struct knote *kn) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); + if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) + so->so_snd.sb_flags &= ~SB_KNOTE; +} + +/*ARGSUSED*/ +static int +filt_sowrite(struct knote *kn, long hint) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + kn->kn_data = sbspace(&so->so_snd); + if (so->so_state & SS_CANTSENDMORE) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + return (1); + } + if (so->so_error) /* temporary udp error */ + return (1); + if (((so->so_state & SS_ISCONNECTED) == 0) && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) + return (0); + if (kn->kn_sfflags & NOTE_LOWAT) + return (kn->kn_data >= kn->kn_sdata); + return (kn->kn_data >= so->so_snd.sb_lowat); +} + +/*ARGSUSED*/ +static int +filt_solisten(struct knote *kn, long hint) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + + /* + * Set kn_data to number of incoming connections, not + * counting partial (incomplete) connections. + */ + kn->kn_data = so->so_qlen; + return (kn->kn_data > 0); +} + +static const struct filterops solisten_filtops = + { 1, NULL, filt_sordetach, filt_solisten }; +static const struct filterops soread_filtops = + { 1, NULL, filt_sordetach, filt_soread }; +static const struct filterops sowrite_filtops = + { 1, NULL, filt_sowdetach, filt_sowrite }; + +int +soo_kqfilter(struct file *fp, struct knote *kn) +{ + struct socket *so; + struct sockbuf *sb; + + so = (struct socket *)kn->kn_fp->f_data; + switch (kn->kn_filter) { + case EVFILT_READ: + if (so->so_options & SO_ACCEPTCONN) + kn->kn_fop = &solisten_filtops; + else + kn->kn_fop = &soread_filtops; + sb = &so->so_rcv; + break; + case EVFILT_WRITE: + kn->kn_fop = &sowrite_filtops; + sb = &so->so_snd; + break; + default: + return (1); + } + SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); + sb->sb_flags |= SB_KNOTE; + return (0); +} + +#include + +static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); + +/* + * sysctl helper routine for kern.somaxkva. ensures that the given + * value is not too small. + * (XXX should we maybe make sure it's not too large as well?) + */ +static int +sysctl_kern_somaxkva(SYSCTLFN_ARGS) +{ + int error, new_somaxkva; + struct sysctlnode node; + + new_somaxkva = somaxkva; + node = *rnode; + node.sysctl_data = &new_somaxkva; + error = sysctl_lookup(SYSCTLFN_CALL(&node)); + if (error || newp == NULL) + return (error); + + if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */ + return (EINVAL); + + mutex_enter(&so_pendfree_lock); + somaxkva = new_somaxkva; + cv_broadcast(&socurkva_cv); + mutex_exit(&so_pendfree_lock); + + return (error); +} + +SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup") +{ + + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "kern", NULL, + NULL, 0, NULL, 0, + CTL_KERN, CTL_EOL); + + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "somaxkva", + SYSCTL_DESCR("Maximum amount of kernel memory to be " + "used for socket buffers"), + sysctl_kern_somaxkva, 0, NULL, 0, + CTL_KERN, KERN_SOMAXKVA, CTL_EOL); +}