Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.66.2.3 retrieving revision 1.120.2.1 diff -u -p -r1.66.2.3 -r1.120.2.1 --- src/sys/kern/uipc_socket.c 2002/08/29 05:23:14 1.66.2.3 +++ src/sys/kern/uipc_socket.c 2006/07/13 17:49:51 1.120.2.1 @@ -1,4 +1,4 @@ -/* $NetBSD: uipc_socket.c,v 1.66.2.3 2002/08/29 05:23:14 gehenna Exp $ */ +/* $NetBSD: uipc_socket.c,v 1.120.2.1 2006/07/13 17:49:51 gdamore Exp $ */ /*- * Copyright (c) 2002 The NetBSD Foundation, Inc. @@ -48,11 +48,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -72,10 +68,12 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.66.2.3 2002/08/29 05:23:14 gehenna Exp $"); +__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.120.2.1 2006/07/13 17:49:51 gdamore Exp $"); #include "opt_sock_counters.h" #include "opt_sosend_loan.h" +#include "opt_mbuftrace.h" +#include "opt_somaxkva.h" #include #include @@ -91,10 +89,16 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_socket. #include #include #include +#include +#include +#include #include -struct pool socket_pool; +POOL_INIT(socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL); + +MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); +MALLOC_DEFINE(M_SONAME, "soname", "socket name"); extern int somaxconn; /* patchable (XXX sysctl) */ int somaxconn = SOMAXCONN; @@ -102,57 +106,157 @@ int somaxconn = SOMAXCONN; #ifdef SOSEND_COUNTERS #include -struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, +static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "loan big"); -struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, +static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "copy big"); -struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, +static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "copy small"); -struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, +static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "kva limit"); #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ +EVCNT_ATTACH_STATIC(sosend_loan_big); +EVCNT_ATTACH_STATIC(sosend_copy_big); +EVCNT_ATTACH_STATIC(sosend_copy_small); +EVCNT_ATTACH_STATIC(sosend_kvalimit); #else #define SOSEND_COUNTER_INCR(ev) /* nothing */ #endif /* SOSEND_COUNTERS */ -void -soinit(void) -{ - - pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, - "sockpl", NULL); - -#ifdef SOSEND_COUNTERS - evcnt_attach_static(&sosend_loan_big); - evcnt_attach_static(&sosend_copy_big); - evcnt_attach_static(&sosend_copy_small); - evcnt_attach_static(&sosend_kvalimit); -#endif /* SOSEND_COUNTERS */ -} +static struct callback_entry sokva_reclaimerentry; #ifdef SOSEND_NO_LOAN -int use_sosend_loan = 0; +int sock_loan_thresh = -1; #else -int use_sosend_loan = 1; +int sock_loan_thresh = 4096; #endif -struct mbuf *so_pendfree; +static struct simplelock so_pendfree_slock = SIMPLELOCK_INITIALIZER; +static struct mbuf *so_pendfree; -int somaxkva = 16 * 1024 * 1024; -int socurkva; -int sokvawaiters; +#ifndef SOMAXKVA +#define SOMAXKVA (16 * 1024 * 1024) +#endif +int somaxkva = SOMAXKVA; +static int socurkva; +static int sokvawaiters; -#define SOCK_LOAN_THRESH 4096 #define SOCK_LOAN_CHUNK 65536 +static size_t sodopendfree(void); +static size_t sodopendfreel(void); + +static vsize_t +sokvareserve(struct socket *so, vsize_t len) +{ + int s; + int error; + + s = splvm(); + simple_lock(&so_pendfree_slock); + while (socurkva + len > somaxkva) { + size_t freed; + + /* + * try to do pendfree. + */ + + freed = sodopendfreel(); + + /* + * if some kva was freed, try again. + */ + + if (freed) + continue; + + SOSEND_COUNTER_INCR(&sosend_kvalimit); + sokvawaiters++; + error = ltsleep(&socurkva, PVM | PCATCH, "sokva", 0, + &so_pendfree_slock); + sokvawaiters--; + if (error) { + len = 0; + break; + } + } + socurkva += len; + simple_unlock(&so_pendfree_slock); + splx(s); + return len; +} + static void -sodoloanfree(caddr_t buf, u_int size) +sokvaunreserve(vsize_t len) +{ + int s; + + s = splvm(); + simple_lock(&so_pendfree_slock); + socurkva -= len; + if (sokvawaiters) + wakeup(&socurkva); + simple_unlock(&so_pendfree_slock); + splx(s); +} + +/* + * sokvaalloc: allocate kva for loan. + */ + +vaddr_t +sokvaalloc(vsize_t len, struct socket *so) +{ + vaddr_t lva; + + /* + * reserve kva. + */ + + if (sokvareserve(so, len) == 0) + return 0; + + /* + * allocate kva. + */ + + lva = uvm_km_alloc(kernel_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA); + if (lva == 0) { + sokvaunreserve(len); + return (0); + } + + return lva; +} + +/* + * sokvafree: free kva for loan. + */ + +void +sokvafree(vaddr_t sva, vsize_t len) +{ + + /* + * free kva. + */ + + uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY); + + /* + * unreserve kva. + */ + + sokvaunreserve(len); +} + +static void +sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size) { - struct vm_page **pgs; vaddr_t va, sva, eva; vsize_t len; paddr_t pa; @@ -163,79 +267,110 @@ sodoloanfree(caddr_t buf, u_int size) len = eva - sva; npgs = len >> PAGE_SHIFT; - pgs = alloca(npgs * sizeof(*pgs)); + if (__predict_false(pgs == NULL)) { + pgs = alloca(npgs * sizeof(*pgs)); - for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { - if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) - panic("sodoloanfree: va 0x%lx not mapped", va); - pgs[i] = PHYS_TO_VM_PAGE(pa); + for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { + if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) + panic("sodoloanfree: va 0x%lx not mapped", va); + pgs[i] = PHYS_TO_VM_PAGE(pa); + } } pmap_kremove(sva, len); pmap_update(pmap_kernel()); uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); - uvm_km_free(kernel_map, sva, len); - socurkva -= len; - if (sokvawaiters) - wakeup(&socurkva); + sokvafree(sva, len); } static size_t -sodopendfree(struct socket *so) +sodopendfree() { - struct mbuf *m; - size_t rv = 0; int s; + size_t rv; s = splvm(); + simple_lock(&so_pendfree_slock); + rv = sodopendfreel(); + simple_unlock(&so_pendfree_slock); + splx(s); - for (;;) { - m = so_pendfree; - if (m == NULL) - break; - so_pendfree = m->m_next; - splx(s); + return rv; +} - rv += m->m_ext.ext_size; - sodoloanfree(m->m_ext.ext_buf, m->m_ext.ext_size); - s = splvm(); - pool_cache_put(&mbpool_cache, m); - } +/* + * sodopendfreel: free mbufs on "pendfree" list. + * unlock and relock so_pendfree_slock when freeing mbufs. + * + * => called with so_pendfree_slock held. + * => called at splvm. + */ + +static size_t +sodopendfreel() +{ + size_t rv = 0; + + LOCK_ASSERT(simple_lock_held(&so_pendfree_slock)); for (;;) { - m = so->so_pendfree; + struct mbuf *m; + struct mbuf *next; + + m = so_pendfree; if (m == NULL) break; - so->so_pendfree = m->m_next; - splx(s); + so_pendfree = NULL; + simple_unlock(&so_pendfree_slock); + /* XXX splx */ + + for (; m != NULL; m = next) { + next = m->m_next; + + rv += m->m_ext.ext_size; + sodoloanfree((m->m_flags & M_EXT_PAGES) ? + m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, + m->m_ext.ext_size); + pool_cache_put(&mbpool_cache, m); + } - rv += m->m_ext.ext_size; - sodoloanfree(m->m_ext.ext_buf, m->m_ext.ext_size); - s = splvm(); - pool_cache_put(&mbpool_cache, m); + /* XXX splvm */ + simple_lock(&so_pendfree_slock); } - splx(s); return (rv); } -static void -soloanfree(struct mbuf *m, caddr_t buf, u_int size, void *arg) +void +soloanfree(struct mbuf *m, caddr_t buf, size_t size, void *arg) { - struct socket *so = arg; int s; if (m == NULL) { - sodoloanfree(buf, size); + + /* + * called from MEXTREMOVE. + */ + + sodoloanfree(NULL, buf, size); return; } + /* + * postpone freeing mbuf. + * + * we can't do it in interrupt context + * because we need to put kva back to kernel_map. + */ + s = splvm(); - m->m_next = so->so_pendfree; - so->so_pendfree = m; - splx(s); + simple_lock(&so_pendfree_slock); + m->m_next = so_pendfree; + so_pendfree = m; if (sokvawaiters) wakeup(&socurkva); + simple_unlock(&so_pendfree_slock); + splx(s); } static long @@ -244,11 +379,10 @@ sosend_loan(struct socket *so, struct ui struct iovec *iov = uio->uio_iov; vaddr_t sva, eva; vsize_t len; - struct vm_page **pgs; vaddr_t lva, va; - int npgs, s, i, error; + int npgs, i, error; - if (uio->uio_segflg != UIO_USERSPACE) + if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) return (0); if (iov->iov_len < (size_t) space) @@ -261,39 +395,29 @@ sosend_loan(struct socket *so, struct ui len = eva - sva; npgs = len >> PAGE_SHIFT; - while (socurkva + len > somaxkva) { - if (sodopendfree(so)) - continue; - SOSEND_COUNTER_INCR(&sosend_kvalimit); - s = splvm(); - sokvawaiters++; - (void) tsleep(&socurkva, PVM, "sokva", 0); - sokvawaiters--; - splx(s); - } + /* XXX KDASSERT */ + KASSERT(npgs <= M_EXT_MAXPAGES); - lva = uvm_km_valloc_wait(kernel_map, len); + lva = sokvaalloc(len, so); if (lva == 0) - return (0); - socurkva += len; + return 0; - pgs = alloca(npgs * sizeof(*pgs)); - - error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len, - pgs, UVM_LOAN_TOPAGE); + error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len, + m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); if (error) { - uvm_km_free(kernel_map, lva, len); - socurkva -= len; + sokvafree(lva, len); return (0); } for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) - pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pgs[i]), VM_PROT_READ); + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), + VM_PROT_READ); pmap_update(pmap_kernel()); lva += (vaddr_t) iov->iov_base & PAGE_MASK; MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so); + m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; uio->uio_resid -= space; /* uio_offset not updated, not set/used for write(2) */ @@ -307,6 +431,32 @@ sosend_loan(struct socket *so, struct ui return (space); } +static int +sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg) +{ + + KASSERT(ce == &sokva_reclaimerentry); + KASSERT(obj == NULL); + + sodopendfree(); + if (!vm_map_starved_p(kernel_map)) { + return CALLBACK_CHAIN_ABORT; + } + return CALLBACK_CHAIN_CONTINUE; +} + +void +soinit(void) +{ + + /* Set the initial adjusted socket buffer size. */ + if (sb_max_set(sb_max)) + panic("bad initial sb_max value: %lu", sb_max); + + callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, + &sokva_reclaimerentry, NULL, sokva_reclaim_callback); +} + /* * Socket operation routines. * These routines are called by the routines in @@ -316,19 +466,27 @@ sosend_loan(struct socket *so, struct ui */ /*ARGSUSED*/ int -socreate(int dom, struct socket **aso, int type, int proto) +socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l) { - struct proc *p; - struct protosw *prp; + const struct protosw *prp; struct socket *so; + uid_t uid; int error, s; - p = curproc; /* XXX */ if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); - if (prp == 0 || prp->pr_usrreq == 0) + if (prp == 0) { + /* no support for domain */ + if (pffinddomain(dom) == 0) + return (EAFNOSUPPORT); + /* no support for socket type */ + if (proto == 0 && type != 0) + return (EPROTOTYPE); + return (EPROTONOSUPPORT); + } + if (prp->pr_usrreq == 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); @@ -341,10 +499,19 @@ socreate(int dom, struct socket **aso, i so->so_proto = prp; so->so_send = sosend; so->so_receive = soreceive; - if (p != 0) - so->so_uid = p->p_ucred->cr_uid; +#ifdef MBUFTRACE + so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; + so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; + so->so_mowner = &prp->pr_domain->dom_mowner; +#endif + if (l != NULL) { + uid = kauth_cred_geteuid(l->l_proc->p_cred); + } else { + uid = 0; + } + so->so_uidinfo = uid_find(uid); error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, - (struct mbuf *)(long)proto, (struct mbuf *)0, p); + (struct mbuf *)(long)proto, (struct mbuf *)0, l); if (error) { so->so_state |= SS_NOFDREF; sofree(so); @@ -357,13 +524,13 @@ socreate(int dom, struct socket **aso, i } int -sobind(struct socket *so, struct mbuf *nam, struct proc *p) +sobind(struct socket *so, struct mbuf *nam, struct lwp *l) { int s, error; s = splsoftnet(); error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0, - nam, (struct mbuf *)0, p); + nam, (struct mbuf *)0, l); splx(s); return (error); } @@ -375,7 +542,7 @@ solisten(struct socket *so, int backlog) s = splsoftnet(); error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); + (struct mbuf *)0, (struct mbuf *)0, (struct lwp *)0); if (error) { splx(s); return (error); @@ -392,7 +559,6 @@ solisten(struct socket *so, int backlog) void sofree(struct socket *so) { - struct mbuf *m; if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) return; @@ -405,13 +571,14 @@ sofree(struct socket *so) if (!soqremque(so, 0)) return; } - sbrelease(&so->so_snd); + if (so->so_rcv.sb_hiwat) + (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, + RLIM_INFINITY); + if (so->so_snd.sb_hiwat) + (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, + RLIM_INFINITY); + sbrelease(&so->so_snd, so); sorflush(so); - while ((m = so->so_pendfree) != NULL) { - so->so_pendfree = m->m_next; - m->m_next = so_pendfree; - so_pendfree = m; - } pool_put(&socket_pool, so); } @@ -463,7 +630,7 @@ soclose(struct socket *so) if (so->so_pcb) { int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, - (struct proc *)0); + (struct lwp *)0); if (error == 0) error = error2; } @@ -484,7 +651,7 @@ soabort(struct socket *so) { return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); + (struct mbuf *)0, (struct mbuf *)0, (struct lwp *)0); } int @@ -500,7 +667,7 @@ soaccept(struct socket *so, struct mbuf if ((so->so_state & SS_ISDISCONNECTED) == 0 || (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, - (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0); + (struct mbuf *)0, nam, (struct mbuf *)0, (struct lwp *)0); else error = ECONNABORTED; @@ -509,12 +676,10 @@ soaccept(struct socket *so, struct mbuf } int -soconnect(struct socket *so, struct mbuf *nam) +soconnect(struct socket *so, struct mbuf *nam, struct lwp *l) { - struct proc *p; int s, error; - p = curproc; /* XXX */ if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); s = splsoftnet(); @@ -530,7 +695,7 @@ soconnect(struct socket *so, struct mbuf error = EISCONN; else error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, - (struct mbuf *)0, nam, (struct mbuf *)0, p); + (struct mbuf *)0, nam, (struct mbuf *)0, l); splx(s); return (error); } @@ -543,7 +708,7 @@ soconnect2(struct socket *so1, struct so s = splsoftnet(); error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0, - (struct proc *)0); + (struct lwp *)0); splx(s); return (error); } @@ -564,10 +729,10 @@ sodisconnect(struct socket *so) } error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, - (struct proc *)0); + (struct lwp *)0); bad: splx(s); - sodopendfree(so); + sodopendfree(); return (error); } @@ -591,16 +756,16 @@ sodisconnect(struct socket *so) */ int sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, - struct mbuf *control, int flags) + struct mbuf *control, int flags, struct lwp *l) { - struct proc *p; struct mbuf **mp, *m; + struct proc *p; long space, len, resid, clen, mlen; int error, s, dontroute, atomic; - sodopendfree(so); + p = l->l_proc; + sodopendfree(); - p = curproc; /* XXX */ clen = 0; atomic = sosendallatonce(so) || top; if (uio) @@ -621,7 +786,8 @@ sosend(struct socket *so, struct mbuf *a dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); - p->p_stats->p_ru.ru_msgsnd++; + if (p) + p->p_stats->p_ru.ru_msgsnd++; if (control) clen = control->m_len; #define snderr(errno) { error = errno; splx(s); goto release; } @@ -653,7 +819,7 @@ sosend(struct socket *so, struct mbuf *a if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) snderr(EMSGSIZE); - if (space < resid + clen && uio && + if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if (so->so_state & SS_NBIO) snderr(EWOULDBLOCK); @@ -677,17 +843,18 @@ sosend(struct socket *so, struct mbuf *a top->m_flags |= M_EOR; } else do { if (top == 0) { - MGETHDR(m, M_WAIT, MT_DATA); + m = m_gethdr(M_WAIT, MT_DATA); mlen = MHLEN; m->m_pkthdr.len = 0; m->m_pkthdr.rcvif = (struct ifnet *)0; } else { - MGET(m, M_WAIT, MT_DATA); + m = m_get(M_WAIT, MT_DATA); mlen = MLEN; } - if (use_sosend_loan && - uio->uio_iov->iov_len >= SOCK_LOAN_THRESH && - space >= SOCK_LOAN_THRESH && + MCLAIM(m, so->so_snd.sb_mowner); + if (sock_loan_thresh >= 0 && + uio->uio_iov->iov_len >= sock_loan_thresh && + space >= sock_loan_thresh && (len = sosend_loan(so, uio, m, space)) != 0) { SOSEND_COUNTER_INCR(&sosend_loan_big); @@ -696,7 +863,7 @@ sosend(struct socket *so, struct mbuf *a } if (resid >= MINCLSIZE && space >= MCLBYTES) { SOSEND_COUNTER_INCR(&sosend_copy_big); - MCLGET(m, M_WAIT); + m_clget(m, M_WAIT); if ((m->m_flags & M_EXT) == 0) goto nopages; mlen = MCLBYTES; @@ -735,7 +902,7 @@ sosend(struct socket *so, struct mbuf *a break; } } while (space > 0 && atomic); - + s = splsoftnet(); if (so->so_state & SS_CANTSENDMORE) @@ -747,7 +914,7 @@ sosend(struct socket *so, struct mbuf *a so->so_state |= SS_MORETOCOME; error = (*so->so_proto->pr_usrreq)(so, (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, - top, addr, control, p); + top, addr, control, curlwp); /* XXX */ if (dontroute) so->so_options &= ~SO_DONTROUTE; if (resid > 0) @@ -793,9 +960,10 @@ int soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { + struct lwp *l = curlwp; struct mbuf *m, **mp; int flags, len, error, s, offset, moff, type, orig_resid; - struct protosw *pr; + const struct protosw *pr; struct mbuf *nextrecord; int mbuf_removed = 0; @@ -803,6 +971,7 @@ soreceive(struct socket *so, struct mbuf mp = mp0; type = 0; orig_resid = uio->uio_resid; + if (paddr) *paddr = 0; if (controlp) @@ -813,13 +982,13 @@ soreceive(struct socket *so, struct mbuf flags = 0; if ((flags & MSG_DONTWAIT) == 0) - sodopendfree(so); + sodopendfree(); if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, - (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0, - (struct proc *)0); + (struct mbuf *)(long)(flags & MSG_PEEK), + (struct mbuf *)0, l); if (error) goto bad; do { @@ -836,7 +1005,7 @@ soreceive(struct socket *so, struct mbuf *mp = (struct mbuf *)0; if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); + (struct mbuf *)0, (struct mbuf *)0, l); restart: if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) @@ -909,10 +1078,8 @@ soreceive(struct socket *so, struct mbuf * While we process the initial mbufs containing address and control * info, we save a copy of m->m_nextpkt into nextrecord. */ -#ifdef notyet /* XXXX */ - if (uio->uio_procp) - uio->uio_procp->p_stats->p_ru.ru_msgrcv++; -#endif + if (l) + l->l_proc->p_stats->p_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb); SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); @@ -950,15 +1117,23 @@ soreceive(struct socket *so, struct mbuf sbfree(&so->so_rcv, m); mbuf_removed = 1; if (controlp) { - if (pr->pr_domain->dom_externalize && + struct domain *dom = pr->pr_domain; + if (dom->dom_externalize && l && mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) - error = (*pr->pr_domain->dom_externalize)(m); + error = (*dom->dom_externalize)(m, l); *controlp = m; so->so_rcv.sb_mb = m->m_next; m->m_next = 0; m = so->so_rcv.sb_mb; } else { + /* + * Dispose of any SCM_RIGHTS message that went + * through the read path rather than recv. + */ + if (pr->pr_domain->dom_dispose && + mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) + (*pr->pr_domain->dom_dispose)(m); MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } @@ -1139,8 +1314,7 @@ soreceive(struct socket *so, struct mbuf (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, (struct mbuf *)(long)flags, - (struct mbuf *)0, - (struct proc *)0); + (struct mbuf *)0, l); SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); error = sbwait(&so->so_rcv); @@ -1177,8 +1351,7 @@ soreceive(struct socket *so, struct mbuf SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, - (struct mbuf *)(long)flags, (struct mbuf *)0, - (struct proc *)0); + (struct mbuf *)(long)flags, (struct mbuf *)0, l); } if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { @@ -1186,7 +1359,7 @@ soreceive(struct socket *so, struct mbuf splx(s); goto restart; } - + if (flagsp) *flagsp |= flags; release: @@ -1198,7 +1371,7 @@ soreceive(struct socket *so, struct mbuf int soshutdown(struct socket *so, int how) { - struct protosw *pr; + const struct protosw *pr; pr = so->so_proto; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) @@ -1208,7 +1381,7 @@ soshutdown(struct socket *so, int how) sorflush(so); if (how == SHUT_WR || how == SHUT_RDWR) return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); + (struct mbuf *)0, (struct mbuf *)0, (struct lwp *)0); return (0); } @@ -1216,7 +1389,7 @@ void sorflush(struct socket *so) { struct sockbuf *sb, asb; - struct protosw *pr; + const struct protosw *pr; int s; sb = &so->so_rcv; @@ -1227,11 +1400,16 @@ sorflush(struct socket *so) socantrcvmore(so); sbunlock(sb); asb = *sb; - memset((caddr_t)sb, 0, sizeof(*sb)); + /* + * Clear most of the sockbuf structure, but leave some of the + * fields valid. + */ + memset(&sb->sb_startzero, 0, + sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); splx(s); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) (*pr->pr_domain->dom_dispose)(asb.sb_mb); - sbrelease(&asb); + sbrelease(&asb, so); } int @@ -1255,6 +1433,11 @@ sosetopt(struct socket *so, int level, i error = EINVAL; goto bad; } + if (mtod(m, struct linger *)->l_linger < 0 || + mtod(m, struct linger *)->l_linger > (INT_MAX / hz)) { + error = EDOM; + goto bad; + } so->so_linger = mtod(m, struct linger *)->l_linger; /* fall thru... */ @@ -1305,7 +1488,7 @@ sosetopt(struct socket *so, int level, i case SO_RCVBUF: if (sbreserve(optname == SO_SNDBUF ? &so->so_snd : &so->so_rcv, - (u_long) optval) == 0) { + (u_long) optval, so) == 0) { error = ENOBUFS; goto bad; } @@ -1333,18 +1516,20 @@ sosetopt(struct socket *so, int level, i case SO_RCVTIMEO: { struct timeval *tv; - short val; + int val; if (m == NULL || m->m_len < sizeof(*tv)) { error = EINVAL; goto bad; } tv = mtod(m, struct timeval *); - if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) { + if (tv->tv_sec > (INT_MAX - tv->tv_usec / tick) / hz) { error = EDOM; goto bad; } val = tv->tv_sec * hz + tv->tv_usec / tick; + if (val == 0 && tv->tv_usec != 0) + val = 1; switch (optname) { @@ -1448,6 +1633,10 @@ sogetopt(struct socket *so, int level, i break; } + case SO_OVERFLOWED: + *mtod(m, int *) = so->so_rcv.sb_overflowed; + break; + default: (void)m_free(m); return (ENOPROTOOPT); @@ -1460,11 +1649,175 @@ sogetopt(struct socket *so, int level, i void sohasoutofband(struct socket *so) { - struct proc *p; - - if (so->so_pgid < 0) - gsignal(-so->so_pgid, SIGURG); - else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) - psignal(p, SIGURG); + fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); selwakeup(&so->so_rcv.sb_sel); } + +static void +filt_sordetach(struct knote *kn) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); + if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) + so->so_rcv.sb_flags &= ~SB_KNOTE; +} + +/*ARGSUSED*/ +static int +filt_soread(struct knote *kn, long hint) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + kn->kn_data = so->so_rcv.sb_cc; + if (so->so_state & SS_CANTRCVMORE) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + return (1); + } + if (so->so_error) /* temporary udp error */ + return (1); + if (kn->kn_sfflags & NOTE_LOWAT) + return (kn->kn_data >= kn->kn_sdata); + return (kn->kn_data >= so->so_rcv.sb_lowat); +} + +static void +filt_sowdetach(struct knote *kn) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); + if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) + so->so_snd.sb_flags &= ~SB_KNOTE; +} + +/*ARGSUSED*/ +static int +filt_sowrite(struct knote *kn, long hint) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + kn->kn_data = sbspace(&so->so_snd); + if (so->so_state & SS_CANTSENDMORE) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + return (1); + } + if (so->so_error) /* temporary udp error */ + return (1); + if (((so->so_state & SS_ISCONNECTED) == 0) && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) + return (0); + if (kn->kn_sfflags & NOTE_LOWAT) + return (kn->kn_data >= kn->kn_sdata); + return (kn->kn_data >= so->so_snd.sb_lowat); +} + +/*ARGSUSED*/ +static int +filt_solisten(struct knote *kn, long hint) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + + /* + * Set kn_data to number of incoming connections, not + * counting partial (incomplete) connections. + */ + kn->kn_data = so->so_qlen; + return (kn->kn_data > 0); +} + +static const struct filterops solisten_filtops = + { 1, NULL, filt_sordetach, filt_solisten }; +static const struct filterops soread_filtops = + { 1, NULL, filt_sordetach, filt_soread }; +static const struct filterops sowrite_filtops = + { 1, NULL, filt_sowdetach, filt_sowrite }; + +int +soo_kqfilter(struct file *fp, struct knote *kn) +{ + struct socket *so; + struct sockbuf *sb; + + so = (struct socket *)kn->kn_fp->f_data; + switch (kn->kn_filter) { + case EVFILT_READ: + if (so->so_options & SO_ACCEPTCONN) + kn->kn_fop = &solisten_filtops; + else + kn->kn_fop = &soread_filtops; + sb = &so->so_rcv; + break; + case EVFILT_WRITE: + kn->kn_fop = &sowrite_filtops; + sb = &so->so_snd; + break; + default: + return (1); + } + SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); + sb->sb_flags |= SB_KNOTE; + return (0); +} + +#include + +static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); + +/* + * sysctl helper routine for kern.somaxkva. ensures that the given + * value is not too small. + * (XXX should we maybe make sure it's not too large as well?) + */ +static int +sysctl_kern_somaxkva(SYSCTLFN_ARGS) +{ + int error, new_somaxkva; + struct sysctlnode node; + int s; + + new_somaxkva = somaxkva; + node = *rnode; + node.sysctl_data = &new_somaxkva; + error = sysctl_lookup(SYSCTLFN_CALL(&node)); + if (error || newp == NULL) + return (error); + + if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */ + return (EINVAL); + + s = splvm(); + simple_lock(&so_pendfree_slock); + somaxkva = new_somaxkva; + wakeup(&socurkva); + simple_unlock(&so_pendfree_slock); + splx(s); + + return (error); +} + +SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup") +{ + + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "kern", NULL, + NULL, 0, NULL, 0, + CTL_KERN, CTL_EOL); + + sysctl_createv(clog, 0, NULL, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "somaxkva", + SYSCTL_DESCR("Maximum amount of kernel memory to be " + "used for socket buffers"), + sysctl_kern_somaxkva, 0, NULL, 0, + CTL_KERN, KERN_SOMAXKVA, CTL_EOL); +}