Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.108 retrieving revision 1.111.2.6 diff -u -p -r1.108 -r1.111.2.6 --- src/sys/kern/uipc_socket.c 2005/02/26 21:34:55 1.108 +++ src/sys/kern/uipc_socket.c 2007/02/26 09:11:20 1.111.2.6 @@ -1,4 +1,4 @@ -/* $NetBSD: uipc_socket.c,v 1.108 2005/02/26 21:34:55 perry Exp $ */ +/* $NetBSD: uipc_socket.c,v 1.111.2.6 2007/02/26 09:11:20 yamt Exp $ */ /*- * Copyright (c) 2002 The NetBSD Foundation, Inc. @@ -68,7 +68,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.108 2005/02/26 21:34:55 perry Exp $"); +__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.111.2.6 2007/02/26 09:11:20 yamt Exp $"); #include "opt_sock_counters.h" #include "opt_sosend_loan.h" @@ -91,6 +91,7 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_socket. #include #include #include +#include #include @@ -105,13 +106,13 @@ int somaxconn = SOMAXCONN; #ifdef SOSEND_COUNTERS #include -struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, +static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "loan big"); -struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, +static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "copy big"); -struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, +static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "copy small"); -struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, +static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "sosend", "kva limit"); #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ @@ -126,41 +127,30 @@ EVCNT_ATTACH_STATIC(sosend_kvalimit); #endif /* SOSEND_COUNTERS */ -void -soinit(void) -{ - - /* Set the initial adjusted socket buffer size. */ - if (sb_max_set(sb_max)) - panic("bad initial sb_max value: %lu\n", sb_max); - -} +static struct callback_entry sokva_reclaimerentry; #ifdef SOSEND_NO_LOAN -int use_sosend_loan = 0; +int sock_loan_thresh = -1; #else -int use_sosend_loan = 1; +int sock_loan_thresh = 4096; #endif -struct simplelock so_pendfree_slock = SIMPLELOCK_INITIALIZER; -struct mbuf *so_pendfree; +static struct simplelock so_pendfree_slock = SIMPLELOCK_INITIALIZER; +static struct mbuf *so_pendfree; #ifndef SOMAXKVA #define SOMAXKVA (16 * 1024 * 1024) #endif int somaxkva = SOMAXKVA; -int socurkva; -int sokvawaiters; +static int socurkva; +static int sokvawaiters; -#define SOCK_LOAN_THRESH 4096 #define SOCK_LOAN_CHUNK 65536 -static size_t sodopendfree(struct socket *); -static size_t sodopendfreel(struct socket *); -static __inline vsize_t sokvareserve(struct socket *, vsize_t); -static __inline void sokvaunreserve(vsize_t); +static size_t sodopendfree(void); +static size_t sodopendfreel(void); -static __inline vsize_t +static vsize_t sokvareserve(struct socket *so, vsize_t len) { int s; @@ -175,7 +165,7 @@ sokvareserve(struct socket *so, vsize_t * try to do pendfree. */ - freed = sodopendfreel(so); + freed = sodopendfreel(); /* * if some kva was freed, try again. @@ -200,7 +190,7 @@ sokvareserve(struct socket *so, vsize_t return len; } -static __inline void +static void sokvaunreserve(vsize_t len) { int s; @@ -234,7 +224,7 @@ sokvaalloc(vsize_t len, struct socket *s * allocate kva. */ - lva = uvm_km_valloc_wait(kernel_map, len); + lva = uvm_km_alloc(kernel_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA); if (lva == 0) { sokvaunreserve(len); return (0); @@ -255,7 +245,7 @@ sokvafree(vaddr_t sva, vsize_t len) * free kva. */ - uvm_km_free(kernel_map, sva, len); + uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY); /* * unreserve kva. @@ -265,43 +255,36 @@ sokvafree(vaddr_t sva, vsize_t len) } static void -sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size) +sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size, bool mapped) { - vaddr_t va, sva, eva; + vaddr_t sva, eva; vsize_t len; - paddr_t pa; - int i, npgs; + int npgs; + + KASSERT(pgs != NULL); eva = round_page((vaddr_t) buf + size); sva = trunc_page((vaddr_t) buf); len = eva - sva; npgs = len >> PAGE_SHIFT; - if (__predict_false(pgs == NULL)) { - pgs = alloca(npgs * sizeof(*pgs)); - - for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { - if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) - panic("sodoloanfree: va 0x%lx not mapped", va); - pgs[i] = PHYS_TO_VM_PAGE(pa); - } + if (mapped) { + pmap_kremove(sva, len); + pmap_update(pmap_kernel()); } - - pmap_kremove(sva, len); - pmap_update(pmap_kernel()); uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); sokvafree(sva, len); } static size_t -sodopendfree(struct socket *so) +sodopendfree() { int s; size_t rv; s = splvm(); simple_lock(&so_pendfree_slock); - rv = sodopendfreel(so); + rv = sodopendfreel(); simple_unlock(&so_pendfree_slock); splx(s); @@ -317,7 +300,7 @@ sodopendfree(struct socket *so) */ static size_t -sodopendfreel(struct socket *so) +sodopendfreel() { size_t rv = 0; @@ -336,11 +319,13 @@ sodopendfreel(struct socket *so) for (; m != NULL; m = next) { next = m->m_next; + KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); + KASSERT(m->m_ext.ext_refcnt == 0); rv += m->m_ext.ext_size; - sodoloanfree((m->m_flags & M_EXT_PAGES) ? - m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, - m->m_ext.ext_size); + sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, + m->m_ext.ext_size, + (m->m_ext.ext_flags & M_EXT_LAZY) == 0); pool_cache_put(&mbpool_cache, m); } @@ -356,15 +341,7 @@ soloanfree(struct mbuf *m, caddr_t buf, { int s; - if (m == NULL) { - - /* - * called from MEXTREMOVE. - */ - - sodoloanfree(NULL, buf, size); - return; - } + KASSERT(m != NULL); /* * postpone freeing mbuf. @@ -389,10 +366,14 @@ sosend_loan(struct socket *so, struct ui struct iovec *iov = uio->uio_iov; vaddr_t sva, eva; vsize_t len; - vaddr_t lva, va; - int npgs, i, error; + vaddr_t lva; + int npgs, error; +#if !defined(__HAVE_LAZY_MBUF) + vaddr_t va; + int i; +#endif /* !defined(__HAVE_LAZY_MBUF) */ - if (uio->uio_segflg != UIO_USERSPACE) + if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) return (0); if (iov->iov_len < (size_t) space) @@ -407,29 +388,35 @@ sosend_loan(struct socket *so, struct ui /* XXX KDASSERT */ KASSERT(npgs <= M_EXT_MAXPAGES); - KASSERT(uio->uio_procp != NULL); lva = sokvaalloc(len, so); if (lva == 0) return 0; - error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len, + error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len, m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); if (error) { sokvafree(lva, len); return (0); } +#if !defined(__HAVE_LAZY_MBUF) for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), VM_PROT_READ); pmap_update(pmap_kernel()); +#endif /* !defined(__HAVE_LAZY_MBUF) */ lva += (vaddr_t) iov->iov_base & PAGE_MASK; MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so); m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; +#if defined(__HAVE_LAZY_MBUF) + m->m_flags |= M_EXT_LAZY; + m->m_ext.ext_flags |= M_EXT_LAZY; +#endif /* defined(__HAVE_LAZY_MBUF) */ + uio->uio_resid -= space; /* uio_offset not updated, not set/used for write(2) */ uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space; @@ -442,6 +429,32 @@ sosend_loan(struct socket *so, struct ui return (space); } +static int +sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg) +{ + + KASSERT(ce == &sokva_reclaimerentry); + KASSERT(obj == NULL); + + sodopendfree(); + if (!vm_map_starved_p(kernel_map)) { + return CALLBACK_CHAIN_ABORT; + } + return CALLBACK_CHAIN_CONTINUE; +} + +void +soinit(void) +{ + + /* Set the initial adjusted socket buffer size. */ + if (sb_max_set(sb_max)) + panic("bad initial sb_max value: %lu", sb_max); + + callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, + &sokva_reclaimerentry, NULL, sokva_reclaim_callback); +} + /* * Socket operation routines. * These routines are called by the routines in @@ -451,17 +464,33 @@ sosend_loan(struct socket *so, struct ui */ /*ARGSUSED*/ int -socreate(int dom, struct socket **aso, int type, int proto, struct proc *p) +socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l) { const struct protosw *prp; struct socket *so; + uid_t uid; int error, s; + error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, + KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type), + KAUTH_ARG(proto)); + if (error) + return (error); + if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); - if (prp == 0 || prp->pr_usrreq == 0) + if (prp == 0) { + /* no support for domain */ + if (pffinddomain(dom) == 0) + return (EAFNOSUPPORT); + /* no support for socket type */ + if (proto == 0 && type != 0) + return (EPROTOTYPE); + return (EPROTONOSUPPORT); + } + if (prp->pr_usrreq == 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); @@ -479,12 +508,14 @@ socreate(int dom, struct socket **aso, i so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; so->so_mowner = &prp->pr_domain->dom_mowner; #endif - if (p != 0) - so->so_uid = p->p_ucred->cr_uid; - else - so->so_uid = UID_MAX; + if (l != NULL) { + uid = kauth_cred_geteuid(l->l_cred); + } else { + uid = 0; + } + so->so_uidinfo = uid_find(uid); error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, - (struct mbuf *)(long)proto, (struct mbuf *)0, p); + (struct mbuf *)(long)proto, (struct mbuf *)0, l); if (error) { so->so_state |= SS_NOFDREF; sofree(so); @@ -497,13 +528,13 @@ socreate(int dom, struct socket **aso, i } int -sobind(struct socket *so, struct mbuf *nam, struct proc *p) +sobind(struct socket *so, struct mbuf *nam, struct lwp *l) { int s, error; s = splsoftnet(); error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0, - nam, (struct mbuf *)0, p); + nam, (struct mbuf *)0, l); splx(s); return (error); } @@ -515,7 +546,7 @@ solisten(struct socket *so, int backlog) s = splsoftnet(); error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); + (struct mbuf *)0, (struct mbuf *)0, (struct lwp *)0); if (error) { splx(s); return (error); @@ -545,10 +576,10 @@ sofree(struct socket *so) return; } if (so->so_rcv.sb_hiwat) - (void)chgsbsize(so->so_uid, &so->so_rcv.sb_hiwat, 0, + (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) - (void)chgsbsize(so->so_uid, &so->so_snd.sb_hiwat, 0, + (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sbrelease(&so->so_snd, so); sorflush(so); @@ -603,7 +634,7 @@ soclose(struct socket *so) if (so->so_pcb) { int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, - (struct proc *)0); + (struct lwp *)0); if (error == 0) error = error2; } @@ -624,7 +655,7 @@ soabort(struct socket *so) { return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); + (struct mbuf *)0, (struct mbuf *)0, (struct lwp *)0); } int @@ -640,7 +671,7 @@ soaccept(struct socket *so, struct mbuf if ((so->so_state & SS_ISDISCONNECTED) == 0 || (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, - (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0); + (struct mbuf *)0, nam, (struct mbuf *)0, (struct lwp *)0); else error = ECONNABORTED; @@ -649,7 +680,7 @@ soaccept(struct socket *so, struct mbuf } int -soconnect(struct socket *so, struct mbuf *nam, struct proc *p) +soconnect(struct socket *so, struct mbuf *nam, struct lwp *l) { int s, error; @@ -668,7 +699,7 @@ soconnect(struct socket *so, struct mbuf error = EISCONN; else error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, - (struct mbuf *)0, nam, (struct mbuf *)0, p); + (struct mbuf *)0, nam, (struct mbuf *)0, l); splx(s); return (error); } @@ -681,7 +712,7 @@ soconnect2(struct socket *so1, struct so s = splsoftnet(); error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0, - (struct proc *)0); + (struct lwp *)0); splx(s); return (error); } @@ -702,10 +733,10 @@ sodisconnect(struct socket *so) } error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, - (struct proc *)0); + (struct lwp *)0); bad: splx(s); - sodopendfree(so); + sodopendfree(); return (error); } @@ -729,13 +760,15 @@ sodisconnect(struct socket *so) */ int sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, - struct mbuf *control, int flags, struct proc *p) + struct mbuf *control, int flags, struct lwp *l) { struct mbuf **mp, *m; + struct proc *p; long space, len, resid, clen, mlen; int error, s, dontroute, atomic; - sodopendfree(so); + p = l->l_proc; + sodopendfree(); clen = 0; atomic = sosendallatonce(so) || top; @@ -823,9 +856,9 @@ sosend(struct socket *so, struct mbuf *a mlen = MLEN; } MCLAIM(m, so->so_snd.sb_mowner); - if (use_sosend_loan && - uio->uio_iov->iov_len >= SOCK_LOAN_THRESH && - space >= SOCK_LOAN_THRESH && + if (sock_loan_thresh >= 0 && + uio->uio_iov->iov_len >= sock_loan_thresh && + space >= sock_loan_thresh && (len = sosend_loan(so, uio, m, space)) != 0) { SOSEND_COUNTER_INCR(&sosend_loan_big); @@ -885,7 +918,7 @@ sosend(struct socket *so, struct mbuf *a so->so_state |= SS_MORETOCOME; error = (*so->so_proto->pr_usrreq)(so, (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, - top, addr, control, p); + top, addr, control, curlwp); /* XXX */ if (dontroute) so->so_options &= ~SO_DONTROUTE; if (resid > 0) @@ -931,7 +964,7 @@ int soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { - struct proc * p; + struct lwp *l = curlwp; struct mbuf *m, **mp; int flags, len, error, s, offset, moff, type, orig_resid; const struct protosw *pr; @@ -942,7 +975,6 @@ soreceive(struct socket *so, struct mbuf mp = mp0; type = 0; orig_resid = uio->uio_resid; - p = uio->uio_procp; if (paddr) *paddr = 0; @@ -954,13 +986,13 @@ soreceive(struct socket *so, struct mbuf flags = 0; if ((flags & MSG_DONTWAIT) == 0) - sodopendfree(so); + sodopendfree(); if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, (struct mbuf *)(long)(flags & MSG_PEEK), - (struct mbuf *)0, p); + (struct mbuf *)0, l); if (error) goto bad; do { @@ -977,7 +1009,7 @@ soreceive(struct socket *so, struct mbuf *mp = (struct mbuf *)0; if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, p); + (struct mbuf *)0, (struct mbuf *)0, l); restart: if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) @@ -1050,8 +1082,8 @@ soreceive(struct socket *so, struct mbuf * While we process the initial mbufs containing address and control * info, we save a copy of m->m_nextpkt into nextrecord. */ - if (p) - p->p_stats->p_ru.ru_msgrcv++; + if (l) + l->l_proc->p_stats->p_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb); SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); @@ -1090,10 +1122,10 @@ soreceive(struct socket *so, struct mbuf mbuf_removed = 1; if (controlp) { struct domain *dom = pr->pr_domain; - if (dom->dom_externalize && p && + if (dom->dom_externalize && l && mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) - error = (*dom->dom_externalize)(m, p); + error = (*dom->dom_externalize)(m, l); *controlp = m; so->so_rcv.sb_mb = m->m_next; m->m_next = 0; @@ -1286,7 +1318,7 @@ soreceive(struct socket *so, struct mbuf (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, (struct mbuf *)(long)flags, - (struct mbuf *)0, p); + (struct mbuf *)0, l); SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); error = sbwait(&so->so_rcv); @@ -1323,7 +1355,7 @@ soreceive(struct socket *so, struct mbuf SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, - (struct mbuf *)(long)flags, (struct mbuf *)0, p); + (struct mbuf *)(long)flags, (struct mbuf *)0, l); } if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { @@ -1353,7 +1385,7 @@ soshutdown(struct socket *so, int how) sorflush(so); if (how == SHUT_WR || how == SHUT_RDWR) return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); + (struct mbuf *)0, (struct mbuf *)0, (struct lwp *)0); return (0); } @@ -1389,6 +1421,7 @@ sosetopt(struct socket *so, int level, i { int error; struct mbuf *m; + struct linger *l; error = 0; m = m0; @@ -1405,8 +1438,18 @@ sosetopt(struct socket *so, int level, i error = EINVAL; goto bad; } - so->so_linger = mtod(m, struct linger *)->l_linger; - /* fall thru... */ + l = mtod(m, struct linger *); + if (l->l_linger < 0 || l->l_linger > USHRT_MAX || + l->l_linger > (INT_MAX / hz)) { + error = EDOM; + goto bad; + } + so->so_linger = l->l_linger; + if (l->l_onoff) + so->so_options |= SO_LINGER; + else + so->so_options &= ~SO_LINGER; + break; case SO_DEBUG: case SO_KEEPALIVE: @@ -1546,7 +1589,7 @@ sogetopt(struct socket *so, int level, i case SO_LINGER: m->m_len = sizeof(struct linger); mtod(m, struct linger *)->l_onoff = - so->so_options & SO_LINGER; + (so->so_options & SO_LINGER) ? 1 : 0; mtod(m, struct linger *)->l_linger = so->so_linger; break; @@ -1559,7 +1602,7 @@ sogetopt(struct socket *so, int level, i case SO_BROADCAST: case SO_OOBINLINE: case SO_TIMESTAMP: - *mtod(m, int *) = so->so_options & optname; + *mtod(m, int *) = (so->so_options & optname) ? 1 : 0; break; case SO_TYPE: