Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.54.2.3 retrieving revision 1.68 diff -u -p -r1.54.2.3 -r1.68 --- src/sys/kern/uipc_socket.c 2001/06/21 20:07:07 1.54.2.3 +++ src/sys/kern/uipc_socket.c 2002/06/11 00:21:33 1.68 @@ -1,4 +1,40 @@ -/* $NetBSD: uipc_socket.c,v 1.54.2.3 2001/06/21 20:07:07 nathanw Exp $ */ +/* $NetBSD: uipc_socket.c,v 1.68 2002/06/11 00:21:33 matt Exp $ */ + +/*- + * Copyright (c) 2002 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe of Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 @@ -35,11 +71,14 @@ * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 */ -#include "opt_compat_sunos.h" +#include +__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.68 2002/06/11 00:21:33 matt Exp $"); + +#include "opt_sock_counters.h" +#include "opt_sosend_loan.h" #include #include -#include #include #include #include @@ -53,17 +92,219 @@ #include #include +#include + struct pool socket_pool; extern int somaxconn; /* patchable (XXX sysctl) */ int somaxconn = SOMAXCONN; +#ifdef SOSEND_COUNTERS +#include + +struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "loan big"); +struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "copy big"); +struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "copy small"); +struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "kva limit"); + +#define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ + +#else + +#define SOSEND_COUNTER_INCR(ev) /* nothing */ + +#endif /* SOSEND_COUNTERS */ + void soinit(void) { pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, - "sockpl", 0, NULL, NULL, M_SOCKET); + "sockpl", NULL); + +#ifdef SOSEND_COUNTERS + evcnt_attach_static(&sosend_loan_big); + evcnt_attach_static(&sosend_copy_big); + evcnt_attach_static(&sosend_copy_small); + evcnt_attach_static(&sosend_kvalimit); +#endif /* SOSEND_COUNTERS */ +} + +#ifdef SOSEND_LOAN +int use_sosend_loan = 1; +#else +int use_sosend_loan = 0; +#endif + +struct mbuf *so_pendfree; + +int somaxkva = 16 * 1024 * 1024; +int socurkva; +int sokvawaiters; + +#define SOCK_LOAN_THRESH 4096 +#define SOCK_LOAN_CHUNK 65536 + +static void +sodoloanfree(caddr_t buf, u_int size) +{ + struct vm_page **pgs; + vaddr_t va, sva, eva; + vsize_t len; + paddr_t pa; + int i, npgs; + + eva = round_page((vaddr_t) buf + size); + sva = trunc_page((vaddr_t) buf); + len = eva - sva; + npgs = len >> PAGE_SHIFT; + + pgs = alloca(npgs * sizeof(*pgs)); + + for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { + if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) + panic("sodoloanfree: va 0x%lx not mapped", va); + pgs[i] = PHYS_TO_VM_PAGE(pa); + } + + pmap_kremove(sva, len); + pmap_update(pmap_kernel()); + uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); + uvm_km_free(kernel_map, sva, len); + socurkva -= len; + if (sokvawaiters) + wakeup(&socurkva); +} + +static size_t +sodopendfree(struct socket *so) +{ + struct mbuf *m; + size_t rv = 0; + int s; + + s = splvm(); + + for (;;) { + m = so_pendfree; + if (m == NULL) + break; + so_pendfree = m->m_next; + splx(s); + + rv += m->m_ext.ext_size; + sodoloanfree(m->m_ext.ext_buf, m->m_ext.ext_size); + s = splvm(); + pool_cache_put(&mbpool_cache, m); + } + + for (;;) { + m = so->so_pendfree; + if (m == NULL) + break; + so->so_pendfree = m->m_next; + splx(s); + + rv += m->m_ext.ext_size; + sodoloanfree(m->m_ext.ext_buf, m->m_ext.ext_size); + s = splvm(); + pool_cache_put(&mbpool_cache, m); + } + + splx(s); + return (rv); +} + +static void +soloanfree(struct mbuf *m, caddr_t buf, u_int size, void *arg) +{ + struct socket *so = arg; + int s; + + if (m == NULL) { + sodoloanfree(buf, size); + return; + } + + s = splvm(); + m->m_next = so->so_pendfree; + so->so_pendfree = m; + splx(s); + if (sokvawaiters) + wakeup(&socurkva); +} + +static long +sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) +{ + struct iovec *iov = uio->uio_iov; + vaddr_t sva, eva; + vsize_t len; + struct vm_page **pgs; + vaddr_t lva, va; + int npgs, s, i, error; + + if (uio->uio_segflg != UIO_USERSPACE) + return (0); + + if (iov->iov_len < (size_t) space) + space = iov->iov_len; + if (space > SOCK_LOAN_CHUNK) + space = SOCK_LOAN_CHUNK; + + eva = round_page((vaddr_t) iov->iov_base + space); + sva = trunc_page((vaddr_t) iov->iov_base); + len = eva - sva; + npgs = len >> PAGE_SHIFT; + + while (socurkva + len > somaxkva) { + if (sodopendfree(so)) + continue; + SOSEND_COUNTER_INCR(&sosend_kvalimit); + s = splvm(); + sokvawaiters++; + (void) tsleep(&socurkva, PVM, "sokva", 0); + sokvawaiters--; + splx(s); + } + + lva = uvm_km_valloc_wait(kernel_map, len); + if (lva == 0) + return (0); + socurkva += len; + + pgs = alloca(npgs * sizeof(*pgs)); + + error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len, + pgs, UVM_LOAN_TOPAGE); + if (error) { + uvm_km_free(kernel_map, lva, len); + socurkva -= len; + return (0); + } + + for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pgs[i]), VM_PROT_READ); + pmap_update(pmap_kernel()); + + lva += (vaddr_t) iov->iov_base & PAGE_MASK; + + MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so); + + uio->uio_resid -= space; + /* uio_offset not updated, not set/used for write(2) */ + uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space; + uio->uio_iov->iov_len -= space; + if (uio->uio_iov->iov_len == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + } + + return (space); } /* @@ -82,7 +323,7 @@ socreate(int dom, struct socket **aso, i struct socket *so; int error, s; - p = curproc->l_proc; /* XXX */ + p = curproc; /* XXX */ if (proto) prp = pffindproto(dom, proto, type); else @@ -110,13 +351,6 @@ socreate(int dom, struct socket **aso, i splx(s); return (error); } -#ifdef COMPAT_SUNOS - { - extern struct emul emul_sunos; - if (p->p_emul == &emul_sunos && type == SOCK_DGRAM) - so->so_options |= SO_BROADCAST; - } -#endif splx(s); *aso = so; return (0); @@ -146,7 +380,7 @@ solisten(struct socket *so, int backlog) splx(s); return (error); } - if (so->so_q.tqh_first == NULL) + if (TAILQ_EMPTY(&so->so_q)) so->so_options |= SO_ACCEPTCONN; if (backlog < 0) backlog = 0; @@ -158,6 +392,7 @@ solisten(struct socket *so, int backlog) void sofree(struct socket *so) { + struct mbuf *m; if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) return; @@ -172,6 +407,11 @@ sofree(struct socket *so) } sbrelease(&so->so_snd); sorflush(so); + while ((m = so->so_pendfree) != NULL) { + so->so_pendfree = m->m_next; + m->m_next = so_pendfree; + so_pendfree = m; + } pool_put(&socket_pool, so); } @@ -189,11 +429,11 @@ soclose(struct socket *so) error = 0; s = splsoftnet(); /* conservative */ if (so->so_options & SO_ACCEPTCONN) { - while ((so2 = so->so_q0.tqh_first) != 0) { + while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { (void) soqremque(so2, 0); (void) soabort(so2); } - while ((so2 = so->so_q.tqh_first) != 0) { + while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { (void) soqremque(so2, 1); (void) soabort(so2); } @@ -274,7 +514,7 @@ soconnect(struct socket *so, struct mbuf struct proc *p; int s, error; - p = curproc->l_proc; /* XXX */ + p = curproc; /* XXX */ if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); s = splsoftnet(); @@ -327,6 +567,7 @@ sodisconnect(struct socket *so) (struct proc *)0); bad: splx(s); + sodopendfree(so); return (error); } @@ -354,10 +595,12 @@ sosend(struct socket *so, struct mbuf *a { struct proc *p; struct mbuf **mp, *m; - long space, len, resid; - int clen, error, s, dontroute, mlen, atomic; + long space, len, resid, clen, mlen; + int error, s, dontroute, atomic; - p = curproc->l_proc; /* XXX */ + sodopendfree(so); + + p = curproc; /* XXX */ clen = 0; atomic = sosendallatonce(so) || top; if (uio) @@ -442,25 +685,32 @@ sosend(struct socket *so, struct mbuf *a MGET(m, M_WAIT, MT_DATA); mlen = MLEN; } + if (use_sosend_loan && + uio->uio_iov->iov_len >= SOCK_LOAN_THRESH && + space >= SOCK_LOAN_THRESH && + (len = sosend_loan(so, uio, m, + space)) != 0) { + SOSEND_COUNTER_INCR(&sosend_loan_big); + space -= len; + goto have_data; + } if (resid >= MINCLSIZE && space >= MCLBYTES) { + SOSEND_COUNTER_INCR(&sosend_copy_big); MCLGET(m, M_WAIT); if ((m->m_flags & M_EXT) == 0) goto nopages; mlen = MCLBYTES; -#ifdef MAPPED_MBUFS - len = min(MCLBYTES, resid); -#else if (atomic && top == 0) { - len = min(MCLBYTES - max_hdr, + len = lmin(MCLBYTES - max_hdr, resid); m->m_data += max_hdr; } else - len = min(MCLBYTES, resid); -#endif + len = lmin(MCLBYTES, resid); space -= len; } else { -nopages: - len = min(min(mlen, resid), space); + nopages: + SOSEND_COUNTER_INCR(&sosend_copy_small); + len = lmin(lmin(mlen, resid), space); space -= len; /* * For datagram protocols, leave room @@ -471,6 +721,7 @@ nopages: } error = uiomove(mtod(m, caddr_t), (int)len, uio); + have_data: resid = uio->uio_resid; m->m_len = len; *mp = m; @@ -546,6 +797,7 @@ soreceive(struct socket *so, struct mbuf int flags, len, error, s, offset, moff, type, orig_resid; struct protosw *pr; struct mbuf *nextrecord; + int mbuf_removed = 0; pr = so->so_proto; mp = mp0; @@ -559,6 +811,10 @@ soreceive(struct socket *so, struct mbuf flags = *flagsp &~ MSG_EOR; else flags = 0; + + if ((flags & MSG_DONTWAIT) == 0) + sodopendfree(so); + if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, @@ -663,6 +919,7 @@ soreceive(struct socket *so, struct mbuf m = m->m_next; } else { sbfree(&so->so_rcv, m); + mbuf_removed = 1; if (paddr) { *paddr = m; so->so_rcv.sb_mb = m->m_next; @@ -681,6 +938,7 @@ soreceive(struct socket *so, struct mbuf m = m->m_next; } else { sbfree(&so->so_rcv, m); + mbuf_removed = 1; if (controlp) { if (pr->pr_domain->dom_externalize && mtod(m, struct cmsghdr *)->cmsg_type == @@ -737,6 +995,24 @@ soreceive(struct socket *so, struct mbuf splx(s); error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); s = splsoftnet(); + if (error) { + /* + * If any part of the record has been removed + * (such as the MT_SONAME mbuf, which will + * happen when PR_ADDR, and thus also + * PR_ATOMIC, is set), then drop the entire + * record to maintain the atomicity of the + * receive operation. + * + * This avoids a later panic("receive 1a") + * when compiled with DIAGNOSTIC. + */ + if (m && mbuf_removed + && (pr->pr_flags & PR_ATOMIC)) + (void) sbdroprecord(&so->so_rcv); + + goto release; + } } else uio->uio_resid -= len; if (len == m->m_len - moff) { @@ -797,6 +1073,23 @@ soreceive(struct socket *so, struct mbuf !sosendallatonce(so) && !nextrecord) { if (so->so_error || so->so_state & SS_CANTRCVMORE) break; + /* + * If we are peeking and the socket receive buffer is + * full, stop since we can't get more data to peek at. + */ + if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) + break; + /* + * If we've drained the socket buffer, tell the + * protocol in case it needs to do something to + * get it filled again. + */ + if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) + (*pr->pr_usrreq)(so, PRU_RCVD, + (struct mbuf *)0, + (struct mbuf *)(long)flags, + (struct mbuf *)0, + (struct proc *)0); error = sbwait(&so->so_rcv); if (error) { sbunlock(&so->so_rcv);