Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.1 retrieving revision 1.78 diff -u -p -r1.1 -r1.78 --- src/sys/kern/uipc_socket.c 1993/03/21 09:45:37 1.1 +++ src/sys/kern/uipc_socket.c 2003/02/26 06:31:11 1.78 @@ -1,7 +1,45 @@ -/* - * Copyright (c) 1982, 1986, 1988, 1990 Regents of the University of California. +/* $NetBSD: uipc_socket.c,v 1.78 2003/02/26 06:31:11 matt Exp $ */ + +/*- + * Copyright (c) 2002 The NetBSD Foundation, Inc. * All rights reserved. * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe of Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -30,20 +68,248 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)uipc_socket.c 7.28 (Berkeley) 5/4/91 + * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 */ -#include "param.h" -#include "proc.h" -#include "file.h" -#include "malloc.h" -#include "mbuf.h" -#include "domain.h" -#include "kernel.h" -#include "protosw.h" -#include "socket.h" -#include "socketvar.h" -#include "resourcevar.h" +#include +__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.78 2003/02/26 06:31:11 matt Exp $"); + +#include "opt_sock_counters.h" +#include "opt_sosend_loan.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct pool socket_pool; + +MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); +MALLOC_DEFINE(M_SONAME, "soname", "socket name"); + +extern int somaxconn; /* patchable (XXX sysctl) */ +int somaxconn = SOMAXCONN; + +#ifdef SOSEND_COUNTERS +#include + +struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "loan big"); +struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "copy big"); +struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "copy small"); +struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "kva limit"); + +#define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ + +#else + +#define SOSEND_COUNTER_INCR(ev) /* nothing */ + +#endif /* SOSEND_COUNTERS */ + +void +soinit(void) +{ + + pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, + "sockpl", NULL); + +#ifdef SOSEND_COUNTERS + evcnt_attach_static(&sosend_loan_big); + evcnt_attach_static(&sosend_copy_big); + evcnt_attach_static(&sosend_copy_small); + evcnt_attach_static(&sosend_kvalimit); +#endif /* SOSEND_COUNTERS */ +} + +#ifdef SOSEND_NO_LOAN +int use_sosend_loan = 0; +#else +int use_sosend_loan = 1; +#endif + +struct mbuf *so_pendfree; + +int somaxkva = 16 * 1024 * 1024; +int socurkva; +int sokvawaiters; + +#define SOCK_LOAN_THRESH 4096 +#define SOCK_LOAN_CHUNK 65536 + +static void +sodoloanfree(caddr_t buf, size_t size) +{ + struct vm_page **pgs; + vaddr_t va, sva, eva; + vsize_t len; + paddr_t pa; + int i, npgs; + + eva = round_page((vaddr_t) buf + size); + sva = trunc_page((vaddr_t) buf); + len = eva - sva; + npgs = len >> PAGE_SHIFT; + + pgs = alloca(npgs * sizeof(*pgs)); + + for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { + if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) + panic("sodoloanfree: va 0x%lx not mapped", va); + pgs[i] = PHYS_TO_VM_PAGE(pa); + } + + pmap_kremove(sva, len); + pmap_update(pmap_kernel()); + uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); + uvm_km_free(kernel_map, sva, len); + socurkva -= len; + if (sokvawaiters) + wakeup(&socurkva); +} + +static size_t +sodopendfree(struct socket *so) +{ + struct mbuf *m; + size_t rv = 0; + int s; + + s = splvm(); + + for (;;) { + m = so_pendfree; + if (m == NULL) + break; + so_pendfree = m->m_next; + splx(s); + + rv += m->m_ext.ext_size; + sodoloanfree(m->m_ext.ext_buf, m->m_ext.ext_size); + s = splvm(); + pool_cache_put(&mbpool_cache, m); + } + + for (;;) { + m = so->so_pendfree; + if (m == NULL) + break; + so->so_pendfree = m->m_next; + splx(s); + + rv += m->m_ext.ext_size; + sodoloanfree(m->m_ext.ext_buf, m->m_ext.ext_size); + s = splvm(); + pool_cache_put(&mbpool_cache, m); + } + + splx(s); + return (rv); +} + +static void +soloanfree(struct mbuf *m, caddr_t buf, size_t size, void *arg) +{ + struct socket *so = arg; + int s; + + if (m == NULL) { + sodoloanfree(buf, size); + return; + } + + s = splvm(); + m->m_next = so->so_pendfree; + so->so_pendfree = m; + splx(s); + if (sokvawaiters) + wakeup(&socurkva); +} + +static long +sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) +{ + struct iovec *iov = uio->uio_iov; + vaddr_t sva, eva; + vsize_t len; + struct vm_page **pgs; + vaddr_t lva, va; + int npgs, s, i, error; + + if (uio->uio_segflg != UIO_USERSPACE) + return (0); + + if (iov->iov_len < (size_t) space) + space = iov->iov_len; + if (space > SOCK_LOAN_CHUNK) + space = SOCK_LOAN_CHUNK; + + eva = round_page((vaddr_t) iov->iov_base + space); + sva = trunc_page((vaddr_t) iov->iov_base); + len = eva - sva; + npgs = len >> PAGE_SHIFT; + + while (socurkva + len > somaxkva) { + if (sodopendfree(so)) + continue; + SOSEND_COUNTER_INCR(&sosend_kvalimit); + s = splvm(); + sokvawaiters++; + (void) tsleep(&socurkva, PVM, "sokva", 0); + sokvawaiters--; + splx(s); + } + + lva = uvm_km_valloc_wait(kernel_map, len); + if (lva == 0) + return (0); + socurkva += len; + + pgs = alloca(npgs * sizeof(*pgs)); + + error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len, + pgs, UVM_LOAN_TOPAGE); + if (error) { + uvm_km_free(kernel_map, lva, len); + socurkva -= len; + return (0); + } + + for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pgs[i]), VM_PROT_READ); + pmap_update(pmap_kernel()); + + lva += (vaddr_t) iov->iov_base & PAGE_MASK; + + MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so); + + uio->uio_resid -= space; + /* uio_offset not updated, not set/used for write(2) */ + uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space; + uio->uio_iov->iov_len -= space; + if (uio->uio_iov->iov_len == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + } + + return (space); +} /* * Socket operation routines. @@ -53,92 +319,109 @@ * switching out to the protocol specific routines. */ /*ARGSUSED*/ -socreate(dom, aso, type, proto) - struct socket **aso; - register int type; - int proto; -{ - struct proc *p = curproc; /* XXX */ - register struct protosw *prp; - register struct socket *so; - register int error; +int +socreate(int dom, struct socket **aso, int type, int proto) +{ + struct proc *p; + struct protosw *prp; + struct socket *so; + int error, s; + p = curproc; /* XXX */ if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); - if (prp == 0) + if (prp == 0 || prp->pr_usrreq == 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); - MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT); - bzero((caddr_t)so, sizeof(*so)); + s = splsoftnet(); + so = pool_get(&socket_pool, PR_WAITOK); + memset((caddr_t)so, 0, sizeof(*so)); + TAILQ_INIT(&so->so_q0); + TAILQ_INIT(&so->so_q); so->so_type = type; - if (p->p_ucred->cr_uid == 0) - so->so_state = SS_PRIV; so->so_proto = prp; - error = - (*prp->pr_usrreq)(so, PRU_ATTACH, - (struct mbuf *)0, (struct mbuf *)proto, (struct mbuf *)0); + so->so_send = sosend; + so->so_receive = soreceive; +#ifdef MBUFTRACE + so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; + so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; + so->so_mowner = &prp->pr_domain->dom_mowner; +#endif + if (p != 0) + so->so_uid = p->p_ucred->cr_uid; + error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, + (struct mbuf *)(long)proto, (struct mbuf *)0, p); if (error) { so->so_state |= SS_NOFDREF; sofree(so); + splx(s); return (error); } + splx(s); *aso = so; return (0); } -sobind(so, nam) - struct socket *so; - struct mbuf *nam; -{ - int s = splnet(); - int error; - - error = - (*so->so_proto->pr_usrreq)(so, PRU_BIND, - (struct mbuf *)0, nam, (struct mbuf *)0); +int +sobind(struct socket *so, struct mbuf *nam, struct proc *p) +{ + int s, error; + + s = splsoftnet(); + error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0, + nam, (struct mbuf *)0, p); splx(s); return (error); } -solisten(so, backlog) - register struct socket *so; - int backlog; -{ - int s = splnet(), error; - - error = - (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); +int +solisten(struct socket *so, int backlog) +{ + int s, error; + + s = splsoftnet(); + error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0, + (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); if (error) { splx(s); return (error); } - if (so->so_q == 0) + if (TAILQ_EMPTY(&so->so_q)) so->so_options |= SO_ACCEPTCONN; if (backlog < 0) backlog = 0; - so->so_qlimit = min(backlog, SOMAXCONN); + so->so_qlimit = min(backlog, somaxconn); splx(s); return (0); } -sofree(so) - register struct socket *so; +void +sofree(struct socket *so) { + struct mbuf *m; if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) return; if (so->so_head) { - if (!soqremque(so, 0) && !soqremque(so, 1)) - panic("sofree dq"); - so->so_head = 0; + /* + * We must not decommission a socket that's on the accept(2) + * queue. If we do, then accept(2) may hang after select(2) + * indicated that the listening socket was ready. + */ + if (!soqremque(so, 0)) + return; } sbrelease(&so->so_snd); sorflush(so); - FREE(so, M_SOCKET); + while ((m = so->so_pendfree) != NULL) { + so->so_pendfree = m->m_next; + m->m_next = so_pendfree; + so_pendfree = m; + } + pool_put(&socket_pool, so); } /* @@ -146,17 +429,23 @@ sofree(so) * Initiate disconnect if connected. * Free socket when disconnect complete. */ -soclose(so) - register struct socket *so; +int +soclose(struct socket *so) { - int s = splnet(); /* conservative */ - int error = 0; + struct socket *so2; + int s, error; + error = 0; + s = splsoftnet(); /* conservative */ if (so->so_options & SO_ACCEPTCONN) { - while (so->so_q0) - (void) soabort(so->so_q0); - while (so->so_q) - (void) soabort(so->so_q); + while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { + (void) soqremque(so2, 0); + (void) soabort(so2); + } + while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { + (void) soqremque(so2, 1); + (void) soabort(so2); + } } if (so->so_pcb == 0) goto discard; @@ -170,21 +459,24 @@ soclose(so) if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; - while (so->so_state & SS_ISCONNECTED) - if (error = tsleep((caddr_t)&so->so_timeo, - PSOCK | PCATCH, netcls, so->so_linger)) + while (so->so_state & SS_ISCONNECTED) { + error = tsleep((caddr_t)&so->so_timeo, + PSOCK | PCATCH, netcls, + so->so_linger * hz); + if (error) break; + } } } -drop: + drop: if (so->so_pcb) { - int error2 = - (*so->so_proto->pr_usrreq)(so, PRU_DETACH, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); + int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, + (struct proc *)0); if (error == 0) error = error2; } -discard: + discard: if (so->so_state & SS_NOFDREF) panic("soclose: NOFDREF"); so->so_state |= SS_NOFDREF; @@ -194,43 +486,47 @@ discard: } /* - * Must be called at splnet... + * Must be called at splsoftnet... */ -soabort(so) - struct socket *so; +int +soabort(struct socket *so) { - return ( - (*so->so_proto->pr_usrreq)(so, PRU_ABORT, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)); + return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0, + (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); } -soaccept(so, nam) - register struct socket *so; - struct mbuf *nam; +int +soaccept(struct socket *so, struct mbuf *nam) { - int s = splnet(); - int error; + int s, error; + error = 0; + s = splsoftnet(); if ((so->so_state & SS_NOFDREF) == 0) panic("soaccept: !NOFDREF"); so->so_state &= ~SS_NOFDREF; - error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, - (struct mbuf *)0, nam, (struct mbuf *)0); + if ((so->so_state & SS_ISDISCONNECTED) == 0 || + (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) + error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, + (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0); + else + error = ECONNABORTED; + splx(s); return (error); } -soconnect(so, nam) - register struct socket *so; - struct mbuf *nam; +int +soconnect(struct socket *so, struct mbuf *nam) { - int s; - int error; + struct proc *p; + int s, error; + p = curproc; /* XXX */ if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); - s = splnet(); + s = splsoftnet(); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. @@ -243,30 +539,30 @@ soconnect(so, nam) error = EISCONN; else error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, - (struct mbuf *)0, nam, (struct mbuf *)0); + (struct mbuf *)0, nam, (struct mbuf *)0, p); splx(s); return (error); } -soconnect2(so1, so2) - register struct socket *so1; - struct socket *so2; +int +soconnect2(struct socket *so1, struct socket *so2) { - int s = splnet(); - int error; + int s, error; + s = splsoftnet(); error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, - (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0); + (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0, + (struct proc *)0); splx(s); return (error); } -sodisconnect(so) - register struct socket *so; +int +sodisconnect(struct socket *so) { - int s = splnet(); - int error; + int s, error; + s = splsoftnet(); if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; goto bad; @@ -276,12 +572,15 @@ sodisconnect(so) goto bad; } error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); -bad: + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, + (struct proc *)0); + bad: splx(s); + sodopendfree(so); return (error); } +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) /* * Send on a socket. * If send must go all at once and message is larger than @@ -299,25 +598,35 @@ bad: * must check for short counts if EINTR/ERESTART are returned. * Data and control buffers are freed on return. */ -sosend(so, addr, uio, top, control, flags) - register struct socket *so; - struct mbuf *addr; - struct uio *uio; - struct mbuf *top; - struct mbuf *control; - int flags; -{ - struct proc *p = curproc; /* XXX */ - struct mbuf **mp; - register struct mbuf *m; - register long space, len, resid; - int clen = 0, error, s, dontroute, mlen; - int atomic = sosendallatonce(so) || top; - +int +sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, + struct mbuf *control, int flags) +{ + struct proc *p; + struct mbuf **mp, *m; + long space, len, resid, clen, mlen; + int error, s, dontroute, atomic; + + sodopendfree(so); + + p = curproc; /* XXX */ + clen = 0; + atomic = sosendallatonce(so) || top; if (uio) resid = uio->uio_resid; else resid = top->m_pkthdr.len; + /* + * In theory resid should be unsigned. + * However, space must be signed, as it might be less than 0 + * if we over-committed, and we must use a signed comparison + * of space and resid. On the other hand, a negative resid + * causes us to loop sending 0-length segments to the protocol. + */ + if (resid < 0) { + error = EINVAL; + goto out; + } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); @@ -326,15 +635,19 @@ sosend(so, addr, uio, top, control, flag clen = control->m_len; #define snderr(errno) { error = errno; splx(s); goto release; } -restart: - if (error = sblock(&so->so_snd)) + restart: + if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) goto out; do { - s = splnet(); + s = splsoftnet(); if (so->so_state & SS_CANTSENDMORE) snderr(EPIPE); - if (so->so_error) - snderr(so->so_error); + if (so->so_error) { + error = so->so_error; + so->so_error = 0; + splx(s); + goto release; + } if ((so->so_state & SS_ISCONNECTED) == 0) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if ((so->so_state & SS_ISCONFIRMING) == 0 && @@ -346,11 +659,11 @@ restart: space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; - if (space < resid + clen && + if ((atomic && resid > so->so_snd.sb_hiwat) || + clen > so->so_snd.sb_hiwat) + snderr(EMSGSIZE); + if (space < resid + clen && uio && (atomic || space < so->so_snd.sb_lowat || space < clen)) { - if (atomic && resid > so->so_snd.sb_hiwat || - clen > so->so_snd.sb_hiwat) - snderr(EMSGSIZE); if (so->so_state & SS_NBIO) snderr(EWOULDBLOCK); sbunlock(&so->so_snd); @@ -364,84 +677,105 @@ restart: mp = ⊤ space -= clen; do { - if (uio == NULL) { - /* - * Data is prepackaged in "top". - */ - resid = 0; - if (flags & MSG_EOR) - top->m_flags |= M_EOR; - } else do { - if (top == 0) { - MGETHDR(m, M_WAIT, MT_DATA); - mlen = MHLEN; - m->m_pkthdr.len = 0; - m->m_pkthdr.rcvif = (struct ifnet *)0; - } else { - MGET(m, M_WAIT, MT_DATA); - mlen = MLEN; - } - if (resid >= MINCLSIZE && space >= MCLBYTES) { - MCLGET(m, M_WAIT); - if ((m->m_flags & M_EXT) == 0) - goto nopages; - mlen = MCLBYTES; -#ifdef MAPPED_MBUFS - len = min(MCLBYTES, resid); -#else - if (top == 0) { - len = min(MCLBYTES - max_hdr, resid); - m->m_data += max_hdr; - } else - len = min(MCLBYTES, resid); -#endif - space -= MCLBYTES; - } else { -nopages: - len = min(min(mlen, resid), space); - space -= len; + if (uio == NULL) { /* - * For datagram protocols, leave room - * for protocol headers in first mbuf. + * Data is prepackaged in "top". */ - if (atomic && top == 0 && len < mlen) - MH_ALIGN(m, len); - } - error = uiomove(mtod(m, caddr_t), (int)len, uio); - resid = uio->uio_resid; - m->m_len = len; - *mp = m; - top->m_pkthdr.len += len; - if (error) - goto release; - mp = &m->m_next; - if (resid <= 0) { + resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; - break; - } - } while (space > 0 && atomic); - if (dontroute) - so->so_options |= SO_DONTROUTE; - s = splnet(); /* XXX */ - error = (*so->so_proto->pr_usrreq)(so, - (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, - top, addr, control); - splx(s); - if (dontroute) - so->so_options &= ~SO_DONTROUTE; - clen = 0; - control = 0; - top = 0; - mp = ⊤ - if (error) - goto release; + } else do { + if (top == 0) { + m = m_gethdr(M_WAIT, MT_DATA); + mlen = MHLEN; + m->m_pkthdr.len = 0; + m->m_pkthdr.rcvif = (struct ifnet *)0; + } else { + m = m_get(M_WAIT, MT_DATA); + mlen = MLEN; + } + MCLAIM(m, so->so_snd.sb_mowner); + if (use_sosend_loan && + uio->uio_iov->iov_len >= SOCK_LOAN_THRESH && + space >= SOCK_LOAN_THRESH && + (len = sosend_loan(so, uio, m, + space)) != 0) { + SOSEND_COUNTER_INCR(&sosend_loan_big); + space -= len; + goto have_data; + } + if (resid >= MINCLSIZE && space >= MCLBYTES) { + SOSEND_COUNTER_INCR(&sosend_copy_big); + m_clget(m, M_WAIT); + if ((m->m_flags & M_EXT) == 0) + goto nopages; + mlen = MCLBYTES; + if (atomic && top == 0) { + len = lmin(MCLBYTES - max_hdr, + resid); + m->m_data += max_hdr; + } else + len = lmin(MCLBYTES, resid); + space -= len; + } else { + nopages: + SOSEND_COUNTER_INCR(&sosend_copy_small); + len = lmin(lmin(mlen, resid), space); + space -= len; + /* + * For datagram protocols, leave room + * for protocol headers in first mbuf. + */ + if (atomic && top == 0 && len < mlen) + MH_ALIGN(m, len); + } + error = uiomove(mtod(m, caddr_t), (int)len, + uio); + have_data: + resid = uio->uio_resid; + m->m_len = len; + *mp = m; + top->m_pkthdr.len += len; + if (error) + goto release; + mp = &m->m_next; + if (resid <= 0) { + if (flags & MSG_EOR) + top->m_flags |= M_EOR; + break; + } + } while (space > 0 && atomic); + + s = splsoftnet(); + + if (so->so_state & SS_CANTSENDMORE) + snderr(EPIPE); + + if (dontroute) + so->so_options |= SO_DONTROUTE; + if (resid > 0) + so->so_state |= SS_MORETOCOME; + error = (*so->so_proto->pr_usrreq)(so, + (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, + top, addr, control, p); + if (dontroute) + so->so_options &= ~SO_DONTROUTE; + if (resid > 0) + so->so_state &= ~SS_MORETOCOME; + splx(s); + + clen = 0; + control = 0; + top = 0; + mp = ⊤ + if (error) + goto release; } while (resid && space > 0); } while (resid); -release: + release: sbunlock(&so->so_snd); -out: + out: if (top) m_freem(top); if (control) @@ -465,22 +799,20 @@ out: * an mbuf **mp0 for use in returning the chain. The uio is then used * only for the count in uio_resid. */ -soreceive(so, paddr, uio, mp0, controlp, flagsp) - register struct socket *so; - struct mbuf **paddr; - struct uio *uio; - struct mbuf **mp0; - struct mbuf **controlp; - int *flagsp; -{ - struct proc *p = curproc; /* XXX */ - register struct mbuf *m, **mp; - register int flags, len, error, s, offset; - struct protosw *pr = so->so_proto; - struct mbuf *nextrecord; - int moff, type; +int +soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, + struct mbuf **mp0, struct mbuf **controlp, int *flagsp) +{ + struct mbuf *m, **mp; + int flags, len, error, s, offset, moff, type, orig_resid; + struct protosw *pr; + struct mbuf *nextrecord; + int mbuf_removed = 0; + pr = so->so_proto; mp = mp0; + type = 0; + orig_resid = uio->uio_resid; if (paddr) *paddr = 0; if (controlp) @@ -489,10 +821,15 @@ soreceive(so, paddr, uio, mp0, controlp, flags = *flagsp &~ MSG_EOR; else flags = 0; + + if ((flags & MSG_DONTWAIT) == 0) + sodopendfree(so); + if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); - error = (*pr->pr_usrreq)(so, PRU_RCVOOB, - m, (struct mbuf *)(flags & MSG_PEEK), (struct mbuf *)0); + error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, + (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0, + (struct proc *)0); if (error) goto bad; do { @@ -500,7 +837,7 @@ soreceive(so, paddr, uio, mp0, controlp, (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); -bad: + bad: if (m) m_freem(m); return (error); @@ -509,35 +846,37 @@ bad: *mp = (struct mbuf *)0; if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0); + (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); -restart: - if (error = sblock(&so->so_rcv)) + restart: + if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) return (error); - s = splnet(); + s = splsoftnet(); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more * (subject to any timeout) if: - * 1. the current count is less than the low water mark, or + * 1. the current count is less than the low water mark, * 2. MSG_WAITALL is set, and it is possible to do the entire - * receive operation at once if we block (resid <= hiwat). + * receive operation at once if we block (resid <= hiwat), or + * 3. MSG_DONTWAIT is not set. * If MSG_WAITALL is set but resid is larger than the receive buffer, * we have to do the receive in sections, and thus risk returning * a short count if a timeout or signal occurs after we start. */ - while (m == 0 || so->so_rcv.sb_cc < uio->uio_resid && + if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && + so->so_rcv.sb_cc < uio->uio_resid) && (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && - m->m_nextpkt == 0) { + m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { #ifdef DIAGNOSTIC if (m == 0 && so->so_rcv.sb_cc) panic("receive 1"); #endif if (so->so_error) { if (m) - break; + goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; @@ -545,7 +884,7 @@ restart: } if (so->so_state & SS_CANTRCVMORE) { if (m) - break; + goto dontblock; else goto release; } @@ -561,10 +900,12 @@ restart: } if (uio->uio_resid == 0) goto release; - if (so->so_state & SS_NBIO) { + if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { error = EWOULDBLOCK; goto release; } + SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(&so->so_rcv); error = sbwait(&so->so_rcv); splx(s); @@ -572,20 +913,33 @@ restart: return (error); goto restart; } -dontblock: - p->p_stats->p_ru.ru_msgrcv++; + dontblock: + /* + * On entry here, m points to the first record of the socket buffer. + * While we process the initial mbufs containing address and control + * info, we save a copy of m->m_nextpkt into nextrecord. + */ +#ifdef notyet /* XXXX */ + if (uio->uio_procp) + uio->uio_procp->p_stats->p_ru.ru_msgrcv++; +#endif + KASSERT(m == so->so_rcv.sb_mb); + SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { #ifdef DIAGNOSTIC if (m->m_type != MT_SONAME) panic("receive 1a"); #endif + orig_resid = 0; if (flags & MSG_PEEK) { if (paddr) *paddr = m_copy(m, 0, m->m_len); m = m->m_next; } else { sbfree(&so->so_rcv, m); + mbuf_removed = 1; if (paddr) { *paddr = m; so->so_rcv.sb_mb = m->m_next; @@ -604,11 +958,12 @@ dontblock: m = m->m_next; } else { sbfree(&so->so_rcv, m); + mbuf_removed = 1; if (controlp) { if (pr->pr_domain->dom_externalize && mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) - error = (*pr->pr_domain->dom_externalize)(m); + error = (*pr->pr_domain->dom_externalize)(m); *controlp = m; so->so_rcv.sb_mb = m->m_next; m->m_next = 0; @@ -618,16 +973,44 @@ dontblock: m = so->so_rcv.sb_mb; } } - if (controlp) + if (controlp) { + orig_resid = 0; controlp = &(*controlp)->m_next; + } } + + /* + * If m is non-NULL, we have some data to read. From now on, + * make sure to keep sb_lastrecord consistent when working on + * the last packet on the chain (nextrecord == NULL) and we + * change m->m_nextpkt. + */ if (m) { - if ((flags & MSG_PEEK) == 0) + if ((flags & MSG_PEEK) == 0) { m->m_nextpkt = nextrecord; + /* + * If nextrecord == NULL (this is a single chain), + * then sb_lastrecord may not be valid here if m + * was changed earlier. + */ + if (nextrecord == NULL) { + KASSERT(so->so_rcv.sb_mb == m); + so->so_rcv.sb_lastrecord = m; + } + } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; + } else { + if ((flags & MSG_PEEK) == 0) { + KASSERT(so->so_rcv.sb_mb == m); + so->so_rcv.sb_mb = nextrecord; + SB_EMPTY_FIXUP(&so->so_rcv); + } } + SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); + moff = 0; offset = 0; while (m && uio->uio_resid > 0 && error == 0) { @@ -655,9 +1038,29 @@ dontblock: * block interrupts again. */ if (mp == 0) { + SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); splx(s); error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); - s = splnet(); + s = splsoftnet(); + if (error) { + /* + * If any part of the record has been removed + * (such as the MT_SONAME mbuf, which will + * happen when PR_ADDR, and thus also + * PR_ATOMIC, is set), then drop the entire + * record to maintain the atomicity of the + * receive operation. + * + * This avoids a later panic("receive 1a") + * when compiled with DIAGNOSTIC. + */ + if (m && mbuf_removed + && (pr->pr_flags & PR_ATOMIC)) + (void) sbdroprecord(&so->so_rcv); + + goto release; + } } else uio->uio_resid -= len; if (len == m->m_len - moff) { @@ -678,8 +1081,21 @@ dontblock: MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } - if (m) + /* + * If m != NULL, we also know that + * so->so_rcv.sb_mb != NULL. + */ + KASSERT(so->so_rcv.sb_mb == m); + if (m) { m->m_nextpkt = nextrecord; + if (nextrecord == NULL) + so->so_rcv.sb_lastrecord = m; + } else { + so->so_rcv.sb_mb = nextrecord; + SB_EMPTY_FIXUP(&so->so_rcv); + } + SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); } } else { if (flags & MSG_PEEK) @@ -699,8 +1115,11 @@ dontblock: so->so_state |= SS_RCVATMARK; break; } - } else + } else { offset += len; + if (offset == so->so_oobmark) + break; + } } if (flags & MSG_EOR) break; @@ -712,83 +1131,127 @@ dontblock: * Keep sockbuf locked against other readers. */ while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && - !sosendallatonce(so)) { + !sosendallatonce(so) && !nextrecord) { if (so->so_error || so->so_state & SS_CANTRCVMORE) break; + /* + * If we are peeking and the socket receive buffer is + * full, stop since we can't get more data to peek at. + */ + if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) + break; + /* + * If we've drained the socket buffer, tell the + * protocol in case it needs to do something to + * get it filled again. + */ + if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) + (*pr->pr_usrreq)(so, PRU_RCVD, + (struct mbuf *)0, + (struct mbuf *)(long)flags, + (struct mbuf *)0, + (struct proc *)0); + SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); error = sbwait(&so->so_rcv); if (error) { sbunlock(&so->so_rcv); splx(s); return (0); } - if (m = so->so_rcv.sb_mb) + if ((m = so->so_rcv.sb_mb) != NULL) nextrecord = m->m_nextpkt; } } + + if (m && pr->pr_flags & PR_ATOMIC) { + flags |= MSG_TRUNC; + if ((flags & MSG_PEEK) == 0) + (void) sbdroprecord(&so->so_rcv); + } if ((flags & MSG_PEEK) == 0) { - if (m == 0) + if (m == 0) { + /* + * First part is an inline SB_EMPTY_FIXUP(). Second + * part makes sure sb_lastrecord is up-to-date if + * there is still data in the socket buffer. + */ so->so_rcv.sb_mb = nextrecord; - else if (pr->pr_flags & PR_ATOMIC) { - flags |= MSG_TRUNC; - (void) sbdroprecord(&so->so_rcv); + if (so->so_rcv.sb_mb == NULL) { + so->so_rcv.sb_mbtail = NULL; + so->so_rcv.sb_lastrecord = NULL; + } else if (nextrecord->m_nextpkt == NULL) + so->so_rcv.sb_lastrecord = nextrecord; } + SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, - (struct mbuf *)flags, (struct mbuf *)0, - (struct mbuf *)0); + (struct mbuf *)(long)flags, (struct mbuf *)0, + (struct proc *)0); + } + if (orig_resid == uio->uio_resid && orig_resid && + (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { + sbunlock(&so->so_rcv); + splx(s); + goto restart; } + if (flagsp) *flagsp |= flags; -release: + release: sbunlock(&so->so_rcv); splx(s); return (error); } -soshutdown(so, how) - register struct socket *so; - register int how; +int +soshutdown(struct socket *so, int how) { - register struct protosw *pr = so->so_proto; + struct protosw *pr; - how++; - if (how & FREAD) + pr = so->so_proto; + if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) + return (EINVAL); + + if (how == SHUT_RD || how == SHUT_RDWR) sorflush(so); - if (how & FWRITE) - return ((*pr->pr_usrreq)(so, PRU_SHUTDOWN, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)); + if (how == SHUT_WR || how == SHUT_RDWR) + return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0, + (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); return (0); } -sorflush(so) - register struct socket *so; +void +sorflush(struct socket *so) { - register struct sockbuf *sb = &so->so_rcv; - register struct protosw *pr = so->so_proto; - register int s; - struct sockbuf asb; + struct sockbuf *sb, asb; + struct protosw *pr; + int s; + sb = &so->so_rcv; + pr = so->so_proto; sb->sb_flags |= SB_NOINTR; - (void) sblock(sb); - s = splimp(); + (void) sblock(sb, M_WAITOK); + s = splnet(); socantrcvmore(so); sbunlock(sb); asb = *sb; - bzero((caddr_t)sb, sizeof (*sb)); + memset((caddr_t)sb, 0, sizeof(*sb)); splx(s); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) (*pr->pr_domain->dom_dispose)(asb.sb_mb); sbrelease(&asb); } -sosetopt(so, level, optname, m0) - register struct socket *so; - int level, optname; - struct mbuf *m0; +int +sosetopt(struct socket *so, int level, int optname, struct mbuf *m0) { - int error = 0; - register struct mbuf *m = m0; + int error; + struct mbuf *m; + error = 0; + m = m0; if (level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) return ((*so->so_proto->pr_ctloutput) @@ -798,7 +1261,7 @@ sosetopt(so, level, optname, m0) switch (optname) { case SO_LINGER: - if (m == NULL || m->m_len != sizeof (struct linger)) { + if (m == NULL || m->m_len != sizeof(struct linger)) { error = EINVAL; goto bad; } @@ -811,8 +1274,10 @@ sosetopt(so, level, optname, m0) case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: + case SO_REUSEPORT: case SO_OOBINLINE: - if (m == NULL || m->m_len < sizeof (int)) { + case SO_TIMESTAMP: + if (m == NULL || m->m_len < sizeof(int)) { error = EINVAL; goto bad; } @@ -826,30 +1291,53 @@ sosetopt(so, level, optname, m0) case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: - if (m == NULL || m->m_len < sizeof (int)) { + { + int optval; + + if (m == NULL || m->m_len < sizeof(int)) { error = EINVAL; goto bad; } + + /* + * Values < 1 make no sense for any of these + * options, so disallow them. + */ + optval = *mtod(m, int *); + if (optval < 1) { + error = EINVAL; + goto bad; + } + switch (optname) { case SO_SNDBUF: case SO_RCVBUF: if (sbreserve(optname == SO_SNDBUF ? &so->so_snd : &so->so_rcv, - (u_long) *mtod(m, int *)) == 0) { + (u_long) optval) == 0) { error = ENOBUFS; goto bad; } break; + /* + * Make sure the low-water is never greater than + * the high-water. + */ case SO_SNDLOWAT: - so->so_snd.sb_lowat = *mtod(m, int *); + so->so_snd.sb_lowat = + (optval > so->so_snd.sb_hiwat) ? + so->so_snd.sb_hiwat : optval; break; case SO_RCVLOWAT: - so->so_rcv.sb_lowat = *mtod(m, int *); + so->so_rcv.sb_lowat = + (optval > so->so_rcv.sb_hiwat) ? + so->so_rcv.sb_hiwat : optval; break; } break; + } case SO_SNDTIMEO: case SO_RCVTIMEO: @@ -857,16 +1345,18 @@ sosetopt(so, level, optname, m0) struct timeval *tv; short val; - if (m == NULL || m->m_len < sizeof (*tv)) { + if (m == NULL || m->m_len < sizeof(*tv)) { error = EINVAL; goto bad; } tv = mtod(m, struct timeval *); - if (tv->tv_sec > SHRT_MAX / hz - hz) { + if (tv->tv_sec > (SHRT_MAX - tv->tv_usec / tick) / hz) { error = EDOM; goto bad; } val = tv->tv_sec * hz + tv->tv_usec / tick; + if (val == 0 && tv->tv_usec != 0) + val = 1; switch (optname) { @@ -884,19 +1374,22 @@ sosetopt(so, level, optname, m0) error = ENOPROTOOPT; break; } + if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { + (void) ((*so->so_proto->pr_ctloutput) + (PRCO_SETOPT, so, level, optname, &m0)); + m = NULL; /* freed by protocol */ + } } -bad: + bad: if (m) (void) m_free(m); return (error); } -sogetopt(so, level, optname, mp) - register struct socket *so; - int level, optname; - struct mbuf **mp; +int +sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) { - register struct mbuf *m; + struct mbuf *m; if (level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) { @@ -906,12 +1399,12 @@ sogetopt(so, level, optname, mp) return (ENOPROTOOPT); } else { m = m_get(M_WAIT, MT_SOOPTS); - m->m_len = sizeof (int); + m->m_len = sizeof(int); switch (optname) { case SO_LINGER: - m->m_len = sizeof (struct linger); + m->m_len = sizeof(struct linger); mtod(m, struct linger *)->l_onoff = so->so_options & SO_LINGER; mtod(m, struct linger *)->l_linger = so->so_linger; @@ -922,8 +1415,10 @@ sogetopt(so, level, optname, mp) case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: + case SO_REUSEPORT: case SO_BROADCAST: case SO_OOBINLINE: + case SO_TIMESTAMP: *mtod(m, int *) = so->so_options & optname; break; @@ -961,7 +1456,7 @@ sogetopt(so, level, optname, mp) m->m_len = sizeof(struct timeval); mtod(m, struct timeval *)->tv_sec = val / hz; mtod(m, struct timeval *)->tv_usec = - (val % hz) / tick; + (val % hz) * tick; break; } @@ -974,8 +1469,8 @@ sogetopt(so, level, optname, mp) } } -sohasoutofband(so) - register struct socket *so; +void +sohasoutofband(struct socket *so) { struct proc *p; @@ -983,9 +1478,121 @@ sohasoutofband(so) gsignal(-so->so_pgid, SIGURG); else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) psignal(p, SIGURG); - if (so->so_rcv.sb_sel) { - selwakeup(so->so_rcv.sb_sel, so->so_rcv.sb_flags & SB_COLL); - so->so_rcv.sb_sel = 0; - so->so_rcv.sb_flags &= ~SB_COLL; + selwakeup(&so->so_rcv.sb_sel); +} + +static void +filt_sordetach(struct knote *kn) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); + if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) + so->so_rcv.sb_flags &= ~SB_KNOTE; +} + +/*ARGSUSED*/ +static int +filt_soread(struct knote *kn, long hint) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + kn->kn_data = so->so_rcv.sb_cc; + if (so->so_state & SS_CANTRCVMORE) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + return (1); + } + if (so->so_error) /* temporary udp error */ + return (1); + if (kn->kn_sfflags & NOTE_LOWAT) + return (kn->kn_data >= kn->kn_sdata); + return (kn->kn_data >= so->so_rcv.sb_lowat); +} + +static void +filt_sowdetach(struct knote *kn) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); + if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) + so->so_snd.sb_flags &= ~SB_KNOTE; +} + +/*ARGSUSED*/ +static int +filt_sowrite(struct knote *kn, long hint) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + kn->kn_data = sbspace(&so->so_snd); + if (so->so_state & SS_CANTSENDMORE) { + kn->kn_flags |= EV_EOF; + kn->kn_fflags = so->so_error; + return (1); + } + if (so->so_error) /* temporary udp error */ + return (1); + if (((so->so_state & SS_ISCONNECTED) == 0) && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) + return (0); + if (kn->kn_sfflags & NOTE_LOWAT) + return (kn->kn_data >= kn->kn_sdata); + return (kn->kn_data >= so->so_snd.sb_lowat); +} + +/*ARGSUSED*/ +static int +filt_solisten(struct knote *kn, long hint) +{ + struct socket *so; + + so = (struct socket *)kn->kn_fp->f_data; + + /* + * Set kn_data to number of incoming connections, not + * counting partial (incomplete) connections. + */ + kn->kn_data = so->so_qlen; + return (kn->kn_data > 0); +} + +static const struct filterops solisten_filtops = + { 1, NULL, filt_sordetach, filt_solisten }; +static const struct filterops soread_filtops = + { 1, NULL, filt_sordetach, filt_soread }; +static const struct filterops sowrite_filtops = + { 1, NULL, filt_sowdetach, filt_sowrite }; + +int +soo_kqfilter(struct file *fp, struct knote *kn) +{ + struct socket *so; + struct sockbuf *sb; + + so = (struct socket *)kn->kn_fp->f_data; + switch (kn->kn_filter) { + case EVFILT_READ: + if (so->so_options & SO_ACCEPTCONN) + kn->kn_fop = &solisten_filtops; + else + kn->kn_fop = &soread_filtops; + sb = &so->so_rcv; + break; + case EVFILT_WRITE: + kn->kn_fop = &sowrite_filtops; + sb = &so->so_snd; + break; + default: + return (1); } + SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); + sb->sb_flags |= SB_KNOTE; + return (0); } +