Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.56.2.5 retrieving revision 1.84 diff -u -p -r1.56.2.5 -r1.84 --- src/sys/kern/uipc_socket.c 2002/03/16 16:01:52 1.56.2.5 +++ src/sys/kern/uipc_socket.c 2003/07/02 20:07:45 1.84 @@ -1,4 +1,40 @@ -/* $NetBSD: uipc_socket.c,v 1.56.2.5 2002/03/16 16:01:52 jdolecek Exp $ */ +/* $NetBSD: uipc_socket.c,v 1.84 2003/07/02 20:07:45 ragge Exp $ */ + +/*- + * Copyright (c) 2002 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe of Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 @@ -36,7 +72,12 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.56.2.5 2002/03/16 16:01:52 jdolecek Exp $"); +__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.84 2003/07/02 20:07:45 ragge Exp $"); + +#include "opt_sock_counters.h" +#include "opt_sosend_loan.h" +#include "opt_mbuftrace.h" +#include "opt_somaxkva.h" #include #include @@ -54,30 +95,253 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_socket. #include #include -static void filt_sordetach(struct knote *kn); -static int filt_soread(struct knote *kn, long hint); -static void filt_sowdetach(struct knote *kn); -static int filt_sowrite(struct knote *kn, long hint); -static int filt_solisten(struct knote *kn, long hint); - -static const struct filterops solisten_filtops = - { 1, NULL, filt_sordetach, filt_solisten }; -const struct filterops soread_filtops = - { 1, NULL, filt_sordetach, filt_soread }; -const struct filterops sowrite_filtops = - { 1, NULL, filt_sowdetach, filt_sowrite }; +#include struct pool socket_pool; +MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); +MALLOC_DEFINE(M_SONAME, "soname", "socket name"); + extern int somaxconn; /* patchable (XXX sysctl) */ int somaxconn = SOMAXCONN; +#ifdef SOSEND_COUNTERS +#include + +struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "loan big"); +struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "copy big"); +struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "copy small"); +struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, + NULL, "sosend", "kva limit"); + +#define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ + +#else + +#define SOSEND_COUNTER_INCR(ev) /* nothing */ + +#endif /* SOSEND_COUNTERS */ + void soinit(void) { pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL); + +#ifdef SOSEND_COUNTERS + evcnt_attach_static(&sosend_loan_big); + evcnt_attach_static(&sosend_copy_big); + evcnt_attach_static(&sosend_copy_small); + evcnt_attach_static(&sosend_kvalimit); +#endif /* SOSEND_COUNTERS */ +} + +#ifdef SOSEND_NO_LOAN +int use_sosend_loan = 0; +#else +int use_sosend_loan = 1; +#endif + +struct mbuf *so_pendfree; + +#ifndef SOMAXKVA +#define SOMAXKVA (16 * 1024 * 1024) +#endif +int somaxkva = SOMAXKVA; +int socurkva; +int sokvawaiters; + +#define SOCK_LOAN_THRESH 4096 +#define SOCK_LOAN_CHUNK 65536 + +static size_t sodopendfree(struct socket *); + +vaddr_t +sokvaalloc(vsize_t len, struct socket *so) +{ + vaddr_t lva; + int s; + + while (socurkva + len > somaxkva) { + if (sodopendfree(so)) + continue; + SOSEND_COUNTER_INCR(&sosend_kvalimit); + s = splvm(); + sokvawaiters++; + (void) tsleep(&socurkva, PVM, "sokva", 0); + sokvawaiters--; + splx(s); + } + + lva = uvm_km_valloc_wait(kernel_map, len); + if (lva == 0) + return (0); + socurkva += len; + + return lva; +} + +void +sokvafree(vaddr_t sva, vsize_t len) +{ + + uvm_km_free(kernel_map, sva, len); + socurkva -= len; + if (sokvawaiters) + wakeup(&socurkva); +} + +static void +sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size) +{ + vaddr_t va, sva, eva; + vsize_t len; + paddr_t pa; + int i, npgs; + + eva = round_page((vaddr_t) buf + size); + sva = trunc_page((vaddr_t) buf); + len = eva - sva; + npgs = len >> PAGE_SHIFT; + + if (__predict_false(pgs == NULL)) { + pgs = alloca(npgs * sizeof(*pgs)); + + for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { + if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) + panic("sodoloanfree: va 0x%lx not mapped", va); + pgs[i] = PHYS_TO_VM_PAGE(pa); + } + } + + pmap_kremove(sva, len); + pmap_update(pmap_kernel()); + uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); + sokvafree(sva, len); +} + +static size_t +sodopendfree(struct socket *so) +{ + struct mbuf *m; + size_t rv = 0; + int s; + + s = splvm(); + + for (;;) { + m = so_pendfree; + if (m == NULL) + break; + so_pendfree = m->m_next; + splx(s); + + rv += m->m_ext.ext_size; + sodoloanfree((m->m_flags & M_EXT_PAGES) ? + m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, + m->m_ext.ext_size); + s = splvm(); + pool_cache_put(&mbpool_cache, m); + } + + for (;;) { + m = so->so_pendfree; + if (m == NULL) + break; + so->so_pendfree = m->m_next; + splx(s); + + rv += m->m_ext.ext_size; + sodoloanfree((m->m_flags & M_EXT_PAGES) ? + m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, + m->m_ext.ext_size); + s = splvm(); + pool_cache_put(&mbpool_cache, m); + } + + splx(s); + return (rv); +} + +void +soloanfree(struct mbuf *m, caddr_t buf, size_t size, void *arg) +{ + struct socket *so = arg; + int s; + + if (m == NULL) { + sodoloanfree(NULL, buf, size); + return; + } + + s = splvm(); + m->m_next = so->so_pendfree; + so->so_pendfree = m; + splx(s); + if (sokvawaiters) + wakeup(&socurkva); +} + +static long +sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) +{ + struct iovec *iov = uio->uio_iov; + vaddr_t sva, eva; + vsize_t len; + vaddr_t lva, va; + int npgs, i, error; + + if (uio->uio_segflg != UIO_USERSPACE) + return (0); + + if (iov->iov_len < (size_t) space) + space = iov->iov_len; + if (space > SOCK_LOAN_CHUNK) + space = SOCK_LOAN_CHUNK; + + eva = round_page((vaddr_t) iov->iov_base + space); + sva = trunc_page((vaddr_t) iov->iov_base); + len = eva - sva; + npgs = len >> PAGE_SHIFT; + + /* XXX KDASSERT */ + KASSERT(npgs <= M_EXT_MAXPAGES); + + lva = sokvaalloc(len, so); + if (lva == 0) + return 0; + + error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len, + m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); + if (error) { + sokvafree(lva, len); + return (0); + } + + for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), + VM_PROT_READ); + pmap_update(pmap_kernel()); + + lva += (vaddr_t) iov->iov_base & PAGE_MASK; + + MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so); + m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; + + uio->uio_resid -= space; + /* uio_offset not updated, not set/used for write(2) */ + uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space; + uio->uio_iov->iov_len -= space; + if (uio->uio_iov->iov_len == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + } + + return (space); } /* @@ -114,6 +378,11 @@ socreate(int dom, struct socket **aso, i so->so_proto = prp; so->so_send = sosend; so->so_receive = soreceive; +#ifdef MBUFTRACE + so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; + so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; + so->so_mowner = &prp->pr_domain->dom_mowner; +#endif if (p != 0) so->so_uid = p->p_ucred->cr_uid; error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, @@ -153,7 +422,7 @@ solisten(struct socket *so, int backlog) splx(s); return (error); } - if (so->so_q.tqh_first == NULL) + if (TAILQ_EMPTY(&so->so_q)) so->so_options |= SO_ACCEPTCONN; if (backlog < 0) backlog = 0; @@ -165,6 +434,7 @@ solisten(struct socket *so, int backlog) void sofree(struct socket *so) { + struct mbuf *m; if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) return; @@ -179,6 +449,11 @@ sofree(struct socket *so) } sbrelease(&so->so_snd); sorflush(so); + while ((m = so->so_pendfree) != NULL) { + so->so_pendfree = m->m_next; + m->m_next = so_pendfree; + so_pendfree = m; + } pool_put(&socket_pool, so); } @@ -196,11 +471,11 @@ soclose(struct socket *so) error = 0; s = splsoftnet(); /* conservative */ if (so->so_options & SO_ACCEPTCONN) { - while ((so2 = so->so_q0.tqh_first) != 0) { + while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { (void) soqremque(so2, 0); (void) soabort(so2); } - while ((so2 = so->so_q.tqh_first) != 0) { + while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { (void) soqremque(so2, 1); (void) soabort(so2); } @@ -334,6 +609,7 @@ sodisconnect(struct socket *so) (struct proc *)0); bad: splx(s); + sodopendfree(so); return (error); } @@ -364,6 +640,8 @@ sosend(struct socket *so, struct mbuf *a long space, len, resid, clen, mlen; int error, s, dontroute, atomic; + sodopendfree(so); + p = curproc; /* XXX */ clen = 0; atomic = sosendallatonce(so) || top; @@ -441,32 +719,40 @@ sosend(struct socket *so, struct mbuf *a top->m_flags |= M_EOR; } else do { if (top == 0) { - MGETHDR(m, M_WAIT, MT_DATA); + m = m_gethdr(M_WAIT, MT_DATA); mlen = MHLEN; m->m_pkthdr.len = 0; m->m_pkthdr.rcvif = (struct ifnet *)0; } else { - MGET(m, M_WAIT, MT_DATA); + m = m_get(M_WAIT, MT_DATA); mlen = MLEN; } + MCLAIM(m, so->so_snd.sb_mowner); + if (use_sosend_loan && + uio->uio_iov->iov_len >= SOCK_LOAN_THRESH && + space >= SOCK_LOAN_THRESH && + (len = sosend_loan(so, uio, m, + space)) != 0) { + SOSEND_COUNTER_INCR(&sosend_loan_big); + space -= len; + goto have_data; + } if (resid >= MINCLSIZE && space >= MCLBYTES) { - MCLGET(m, M_WAIT); + SOSEND_COUNTER_INCR(&sosend_copy_big); + m_clget(m, M_WAIT); if ((m->m_flags & M_EXT) == 0) goto nopages; mlen = MCLBYTES; -#ifdef MAPPED_MBUFS - len = lmin(MCLBYTES, resid); -#else if (atomic && top == 0) { len = lmin(MCLBYTES - max_hdr, resid); m->m_data += max_hdr; } else len = lmin(MCLBYTES, resid); -#endif space -= len; } else { -nopages: + nopages: + SOSEND_COUNTER_INCR(&sosend_copy_small); len = lmin(lmin(mlen, resid), space); space -= len; /* @@ -478,6 +764,7 @@ nopages: } error = uiomove(mtod(m, caddr_t), (int)len, uio); + have_data: resid = uio->uio_resid; m->m_len = len; *mp = m; @@ -553,6 +840,7 @@ soreceive(struct socket *so, struct mbuf int flags, len, error, s, offset, moff, type, orig_resid; struct protosw *pr; struct mbuf *nextrecord; + int mbuf_removed = 0; pr = so->so_proto; mp = mp0; @@ -566,6 +854,10 @@ soreceive(struct socket *so, struct mbuf flags = *flagsp &~ MSG_EOR; else flags = 0; + + if ((flags & MSG_DONTWAIT) == 0) + sodopendfree(so); + if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, @@ -645,6 +937,8 @@ soreceive(struct socket *so, struct mbuf error = EWOULDBLOCK; goto release; } + SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(&so->so_rcv); error = sbwait(&so->so_rcv); splx(s); @@ -653,10 +947,18 @@ soreceive(struct socket *so, struct mbuf goto restart; } dontblock: + /* + * On entry here, m points to the first record of the socket buffer. + * While we process the initial mbufs containing address and control + * info, we save a copy of m->m_nextpkt into nextrecord. + */ #ifdef notyet /* XXXX */ if (uio->uio_procp) uio->uio_procp->p_stats->p_ru.ru_msgrcv++; #endif + KASSERT(m == so->so_rcv.sb_mb); + SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { #ifdef DIAGNOSTIC @@ -670,6 +972,7 @@ soreceive(struct socket *so, struct mbuf m = m->m_next; } else { sbfree(&so->so_rcv, m); + mbuf_removed = 1; if (paddr) { *paddr = m; so->so_rcv.sb_mb = m->m_next; @@ -688,6 +991,7 @@ soreceive(struct socket *so, struct mbuf m = m->m_next; } else { sbfree(&so->so_rcv, m); + mbuf_removed = 1; if (controlp) { if (pr->pr_domain->dom_externalize && mtod(m, struct cmsghdr *)->cmsg_type == @@ -707,13 +1011,39 @@ soreceive(struct socket *so, struct mbuf controlp = &(*controlp)->m_next; } } + + /* + * If m is non-NULL, we have some data to read. From now on, + * make sure to keep sb_lastrecord consistent when working on + * the last packet on the chain (nextrecord == NULL) and we + * change m->m_nextpkt. + */ if (m) { - if ((flags & MSG_PEEK) == 0) + if ((flags & MSG_PEEK) == 0) { m->m_nextpkt = nextrecord; + /* + * If nextrecord == NULL (this is a single chain), + * then sb_lastrecord may not be valid here if m + * was changed earlier. + */ + if (nextrecord == NULL) { + KASSERT(so->so_rcv.sb_mb == m); + so->so_rcv.sb_lastrecord = m; + } + } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; + } else { + if ((flags & MSG_PEEK) == 0) { + KASSERT(so->so_rcv.sb_mb == m); + so->so_rcv.sb_mb = nextrecord; + SB_EMPTY_FIXUP(&so->so_rcv); + } } + SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); + moff = 0; offset = 0; while (m && uio->uio_resid > 0 && error == 0) { @@ -741,11 +1071,29 @@ soreceive(struct socket *so, struct mbuf * block interrupts again. */ if (mp == 0) { + SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); splx(s); error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); s = splsoftnet(); - if (error) + if (error) { + /* + * If any part of the record has been removed + * (such as the MT_SONAME mbuf, which will + * happen when PR_ADDR, and thus also + * PR_ATOMIC, is set), then drop the entire + * record to maintain the atomicity of the + * receive operation. + * + * This avoids a later panic("receive 1a") + * when compiled with DIAGNOSTIC. + */ + if (m && mbuf_removed + && (pr->pr_flags & PR_ATOMIC)) + (void) sbdroprecord(&so->so_rcv); + goto release; + } } else uio->uio_resid -= len; if (len == m->m_len - moff) { @@ -766,8 +1114,21 @@ soreceive(struct socket *so, struct mbuf MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } - if (m) + /* + * If m != NULL, we also know that + * so->so_rcv.sb_mb != NULL. + */ + KASSERT(so->so_rcv.sb_mb == m); + if (m) { m->m_nextpkt = nextrecord; + if (nextrecord == NULL) + so->so_rcv.sb_lastrecord = m; + } else { + so->so_rcv.sb_mb = nextrecord; + SB_EMPTY_FIXUP(&so->so_rcv); + } + SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); } } else { if (flags & MSG_PEEK) @@ -806,6 +1167,25 @@ soreceive(struct socket *so, struct mbuf !sosendallatonce(so) && !nextrecord) { if (so->so_error || so->so_state & SS_CANTRCVMORE) break; + /* + * If we are peeking and the socket receive buffer is + * full, stop since we can't get more data to peek at. + */ + if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) + break; + /* + * If we've drained the socket buffer, tell the + * protocol in case it needs to do something to + * get it filled again. + */ + if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) + (*pr->pr_usrreq)(so, PRU_RCVD, + (struct mbuf *)0, + (struct mbuf *)(long)flags, + (struct mbuf *)0, + (struct proc *)0); + SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); error = sbwait(&so->so_rcv); if (error) { sbunlock(&so->so_rcv); @@ -823,8 +1203,21 @@ soreceive(struct socket *so, struct mbuf (void) sbdroprecord(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { - if (m == 0) + if (m == 0) { + /* + * First part is an inline SB_EMPTY_FIXUP(). Second + * part makes sure sb_lastrecord is up-to-date if + * there is still data in the socket buffer. + */ so->so_rcv.sb_mb = nextrecord; + if (so->so_rcv.sb_mb == NULL) { + so->so_rcv.sb_mbtail = NULL; + so->so_rcv.sb_lastrecord = NULL; + } else if (nextrecord->m_nextpkt == NULL) + so->so_rcv.sb_lastrecord = nextrecord; + } + SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); + SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, (struct mbuf *)(long)flags, (struct mbuf *)0, @@ -990,11 +1383,13 @@ sosetopt(struct socket *so, int level, i goto bad; } tv = mtod(m, struct timeval *); - if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) { + if (tv->tv_sec > (SHRT_MAX - tv->tv_usec / tick) / hz) { error = EDOM; goto bad; } val = tv->tv_sec * hz + tv->tv_usec / tick; + if (val == 0 && tv->tv_usec != 0) + val = 1; switch (optname) { @@ -1119,42 +1514,14 @@ sohasoutofband(struct socket *so) selwakeup(&so->so_rcv.sb_sel); } - -int -soo_kqfilter(struct file *fp, struct knote *kn) -{ - struct socket *so; - struct sockbuf *sb; - - so = (struct socket *)kn->kn_fp->f_data; - switch (kn->kn_filter) { - case EVFILT_READ: - if (so->so_options & SO_ACCEPTCONN) - kn->kn_fop = &solisten_filtops; - else - kn->kn_fop = &soread_filtops; - sb = &so->so_rcv; - break; - case EVFILT_WRITE: - kn->kn_fop = &sowrite_filtops; - sb = &so->so_snd; - break; - default: - return (1); - } - SLIST_INSERT_HEAD(&sb->sb_sel.si_klist, kn, kn_selnext); - sb->sb_flags |= SB_KNOTE; - return (0); -} - static void filt_sordetach(struct knote *kn) { struct socket *so; so = (struct socket *)kn->kn_fp->f_data; - SLIST_REMOVE(&so->so_rcv.sb_sel.si_klist, kn, knote, kn_selnext); - if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_klist)) + SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); + if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) so->so_rcv.sb_flags &= ~SB_KNOTE; } @@ -1184,8 +1551,8 @@ filt_sowdetach(struct knote *kn) struct socket *so; so = (struct socket *)kn->kn_fp->f_data; - SLIST_REMOVE(&so->so_snd.sb_sel.si_klist, kn, knote, kn_selnext); - if (SLIST_EMPTY(&so->so_snd.sb_sel.si_klist)) + SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); + if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) so->so_snd.sb_flags &= ~SB_KNOTE; } @@ -1227,3 +1594,38 @@ filt_solisten(struct knote *kn, long hin kn->kn_data = so->so_qlen; return (kn->kn_data > 0); } + +static const struct filterops solisten_filtops = + { 1, NULL, filt_sordetach, filt_solisten }; +static const struct filterops soread_filtops = + { 1, NULL, filt_sordetach, filt_soread }; +static const struct filterops sowrite_filtops = + { 1, NULL, filt_sowdetach, filt_sowrite }; + +int +soo_kqfilter(struct file *fp, struct knote *kn) +{ + struct socket *so; + struct sockbuf *sb; + + so = (struct socket *)kn->kn_fp->f_data; + switch (kn->kn_filter) { + case EVFILT_READ: + if (so->so_options & SO_ACCEPTCONN) + kn->kn_fop = &solisten_filtops; + else + kn->kn_fop = &soread_filtops; + sb = &so->so_rcv; + break; + case EVFILT_WRITE: + kn->kn_fop = &sowrite_filtops; + sb = &so->so_snd; + break; + default: + return (1); + } + SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); + sb->sb_flags |= SB_KNOTE; + return (0); +} +