Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.144.4.3 retrieving revision 1.167.2.2 diff -u -p -r1.144.4.3 -r1.167.2.2 --- src/sys/kern/uipc_socket.c 2007/12/27 00:46:16 1.144.4.3 +++ src/sys/kern/uipc_socket.c 2008/07/28 14:37:36 1.167.2.2 @@ -1,7 +1,7 @@ -/* $NetBSD: uipc_socket.c,v 1.144.4.3 2007/12/27 00:46:16 mjf Exp $ */ +/* $NetBSD: uipc_socket.c,v 1.167.2.2 2008/07/28 14:37:36 simonb Exp $ */ /*- - * Copyright (c) 2002, 2007 The NetBSD Foundation, Inc. + * Copyright (c) 2002, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -15,13 +15,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the NetBSD - * Foundation, Inc. and its contributors. - * 4. Neither the name of The NetBSD Foundation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED @@ -37,6 +30,8 @@ */ /* + * Copyright (c) 2004 The FreeBSD Foundation + * Copyright (c) 2004 Robert Watson * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * @@ -68,12 +63,13 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.144.4.3 2007/12/27 00:46:16 mjf Exp $"); +__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.167.2.2 2008/07/28 14:37:36 simonb Exp $"); #include "opt_sock_counters.h" #include "opt_sosend_loan.h" #include "opt_mbuftrace.h" #include "opt_somaxkva.h" +#include "opt_multiprocessor.h" /* XXX */ #include #include @@ -89,7 +85,6 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_socket. #include #include #include -#include #include #include #include @@ -98,9 +93,6 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_socket. #include -POOL_INIT(socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL, - IPL_SOFTNET); - MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); MALLOC_DEFINE(M_SONAME, "soname", "socket name"); @@ -108,6 +100,7 @@ extern const struct fileops socketops; extern int somaxconn; /* patchable (XXX sysctl) */ int somaxconn = SOMAXCONN; +kmutex_t *softnet_lock; #ifdef SOSEND_COUNTERS #include @@ -135,7 +128,7 @@ EVCNT_ATTACH_STATIC(sosend_kvalimit); static struct callback_entry sokva_reclaimerentry; -#ifdef SOSEND_NO_LOAN +#if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR) int sock_loan_thresh = -1; #else int sock_loan_thresh = 4096; @@ -253,26 +246,17 @@ sokvafree(vaddr_t sva, vsize_t len) static void sodoloanfree(struct vm_page **pgs, void *buf, size_t size) { - vaddr_t va, sva, eva; + vaddr_t sva, eva; vsize_t len; - paddr_t pa; - int i, npgs; + int npgs; + + KASSERT(pgs != NULL); eva = round_page((vaddr_t) buf + size); sva = trunc_page((vaddr_t) buf); len = eva - sva; npgs = len >> PAGE_SHIFT; - if (__predict_false(pgs == NULL)) { - pgs = alloca(npgs * sizeof(*pgs)); - - for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { - if (pmap_extract(pmap_kernel(), va, &pa) == false) - panic("sodoloanfree: va 0x%lx not mapped", va); - pgs[i] = PHYS_TO_VM_PAGE(pa); - } - } - pmap_kremove(sva, len); pmap_update(pmap_kernel()); uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); @@ -280,10 +264,13 @@ sodoloanfree(struct vm_page **pgs, void } static size_t -sodopendfree() +sodopendfree(void) { size_t rv; + if (__predict_true(so_pendfree == NULL)) + return 0; + mutex_enter(&so_pendfree_lock); rv = sodopendfreel(); mutex_exit(&so_pendfree_lock); @@ -299,7 +286,7 @@ sodopendfree() */ static size_t -sodopendfreel() +sodopendfreel(void) { struct mbuf *m, *next; size_t rv = 0; @@ -313,10 +300,11 @@ sodopendfreel() for (; m != NULL; m = next) { next = m->m_next; + KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); + KASSERT(m->m_ext.ext_refcnt == 0); rv += m->m_ext.ext_size; - sodoloanfree((m->m_flags & M_EXT_PAGES) ? - m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, + sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, m->m_ext.ext_size); pool_cache_put(mb_cache, m); } @@ -331,15 +319,7 @@ void soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) { - if (m == NULL) { - - /* - * called from MEXTREMOVE. - */ - - sodoloanfree(NULL, buf, size); - return; - } + KASSERT(m != NULL); /* * postpone freeing mbuf. @@ -361,8 +341,10 @@ sosend_loan(struct socket *so, struct ui struct iovec *iov = uio->uio_iov; vaddr_t sva, eva; vsize_t len; - vaddr_t lva, va; - int npgs, i, error; + vaddr_t lva; + int npgs, error; + vaddr_t va; + int i; if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) return (0); @@ -377,7 +359,6 @@ sosend_loan(struct socket *so, struct ui len = eva - sva; npgs = len >> PAGE_SHIFT; - /* XXX KDASSERT */ KASSERT(npgs <= M_EXT_MAXPAGES); lva = sokvaalloc(len, so); @@ -453,7 +434,9 @@ soinit(void) { mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM); + softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); cv_init(&socurkva_cv, "sokva"); + soinit2(); /* Set the initial adjusted socket buffer size. */ if (sb_max_set(sb_max)) @@ -472,12 +455,14 @@ soinit(void) */ /*ARGSUSED*/ int -socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l) +socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l, + struct socket *lockso) { const struct protosw *prp; struct socket *so; uid_t uid; - int error, s; + int error; + kmutex_t *lock; error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type), @@ -502,11 +487,8 @@ socreate(int dom, struct socket **aso, i return EPROTONOSUPPORT; if (prp->pr_type != type) return EPROTOTYPE; - s = splsoftnet(); - so = pool_get(&socket_pool, PR_WAITOK); - memset(so, 0, sizeof(*so)); - TAILQ_INIT(&so->so_q0); - TAILQ_INIT(&so->so_q); + + so = soget(true); so->so_type = type; so->so_proto = prp; so->so_send = sosend; @@ -516,19 +498,28 @@ socreate(int dom, struct socket **aso, i so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; so->so_mowner = &prp->pr_domain->dom_mowner; #endif - selinit(&so->so_rcv.sb_sel); - selinit(&so->so_snd.sb_sel); uid = kauth_cred_geteuid(l->l_cred); so->so_uidinfo = uid_find(uid); + so->so_egid = kauth_cred_getegid(l->l_cred); + so->so_cpid = l->l_proc->p_pid; + if (lockso != NULL) { + /* Caller wants us to share a lock. */ + lock = lockso->so_lock; + so->so_lock = lock; + mutex_obj_hold(lock); + mutex_enter(lock); + } else { + /* Lock assigned and taken during PRU_ATTACH. */ + } error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL, (struct mbuf *)(long)proto, NULL, l); + KASSERT(solocked(so)); if (error != 0) { so->so_state |= SS_NOFDREF; sofree(so); - splx(s); return error; } - splx(s); + sounlock(so); *aso = so; return 0; } @@ -540,29 +531,23 @@ int fsocreate(int domain, struct socket **sop, int type, int protocol, struct lwp *l, int *fdout) { - struct filedesc *fdp; struct socket *so; struct file *fp; int fd, error; - fdp = l->l_proc->p_fd; - /* falloc() will use the desciptor for us */ - if ((error = falloc(l, &fp, &fd)) != 0) + if ((error = fd_allocfile(&fp, &fd)) != 0) return (error); fp->f_flag = FREAD|FWRITE; fp->f_type = DTYPE_SOCKET; fp->f_ops = &socketops; - error = socreate(domain, &so, type, protocol, l); + error = socreate(domain, &so, type, protocol, l, NULL); if (error != 0) { - FILE_UNUSE(fp, l); - fdremove(fdp, fd); - ffree(fp); + fd_abort(curproc, fp, fd); } else { if (sop != NULL) *sop = so; fp->f_data = so; - FILE_SET_MATURE(fp); - FILE_UNUSE(fp, l); + fd_affix(curproc, fp, fd); *fdout = fd; } return error; @@ -571,24 +556,29 @@ fsocreate(int domain, struct socket **so int sobind(struct socket *so, struct mbuf *nam, struct lwp *l) { - int s, error; + int error; - s = splsoftnet(); + solock(so); error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, l); - splx(s); + sounlock(so); return error; } int solisten(struct socket *so, int backlog, struct lwp *l) { - int s, error; + int error; - s = splsoftnet(); + solock(so); + if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | + SS_ISDISCONNECTING)) != 0) { + sounlock(so); + return (EOPNOTSUPP); + } error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL, l); if (error != 0) { - splx(s); + sounlock(so); return error; } if (TAILQ_EMPTY(&so->so_q)) @@ -596,24 +586,31 @@ solisten(struct socket *so, int backlog, if (backlog < 0) backlog = 0; so->so_qlimit = min(backlog, somaxconn); - splx(s); + sounlock(so); return 0; } void sofree(struct socket *so) { + u_int refs; + + KASSERT(solocked(so)); - if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) + if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { + sounlock(so); return; + } if (so->so_head) { /* * We must not decommission a socket that's on the accept(2) * queue. If we do, then accept(2) may hang after select(2) * indicated that the listening socket was ready. */ - if (!soqremque(so, 0)) + if (!soqremque(so, 0)) { + sounlock(so); return; + } } if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, @@ -622,10 +619,14 @@ sofree(struct socket *so) (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sbrelease(&so->so_snd, so); + KASSERT(!cv_has_waiters(&so->so_cv)); + KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); + KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); sorflush(so); - seldestroy(&so->so_rcv.sb_sel); - seldestroy(&so->so_snd.sb_sel); - pool_put(&socket_pool, so); + refs = so->so_aborting; /* XXX */ + sounlock(so); + if (refs == 0) /* XXX */ + soput(so); } /* @@ -637,19 +638,28 @@ int soclose(struct socket *so) { struct socket *so2; - int s, error; + int error; + int error2; error = 0; - s = splsoftnet(); /* conservative */ + solock(so); if (so->so_options & SO_ACCEPTCONN) { - while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { - (void) soqremque(so2, 0); - (void) soabort(so2); - } - while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { - (void) soqremque(so2, 1); - (void) soabort(so2); - } + do { + while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { + KASSERT(solocked2(so, so2)); + (void) soqremque(so2, 0); + /* soabort drops the lock. */ + (void) soabort(so2); + solock(so); + } + while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { + KASSERT(solocked2(so, so2)); + (void) soqremque(so2, 1); + /* soabort drops the lock. */ + (void) soabort(so2); + solock(so); + } + } while (!TAILQ_EMPTY(&so->so_q0)); } if (so->so_pcb == 0) goto discard; @@ -660,13 +670,10 @@ soclose(struct socket *so) goto drop; } if (so->so_options & SO_LINGER) { - if ((so->so_state & SS_ISDISCONNECTING) && - (so->so_state & SS_NBIO)) + if ((so->so_state & SS_ISDISCONNECTING) && so->so_nbio) goto drop; while (so->so_state & SS_ISCONNECTED) { - error = tsleep((void *)&so->so_timeo, - PSOCK | PCATCH, netcls, - so->so_linger * hz); + error = sowait(so, so->so_linger * hz); if (error) break; } @@ -674,7 +681,7 @@ soclose(struct socket *so) } drop: if (so->so_pcb) { - int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, + error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, NULL, NULL, NULL, NULL); if (error == 0) error = error2; @@ -684,23 +691,29 @@ soclose(struct socket *so) panic("soclose: NOFDREF"); so->so_state |= SS_NOFDREF; sofree(so); - splx(s); return (error); } /* - * Must be called at splsoftnet... + * Must be called with the socket locked.. Will return with it unlocked. */ int soabort(struct socket *so) { + u_int refs; int error; - + + KASSERT(solocked(so)); KASSERT(so->so_head == NULL); + + so->so_aborting++; /* XXX */ error = (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL, NULL); - if (error) { + refs = --so->so_aborting; /* XXX */ + if (error || (refs == 0)) { sofree(so); + } else { + sounlock(so); } return error; } @@ -708,10 +721,11 @@ soabort(struct socket *so) int soaccept(struct socket *so, struct mbuf *nam) { - int s, error; + int error; + + KASSERT(solocked(so)); error = 0; - s = splsoftnet(); if ((so->so_state & SS_NOFDREF) == 0) panic("soaccept: !NOFDREF"); so->so_state &= ~SS_NOFDREF; @@ -722,18 +736,18 @@ soaccept(struct socket *so, struct mbuf else error = ECONNABORTED; - splx(s); return (error); } int soconnect(struct socket *so, struct mbuf *nam, struct lwp *l) { - int s, error; + int error; + + KASSERT(solocked(so)); if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); - s = splsoftnet(); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. @@ -747,40 +761,36 @@ soconnect(struct socket *so, struct mbuf else error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, NULL, nam, NULL, l); - splx(s); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { - int s, error; + int error; + + KASSERT(solocked2(so1, so2)); - s = splsoftnet(); error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL, (struct mbuf *)so2, NULL, NULL); - splx(s); return (error); } int sodisconnect(struct socket *so) { - int s, error; + int error; + + KASSERT(solocked(so)); - s = splsoftnet(); if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; - goto bad; - } - if (so->so_state & SS_ISDISCONNECTING) { + } else if (so->so_state & SS_ISDISCONNECTING) { error = EALREADY; - goto bad; + } else { + error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, + NULL, NULL, NULL, NULL); } - error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, - NULL, NULL, NULL, NULL); - bad: - splx(s); sodopendfree(); return (error); } @@ -814,8 +824,15 @@ sosend(struct socket *so, struct mbuf *a p = l->l_proc; sodopendfree(); - clen = 0; + + /* + * solock() provides atomicity of access. splsoftnet() prevents + * protocol processing soft interrupts from interrupting us and + * blocking (expensive). + */ + s = splsoftnet(); + solock(so); atomic = sosendallatonce(so) || top; if (uio) resid = uio->uio_resid; @@ -835,51 +852,54 @@ sosend(struct socket *so, struct mbuf *a dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); - if (p) - p->p_stats->p_ru.ru_msgsnd++; + l->l_ru.ru_msgsnd++; if (control) clen = control->m_len; -#define snderr(errno) { error = errno; splx(s); goto release; } - restart: if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) goto out; do { - s = splsoftnet(); - if (so->so_state & SS_CANTSENDMORE) - snderr(EPIPE); + if (so->so_state & SS_CANTSENDMORE) { + error = EPIPE; + goto release; + } if (so->so_error) { error = so->so_error; so->so_error = 0; - splx(s); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if ((so->so_state & SS_ISCONFIRMING) == 0 && - !(resid == 0 && clen != 0)) - snderr(ENOTCONN); - } else if (addr == 0) - snderr(EDESTADDRREQ); + !(resid == 0 && clen != 0)) { + error = ENOTCONN; + goto release; + } + } else if (addr == 0) { + error = EDESTADDRREQ; + goto release; + } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || - clen > so->so_snd.sb_hiwat) - snderr(EMSGSIZE); + clen > so->so_snd.sb_hiwat) { + error = EMSGSIZE; + goto release; + } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { - if (so->so_state & SS_NBIO) - snderr(EWOULDBLOCK); + if (so->so_nbio) { + error = EWOULDBLOCK; + goto release; + } sbunlock(&so->so_snd); error = sbwait(&so->so_snd); - splx(s); if (error) goto out; goto restart; } - splx(s); mp = ⊤ space -= clen; do { @@ -891,6 +911,8 @@ sosend(struct socket *so, struct mbuf *a if (flags & MSG_EOR) top->m_flags |= M_EOR; } else do { + sounlock(so); + splx(s); if (top == NULL) { m = m_gethdr(M_WAIT, MT_DATA); mlen = MHLEN; @@ -941,6 +963,8 @@ sosend(struct socket *so, struct mbuf *a m->m_len = len; *mp = m; top->m_pkthdr.len += len; + s = splsoftnet(); + solock(so); if (error != 0) goto release; mp = &m->m_next; @@ -951,24 +975,21 @@ sosend(struct socket *so, struct mbuf *a } } while (space > 0 && atomic); - s = splsoftnet(); - - if (so->so_state & SS_CANTSENDMORE) - snderr(EPIPE); - + if (so->so_state & SS_CANTSENDMORE) { + error = EPIPE; + goto release; + } if (dontroute) so->so_options |= SO_DONTROUTE; if (resid > 0) so->so_state |= SS_MORETOCOME; error = (*so->so_proto->pr_usrreq)(so, (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, - top, addr, control, curlwp); /* XXX */ + top, addr, control, curlwp); if (dontroute) so->so_options &= ~SO_DONTROUTE; if (resid > 0) so->so_state &= ~SS_MORETOCOME; - splx(s); - clen = 0; control = NULL; top = NULL; @@ -981,6 +1002,8 @@ sosend(struct socket *so, struct mbuf *a release: sbunlock(&so->so_snd); out: + sounlock(so); + splx(s); if (top) m_freem(top); if (control) @@ -989,6 +1012,43 @@ sosend(struct socket *so, struct mbuf *a } /* + * Following replacement or removal of the first mbuf on the first + * mbuf chain of a socket buffer, push necessary state changes back + * into the socket buffer so that other consumers see the values + * consistently. 'nextrecord' is the callers locally stored value of + * the original value of sb->sb_mb->m_nextpkt which must be restored + * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. + */ +static void +sbsync(struct sockbuf *sb, struct mbuf *nextrecord) +{ + + KASSERT(solocked(sb->sb_so)); + + /* + * First, update for the new value of nextrecord. If necessary, + * make it the first record. + */ + if (sb->sb_mb != NULL) + sb->sb_mb->m_nextpkt = nextrecord; + else + sb->sb_mb = nextrecord; + + /* + * Now update any dependent socket buffer fields to reflect + * the new state. This is an inline of SB_EMPTY_FIXUP, with + * the addition of a second clause that takes care of the + * case where sb_mb has been updated, but remains the last + * record. + */ + if (sb->sb_mb == NULL) { + sb->sb_mbtail = NULL; + sb->sb_lastrecord = NULL; + } else if (sb->sb_mb->m_nextpkt == NULL) + sb->sb_lastrecord = sb->sb_mb; +} + +/* * Implement receive operations on a socket. * We depend on the way that records are added to the sockbuf * by sbappend*. In particular, each record (mbufs linked through m_next) @@ -1009,7 +1069,7 @@ soreceive(struct socket *so, struct mbuf struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct lwp *l = curlwp; - struct mbuf *m, **mp; + struct mbuf *m, **mp, *mt; int atomic, flags, len, error, s, offset, moff, type, orig_resid; const struct protosw *pr; struct mbuf *nextrecord; @@ -1037,8 +1097,10 @@ soreceive(struct socket *so, struct mbuf if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); + solock(so); error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, (struct mbuf *)(long)(flags & MSG_PEEK), NULL, l); + sounlock(so); if (error) goto bad; do { @@ -1053,13 +1115,23 @@ soreceive(struct socket *so, struct mbuf } if (mp != NULL) *mp = NULL; + + /* + * solock() provides atomicity of access. splsoftnet() prevents + * protocol processing soft interrupts from interrupting us and + * blocking (expensive). + */ + s = splsoftnet(); + solock(so); if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) (*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, l); restart: - if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) + if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) { + sounlock(so); + splx(s); return error; - s = splsoftnet(); + } m = so->so_rcv.sb_mb; /* @@ -1110,7 +1182,7 @@ soreceive(struct socket *so, struct mbuf } if (uio->uio_resid == 0) goto release; - if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { + if (so->so_nbio || (flags & MSG_DONTWAIT)) { error = EWOULDBLOCK; goto release; } @@ -1118,19 +1190,32 @@ soreceive(struct socket *so, struct mbuf SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(&so->so_rcv); error = sbwait(&so->so_rcv); - splx(s); - if (error != 0) + if (error != 0) { + sounlock(so); + splx(s); return error; + } goto restart; } dontblock: /* * On entry here, m points to the first record of the socket buffer. - * While we process the initial mbufs containing address and control - * info, we save a copy of m->m_nextpkt into nextrecord. + * From this point onward, we maintain 'nextrecord' as a cache of the + * pointer to the next record in the socket buffer. We must keep the + * various socket buffer pointers and local stack versions of the + * pointers in sync, pushing out modifications before dropping the + * socket lock, and re-reading them when picking it up. + * + * Otherwise, we will race with the network stack appending new data + * or records onto the socket buffer by using inconsistent/stale + * versions of the field, possibly resulting in socket buffer + * corruption. + * + * By holding the high-level sblock(), we prevent simultaneous + * readers from pulling off the front of the socket buffer. */ if (l != NULL) - l->l_proc->p_stats->p_ru.ru_msgrcv++; + l->l_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb); SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); @@ -1157,71 +1242,80 @@ soreceive(struct socket *so, struct mbuf MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } + sbsync(&so->so_rcv, nextrecord); } } - while (m != NULL && m->m_type == MT_CONTROL && error == 0) { - if (flags & MSG_PEEK) { - if (controlp != NULL) - *controlp = m_copy(m, 0, m->m_len); - m = m->m_next; - } else { - sbfree(&so->so_rcv, m); - mbuf_removed = 1; - if (controlp != NULL) { - if (dom->dom_externalize && l && - mtod(m, struct cmsghdr *)->cmsg_type == - SCM_RIGHTS) - error = (*dom->dom_externalize)(m, l); - *controlp = m; + + /* + * Process one or more MT_CONTROL mbufs present before any data mbufs + * in the first mbuf chain on the socket buffer. If MSG_PEEK, we + * just copy the data; if !MSG_PEEK, we call into the protocol to + * perform externalization (or freeing if controlp == NULL). + */ + if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) { + struct mbuf *cm = NULL, *cmn; + struct mbuf **cme = &cm; + + do { + if (flags & MSG_PEEK) { + if (controlp != NULL) { + *controlp = m_copy(m, 0, m->m_len); + controlp = &(*controlp)->m_next; + } + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; + *cme = m; + cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; + } + } while (m != NULL && m->m_type == MT_CONTROL); + if ((flags & MSG_PEEK) == 0) + sbsync(&so->so_rcv, nextrecord); + for (; cm != NULL; cm = cmn) { + cmn = cm->m_next; + cm->m_next = NULL; + type = mtod(cm, struct cmsghdr *)->cmsg_type; + if (controlp != NULL) { + if (dom->dom_externalize != NULL && + type == SCM_RIGHTS) { + sounlock(so); + splx(s); + error = (*dom->dom_externalize)(cm, l); + s = splsoftnet(); + solock(so); + } + *controlp = cm; + while (*controlp != NULL) + controlp = &(*controlp)->m_next; } else { /* * Dispose of any SCM_RIGHTS message that went * through the read path rather than recv. */ - if (dom->dom_dispose && - mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) - (*dom->dom_dispose)(m); - MFREE(m, so->so_rcv.sb_mb); - m = so->so_rcv.sb_mb; + if (dom->dom_dispose != NULL && + type == SCM_RIGHTS) { + sounlock(so); + (*dom->dom_dispose)(cm); + solock(so); + } + m_freem(cm); } } - if (controlp != NULL) { - orig_resid = 0; - controlp = &(*controlp)->m_next; - } + if (m != NULL) + nextrecord = so->so_rcv.sb_mb->m_nextpkt; + else + nextrecord = so->so_rcv.sb_mb; + orig_resid = 0; } - /* - * If m is non-NULL, we have some data to read. From now on, - * make sure to keep sb_lastrecord consistent when working on - * the last packet on the chain (nextrecord == NULL) and we - * change m->m_nextpkt. - */ - if (m != NULL) { - if ((flags & MSG_PEEK) == 0) { - m->m_nextpkt = nextrecord; - /* - * If nextrecord == NULL (this is a single chain), - * then sb_lastrecord may not be valid here if m - * was changed earlier. - */ - if (nextrecord == NULL) { - KASSERT(so->so_rcv.sb_mb == m); - so->so_rcv.sb_lastrecord = m; - } - } + /* If m is non-NULL, we have some data to read. */ + if (__predict_true(m != NULL)) { type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; - } else { - if ((flags & MSG_PEEK) == 0) { - KASSERT(so->so_rcv.sb_mb == m); - so->so_rcv.sb_mb = nextrecord; - SB_EMPTY_FIXUP(&so->so_rcv); - } } SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); @@ -1255,9 +1349,11 @@ soreceive(struct socket *so, struct mbuf if (mp == NULL) { SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); + sounlock(so); splx(s); error = uiomove(mtod(m, char *) + moff, (int)len, uio); s = splsoftnet(); + solock(so); if (error != 0) { /* * If any part of the record has been removed @@ -1314,8 +1410,15 @@ soreceive(struct socket *so, struct mbuf } else if (flags & MSG_PEEK) moff += len; else { - if (mp != NULL) - *mp = m_copym(m, 0, len, M_WAIT); + if (mp != NULL) { + mt = m_copym(m, 0, len, M_NOWAIT); + if (__predict_false(mt == NULL)) { + sounlock(so); + mt = m_copym(m, 0, len, M_WAIT); + solock(so); + } + *mp = mt; + } m->m_data += len; m->m_len -= len; so->so_rcv.sb_cc -= len; @@ -1365,6 +1468,7 @@ soreceive(struct socket *so, struct mbuf error = sbwait(&so->so_rcv); if (error != 0) { sbunlock(&so->so_rcv); + sounlock(so); splx(s); return 0; } @@ -1401,7 +1505,6 @@ soreceive(struct socket *so, struct mbuf if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { sbunlock(&so->so_rcv); - splx(s); goto restart; } @@ -1409,6 +1512,7 @@ soreceive(struct socket *so, struct mbuf *flagsp |= flags; release: sbunlock(&so->so_rcv); + sounlock(so); splx(s); return error; } @@ -1417,17 +1521,23 @@ int soshutdown(struct socket *so, int how) { const struct protosw *pr; + int error; + + KASSERT(solocked(so)); pr = so->so_proto; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); - if (how == SHUT_RD || how == SHUT_RDWR) + if (how == SHUT_RD || how == SHUT_RDWR) { sorflush(so); + error = 0; + } if (how == SHUT_WR || how == SHUT_RDWR) - return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, + error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL, NULL); - return 0; + + return error; } void @@ -1435,14 +1545,14 @@ sorflush(struct socket *so) { struct sockbuf *sb, asb; const struct protosw *pr; - int s; + + KASSERT(solocked(so)); sb = &so->so_rcv; pr = so->so_proto; - sb->sb_flags |= SB_NOINTR; - (void) sblock(sb, M_WAITOK); - s = splnet(); socantrcvmore(so); + sb->sb_flags |= SB_NOINTR; + (void )sblock(sb, M_WAITOK); sbunlock(sb); asb = *sb; /* @@ -1451,9 +1561,11 @@ sorflush(struct socket *so) */ memset(&sb->sb_startzero, 0, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); - splx(s); - if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) + if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) { + sounlock(so); (*pr->pr_domain->dom_dispose)(asb.sb_mb); + solock(so); + } sbrelease(&asb, so); } @@ -1574,6 +1686,7 @@ sosetopt(struct socket *so, int level, i { int error, prerr; + solock(so); if (level == SOL_SOCKET) error = sosetopt1(so, level, optname, m); else @@ -1590,6 +1703,7 @@ sosetopt(struct socket *so, int level, i error = prerr; } else if (m != NULL) (void)m_free(m); + sounlock(so); return error; } @@ -1597,13 +1711,15 @@ int sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) { struct mbuf *m; + int error; + solock(so); if (level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) { - return ((*so->so_proto->pr_ctloutput) + error = ((*so->so_proto->pr_ctloutput) (PRCO_GETOPT, so, level, optname, mp)); } else - return (ENOPROTOOPT); + error = (ENOPROTOOPT); } else { m = m_get(M_WAIT, MT_SOOPTS); m->m_len = sizeof(int); @@ -1672,19 +1788,24 @@ sogetopt(struct socket *so, int level, i break; default: + sounlock(so); (void)m_free(m); return (ENOPROTOOPT); } *mp = m; - return (0); + error = 0; } + + sounlock(so); + return (error); } void sohasoutofband(struct socket *so) { + fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); - selwakeup(&so->so_rcv.sb_sel); + selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, 0); } static void @@ -1692,10 +1813,12 @@ filt_sordetach(struct knote *kn) { struct socket *so; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; + solock(so); SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) so->so_rcv.sb_flags &= ~SB_KNOTE; + sounlock(so); } /*ARGSUSED*/ @@ -1703,19 +1826,25 @@ static int filt_soread(struct knote *kn, long hint) { struct socket *so; + int rv; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; + if (hint != NOTE_SUBMIT) + solock(so); kn->kn_data = so->so_rcv.sb_cc; if (so->so_state & SS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; - return (1); - } - if (so->so_error) /* temporary udp error */ - return (1); - if (kn->kn_sfflags & NOTE_LOWAT) - return (kn->kn_data >= kn->kn_sdata); - return (kn->kn_data >= so->so_rcv.sb_lowat); + rv = 1; + } else if (so->so_error) /* temporary udp error */ + rv = 1; + else if (kn->kn_sfflags & NOTE_LOWAT) + rv = (kn->kn_data >= kn->kn_sdata); + else + rv = (kn->kn_data >= so->so_rcv.sb_lowat); + if (hint != NOTE_SUBMIT) + sounlock(so); + return rv; } static void @@ -1723,10 +1852,12 @@ filt_sowdetach(struct knote *kn) { struct socket *so; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; + solock(so); SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) so->so_snd.sb_flags &= ~SB_KNOTE; + sounlock(so); } /*ARGSUSED*/ @@ -1734,22 +1865,28 @@ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; + int rv; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; + if (hint != NOTE_SUBMIT) + solock(so); kn->kn_data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; - return (1); - } - if (so->so_error) /* temporary udp error */ - return (1); - if (((so->so_state & SS_ISCONNECTED) == 0) && + rv = 1; + } else if (so->so_error) /* temporary udp error */ + rv = 1; + else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) - return (0); - if (kn->kn_sfflags & NOTE_LOWAT) - return (kn->kn_data >= kn->kn_sdata); - return (kn->kn_data >= so->so_snd.sb_lowat); + rv = 0; + else if (kn->kn_sfflags & NOTE_LOWAT) + rv = (kn->kn_data >= kn->kn_sdata); + else + rv = (kn->kn_data >= so->so_snd.sb_lowat); + if (hint != NOTE_SUBMIT) + sounlock(so); + return rv; } /*ARGSUSED*/ @@ -1757,15 +1894,21 @@ static int filt_solisten(struct knote *kn, long hint) { struct socket *so; + int rv; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; /* * Set kn_data to number of incoming connections, not * counting partial (incomplete) connections. */ + if (hint != NOTE_SUBMIT) + solock(so); kn->kn_data = so->so_qlen; - return (kn->kn_data > 0); + rv = (kn->kn_data > 0); + if (hint != NOTE_SUBMIT) + sounlock(so); + return rv; } static const struct filterops solisten_filtops = @@ -1781,7 +1924,8 @@ soo_kqfilter(struct file *fp, struct kno struct socket *so; struct sockbuf *sb; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; + solock(so); switch (kn->kn_filter) { case EVFILT_READ: if (so->so_options & SO_ACCEPTCONN) @@ -1795,13 +1939,70 @@ soo_kqfilter(struct file *fp, struct kno sb = &so->so_snd; break; default: + sounlock(so); return (EINVAL); } SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); sb->sb_flags |= SB_KNOTE; + sounlock(so); return (0); } +static int +sodopoll(struct socket *so, int events) +{ + int revents; + + revents = 0; + + if (events & (POLLIN | POLLRDNORM)) + if (soreadable(so)) + revents |= events & (POLLIN | POLLRDNORM); + + if (events & (POLLOUT | POLLWRNORM)) + if (sowritable(so)) + revents |= events & (POLLOUT | POLLWRNORM); + + if (events & (POLLPRI | POLLRDBAND)) + if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) + revents |= events & (POLLPRI | POLLRDBAND); + + return revents; +} + +int +sopoll(struct socket *so, int events) +{ + int revents = 0; + +#ifndef DIAGNOSTIC + /* + * Do a quick, unlocked check in expectation that the socket + * will be ready for I/O. Don't do this check if DIAGNOSTIC, + * as the solocked() assertions will fail. + */ + if ((revents = sodopoll(so, events)) != 0) + return revents; +#endif + + solock(so); + if ((revents = sodopoll(so, events)) == 0) { + if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { + selrecord(curlwp, &so->so_rcv.sb_sel); + so->so_rcv.sb_flags |= SB_NOTIFY; + } + + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(curlwp, &so->so_snd.sb_sel); + so->so_snd.sb_flags |= SB_NOTIFY; + } + } + sounlock(so); + + return revents; +} + + #include static int sysctl_kern_somaxkva(SYSCTLFN_PROTO);