Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v retrieving revision 1.151.6.5 retrieving revision 1.152 diff -u -p -r1.151.6.5 -r1.152 --- src/sys/kern/uipc_socket.c 2009/01/17 13:29:20 1.151.6.5 +++ src/sys/kern/uipc_socket.c 2008/02/27 19:55:59 1.152 @@ -1,7 +1,7 @@ -/* $NetBSD: uipc_socket.c,v 1.151.6.5 2009/01/17 13:29:20 mjf Exp $ */ +/* $NetBSD: uipc_socket.c,v 1.152 2008/02/27 19:55:59 matt Exp $ */ /*- - * Copyright (c) 2002, 2007, 2008 The NetBSD Foundation, Inc. + * Copyright (c) 2002, 2007 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -15,6 +15,13 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED @@ -30,8 +37,6 @@ */ /* - * Copyright (c) 2004 The FreeBSD Foundation - * Copyright (c) 2004 Robert Watson * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * @@ -63,21 +68,19 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.151.6.5 2009/01/17 13:29:20 mjf Exp $"); +__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.152 2008/02/27 19:55:59 matt Exp $"); -#include "opt_compat_netbsd.h" #include "opt_sock_counters.h" #include "opt_sosend_loan.h" #include "opt_mbuftrace.h" #include "opt_somaxkva.h" -#include "opt_multiprocessor.h" /* XXX */ #include #include #include #include #include -#include +#include #include #include #include @@ -86,21 +89,18 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_socket. #include #include #include -#include +#include #include #include #include #include #include -#ifdef COMPAT_50 -#include -#define SO_OSNDTIMEO 0x1005 -#define SO_ORCVTIMEO 0x1006 -#endif - #include +POOL_INIT(socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL, + IPL_SOFTNET); + MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); MALLOC_DEFINE(M_SONAME, "soname", "socket name"); @@ -108,7 +108,6 @@ extern const struct fileops socketops; extern int somaxconn; /* patchable (XXX sysctl) */ int somaxconn = SOMAXCONN; -kmutex_t *softnet_lock; #ifdef SOSEND_COUNTERS #include @@ -136,7 +135,7 @@ EVCNT_ATTACH_STATIC(sosend_kvalimit); static struct callback_entry sokva_reclaimerentry; -#if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR) +#ifdef SOSEND_NO_LOAN int sock_loan_thresh = -1; #else int sock_loan_thresh = 4096; @@ -157,9 +156,6 @@ static kcondvar_t socurkva_cv; static size_t sodopendfree(void); static size_t sodopendfreel(void); -static void sysctl_kern_somaxkva_setup(void); -static struct sysctllog *socket_sysctllog; - static vsize_t sokvareserve(struct socket *so, vsize_t len) { @@ -257,17 +253,26 @@ sokvafree(vaddr_t sva, vsize_t len) static void sodoloanfree(struct vm_page **pgs, void *buf, size_t size) { - vaddr_t sva, eva; + vaddr_t va, sva, eva; vsize_t len; - int npgs; - - KASSERT(pgs != NULL); + paddr_t pa; + int i, npgs; eva = round_page((vaddr_t) buf + size); sva = trunc_page((vaddr_t) buf); len = eva - sva; npgs = len >> PAGE_SHIFT; + if (__predict_false(pgs == NULL)) { + pgs = alloca(npgs * sizeof(*pgs)); + + for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { + if (pmap_extract(pmap_kernel(), va, &pa) == false) + panic("sodoloanfree: va 0x%lx not mapped", va); + pgs[i] = PHYS_TO_VM_PAGE(pa); + } + } + pmap_kremove(sva, len); pmap_update(pmap_kernel()); uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); @@ -279,9 +284,6 @@ sodopendfree(void) { size_t rv; - if (__predict_true(so_pendfree == NULL)) - return 0; - mutex_enter(&so_pendfree_lock); rv = sodopendfreel(); mutex_exit(&so_pendfree_lock); @@ -311,11 +313,10 @@ sodopendfreel(void) for (; m != NULL; m = next) { next = m->m_next; - KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); - KASSERT(m->m_ext.ext_refcnt == 0); rv += m->m_ext.ext_size; - sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, + sodoloanfree((m->m_flags & M_EXT_PAGES) ? + m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, m->m_ext.ext_size); pool_cache_put(mb_cache, m); } @@ -330,7 +331,15 @@ void soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) { - KASSERT(m != NULL); + if (m == NULL) { + + /* + * called from MEXTREMOVE. + */ + + sodoloanfree(NULL, buf, size); + return; + } /* * postpone freeing mbuf. @@ -352,10 +361,8 @@ sosend_loan(struct socket *so, struct ui struct iovec *iov = uio->uio_iov; vaddr_t sva, eva; vsize_t len; - vaddr_t lva; - int npgs, error; - vaddr_t va; - int i; + vaddr_t lva, va; + int npgs, i, error; if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) return (0); @@ -370,6 +377,7 @@ sosend_loan(struct socket *so, struct ui len = eva - sva; npgs = len >> PAGE_SHIFT; + /* XXX KDASSERT */ KASSERT(npgs <= M_EXT_MAXPAGES); lva = sokvaalloc(len, so); @@ -429,16 +437,23 @@ getsombuf(struct socket *so, int type) return m; } +struct mbuf * +m_intopt(struct socket *so, int val) +{ + struct mbuf *m; + + m = getsombuf(so, MT_SOOPTS); + m->m_len = sizeof(int); + *mtod(m, int *) = val; + return m; +} + void soinit(void) { - sysctl_kern_somaxkva_setup(); - mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM); - softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); cv_init(&socurkva_cv, "sokva"); - soinit2(); /* Set the initial adjusted socket buffer size. */ if (sb_max_set(sb_max)) @@ -457,14 +472,12 @@ soinit(void) */ /*ARGSUSED*/ int -socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l, - struct socket *lockso) +socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l) { const struct protosw *prp; struct socket *so; uid_t uid; - int error; - kmutex_t *lock; + int error, s; error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type), @@ -489,8 +502,11 @@ socreate(int dom, struct socket **aso, i return EPROTONOSUPPORT; if (prp->pr_type != type) return EPROTOTYPE; - - so = soget(true); + s = splsoftnet(); + so = pool_get(&socket_pool, PR_WAITOK); + memset(so, 0, sizeof(*so)); + TAILQ_INIT(&so->so_q0); + TAILQ_INIT(&so->so_q); so->so_type = type; so->so_proto = prp; so->so_send = sosend; @@ -500,28 +516,19 @@ socreate(int dom, struct socket **aso, i so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; so->so_mowner = &prp->pr_domain->dom_mowner; #endif + selinit(&so->so_rcv.sb_sel); + selinit(&so->so_snd.sb_sel); uid = kauth_cred_geteuid(l->l_cred); so->so_uidinfo = uid_find(uid); - so->so_egid = kauth_cred_getegid(l->l_cred); - so->so_cpid = l->l_proc->p_pid; - if (lockso != NULL) { - /* Caller wants us to share a lock. */ - lock = lockso->so_lock; - so->so_lock = lock; - mutex_obj_hold(lock); - mutex_enter(lock); - } else { - /* Lock assigned and taken during PRU_ATTACH. */ - } error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL, (struct mbuf *)(long)proto, NULL, l); - KASSERT(solocked(so)); if (error != 0) { so->so_state |= SS_NOFDREF; sofree(so); + splx(s); return error; } - sounlock(so); + splx(s); *aso = so; return 0; } @@ -533,23 +540,29 @@ int fsocreate(int domain, struct socket **sop, int type, int protocol, struct lwp *l, int *fdout) { + struct filedesc *fdp; struct socket *so; struct file *fp; int fd, error; - if ((error = fd_allocfile(&fp, &fd)) != 0) + fdp = l->l_proc->p_fd; + /* falloc() will use the desciptor for us */ + if ((error = falloc(l, &fp, &fd)) != 0) return (error); fp->f_flag = FREAD|FWRITE; fp->f_type = DTYPE_SOCKET; fp->f_ops = &socketops; - error = socreate(domain, &so, type, protocol, l, NULL); + error = socreate(domain, &so, type, protocol, l); if (error != 0) { - fd_abort(curproc, fp, fd); + FILE_UNUSE(fp, l); + fdremove(fdp, fd); + ffree(fp); } else { if (sop != NULL) *sop = so; fp->f_data = so; - fd_affix(curproc, fp, fd); + FILE_SET_MATURE(fp); + FILE_UNUSE(fp, l); *fdout = fd; } return error; @@ -558,29 +571,24 @@ fsocreate(int domain, struct socket **so int sobind(struct socket *so, struct mbuf *nam, struct lwp *l) { - int error; + int s, error; - solock(so); + s = splsoftnet(); error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, l); - sounlock(so); + splx(s); return error; } int solisten(struct socket *so, int backlog, struct lwp *l) { - int error; + int s, error; - solock(so); - if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | - SS_ISDISCONNECTING)) != 0) { - sounlock(so); - return (EOPNOTSUPP); - } + s = splsoftnet(); error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL, l); if (error != 0) { - sounlock(so); + splx(s); return error; } if (TAILQ_EMPTY(&so->so_q)) @@ -588,31 +596,24 @@ solisten(struct socket *so, int backlog, if (backlog < 0) backlog = 0; so->so_qlimit = min(backlog, somaxconn); - sounlock(so); + splx(s); return 0; } void sofree(struct socket *so) { - u_int refs; - - KASSERT(solocked(so)); - if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { - sounlock(so); + if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) return; - } if (so->so_head) { /* * We must not decommission a socket that's on the accept(2) * queue. If we do, then accept(2) may hang after select(2) * indicated that the listening socket was ready. */ - if (!soqremque(so, 0)) { - sounlock(so); + if (!soqremque(so, 0)) return; - } } if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, @@ -621,17 +622,10 @@ sofree(struct socket *so) (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sbrelease(&so->so_snd, so); - KASSERT(!cv_has_waiters(&so->so_cv)); - KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); - KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); sorflush(so); - refs = so->so_aborting; /* XXX */ - /* Remove acccept filter if one is present. */ - if (so->so_accf != NULL) - (void)accept_filt_clear(so); - sounlock(so); - if (refs == 0) /* XXX */ - soput(so); + seldestroy(&so->so_rcv.sb_sel); + seldestroy(&so->so_snd.sb_sel); + pool_put(&socket_pool, so); } /* @@ -643,30 +637,18 @@ int soclose(struct socket *so) { struct socket *so2; - int error; - int error2; + int s, error; error = 0; - solock(so); + s = splsoftnet(); /* conservative */ if (so->so_options & SO_ACCEPTCONN) { - for (;;) { - if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { - KASSERT(solocked2(so, so2)); - (void) soqremque(so2, 0); - /* soabort drops the lock. */ - (void) soabort(so2); - solock(so); - continue; - } - if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { - KASSERT(solocked2(so, so2)); - (void) soqremque(so2, 1); - /* soabort drops the lock. */ - (void) soabort(so2); - solock(so); - continue; - } - break; + while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { + (void) soqremque(so2, 0); + (void) soabort(so2); + } + while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { + (void) soqremque(so2, 1); + (void) soabort(so2); } } if (so->so_pcb == 0) @@ -681,7 +663,9 @@ soclose(struct socket *so) if ((so->so_state & SS_ISDISCONNECTING) && so->so_nbio) goto drop; while (so->so_state & SS_ISCONNECTED) { - error = sowait(so, so->so_linger * hz); + error = tsleep((void *)&so->so_timeo, + PSOCK | PCATCH, netcls, + so->so_linger * hz); if (error) break; } @@ -689,7 +673,7 @@ soclose(struct socket *so) } drop: if (so->so_pcb) { - error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, + int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, NULL, NULL, NULL, NULL); if (error == 0) error = error2; @@ -699,29 +683,23 @@ soclose(struct socket *so) panic("soclose: NOFDREF"); so->so_state |= SS_NOFDREF; sofree(so); + splx(s); return (error); } /* - * Must be called with the socket locked.. Will return with it unlocked. + * Must be called at splsoftnet... */ int soabort(struct socket *so) { - u_int refs; int error; - - KASSERT(solocked(so)); - KASSERT(so->so_head == NULL); - so->so_aborting++; /* XXX */ + KASSERT(so->so_head == NULL); error = (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL, NULL); - refs = --so->so_aborting; /* XXX */ - if (error || (refs == 0)) { + if (error) { sofree(so); - } else { - sounlock(so); } return error; } @@ -729,11 +707,10 @@ soabort(struct socket *so) int soaccept(struct socket *so, struct mbuf *nam) { - int error; - - KASSERT(solocked(so)); + int s, error; error = 0; + s = splsoftnet(); if ((so->so_state & SS_NOFDREF) == 0) panic("soaccept: !NOFDREF"); so->so_state &= ~SS_NOFDREF; @@ -744,18 +721,18 @@ soaccept(struct socket *so, struct mbuf else error = ECONNABORTED; + splx(s); return (error); } int soconnect(struct socket *so, struct mbuf *nam, struct lwp *l) { - int error; - - KASSERT(solocked(so)); + int s, error; if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); + s = splsoftnet(); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. @@ -769,36 +746,40 @@ soconnect(struct socket *so, struct mbuf else error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, NULL, nam, NULL, l); + splx(s); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { - int error; - - KASSERT(solocked2(so1, so2)); + int s, error; + s = splsoftnet(); error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL, (struct mbuf *)so2, NULL, NULL); + splx(s); return (error); } int sodisconnect(struct socket *so) { - int error; - - KASSERT(solocked(so)); + int s, error; + s = splsoftnet(); if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; - } else if (so->so_state & SS_ISDISCONNECTING) { + goto bad; + } + if (so->so_state & SS_ISDISCONNECTING) { error = EALREADY; - } else { - error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, - NULL, NULL, NULL, NULL); + goto bad; } + error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, + NULL, NULL, NULL, NULL); + bad: + splx(s); sodopendfree(); return (error); } @@ -832,15 +813,8 @@ sosend(struct socket *so, struct mbuf *a p = l->l_proc; sodopendfree(); - clen = 0; - /* - * solock() provides atomicity of access. splsoftnet() prevents - * protocol processing soft interrupts from interrupting us and - * blocking (expensive). - */ - s = splsoftnet(); - solock(so); + clen = 0; atomic = sosendallatonce(so) || top; if (uio) resid = uio->uio_resid; @@ -860,54 +834,51 @@ sosend(struct socket *so, struct mbuf *a dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); - l->l_ru.ru_msgsnd++; + if (p) + p->p_stats->p_ru.ru_msgsnd++; if (control) clen = control->m_len; +#define snderr(errno) { error = errno; splx(s); goto release; } + restart: if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) goto out; do { - if (so->so_state & SS_CANTSENDMORE) { - error = EPIPE; - goto release; - } + s = splsoftnet(); + if (so->so_state & SS_CANTSENDMORE) + snderr(EPIPE); if (so->so_error) { error = so->so_error; so->so_error = 0; + splx(s); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if ((so->so_state & SS_ISCONFIRMING) == 0 && - !(resid == 0 && clen != 0)) { - error = ENOTCONN; - goto release; - } - } else if (addr == 0) { - error = EDESTADDRREQ; - goto release; - } + !(resid == 0 && clen != 0)) + snderr(ENOTCONN); + } else if (addr == 0) + snderr(EDESTADDRREQ); } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || - clen > so->so_snd.sb_hiwat) { - error = EMSGSIZE; - goto release; - } + clen > so->so_snd.sb_hiwat) + snderr(EMSGSIZE); if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { - if (so->so_nbio) { - error = EWOULDBLOCK; - goto release; - } + if (so->so_nbio) + snderr(EWOULDBLOCK); sbunlock(&so->so_snd); error = sbwait(&so->so_snd); + splx(s); if (error) goto out; goto restart; } + splx(s); mp = ⊤ space -= clen; do { @@ -919,8 +890,6 @@ sosend(struct socket *so, struct mbuf *a if (flags & MSG_EOR) top->m_flags |= M_EOR; } else do { - sounlock(so); - splx(s); if (top == NULL) { m = m_gethdr(M_WAIT, MT_DATA); mlen = MHLEN; @@ -971,8 +940,6 @@ sosend(struct socket *so, struct mbuf *a m->m_len = len; *mp = m; top->m_pkthdr.len += len; - s = splsoftnet(); - solock(so); if (error != 0) goto release; mp = &m->m_next; @@ -983,21 +950,24 @@ sosend(struct socket *so, struct mbuf *a } } while (space > 0 && atomic); - if (so->so_state & SS_CANTSENDMORE) { - error = EPIPE; - goto release; - } + s = splsoftnet(); + + if (so->so_state & SS_CANTSENDMORE) + snderr(EPIPE); + if (dontroute) so->so_options |= SO_DONTROUTE; if (resid > 0) so->so_state |= SS_MORETOCOME; error = (*so->so_proto->pr_usrreq)(so, (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, - top, addr, control, curlwp); + top, addr, control, curlwp); /* XXX */ if (dontroute) so->so_options &= ~SO_DONTROUTE; if (resid > 0) so->so_state &= ~SS_MORETOCOME; + splx(s); + clen = 0; control = NULL; top = NULL; @@ -1010,8 +980,6 @@ sosend(struct socket *so, struct mbuf *a release: sbunlock(&so->so_snd); out: - sounlock(so); - splx(s); if (top) m_freem(top); if (control) @@ -1020,43 +988,6 @@ sosend(struct socket *so, struct mbuf *a } /* - * Following replacement or removal of the first mbuf on the first - * mbuf chain of a socket buffer, push necessary state changes back - * into the socket buffer so that other consumers see the values - * consistently. 'nextrecord' is the callers locally stored value of - * the original value of sb->sb_mb->m_nextpkt which must be restored - * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. - */ -static void -sbsync(struct sockbuf *sb, struct mbuf *nextrecord) -{ - - KASSERT(solocked(sb->sb_so)); - - /* - * First, update for the new value of nextrecord. If necessary, - * make it the first record. - */ - if (sb->sb_mb != NULL) - sb->sb_mb->m_nextpkt = nextrecord; - else - sb->sb_mb = nextrecord; - - /* - * Now update any dependent socket buffer fields to reflect - * the new state. This is an inline of SB_EMPTY_FIXUP, with - * the addition of a second clause that takes care of the - * case where sb_mb has been updated, but remains the last - * record. - */ - if (sb->sb_mb == NULL) { - sb->sb_mbtail = NULL; - sb->sb_lastrecord = NULL; - } else if (sb->sb_mb->m_nextpkt == NULL) - sb->sb_lastrecord = sb->sb_mb; -} - -/* * Implement receive operations on a socket. * We depend on the way that records are added to the sockbuf * by sbappend*. In particular, each record (mbufs linked through m_next) @@ -1077,7 +1008,7 @@ soreceive(struct socket *so, struct mbuf struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct lwp *l = curlwp; - struct mbuf *m, **mp, *mt; + struct mbuf *m, **mp; int atomic, flags, len, error, s, offset, moff, type, orig_resid; const struct protosw *pr; struct mbuf *nextrecord; @@ -1105,10 +1036,8 @@ soreceive(struct socket *so, struct mbuf if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); - solock(so); error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, (struct mbuf *)(long)(flags & MSG_PEEK), NULL, l); - sounlock(so); if (error) goto bad; do { @@ -1123,23 +1052,13 @@ soreceive(struct socket *so, struct mbuf } if (mp != NULL) *mp = NULL; - - /* - * solock() provides atomicity of access. splsoftnet() prevents - * protocol processing soft interrupts from interrupting us and - * blocking (expensive). - */ - s = splsoftnet(); - solock(so); if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) (*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, l); restart: - if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) { - sounlock(so); - splx(s); + if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) return error; - } + s = splsoftnet(); m = so->so_rcv.sb_mb; /* @@ -1198,32 +1117,19 @@ soreceive(struct socket *so, struct mbuf SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(&so->so_rcv); error = sbwait(&so->so_rcv); - if (error != 0) { - sounlock(so); - splx(s); + splx(s); + if (error != 0) return error; - } goto restart; } dontblock: /* * On entry here, m points to the first record of the socket buffer. - * From this point onward, we maintain 'nextrecord' as a cache of the - * pointer to the next record in the socket buffer. We must keep the - * various socket buffer pointers and local stack versions of the - * pointers in sync, pushing out modifications before dropping the - * socket lock, and re-reading them when picking it up. - * - * Otherwise, we will race with the network stack appending new data - * or records onto the socket buffer by using inconsistent/stale - * versions of the field, possibly resulting in socket buffer - * corruption. - * - * By holding the high-level sblock(), we prevent simultaneous - * readers from pulling off the front of the socket buffer. + * While we process the initial mbufs containing address and control + * info, we save a copy of m->m_nextpkt into nextrecord. */ if (l != NULL) - l->l_ru.ru_msgrcv++; + l->l_proc->p_stats->p_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb); SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); @@ -1250,80 +1156,71 @@ soreceive(struct socket *so, struct mbuf MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } - sbsync(&so->so_rcv, nextrecord); } } - - /* - * Process one or more MT_CONTROL mbufs present before any data mbufs - * in the first mbuf chain on the socket buffer. If MSG_PEEK, we - * just copy the data; if !MSG_PEEK, we call into the protocol to - * perform externalization (or freeing if controlp == NULL). - */ - if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) { - struct mbuf *cm = NULL, *cmn; - struct mbuf **cme = &cm; - - do { - if (flags & MSG_PEEK) { - if (controlp != NULL) { - *controlp = m_copy(m, 0, m->m_len); - controlp = &(*controlp)->m_next; - } - m = m->m_next; - } else { - sbfree(&so->so_rcv, m); + while (m != NULL && m->m_type == MT_CONTROL && error == 0) { + if (flags & MSG_PEEK) { + if (controlp != NULL) + *controlp = m_copy(m, 0, m->m_len); + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); + mbuf_removed = 1; + if (controlp != NULL) { + if (dom->dom_externalize && l && + mtod(m, struct cmsghdr *)->cmsg_type == + SCM_RIGHTS) + error = (*dom->dom_externalize)(m, l); + *controlp = m; so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; - *cme = m; - cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; - } - } while (m != NULL && m->m_type == MT_CONTROL); - if ((flags & MSG_PEEK) == 0) - sbsync(&so->so_rcv, nextrecord); - for (; cm != NULL; cm = cmn) { - cmn = cm->m_next; - cm->m_next = NULL; - type = mtod(cm, struct cmsghdr *)->cmsg_type; - if (controlp != NULL) { - if (dom->dom_externalize != NULL && - type == SCM_RIGHTS) { - sounlock(so); - splx(s); - error = (*dom->dom_externalize)(cm, l); - s = splsoftnet(); - solock(so); - } - *controlp = cm; - while (*controlp != NULL) - controlp = &(*controlp)->m_next; } else { /* * Dispose of any SCM_RIGHTS message that went * through the read path rather than recv. */ - if (dom->dom_dispose != NULL && - type == SCM_RIGHTS) { - sounlock(so); - (*dom->dom_dispose)(cm); - solock(so); - } - m_freem(cm); + if (dom->dom_dispose && + mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) + (*dom->dom_dispose)(m); + MFREE(m, so->so_rcv.sb_mb); + m = so->so_rcv.sb_mb; } } - if (m != NULL) - nextrecord = so->so_rcv.sb_mb->m_nextpkt; - else - nextrecord = so->so_rcv.sb_mb; - orig_resid = 0; + if (controlp != NULL) { + orig_resid = 0; + controlp = &(*controlp)->m_next; + } } - /* If m is non-NULL, we have some data to read. */ - if (__predict_true(m != NULL)) { + /* + * If m is non-NULL, we have some data to read. From now on, + * make sure to keep sb_lastrecord consistent when working on + * the last packet on the chain (nextrecord == NULL) and we + * change m->m_nextpkt. + */ + if (m != NULL) { + if ((flags & MSG_PEEK) == 0) { + m->m_nextpkt = nextrecord; + /* + * If nextrecord == NULL (this is a single chain), + * then sb_lastrecord may not be valid here if m + * was changed earlier. + */ + if (nextrecord == NULL) { + KASSERT(so->so_rcv.sb_mb == m); + so->so_rcv.sb_lastrecord = m; + } + } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; + } else { + if ((flags & MSG_PEEK) == 0) { + KASSERT(so->so_rcv.sb_mb == m); + so->so_rcv.sb_mb = nextrecord; + SB_EMPTY_FIXUP(&so->so_rcv); + } } SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); @@ -1357,11 +1254,9 @@ soreceive(struct socket *so, struct mbuf if (mp == NULL) { SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); - sounlock(so); splx(s); error = uiomove(mtod(m, char *) + moff, (int)len, uio); s = splsoftnet(); - solock(so); if (error != 0) { /* * If any part of the record has been removed @@ -1418,15 +1313,8 @@ soreceive(struct socket *so, struct mbuf } else if (flags & MSG_PEEK) moff += len; else { - if (mp != NULL) { - mt = m_copym(m, 0, len, M_NOWAIT); - if (__predict_false(mt == NULL)) { - sounlock(so); - mt = m_copym(m, 0, len, M_WAIT); - solock(so); - } - *mp = mt; - } + if (mp != NULL) + *mp = m_copym(m, 0, len, M_WAIT); m->m_data += len; m->m_len -= len; so->so_rcv.sb_cc -= len; @@ -1476,7 +1364,6 @@ soreceive(struct socket *so, struct mbuf error = sbwait(&so->so_rcv); if (error != 0) { sbunlock(&so->so_rcv); - sounlock(so); splx(s); return 0; } @@ -1513,6 +1400,7 @@ soreceive(struct socket *so, struct mbuf if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { sbunlock(&so->so_rcv); + splx(s); goto restart; } @@ -1520,7 +1408,6 @@ soreceive(struct socket *so, struct mbuf *flagsp |= flags; release: sbunlock(&so->so_rcv); - sounlock(so); splx(s); return error; } @@ -1529,23 +1416,17 @@ int soshutdown(struct socket *so, int how) { const struct protosw *pr; - int error; - - KASSERT(solocked(so)); pr = so->so_proto; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); - if (how == SHUT_RD || how == SHUT_RDWR) { + if (how == SHUT_RD || how == SHUT_RDWR) sorflush(so); - error = 0; - } if (how == SHUT_WR || how == SHUT_RDWR) - error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, + return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL, NULL); - - return error; + return 0; } void @@ -1553,14 +1434,14 @@ sorflush(struct socket *so) { struct sockbuf *sb, asb; const struct protosw *pr; - - KASSERT(solocked(so)); + int s; sb = &so->so_rcv; pr = so->so_proto; - socantrcvmore(so); sb->sb_flags |= SB_NOINTR; - (void )sblock(sb, M_WAITOK); + (void) sblock(sb, M_WAITOK); + s = splnet(); + socantrcvmore(so); sbunlock(sb); asb = *sb; /* @@ -1569,48 +1450,36 @@ sorflush(struct socket *so) */ memset(&sb->sb_startzero, 0, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); - if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) { - sounlock(so); + splx(s); + if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) (*pr->pr_domain->dom_dispose)(asb.sb_mb); - solock(so); - } sbrelease(&asb, so); } -/* - * internal set SOL_SOCKET options - */ static int -sosetopt1(struct socket *so, const struct sockopt *sopt) +sosetopt1(struct socket *so, int level, int optname, struct mbuf *m) { - int error = EINVAL, optval, opt; - struct linger l; - struct timeval tv; + int optval, val; + struct linger *l; + struct sockbuf *sb; + struct timeval *tv; - switch ((opt = sopt->sopt_name)) { + switch (optname) { - case SO_ACCEPTFILTER: - error = accept_filt_setopt(so, sopt); - KASSERT(solocked(so)); + case SO_LINGER: + if (m == NULL || m->m_len != sizeof(struct linger)) + return EINVAL; + l = mtod(m, struct linger *); + if (l->l_linger < 0 || l->l_linger > USHRT_MAX || + l->l_linger > (INT_MAX / hz)) + return EDOM; + so->so_linger = l->l_linger; + if (l->l_onoff) + so->so_options |= SO_LINGER; + else + so->so_options &= ~SO_LINGER; break; - case SO_LINGER: - error = sockopt_get(sopt, &l, sizeof(l)); - solock(so); - if (error) - break; - if (l.l_linger < 0 || l.l_linger > USHRT_MAX || - l.l_linger > (INT_MAX / hz)) { - error = EDOM; - break; - } - so->so_linger = l.l_linger; - if (l.l_onoff) - so->so_options |= SO_LINGER; - else - so->so_options &= ~SO_LINGER; - break; - case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: @@ -1620,49 +1489,38 @@ sosetopt1(struct socket *so, const struc case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: - error = sockopt_getint(sopt, &optval); - solock(so); - if (error) - break; - if (optval) - so->so_options |= opt; + if (m == NULL || m->m_len < sizeof(int)) + return EINVAL; + if (*mtod(m, int *)) + so->so_options |= optname; else - so->so_options &= ~opt; + so->so_options &= ~optname; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: - error = sockopt_getint(sopt, &optval); - solock(so); - if (error) - break; + if (m == NULL || m->m_len < sizeof(int)) + return EINVAL; /* * Values < 1 make no sense for any of these * options, so disallow them. */ - if (optval < 1) { - error = EINVAL; - break; - } + optval = *mtod(m, int *); + if (optval < 1) + return EINVAL; - switch (opt) { - case SO_SNDBUF: - if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) { - error = ENOBUFS; - break; - } - so->so_snd.sb_flags &= ~SB_AUTOSIZE; - break; + switch (optname) { + case SO_SNDBUF: case SO_RCVBUF: - if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) { - error = ENOBUFS; - break; - } - so->so_rcv.sb_flags &= ~SB_AUTOSIZE; + sb = (optname == SO_SNDBUF) ? + &so->so_snd : &so->so_rcv; + if (sbreserve(sb, (u_long)optval, so) == 0) + return ENOBUFS; + sb->sb_flags &= ~SB_AUTOSIZE; break; /* @@ -1670,408 +1528,162 @@ sosetopt1(struct socket *so, const struc * the high-water. */ case SO_SNDLOWAT: - if (optval > so->so_snd.sb_hiwat) - optval = so->so_snd.sb_hiwat; - - so->so_snd.sb_lowat = optval; + so->so_snd.sb_lowat = + (optval > so->so_snd.sb_hiwat) ? + so->so_snd.sb_hiwat : optval; break; - case SO_RCVLOWAT: - if (optval > so->so_rcv.sb_hiwat) - optval = so->so_rcv.sb_hiwat; - - so->so_rcv.sb_lowat = optval; + so->so_rcv.sb_lowat = + (optval > so->so_rcv.sb_hiwat) ? + so->so_rcv.sb_hiwat : optval; break; } break; -#ifdef COMPAT_50 - case SO_OSNDTIMEO: - case SO_ORCVTIMEO: { - struct timeval50 otv; - error = sockopt_get(sopt, &otv, sizeof(otv)); - if (error) - break; - timeval50_to_timeval(&otv, &tv); - opt = opt == SO_OSNDTIMEO ? SO_SNDTIMEO : SO_RCVTIMEO; - error = 0; - /*FALLTHROUGH*/ - } -#endif /* COMPAT_50 */ - case SO_SNDTIMEO: case SO_RCVTIMEO: - if (error) - error = sockopt_get(sopt, &tv, sizeof(tv)); - solock(so); - if (error) - break; + if (m == NULL || m->m_len < sizeof(*tv)) + return EINVAL; + tv = mtod(m, struct timeval *); + if (tv->tv_sec > (INT_MAX - tv->tv_usec / tick) / hz) + return EDOM; + val = tv->tv_sec * hz + tv->tv_usec / tick; + if (val == 0 && tv->tv_usec != 0) + val = 1; - if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) { - error = EDOM; - break; - } + switch (optname) { - optval = tv.tv_sec * hz + tv.tv_usec / tick; - if (optval == 0 && tv.tv_usec != 0) - optval = 1; - - switch (opt) { case SO_SNDTIMEO: - so->so_snd.sb_timeo = optval; + so->so_snd.sb_timeo = val; break; case SO_RCVTIMEO: - so->so_rcv.sb_timeo = optval; + so->so_rcv.sb_timeo = val; break; } break; default: - solock(so); - error = ENOPROTOOPT; - break; + return ENOPROTOOPT; } - KASSERT(solocked(so)); - return error; + return 0; } int -sosetopt(struct socket *so, struct sockopt *sopt) +sosetopt(struct socket *so, int level, int optname, struct mbuf *m) { int error, prerr; - if (sopt->sopt_level == SOL_SOCKET) { - error = sosetopt1(so, sopt); - KASSERT(solocked(so)); - } else { + if (level == SOL_SOCKET) + error = sosetopt1(so, level, optname, m); + else error = ENOPROTOOPT; - solock(so); - } if ((error == 0 || error == ENOPROTOOPT) && so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { /* give the protocol stack a shot */ - prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt); + prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, level, + optname, &m); if (prerr == 0) error = 0; else if (prerr != ENOPROTOOPT) error = prerr; - } - sounlock(so); - return error; -} - -/* - * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt() - */ -int -so_setsockopt(struct lwp *l, struct socket *so, int level, int name, - const void *val, size_t valsize) -{ - struct sockopt sopt; - int error; - - KASSERT(valsize == 0 || val != NULL); - - sockopt_init(&sopt, level, name, valsize); - sockopt_set(&sopt, val, valsize); - - error = sosetopt(so, &sopt); - - sockopt_destroy(&sopt); - + } else if (m != NULL) + (void)m_free(m); return error; } - -/* - * internal get SOL_SOCKET options - */ -static int -sogetopt1(struct socket *so, struct sockopt *sopt) -{ - int error, optval, opt; - struct linger l; - struct timeval tv; - - switch ((opt = sopt->sopt_name)) { - - case SO_ACCEPTFILTER: - error = accept_filt_getopt(so, sopt); - break; - - case SO_LINGER: - l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0; - l.l_linger = so->so_linger; - - error = sockopt_set(sopt, &l, sizeof(l)); - break; - - case SO_USELOOPBACK: - case SO_DONTROUTE: - case SO_DEBUG: - case SO_KEEPALIVE: - case SO_REUSEADDR: - case SO_REUSEPORT: - case SO_BROADCAST: - case SO_OOBINLINE: - case SO_TIMESTAMP: - error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0); - break; - - case SO_TYPE: - error = sockopt_setint(sopt, so->so_type); - break; - - case SO_ERROR: - error = sockopt_setint(sopt, so->so_error); - so->so_error = 0; - break; - - case SO_SNDBUF: - error = sockopt_setint(sopt, so->so_snd.sb_hiwat); - break; - - case SO_RCVBUF: - error = sockopt_setint(sopt, so->so_rcv.sb_hiwat); - break; - - case SO_SNDLOWAT: - error = sockopt_setint(sopt, so->so_snd.sb_lowat); - break; - - case SO_RCVLOWAT: - error = sockopt_setint(sopt, so->so_rcv.sb_lowat); - break; - -#ifdef COMPAT_50 - case SO_OSNDTIMEO: - case SO_ORCVTIMEO: { - struct timeval50 otv; - - optval = (opt == SO_OSNDTIMEO ? - so->so_snd.sb_timeo : so->so_rcv.sb_timeo); - - otv.tv_sec = optval / hz; - otv.tv_usec = (optval % hz) * tick; - - error = sockopt_set(sopt, &otv, sizeof(otv)); - break; - } -#endif /* COMPAT_50 */ - - case SO_SNDTIMEO: - case SO_RCVTIMEO: - optval = (opt == SO_SNDTIMEO ? - so->so_snd.sb_timeo : so->so_rcv.sb_timeo); - - tv.tv_sec = optval / hz; - tv.tv_usec = (optval % hz) * tick; - - error = sockopt_set(sopt, &tv, sizeof(tv)); - break; - - case SO_OVERFLOWED: - error = sockopt_setint(sopt, so->so_rcv.sb_overflowed); - break; - - default: - error = ENOPROTOOPT; - break; - } - - return (error); -} int -sogetopt(struct socket *so, struct sockopt *sopt) +sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) { - int error; + struct mbuf *m; - solock(so); - if (sopt->sopt_level != SOL_SOCKET) { + if (level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) { - error = ((*so->so_proto->pr_ctloutput) - (PRCO_GETOPT, so, sopt)); + return ((*so->so_proto->pr_ctloutput) + (PRCO_GETOPT, so, level, optname, mp)); } else - error = (ENOPROTOOPT); + return (ENOPROTOOPT); } else { - error = sogetopt1(so, sopt); - } - sounlock(so); - return (error); -} - -/* - * alloc sockopt data buffer buffer - * - will be released at destroy - */ -static int -sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag) -{ - - KASSERT(sopt->sopt_size == 0); - - if (len > sizeof(sopt->sopt_buf)) { - sopt->sopt_data = kmem_zalloc(len, kmflag); - if (sopt->sopt_data == NULL) - return ENOMEM; - } else - sopt->sopt_data = sopt->sopt_buf; - - sopt->sopt_size = len; - return 0; -} - -/* - * initialise sockopt storage - * - MAY sleep during allocation - */ -void -sockopt_init(struct sockopt *sopt, int level, int name, size_t size) -{ - - memset(sopt, 0, sizeof(*sopt)); + m = m_get(M_WAIT, MT_SOOPTS); + m->m_len = sizeof(int); - sopt->sopt_level = level; - sopt->sopt_name = name; - (void)sockopt_alloc(sopt, size, KM_SLEEP); -} - -/* - * destroy sockopt storage - * - will release any held memory references - */ -void -sockopt_destroy(struct sockopt *sopt) -{ - - if (sopt->sopt_data != sopt->sopt_buf) - kmem_free(sopt->sopt_data, sopt->sopt_size); - - memset(sopt, 0, sizeof(*sopt)); -} + switch (optname) { -/* - * set sockopt value - * - value is copied into sockopt - * - memory is allocated when necessary, will not sleep - */ -int -sockopt_set(struct sockopt *sopt, const void *buf, size_t len) -{ - int error; - - if (sopt->sopt_size == 0) { - error = sockopt_alloc(sopt, len, KM_NOSLEEP); - if (error) - return error; - } - - KASSERT(sopt->sopt_size == len); - memcpy(sopt->sopt_data, buf, len); - return 0; -} - -/* - * common case of set sockopt integer value - */ -int -sockopt_setint(struct sockopt *sopt, int val) -{ - - return sockopt_set(sopt, &val, sizeof(int)); -} - -/* - * get sockopt value - * - correct size must be given - */ -int -sockopt_get(const struct sockopt *sopt, void *buf, size_t len) -{ - - if (sopt->sopt_size != len) - return EINVAL; - - memcpy(buf, sopt->sopt_data, len); - return 0; -} - -/* - * common case of get sockopt integer value - */ -int -sockopt_getint(const struct sockopt *sopt, int *valp) -{ + case SO_LINGER: + m->m_len = sizeof(struct linger); + mtod(m, struct linger *)->l_onoff = + (so->so_options & SO_LINGER) ? 1 : 0; + mtod(m, struct linger *)->l_linger = so->so_linger; + break; - return sockopt_get(sopt, valp, sizeof(int)); -} + case SO_USELOOPBACK: + case SO_DONTROUTE: + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_BROADCAST: + case SO_OOBINLINE: + case SO_TIMESTAMP: + *mtod(m, int *) = (so->so_options & optname) ? 1 : 0; + break; -/* - * set sockopt value from mbuf - * - ONLY for legacy code - * - mbuf is released by sockopt - * - will not sleep - */ -int -sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m) -{ - size_t len; - int error; + case SO_TYPE: + *mtod(m, int *) = so->so_type; + break; - len = m_length(m); + case SO_ERROR: + *mtod(m, int *) = so->so_error; + so->so_error = 0; + break; - if (sopt->sopt_size == 0) { - error = sockopt_alloc(sopt, len, KM_NOSLEEP); - if (error) - return error; - } + case SO_SNDBUF: + *mtod(m, int *) = so->so_snd.sb_hiwat; + break; - KASSERT(sopt->sopt_size == len); - m_copydata(m, 0, len, sopt->sopt_data); - m_freem(m); + case SO_RCVBUF: + *mtod(m, int *) = so->so_rcv.sb_hiwat; + break; - return 0; -} + case SO_SNDLOWAT: + *mtod(m, int *) = so->so_snd.sb_lowat; + break; -/* - * get sockopt value into mbuf - * - ONLY for legacy code - * - mbuf to be released by the caller - * - will not sleep - */ -struct mbuf * -sockopt_getmbuf(const struct sockopt *sopt) -{ - struct mbuf *m; + case SO_RCVLOWAT: + *mtod(m, int *) = so->so_rcv.sb_lowat; + break; - if (sopt->sopt_size > MCLBYTES) - return NULL; + case SO_SNDTIMEO: + case SO_RCVTIMEO: + { + int val = (optname == SO_SNDTIMEO ? + so->so_snd.sb_timeo : so->so_rcv.sb_timeo); + + m->m_len = sizeof(struct timeval); + mtod(m, struct timeval *)->tv_sec = val / hz; + mtod(m, struct timeval *)->tv_usec = + (val % hz) * tick; + break; + } - m = m_get(M_DONTWAIT, MT_SOOPTS); - if (m == NULL) - return NULL; + case SO_OVERFLOWED: + *mtod(m, int *) = so->so_rcv.sb_overflowed; + break; - if (sopt->sopt_size > MLEN) { - MCLGET(m, M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { - m_free(m); - return NULL; + default: + (void)m_free(m); + return (ENOPROTOOPT); } + *mp = m; + return (0); } - - memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size); - m->m_len = sopt->sopt_size; - - return m; } void sohasoutofband(struct socket *so) { - fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); - selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, 0); + selwakeup(&so->so_rcv.sb_sel); } static void @@ -2079,12 +1691,10 @@ filt_sordetach(struct knote *kn) { struct socket *so; - so = ((file_t *)kn->kn_obj)->f_data; - solock(so); + so = (struct socket *)kn->kn_fp->f_data; SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) so->so_rcv.sb_flags &= ~SB_KNOTE; - sounlock(so); } /*ARGSUSED*/ @@ -2092,25 +1702,19 @@ static int filt_soread(struct knote *kn, long hint) { struct socket *so; - int rv; - so = ((file_t *)kn->kn_obj)->f_data; - if (hint != NOTE_SUBMIT) - solock(so); + so = (struct socket *)kn->kn_fp->f_data; kn->kn_data = so->so_rcv.sb_cc; if (so->so_state & SS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; - rv = 1; - } else if (so->so_error) /* temporary udp error */ - rv = 1; - else if (kn->kn_sfflags & NOTE_LOWAT) - rv = (kn->kn_data >= kn->kn_sdata); - else - rv = (kn->kn_data >= so->so_rcv.sb_lowat); - if (hint != NOTE_SUBMIT) - sounlock(so); - return rv; + return (1); + } + if (so->so_error) /* temporary udp error */ + return (1); + if (kn->kn_sfflags & NOTE_LOWAT) + return (kn->kn_data >= kn->kn_sdata); + return (kn->kn_data >= so->so_rcv.sb_lowat); } static void @@ -2118,12 +1722,10 @@ filt_sowdetach(struct knote *kn) { struct socket *so; - so = ((file_t *)kn->kn_obj)->f_data; - solock(so); + so = (struct socket *)kn->kn_fp->f_data; SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) so->so_snd.sb_flags &= ~SB_KNOTE; - sounlock(so); } /*ARGSUSED*/ @@ -2131,28 +1733,22 @@ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; - int rv; - so = ((file_t *)kn->kn_obj)->f_data; - if (hint != NOTE_SUBMIT) - solock(so); + so = (struct socket *)kn->kn_fp->f_data; kn->kn_data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; - rv = 1; - } else if (so->so_error) /* temporary udp error */ - rv = 1; - else if (((so->so_state & SS_ISCONNECTED) == 0) && + return (1); + } + if (so->so_error) /* temporary udp error */ + return (1); + if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) - rv = 0; - else if (kn->kn_sfflags & NOTE_LOWAT) - rv = (kn->kn_data >= kn->kn_sdata); - else - rv = (kn->kn_data >= so->so_snd.sb_lowat); - if (hint != NOTE_SUBMIT) - sounlock(so); - return rv; + return (0); + if (kn->kn_sfflags & NOTE_LOWAT) + return (kn->kn_data >= kn->kn_sdata); + return (kn->kn_data >= so->so_snd.sb_lowat); } /*ARGSUSED*/ @@ -2160,21 +1756,15 @@ static int filt_solisten(struct knote *kn, long hint) { struct socket *so; - int rv; - so = ((file_t *)kn->kn_obj)->f_data; + so = (struct socket *)kn->kn_fp->f_data; /* * Set kn_data to number of incoming connections, not * counting partial (incomplete) connections. */ - if (hint != NOTE_SUBMIT) - solock(so); kn->kn_data = so->so_qlen; - rv = (kn->kn_data > 0); - if (hint != NOTE_SUBMIT) - sounlock(so); - return rv; + return (kn->kn_data > 0); } static const struct filterops solisten_filtops = @@ -2190,8 +1780,7 @@ soo_kqfilter(struct file *fp, struct kno struct socket *so; struct sockbuf *sb; - so = ((file_t *)kn->kn_obj)->f_data; - solock(so); + so = (struct socket *)kn->kn_fp->f_data; switch (kn->kn_filter) { case EVFILT_READ: if (so->so_options & SO_ACCEPTCONN) @@ -2205,70 +1794,13 @@ soo_kqfilter(struct file *fp, struct kno sb = &so->so_snd; break; default: - sounlock(so); return (EINVAL); } SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); sb->sb_flags |= SB_KNOTE; - sounlock(so); return (0); } -static int -sodopoll(struct socket *so, int events) -{ - int revents; - - revents = 0; - - if (events & (POLLIN | POLLRDNORM)) - if (soreadable(so)) - revents |= events & (POLLIN | POLLRDNORM); - - if (events & (POLLOUT | POLLWRNORM)) - if (sowritable(so)) - revents |= events & (POLLOUT | POLLWRNORM); - - if (events & (POLLPRI | POLLRDBAND)) - if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) - revents |= events & (POLLPRI | POLLRDBAND); - - return revents; -} - -int -sopoll(struct socket *so, int events) -{ - int revents = 0; - -#ifndef DIAGNOSTIC - /* - * Do a quick, unlocked check in expectation that the socket - * will be ready for I/O. Don't do this check if DIAGNOSTIC, - * as the solocked() assertions will fail. - */ - if ((revents = sodopoll(so, events)) != 0) - return revents; -#endif - - solock(so); - if ((revents = sodopoll(so, events)) == 0) { - if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { - selrecord(curlwp, &so->so_rcv.sb_sel); - so->so_rcv.sb_flags |= SB_NOTIFY; - } - - if (events & (POLLOUT | POLLWRNORM)) { - selrecord(curlwp, &so->so_snd.sb_sel); - so->so_snd.sb_flags |= SB_NOTIFY; - } - } - sounlock(so); - - return revents; -} - - #include static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); @@ -2302,18 +1834,16 @@ sysctl_kern_somaxkva(SYSCTLFN_ARGS) return (error); } -static void -sysctl_kern_somaxkva_setup() +SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup") { - KASSERT(socket_sysctllog == NULL); - sysctl_createv(&socket_sysctllog, 0, NULL, NULL, + sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "kern", NULL, NULL, 0, NULL, 0, CTL_KERN, CTL_EOL); - sysctl_createv(&socket_sysctllog, 0, NULL, NULL, + sysctl_createv(clog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "somaxkva", SYSCTL_DESCR("Maximum amount of kernel memory to be "