Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.140.6.4 retrieving revision 1.159 diff -u -p -r1.140.6.4 -r1.159 --- src/sys/kern/uipc_socket.c 2007/11/11 16:48:17 1.140.6.4 +++ src/sys/kern/uipc_socket.c 2008/04/14 15:42:20 1.159 @@ -1,7 +1,7 @@ -/* $NetBSD: uipc_socket.c,v 1.140.6.4 2007/11/11 16:48:17 joerg Exp $ */ +/* $NetBSD: uipc_socket.c,v 1.159 2008/04/14 15:42:20 ad Exp $ */ /*- - * Copyright (c) 2002, 2007 The NetBSD Foundation, Inc. + * Copyright (c) 2002, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -37,6 +37,8 @@ */ /* + * Copyright (c) 2004 The FreeBSD Foundation + * Copyright (c) 2004 Robert Watson * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * @@ -68,7 +70,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.140.6.4 2007/11/11 16:48:17 joerg Exp $"); +__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.159 2008/04/14 15:42:20 ad Exp $"); #include "opt_sock_counters.h" #include "opt_sosend_loan.h" @@ -253,26 +255,17 @@ sokvafree(vaddr_t sva, vsize_t len) static void sodoloanfree(struct vm_page **pgs, void *buf, size_t size) { - vaddr_t va, sva, eva; + vaddr_t sva, eva; vsize_t len; - paddr_t pa; - int i, npgs; + int npgs; + + KASSERT(pgs != NULL); eva = round_page((vaddr_t) buf + size); sva = trunc_page((vaddr_t) buf); len = eva - sva; npgs = len >> PAGE_SHIFT; - if (__predict_false(pgs == NULL)) { - pgs = alloca(npgs * sizeof(*pgs)); - - for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { - if (pmap_extract(pmap_kernel(), va, &pa) == false) - panic("sodoloanfree: va 0x%lx not mapped", va); - pgs[i] = PHYS_TO_VM_PAGE(pa); - } - } - pmap_kremove(sva, len); pmap_update(pmap_kernel()); uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); @@ -280,7 +273,7 @@ sodoloanfree(struct vm_page **pgs, void } static size_t -sodopendfree() +sodopendfree(void) { size_t rv; @@ -299,7 +292,7 @@ sodopendfree() */ static size_t -sodopendfreel() +sodopendfreel(void) { struct mbuf *m, *next; size_t rv = 0; @@ -313,10 +306,11 @@ sodopendfreel() for (; m != NULL; m = next) { next = m->m_next; + KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); + KASSERT(m->m_ext.ext_refcnt == 0); rv += m->m_ext.ext_size; - sodoloanfree((m->m_flags & M_EXT_PAGES) ? - m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, + sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, m->m_ext.ext_size); pool_cache_put(mb_cache, m); } @@ -331,15 +325,7 @@ void soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) { - if (m == NULL) { - - /* - * called from MEXTREMOVE. - */ - - sodoloanfree(NULL, buf, size); - return; - } + KASSERT(m != NULL); /* * postpone freeing mbuf. @@ -361,8 +347,10 @@ sosend_loan(struct socket *so, struct ui struct iovec *iov = uio->uio_iov; vaddr_t sva, eva; vsize_t len; - vaddr_t lva, va; - int npgs, i, error; + vaddr_t lva; + int npgs, error; + vaddr_t va; + int i; if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) return (0); @@ -428,11 +416,11 @@ sokva_reclaim_callback(struct callback_e } struct mbuf * -getsombuf(struct socket *so) +getsombuf(struct socket *so, int type) { struct mbuf *m; - m = m_get(M_WAIT, MT_SONAME); + m = m_get(M_WAIT, type); MCLAIM(m, so->so_mowner); return m; } @@ -442,7 +430,7 @@ m_intopt(struct socket *so, int val) { struct mbuf *m; - m = getsombuf(so); + m = getsombuf(so, MT_SOOPTS); m->m_len = sizeof(int); *mtod(m, int *) = val; return m; @@ -452,7 +440,7 @@ void soinit(void) { - mutex_init(&so_pendfree_lock, MUTEX_DRIVER, IPL_VM); + mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM); cv_init(&socurkva_cv, "sokva"); /* Set the initial adjusted socket buffer size. */ @@ -540,29 +528,23 @@ int fsocreate(int domain, struct socket **sop, int type, int protocol, struct lwp *l, int *fdout) { - struct filedesc *fdp; struct socket *so; struct file *fp; int fd, error; - fdp = l->l_proc->p_fd; - /* falloc() will use the desciptor for us */ - if ((error = falloc(l, &fp, &fd)) != 0) + if ((error = fd_allocfile(&fp, &fd)) != 0) return (error); fp->f_flag = FREAD|FWRITE; fp->f_type = DTYPE_SOCKET; fp->f_ops = &socketops; error = socreate(domain, &so, type, protocol, l); if (error != 0) { - FILE_UNUSE(fp, l); - fdremove(fdp, fd); - ffree(fp); + fd_abort(curproc, fp, fd); } else { if (sop != NULL) *sop = so; fp->f_data = so; - FILE_SET_MATURE(fp); - FILE_UNUSE(fp, l); + fd_affix(curproc, fp, fd); *fdout = fd; } return error; @@ -580,13 +562,16 @@ sobind(struct socket *so, struct mbuf *n } int -solisten(struct socket *so, int backlog) +solisten(struct socket *so, int backlog, struct lwp *l) { int s, error; s = splsoftnet(); + if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | + SS_ISDISCONNECTING)) != 0) + return (EOPNOTSUPP); error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, - NULL, NULL, NULL); + NULL, NULL, l); if (error != 0) { splx(s); return error; @@ -660,8 +645,7 @@ soclose(struct socket *so) goto drop; } if (so->so_options & SO_LINGER) { - if ((so->so_state & SS_ISDISCONNECTING) && - (so->so_state & SS_NBIO)) + if ((so->so_state & SS_ISDISCONNECTING) && so->so_nbio) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep((void *)&so->so_timeo, @@ -835,8 +819,8 @@ sosend(struct socket *so, struct mbuf *a dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); - if (p) - p->p_stats->p_ru.ru_msgsnd++; + if (l) + l->l_ru.ru_msgsnd++; if (control) clen = control->m_len; #define snderr(errno) { error = errno; splx(s); goto release; } @@ -870,7 +854,7 @@ sosend(struct socket *so, struct mbuf *a snderr(EMSGSIZE); if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { - if (so->so_state & SS_NBIO) + if (so->so_nbio) snderr(EWOULDBLOCK); sbunlock(&so->so_snd); error = sbwait(&so->so_snd); @@ -989,6 +973,41 @@ sosend(struct socket *so, struct mbuf *a } /* + * Following replacement or removal of the first mbuf on the first + * mbuf chain of a socket buffer, push necessary state changes back + * into the socket buffer so that other consumers see the values + * consistently. 'nextrecord' is the callers locally stored value of + * the original value of sb->sb_mb->m_nextpkt which must be restored + * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. + */ +static void +sbsync(struct sockbuf *sb, struct mbuf *nextrecord) +{ + + /* + * First, update for the new value of nextrecord. If necessary, + * make it the first record. + */ + if (sb->sb_mb != NULL) + sb->sb_mb->m_nextpkt = nextrecord; + else + sb->sb_mb = nextrecord; + + /* + * Now update any dependent socket buffer fields to reflect + * the new state. This is an inline of SB_EMPTY_FIXUP, with + * the addition of a second clause that takes care of the + * case where sb_mb has been updated, but remains the last + * record. + */ + if (sb->sb_mb == NULL) { + sb->sb_mbtail = NULL; + sb->sb_lastrecord = NULL; + } else if (sb->sb_mb->m_nextpkt == NULL) + sb->sb_lastrecord = sb->sb_mb; +} + +/* * Implement receive operations on a socket. * We depend on the way that records are added to the sockbuf * by sbappend*. In particular, each record (mbufs linked through m_next) @@ -1010,12 +1029,15 @@ soreceive(struct socket *so, struct mbuf { struct lwp *l = curlwp; struct mbuf *m, **mp; - int flags, len, error, s, offset, moff, type, orig_resid; + int atomic, flags, len, error, s, offset, moff, type, orig_resid; const struct protosw *pr; struct mbuf *nextrecord; int mbuf_removed = 0; + const struct domain *dom; pr = so->so_proto; + atomic = pr->pr_flags & PR_ATOMIC; + dom = pr->pr_domain; mp = mp0; type = 0; orig_resid = uio->uio_resid; @@ -1076,8 +1098,7 @@ soreceive(struct socket *so, struct mbuf (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && - m->m_nextpkt == NULL && - (pr->pr_flags & PR_ATOMIC) == 0)) { + m->m_nextpkt == NULL && !atomic)) { #ifdef DIAGNOSTIC if (m == NULL && so->so_rcv.sb_cc) panic("receive 1"); @@ -1108,7 +1129,7 @@ soreceive(struct socket *so, struct mbuf } if (uio->uio_resid == 0) goto release; - if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { + if (so->so_nbio || (flags & MSG_DONTWAIT)) { error = EWOULDBLOCK; goto release; } @@ -1124,11 +1145,22 @@ soreceive(struct socket *so, struct mbuf dontblock: /* * On entry here, m points to the first record of the socket buffer. - * While we process the initial mbufs containing address and control - * info, we save a copy of m->m_nextpkt into nextrecord. + * From this point onward, we maintain 'nextrecord' as a cache of the + * pointer to the next record in the socket buffer. We must keep the + * various socket buffer pointers and local stack versions of the + * pointers in sync, pushing out modifications before dropping the + * IPL, and re-reading them when picking it up. + * + * Otherwise, we will race with the network stack appending new data + * or records onto the socket buffer by using inconsistent/stale + * versions of the field, possibly resulting in socket buffer + * corruption. + * + * By holding the high-level sblock(), we prevent simultaneous + * readers from pulling off the front of the socket buffer. */ if (l != NULL) - l->l_proc->p_stats->p_ru.ru_msgrcv++; + l->l_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb); SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); @@ -1155,72 +1187,78 @@ soreceive(struct socket *so, struct mbuf MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } + sbsync(&so->so_rcv, nextrecord); } } - while (m != NULL && m->m_type == MT_CONTROL && error == 0) { - if (flags & MSG_PEEK) { - if (controlp != NULL) - *controlp = m_copy(m, 0, m->m_len); - m = m->m_next; - } else { - sbfree(&so->so_rcv, m); - mbuf_removed = 1; - if (controlp != NULL) { - struct domain *dom = pr->pr_domain; - if (dom->dom_externalize && l && - mtod(m, struct cmsghdr *)->cmsg_type == - SCM_RIGHTS) - error = (*dom->dom_externalize)(m, l); - *controlp = m; + + /* + * Process one or more MT_CONTROL mbufs present before any data mbufs + * in the first mbuf chain on the socket buffer. If MSG_PEEK, we + * just copy the data; if !MSG_PEEK, we call into the protocol to + * perform externalization (or freeing if controlp == NULL). + */ + if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) { + struct mbuf *cm = NULL, *cmn; + struct mbuf **cme = &cm; + + do { + if (flags & MSG_PEEK) { + if (controlp != NULL) { + *controlp = m_copy(m, 0, m->m_len); + controlp = &(*controlp)->m_next; + } + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; + *cme = m; + cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; + } + } while (m != NULL && m->m_type == MT_CONTROL); + if ((flags & MSG_PEEK) == 0) + sbsync(&so->so_rcv, nextrecord); + for (; cm != NULL; cm = cmn) { + cmn = cm->m_next; + cm->m_next = NULL; + type = mtod(cm, struct cmsghdr *)->cmsg_type; + if (controlp != NULL) { + if (dom->dom_externalize != NULL && + type == SCM_RIGHTS) { + splx(s); + error = (*dom->dom_externalize)(cm, l); + s = splsoftnet(); + } + *controlp = cm; + while (*controlp != NULL) + controlp = &(*controlp)->m_next; } else { /* * Dispose of any SCM_RIGHTS message that went * through the read path rather than recv. */ - if (pr->pr_domain->dom_dispose && - mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) - (*pr->pr_domain->dom_dispose)(m); - MFREE(m, so->so_rcv.sb_mb); - m = so->so_rcv.sb_mb; + if (dom->dom_dispose != NULL && + type == SCM_RIGHTS) { + splx(s); + (*dom->dom_dispose)(cm); + s = splsoftnet(); + } + m_freem(cm); } } - if (controlp != NULL) { - orig_resid = 0; - controlp = &(*controlp)->m_next; - } + if (m != NULL) + nextrecord = so->so_rcv.sb_mb->m_nextpkt; + else + nextrecord = so->so_rcv.sb_mb; + orig_resid = 0; } - /* - * If m is non-NULL, we have some data to read. From now on, - * make sure to keep sb_lastrecord consistent when working on - * the last packet on the chain (nextrecord == NULL) and we - * change m->m_nextpkt. - */ - if (m != NULL) { - if ((flags & MSG_PEEK) == 0) { - m->m_nextpkt = nextrecord; - /* - * If nextrecord == NULL (this is a single chain), - * then sb_lastrecord may not be valid here if m - * was changed earlier. - */ - if (nextrecord == NULL) { - KASSERT(so->so_rcv.sb_mb == m); - so->so_rcv.sb_lastrecord = m; - } - } + /* If m is non-NULL, we have some data to read. */ + if (__predict_true(m != NULL)) { type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; - } else { - if ((flags & MSG_PEEK) == 0) { - KASSERT(so->so_rcv.sb_mb == m); - so->so_rcv.sb_mb = nextrecord; - SB_EMPTY_FIXUP(&so->so_rcv); - } } SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); @@ -1269,8 +1307,7 @@ soreceive(struct socket *so, struct mbuf * This avoids a later panic("receive 1a") * when compiled with DIAGNOSTIC. */ - if (m && mbuf_removed - && (pr->pr_flags & PR_ATOMIC)) + if (m && mbuf_removed && atomic) (void) sbdroprecord(&so->so_rcv); goto release; @@ -1373,7 +1410,7 @@ soreceive(struct socket *so, struct mbuf } } - if (m && pr->pr_flags & PR_ATOMIC) { + if (m && atomic) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord(&so->so_rcv); @@ -1683,8 +1720,9 @@ sogetopt(struct socket *so, int level, i void sohasoutofband(struct socket *so) { + fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); - selwakeup(&so->so_rcv.sb_sel); + selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, 0); } static void @@ -1692,7 +1730,7 @@ filt_sordetach(struct knote *kn) { struct socket *so; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) so->so_rcv.sb_flags &= ~SB_KNOTE; @@ -1704,7 +1742,7 @@ filt_soread(struct knote *kn, long hint) { struct socket *so; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; kn->kn_data = so->so_rcv.sb_cc; if (so->so_state & SS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; @@ -1723,7 +1761,7 @@ filt_sowdetach(struct knote *kn) { struct socket *so; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) so->so_snd.sb_flags &= ~SB_KNOTE; @@ -1735,7 +1773,7 @@ filt_sowrite(struct knote *kn, long hint { struct socket *so; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; kn->kn_data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; @@ -1758,7 +1796,7 @@ filt_solisten(struct knote *kn, long hin { struct socket *so; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; /* * Set kn_data to number of incoming connections, not @@ -1781,7 +1819,7 @@ soo_kqfilter(struct file *fp, struct kno struct socket *so; struct sockbuf *sb; - so = (struct socket *)kn->kn_fp->f_data; + so = ((file_t *)kn->kn_obj)->f_data; switch (kn->kn_filter) { case EVFILT_READ: if (so->so_options & SO_ACCEPTCONN) @@ -1795,13 +1833,66 @@ soo_kqfilter(struct file *fp, struct kno sb = &so->so_snd; break; default: - return (1); + return (EINVAL); } SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); sb->sb_flags |= SB_KNOTE; return (0); } +static int +sodopoll(struct socket *so, int events) +{ + int revents; + + revents = 0; + + if (events & (POLLIN | POLLRDNORM)) + if (soreadable(so)) + revents |= events & (POLLIN | POLLRDNORM); + + if (events & (POLLOUT | POLLWRNORM)) + if (sowritable(so)) + revents |= events & (POLLOUT | POLLWRNORM); + + if (events & (POLLPRI | POLLRDBAND)) + if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) + revents |= events & (POLLPRI | POLLRDBAND); + + return revents; +} + +int +sopoll(struct socket *so, int events) +{ + int revents = 0; + int s; + + if ((revents = sodopoll(so, events)) != 0) + return revents; + + KERNEL_LOCK(1, curlwp); + s = splsoftnet(); + + if ((revents = sodopoll(so, events)) == 0) { + if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { + selrecord(curlwp, &so->so_rcv.sb_sel); + so->so_rcv.sb_flags |= SB_SEL; + } + + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(curlwp, &so->so_snd.sb_sel); + so->so_snd.sb_flags |= SB_SEL; + } + } + + splx(s); + KERNEL_UNLOCK_ONE(curlwp); + + return revents; +} + + #include static int sysctl_kern_somaxkva(SYSCTLFN_PROTO);