Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket2.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket2.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.89 retrieving revision 1.89.6.2 diff -u -p -r1.89 -r1.89.6.2 --- src/sys/kern/uipc_socket2.c 2008/02/07 12:14:43 1.89 +++ src/sys/kern/uipc_socket2.c 2008/06/02 13:24:13 1.89.6.2 @@ -1,4 +1,30 @@ -/* $NetBSD: uipc_socket2.c,v 1.89 2008/02/07 12:14:43 ad Exp $ */ +/* $NetBSD: uipc_socket2.c,v 1.89.6.2 2008/06/02 13:24:13 mjf Exp $ */ + +/*- + * Copyright (c) 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 @@ -32,7 +58,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.89 2008/02/07 12:14:43 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.89.6.2 2008/06/02 13:24:13 mjf Exp $"); #include "opt_mbuftrace.h" #include "opt_sb_max.h" @@ -45,21 +71,56 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_socket2 #include #include #include +#include #include #include #include #include #include +#include /* - * Primitive routines for operating on sockets and socket buffers + * Primitive routines for operating on sockets and socket buffers. + * + * Locking rules and assumptions: + * + * o socket::so_lock can change on the fly. The low level routines used + * to lock sockets are aware of this. When so_lock is acquired, the + * routine locking must check to see if so_lock still points to the + * lock that was acquired. If so_lock has changed in the meantime, the + * now irellevant lock that was acquired must be dropped and the lock + * operation retried. Although not proven here, this is completely safe + * on a multiprocessor system, even with relaxed memory ordering, given + * the next two rules: + * + * o In order to mutate so_lock, the lock pointed to by the current value + * of so_lock must be held: i.e., the socket must be held locked by the + * changing thread. The thread must issue membar_exit() to prevent + * memory accesses being reordered, and can set so_lock to the desired + * value. If the lock pointed to by the new value of so_lock is not + * held by the changing thread, the socket must then be considered + * unlocked. + * + * o If so_lock is mutated, and the previous lock referred to by so_lock + * could still be visible to other threads in the system (e.g. via file + * descriptor or protocol-internal reference), then the old lock must + * remain valid until the socket and/or protocol control block has been + * torn down. + * + * o If a socket has a non-NULL so_head value (i.e. is in the process of + * connecting), then locking the socket must also lock the socket pointed + * to by so_head: their lock pointers must match. + * + * o If a socket has connections in progress (so_q, so_q0 not empty) then + * locking the socket must also lock the sockets attached to both queues. + * Again, their lock pointers must match. + * + * o Beyond the initial lock assigment in socreate(), assigning locks to + * sockets is the responsibility of the individual protocols / protocol + * domains. */ -/* strings for sleep message: */ -const char netcon[] = "netcon"; -const char netcls[] = "netcls"; -const char netio[] = "netio"; -const char netlck[] = "netlck"; +static pool_cache_t socket_cache; u_long sb_max = SB_MAX; /* maximum socket buffer size */ static u_long sb_max_adj; /* adjusted sb_max */ @@ -98,6 +159,8 @@ void soisconnecting(struct socket *so) { + KASSERT(solocked(so)); + so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; } @@ -108,14 +171,18 @@ soisconnected(struct socket *so) struct socket *head; head = so->so_head; + + KASSERT(solocked(so)); + KASSERT(head == NULL || solocked2(so, head)); + so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; if (head && soqremque(so, 0)) { soqinsque(head, so, 1); sorwakeup(head); - wakeup((void *)&head->so_timeo); + cv_broadcast(&head->so_cv); } else { - wakeup((void *)&so->so_timeo); + cv_broadcast(&so->so_cv); sorwakeup(so); sowwakeup(so); } @@ -125,9 +192,11 @@ void soisdisconnecting(struct socket *so) { + KASSERT(solocked(so)); + so->so_state &= ~SS_ISCONNECTING; so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); - wakeup((void *)&so->so_timeo); + cv_broadcast(&so->so_cv); sowwakeup(so); sorwakeup(so); } @@ -136,13 +205,23 @@ void soisdisconnected(struct socket *so) { + KASSERT(solocked(so)); + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); - wakeup((void *)&so->so_timeo); + cv_broadcast(&so->so_cv); sowwakeup(so); sorwakeup(so); } +void +soinit2(void) +{ + + socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0, + "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL); +} + /* * When an attempt at a new connection is noted on a socket * which accepts connections, sonewconn is called. If the @@ -155,15 +234,18 @@ struct socket * sonewconn(struct socket *head, int connstatus) { struct socket *so; - int soqueue; + int soqueue, error; + + KASSERT(solocked(head)); soqueue = connstatus ? 1 : 0; if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) return ((struct socket *)0); - so = pool_get(&socket_pool, PR_NOWAIT); + so = soget(false); if (so == NULL) return (NULL); - memset((void *)so, 0, sizeof(*so)); + mutex_obj_hold(head->so_lock); + so->so_lock = head->so_lock; so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; @@ -180,8 +262,6 @@ sonewconn(struct socket *head, int conns so->so_rcv.sb_mowner = head->so_rcv.sb_mowner; so->so_snd.sb_mowner = head->so_snd.sb_mowner; #endif - selinit(&so->so_rcv.sb_sel); - selinit(&so->so_snd.sb_sel); (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; @@ -190,27 +270,65 @@ sonewconn(struct socket *head, int conns so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; soqinsque(head, so, soqueue); - if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, - (struct lwp *)0)) { + error = (*so->so_proto->pr_usrreq)(so, PRU_ATTACH, NULL, NULL, + NULL, NULL); + KASSERT(solocked(so)); + if (error != 0) { (void) soqremque(so, soqueue); - seldestroy(&so->so_rcv.sb_sel); - seldestroy(&so->so_snd.sb_sel); - pool_put(&socket_pool, so); + soput(so); return (NULL); } if (connstatus) { sorwakeup(head); - wakeup((void *)&head->so_timeo); + cv_broadcast(&head->so_cv); so->so_state |= connstatus; } return (so); } +struct socket * +soget(bool waitok) +{ + struct socket *so; + + so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); + if (__predict_false(so == NULL)) + return (NULL); + memset(so, 0, sizeof(*so)); + TAILQ_INIT(&so->so_q0); + TAILQ_INIT(&so->so_q); + cv_init(&so->so_cv, "socket"); + cv_init(&so->so_rcv.sb_cv, "netio"); + cv_init(&so->so_snd.sb_cv, "netio"); + selinit(&so->so_rcv.sb_sel); + selinit(&so->so_snd.sb_sel); + so->so_rcv.sb_so = so; + so->so_snd.sb_so = so; + return so; +} + +void +soput(struct socket *so) +{ + + KASSERT(!cv_has_waiters(&so->so_cv)); + KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); + KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); + seldestroy(&so->so_rcv.sb_sel); + seldestroy(&so->so_snd.sb_sel); + mutex_obj_free(so->so_lock); + cv_destroy(&so->so_cv); + cv_destroy(&so->so_rcv.sb_cv); + cv_destroy(&so->so_snd.sb_cv); + pool_cache_put(socket_cache, so); +} + void soqinsque(struct socket *head, struct socket *so, int q) { + KASSERT(solocked2(head, so)); + #ifdef DIAGNOSTIC if (so->so_onq != NULL) panic("soqinsque"); @@ -233,6 +351,8 @@ soqremque(struct socket *so, int q) struct socket *head; head = so->so_head; + + KASSERT(solocked(so)); if (q == 0) { if (so->so_onq != &head->so_q0) return (0); @@ -242,6 +362,7 @@ soqremque(struct socket *so, int q) return (0); head->so_qlen--; } + KASSERT(solocked2(so, head)); TAILQ_REMOVE(so->so_onq, so, so_qe); so->so_onq = NULL; so->so_head = NULL; @@ -262,6 +383,8 @@ void socantsendmore(struct socket *so) { + KASSERT(solocked(so)); + so->so_state |= SS_CANTSENDMORE; sowwakeup(so); } @@ -270,6 +393,8 @@ void socantrcvmore(struct socket *so) { + KASSERT(solocked(so)); + so->so_state |= SS_CANTRCVMORE; sorwakeup(so); } @@ -280,32 +405,23 @@ socantrcvmore(struct socket *so) int sbwait(struct sockbuf *sb) { + struct socket *so; + kmutex_t *lock; + int error; - sb->sb_flags |= SB_WAIT; - return (tsleep((void *)&sb->sb_cc, - (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio, - sb->sb_timeo)); -} + so = sb->sb_so; -/* - * Lock a sockbuf already known to be locked; - * return any error returned from sleep (EINTR). - */ -int -sb_lock(struct sockbuf *sb) -{ - int error; + KASSERT(solocked(so)); - while (sb->sb_flags & SB_LOCK) { - sb->sb_flags |= SB_WANT; - error = tsleep((void *)&sb->sb_flags, - (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, - netlck, 0); - if (error) - return (error); - } - sb->sb_flags |= SB_LOCK; - return (0); + sb->sb_flags |= SB_NOTIFY; + lock = so->so_lock; + if ((sb->sb_flags & SB_NOINTR) != 0) + error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo); + else + error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo); + if (__predict_false(lock != so->so_lock)) + solockretry(so, lock); + return error; } /* @@ -316,20 +432,20 @@ sb_lock(struct sockbuf *sb) void sowakeup(struct socket *so, struct sockbuf *sb, int code) { - selnotify(&sb->sb_sel, 0); - sb->sb_flags &= ~SB_SEL; - if (sb->sb_flags & SB_WAIT) { - sb->sb_flags &= ~SB_WAIT; - wakeup((void *)&sb->sb_cc); - } - if (sb->sb_flags & SB_ASYNC) { - int band; - if (code == POLL_IN) - band = POLLIN|POLLRDNORM; - else - band = POLLOUT|POLLWRNORM; + int band; + + KASSERT(solocked(so)); + KASSERT(sb->sb_so == so); + + if (code == POLL_IN) + band = POLLIN|POLLRDNORM; + else + band = POLLOUT|POLLWRNORM; + sb->sb_flags &= ~SB_NOTIFY; + selnotify(&sb->sb_sel, band, NOTE_SUBMIT); + cv_broadcast(&sb->sb_cv); + if (sb->sb_flags & SB_ASYNC) fownsignal(so->so_pgid, SIGIO, code, band, so); - } if (sb->sb_flags & SB_UPCALL) (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); } @@ -385,6 +501,9 @@ sb_max_set(u_long new_sbmax) int soreserve(struct socket *so, u_long sndcc, u_long rcvcc) { + + KASSERT(so->so_lock == NULL || solocked(so)); + /* * there's at least one application (a configure script of screen) * which expects a fifo is writable even if it has "some" bytes @@ -428,19 +547,19 @@ sbreserve(struct sockbuf *sb, u_long cc, rlim_t maxcc; struct uidinfo *uidinfo; - KDASSERT(sb_max_adj != 0); + KASSERT(so->so_lock == NULL || solocked(so)); + KASSERT(sb->sb_so == so); + KASSERT(sb_max_adj != 0); + if (cc == 0 || cc > sb_max_adj) return (0); - if (so) { - if (kauth_cred_geteuid(l->l_cred) == so->so_uidinfo->ui_uid) - maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur; - else - maxcc = RLIM_INFINITY; - uidinfo = so->so_uidinfo; - } else { - uidinfo = uid_find(0); /* XXX: nothing better */ + + if (kauth_cred_geteuid(l->l_cred) == so->so_uidinfo->ui_uid) + maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur; + else maxcc = RLIM_INFINITY; - } + + uidinfo = so->so_uidinfo; if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc)) return 0; sb->sb_mbmax = min(cc * 2, sb_max); @@ -450,12 +569,15 @@ sbreserve(struct sockbuf *sb, u_long cc, } /* - * Free mbufs held by a socket, and reserved mbuf space. + * Free mbufs held by a socket, and reserved mbuf space. We do not assert + * that the socket is held locked here: see sorflush(). */ void sbrelease(struct sockbuf *sb, struct socket *so) { + KASSERT(sb->sb_so == so); + sbflush(sb); (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); sb->sb_mbmax = 0; @@ -492,6 +614,8 @@ sblastrecordchk(struct sockbuf *sb, cons { struct mbuf *m = sb->sb_mb; + KASSERT(solocked(sb->sb_so)); + while (m && m->m_nextpkt) m = m->m_nextpkt; @@ -511,6 +635,8 @@ sblastmbufchk(struct sockbuf *sb, const struct mbuf *m = sb->sb_mb; struct mbuf *n; + KASSERT(solocked(sb->sb_so)); + while (m && m->m_nextpkt) m = m->m_nextpkt; @@ -559,6 +685,8 @@ sbappend(struct sockbuf *sb, struct mbuf { struct mbuf *n; + KASSERT(solocked(sb->sb_so)); + if (m == 0) return; @@ -600,6 +728,7 @@ void sbappendstream(struct sockbuf *sb, struct mbuf *m) { + KASSERT(solocked(sb->sb_so)); KDASSERT(m->m_nextpkt == NULL); KASSERT(sb->sb_mb == sb->sb_lastrecord); @@ -619,18 +748,22 @@ sbappendstream(struct sockbuf *sb, struc void sbcheck(struct sockbuf *sb) { - struct mbuf *m; + struct mbuf *m, *m2; u_long len, mbcnt; + KASSERT(solocked(sb->sb_so)); + len = 0; mbcnt = 0; - for (m = sb->sb_mb; m; m = m->m_next) { - len += m->m_len; - mbcnt += MSIZE; - if (m->m_flags & M_EXT) - mbcnt += m->m_ext.ext_size; - if (m->m_nextpkt) - panic("sbcheck nextpkt"); + for (m = sb->sb_mb; m; m = m->m_nextpkt) { + for (m2 = m; m2 != NULL; m2 = m2->m_next) { + len += m2->m_len; + mbcnt += MSIZE; + if (m2->m_flags & M_EXT) + mbcnt += m2->m_ext.ext_size; + if (m2->m_nextpkt != NULL) + panic("sbcheck nextpkt"); + } } if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, @@ -649,6 +782,8 @@ sbappendrecord(struct sockbuf *sb, struc { struct mbuf *m; + KASSERT(solocked(sb->sb_so)); + if (m0 == 0) return; @@ -682,6 +817,8 @@ sbinsertoob(struct sockbuf *sb, struct m { struct mbuf *m, **mp; + KASSERT(solocked(sb->sb_so)); + if (m0 == 0) return; @@ -734,6 +871,8 @@ sbappendaddr(struct sockbuf *sb, const s struct mbuf *m, *n, *nlast; int space, len; + KASSERT(solocked(sb->sb_so)); + space = asa->sa_len; if (m0 != NULL) { @@ -786,7 +925,6 @@ sbappendaddr(struct sockbuf *sb, const s sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb, "sbappendaddr"); - SBLASTRECORDCHK(sb, "sbappendaddr 2"); return (1); @@ -803,6 +941,8 @@ m_prepend_sockaddr(struct sockbuf *sb, s struct mbuf *m; const int salen = asa->sa_len; + KASSERT(solocked(sb->sb_so)); + /* only the first in each chain need be a pkthdr */ MGETHDR(m, M_DONTWAIT, MT_SONAME); if (m == 0) @@ -835,6 +975,8 @@ sbappendaddrchain(struct sockbuf *sb, co struct mbuf *m, *n, *n0, *nlast; int error; + KASSERT(solocked(sb->sb_so)); + /* * XXX sbprio reserved for encoding priority of this* request: * SB_PRIO_NONE --> honour normal sb limits @@ -930,6 +1072,8 @@ sbappendcontrol(struct sockbuf *sb, stru struct mbuf *m, *mlast, *n; int space; + KASSERT(solocked(sb->sb_so)); + space = 0; if (control == 0) panic("sbappendcontrol"); @@ -958,7 +1102,6 @@ sbappendcontrol(struct sockbuf *sb, stru sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb, "sbappendcontrol"); - SBLASTRECORDCHK(sb, "sbappendcontrol 2"); return (1); @@ -975,6 +1118,8 @@ sbcompress(struct sockbuf *sb, struct mb int eor; struct mbuf *o; + KASSERT(solocked(sb->sb_so)); + eor = 0; while (m) { eor |= m->m_flags & M_EOR; @@ -1027,6 +1172,7 @@ void sbflush(struct sockbuf *sb) { + KASSERT(solocked(sb->sb_so)); KASSERT((sb->sb_flags & SB_LOCK) == 0); while (sb->sb_mbcnt) @@ -1046,6 +1192,8 @@ sbdrop(struct sockbuf *sb, int len) { struct mbuf *m, *mn, *next; + KASSERT(solocked(sb->sb_so)); + next = (m = sb->sb_mb) ? m->m_nextpkt : 0; while (len > 0) { if (m == 0) { @@ -1098,6 +1246,8 @@ sbdroprecord(struct sockbuf *sb) { struct mbuf *m, *mn; + KASSERT(solocked(sb->sb_so)); + m = sb->sb_mb; if (m) { sb->sb_mb = m->m_nextpkt; @@ -1141,3 +1291,116 @@ sbcreatecontrol(void *p, int size, int t cp->cmsg_type = type; return (m); } + +void +solockretry(struct socket *so, kmutex_t *lock) +{ + + while (lock != so->so_lock) { + mutex_exit(lock); + lock = so->so_lock; + mutex_enter(lock); + } +} + +bool +solocked(struct socket *so) +{ + + return mutex_owned(so->so_lock); +} + +bool +solocked2(struct socket *so1, struct socket *so2) +{ + kmutex_t *lock; + + lock = so1->so_lock; + if (lock != so2->so_lock) + return false; + return mutex_owned(lock); +} + +/* + * Assign a default lock to a new socket. For PRU_ATTACH, and done by + * protocols that do not have special locking requirements. + */ +void +sosetlock(struct socket *so) +{ + kmutex_t *lock; + + if (so->so_lock == NULL) { + lock = softnet_lock; + so->so_lock = lock; + mutex_obj_hold(lock); + mutex_enter(lock); + } + + /* In all cases, lock must be held on return from PRU_ATTACH. */ + KASSERT(solocked(so)); +} + +/* + * Set lock on sockbuf sb; sleep if lock is already held. + * Unless SB_NOINTR is set on sockbuf, sleep is interruptible. + * Returns error without lock if sleep is interrupted. + */ +int +sblock(struct sockbuf *sb, int wf) +{ + struct socket *so; + kmutex_t *lock; + int error; + + KASSERT(solocked(sb->sb_so)); + + for (;;) { + if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) { + sb->sb_flags |= SB_LOCK; + return 0; + } + if (wf != M_WAITOK) + return EWOULDBLOCK; + so = sb->sb_so; + lock = so->so_lock; + if ((sb->sb_flags & SB_NOINTR) != 0) { + cv_wait(&so->so_cv, lock); + error = 0; + } else + error = cv_wait_sig(&so->so_cv, lock); + if (__predict_false(lock != so->so_lock)) + solockretry(so, lock); + if (error != 0) + return error; + } +} + +void +sbunlock(struct sockbuf *sb) +{ + struct socket *so; + + so = sb->sb_so; + + KASSERT(solocked(so)); + KASSERT((sb->sb_flags & SB_LOCK) != 0); + + sb->sb_flags &= ~SB_LOCK; + cv_broadcast(&so->so_cv); +} + +int +sowait(struct socket *so, int timo) +{ + kmutex_t *lock; + int error; + + KASSERT(solocked(so)); + + lock = so->so_lock; + error = cv_timedwait_sig(&so->so_cv, lock, timo); + if (__predict_false(lock != so->so_lock)) + solockretry(so, lock); + return error; +}