Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.177.4.4 retrieving revision 1.196 diff -u -p -r1.177.4.4 -r1.196 --- src/sys/kern/uipc_socket.c 2011/08/08 19:45:57 1.177.4.4 +++ src/sys/kern/uipc_socket.c 2009/12/20 09:36:06 1.196 @@ -1,4 +1,4 @@ -/* $NetBSD: uipc_socket.c,v 1.177.4.4 2011/08/08 19:45:57 riz Exp $ */ +/* $NetBSD: uipc_socket.c,v 1.196 2009/12/20 09:36:06 dsl Exp $ */ /*- * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc. @@ -63,8 +63,9 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.177.4.4 2011/08/08 19:45:57 riz Exp $"); +__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.196 2009/12/20 09:36:06 dsl Exp $"); +#include "opt_compat_netbsd.h" #include "opt_sock_counters.h" #include "opt_sosend_loan.h" #include "opt_mbuftrace.h" @@ -91,7 +92,11 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_socket. #include #include #include -#include + +#ifdef COMPAT_50 +#include +#include +#endif #include @@ -137,7 +142,7 @@ int sock_loan_thresh = 4096; #endif static kmutex_t so_pendfree_lock; -static struct mbuf *so_pendfree = NULL; +static struct mbuf *so_pendfree; #ifndef SOMAXKVA #define SOMAXKVA (16 * 1024 * 1024) @@ -146,11 +151,15 @@ int somaxkva = SOMAXKVA; static int socurkva; static kcondvar_t socurkva_cv; +static kauth_listener_t socket_listener; + #define SOCK_LOAN_CHUNK 65536 -static void sopendfree_thread(void *); -static kcondvar_t pendfree_thread_cv; -static lwp_t *sopendfree_lwp; +static size_t sodopendfree(void); +static size_t sodopendfreel(void); + +static void sysctl_kern_somaxkva_setup(void); +static struct sysctllog *socket_sysctllog; static vsize_t sokvareserve(struct socket *so, vsize_t len) @@ -159,6 +168,21 @@ sokvareserve(struct socket *so, vsize_t mutex_enter(&so_pendfree_lock); while (socurkva + len > somaxkva) { + size_t freed; + + /* + * try to do pendfree. + */ + + freed = sodopendfreel(); + + /* + * if some kva was freed, try again. + */ + + if (freed) + continue; + SOSEND_COUNTER_INCR(&sosend_kvalimit); error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock); if (error) { @@ -251,45 +275,56 @@ sodoloanfree(struct vm_page **pgs, void sokvafree(sva, len); } +static size_t +sodopendfree(void) +{ + size_t rv; + + if (__predict_true(so_pendfree == NULL)) + return 0; + + mutex_enter(&so_pendfree_lock); + rv = sodopendfreel(); + mutex_exit(&so_pendfree_lock); + + return rv; +} + /* - * sopendfree_thread: free mbufs on "pendfree" list. + * sodopendfreel: free mbufs on "pendfree" list. * unlock and relock so_pendfree_lock when freeing mbufs. + * + * => called with so_pendfree_lock held. */ -static void -sopendfree_thread(void *v) +static size_t +sodopendfreel(void) { struct mbuf *m, *next; - size_t rv; + size_t rv = 0; - mutex_enter(&so_pendfree_lock); + KASSERT(mutex_owned(&so_pendfree_lock)); - for (;;) { - rv = 0; - while (so_pendfree != NULL) { - m = so_pendfree; - so_pendfree = NULL; - mutex_exit(&so_pendfree_lock); - - for (; m != NULL; m = next) { - next = m->m_next; - KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); - KASSERT(m->m_ext.ext_refcnt == 0); - - rv += m->m_ext.ext_size; - sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, - m->m_ext.ext_size); - pool_cache_put(mb_cache, m); - } + while (so_pendfree != NULL) { + m = so_pendfree; + so_pendfree = NULL; + mutex_exit(&so_pendfree_lock); + + for (; m != NULL; m = next) { + next = m->m_next; + KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); + KASSERT(m->m_ext.ext_refcnt == 0); - mutex_enter(&so_pendfree_lock); + rv += m->m_ext.ext_size; + sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, + m->m_ext.ext_size); + pool_cache_put(mb_cache, m); } - if (rv) - cv_broadcast(&socurkva_cv); - cv_wait(&pendfree_thread_cv, &so_pendfree_lock); + + mutex_enter(&so_pendfree_lock); } - panic("sopendfree_thread"); - /* NOTREACHED */ + + return (rv); } void @@ -308,7 +343,7 @@ soloanfree(struct mbuf *m, void *buf, si mutex_enter(&so_pendfree_lock); m->m_next = so_pendfree; so_pendfree = m; - cv_signal(&pendfree_thread_cv); + cv_broadcast(&socurkva_cv); mutex_exit(&so_pendfree_lock); } @@ -351,7 +386,7 @@ sosend_loan(struct socket *so, struct ui for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), - VM_PROT_READ); + VM_PROT_READ, 0); pmap_update(pmap_kernel()); lva += (vaddr_t) iov->iov_base & PAGE_MASK; @@ -378,6 +413,7 @@ sokva_reclaim_callback(struct callback_e KASSERT(ce == &sokva_reclaimerentry); KASSERT(obj == NULL); + sodopendfree(); if (!vm_map_starved_p(kernel_map)) { return CALLBACK_CHAIN_ABORT; } @@ -394,31 +430,83 @@ getsombuf(struct socket *so, int type) return m; } +static int +socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, + void *arg0, void *arg1, void *arg2, void *arg3) +{ + int result; + enum kauth_network_req req; + + result = KAUTH_RESULT_DEFER; + req = (enum kauth_network_req)arg0; + + if ((action != KAUTH_NETWORK_SOCKET) && + (action != KAUTH_NETWORK_BIND)) + return result; + + switch (req) { + case KAUTH_REQ_NETWORK_BIND_PORT: + result = KAUTH_RESULT_ALLOW; + break; + + case KAUTH_REQ_NETWORK_SOCKET_DROP: { + /* Normal users can only drop their own connections. */ + struct socket *so = (struct socket *)arg1; + uid_t sockuid = so->so_uidinfo->ui_uid; + + if (sockuid == kauth_cred_getuid(cred) || + sockuid == kauth_cred_geteuid(cred)) + result = KAUTH_RESULT_ALLOW; + + break; + } + + case KAUTH_REQ_NETWORK_SOCKET_OPEN: + /* We allow "raw" routing/bluetooth sockets to anyone. */ + if ((u_long)arg1 == PF_ROUTE || (u_long)arg1 == PF_BLUETOOTH) + result = KAUTH_RESULT_ALLOW; + else { + /* Privileged, let secmodel handle this. */ + if ((u_long)arg2 == SOCK_RAW) + break; + } + + result = KAUTH_RESULT_ALLOW; + + break; + + case KAUTH_REQ_NETWORK_SOCKET_CANSEE: + result = KAUTH_RESULT_ALLOW; + + break; + + default: + break; + } + + return result; +} + void -soinit() +soinit(void) { + + sysctl_kern_somaxkva_setup(); + mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM); softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); cv_init(&socurkva_cv, "sokva"); - cv_init(&pendfree_thread_cv, "sopendfr"); soinit2(); - /* Set the initial adjusted socket buffer size. */ if (sb_max_set(sb_max)) panic("bad initial sb_max value: %lu", sb_max); callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, &sokva_reclaimerentry, NULL, sokva_reclaim_callback); -} -void -soinit1(void) -{ - int error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, - sopendfree_thread, NULL, &sopendfree_lwp, "sopendfree"); - if (error) - panic("soinit1 %d", error); + socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK, + socket_listener_cb, NULL); } /* @@ -473,6 +561,7 @@ socreate(int dom, struct socket **aso, i so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; so->so_mowner = &prp->pr_domain->dom_mowner; #endif + /* so->so_cred = kauth_cred_dup(l->l_cred); */ uid = kauth_cred_geteuid(l->l_cred); so->so_uidinfo = uid_find(uid); so->so_egid = kauth_cred_getegid(l->l_cred); @@ -529,6 +618,19 @@ fsocreate(int domain, struct socket **so } int +sofamily(const struct socket *so) +{ + const struct protosw *pr; + const struct domain *dom; + + if ((pr = so->so_proto) == NULL) + return AF_UNSPEC; + if ((dom = pr->pr_domain) == NULL) + return AF_UNSPEC; + return dom->dom_family; +} + +int sobind(struct socket *so, struct mbuf *nam, struct lwp *l) { int error; @@ -602,6 +704,7 @@ sofree(struct socket *so) /* Remove acccept filter if one is present. */ if (so->so_accf != NULL) (void)accept_filt_clear(so); + /* kauth_cred_free(so->so_cred); */ sounlock(so); if (refs == 0) /* XXX */ soput(so); @@ -772,6 +875,7 @@ sodisconnect(struct socket *so) error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL, NULL, NULL); } + sodopendfree(); return (error); } @@ -801,8 +905,10 @@ sosend(struct socket *so, struct mbuf *a struct proc *p; long space, len, resid, clen, mlen; int error, s, dontroute, atomic; + short wakeup_state = 0; p = l->l_proc; + sodopendfree(); clen = 0; /* @@ -874,11 +980,17 @@ sosend(struct socket *so, struct mbuf *a goto release; } sbunlock(&so->so_snd); + if (wakeup_state & SS_RESTARTSYS) { + error = ERESTART; + goto out; + } error = sbwait(&so->so_snd); if (error) goto out; + wakeup_state = so->so_state; goto restart; } + wakeup_state = 0; mp = ⊤ space -= clen; do { @@ -1054,6 +1166,7 @@ soreceive(struct socket *so, struct mbuf struct mbuf *nextrecord; int mbuf_removed = 0; const struct domain *dom; + short wakeup_state = 0; pr = so->so_proto; atomic = pr->pr_flags & PR_ATOMIC; @@ -1071,6 +1184,9 @@ soreceive(struct socket *so, struct mbuf else flags = 0; + if ((flags & MSG_DONTWAIT) == 0) + sodopendfree(); + if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); solock(so); @@ -1165,12 +1281,16 @@ soreceive(struct socket *so, struct mbuf SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(&so->so_rcv); - error = sbwait(&so->so_rcv); + if (wakeup_state & SS_RESTARTSYS) + error = ERESTART; + else + error = sbwait(&so->so_rcv); if (error != 0) { sounlock(so); splx(s); return error; } + wakeup_state = so->so_state; goto restart; } dontblock: @@ -1309,6 +1429,7 @@ soreceive(struct socket *so, struct mbuf panic("receive 3"); #endif so->so_state &= ~SS_RCVATMARK; + wakeup_state = 0; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; @@ -1441,7 +1562,10 @@ soreceive(struct socket *so, struct mbuf NULL, (struct mbuf *)(long)flags, NULL, l); SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); - error = sbwait(&so->so_rcv); + if (wakeup_state & SS_RESTARTSYS) + error = ERESTART; + else + error = sbwait(&so->so_rcv); if (error != 0) { sbunlock(&so->so_rcv); sounlock(so); @@ -1450,6 +1574,7 @@ soreceive(struct socket *so, struct mbuf } if ((m = so->so_rcv.sb_mb) != NULL) nextrecord = m->m_nextpkt; + wakeup_state = so->so_state; } } @@ -1516,18 +1641,23 @@ soshutdown(struct socket *so, int how) return error; } -int -sodrain(struct socket *so) +void +sorestart(struct socket *so) { - int error; - + /* + * An application has called close() on an fd on which another + * of its threads has called a socket system call. + * Mark this and wake everyone up, and code that would block again + * instead returns ERESTART. + * On system call re-entry the fd is validated and EBADF returned. + * Any other fd will block again on the 2nd syscall. + */ solock(so); - so->so_state |= SS_ISDRAINING; + so->so_state |= SS_RESTARTSYS; cv_broadcast(&so->so_cv); - error = soshutdown(so, SHUT_RDWR); + cv_broadcast(&so->so_snd.sb_cv); + cv_broadcast(&so->so_rcv.sb_cv); sounlock(so); - - return error; } void @@ -1565,11 +1695,11 @@ sorflush(struct socket *so) static int sosetopt1(struct socket *so, const struct sockopt *sopt) { - int error, optval; + int error = EINVAL, optval, opt; struct linger l; struct timeval tv; - switch (sopt->sopt_name) { + switch ((opt = sopt->sopt_name)) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); @@ -1602,14 +1732,17 @@ sosetopt1(struct socket *so, const struc case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: +#ifdef SO_OTIMESTAMP + case SO_OTIMESTAMP: +#endif error = sockopt_getint(sopt, &optval); solock(so); if (error) break; if (optval) - so->so_options |= sopt->sopt_name; + so->so_options |= opt; else - so->so_options &= ~sopt->sopt_name; + so->so_options &= ~opt; break; case SO_SNDBUF: @@ -1630,7 +1763,7 @@ sosetopt1(struct socket *so, const struc break; } - switch (sopt->sopt_name) { + switch (opt) { case SO_SNDBUF: if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) { error = ENOBUFS; @@ -1667,9 +1800,26 @@ sosetopt1(struct socket *so, const struc } break; +#ifdef COMPAT_50 + case SO_OSNDTIMEO: + case SO_ORCVTIMEO: { + struct timeval50 otv; + error = sockopt_get(sopt, &otv, sizeof(otv)); + if (error) { + solock(so); + break; + } + timeval50_to_timeval(&otv, &tv); + opt = opt == SO_OSNDTIMEO ? SO_SNDTIMEO : SO_RCVTIMEO; + error = 0; + /*FALLTHROUGH*/ + } +#endif /* COMPAT_50 */ + case SO_SNDTIMEO: case SO_RCVTIMEO: - error = sockopt_get(sopt, &tv, sizeof(tv)); + if (error) + error = sockopt_get(sopt, &tv, sizeof(tv)); solock(so); if (error) break; @@ -1683,7 +1833,7 @@ sosetopt1(struct socket *so, const struc if (optval == 0 && tv.tv_usec != 0) optval = 1; - switch (sopt->sopt_name) { + switch (opt) { case SO_SNDTIMEO: so->so_snd.sb_timeo = optval; break; @@ -1756,11 +1906,11 @@ so_setsockopt(struct lwp *l, struct sock static int sogetopt1(struct socket *so, struct sockopt *sopt) { - int error, optval; + int error, optval, opt; struct linger l; struct timeval tv; - switch (sopt->sopt_name) { + switch ((opt = sopt->sopt_name)) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); @@ -1782,8 +1932,10 @@ sogetopt1(struct socket *so, struct sock case SO_BROADCAST: case SO_OOBINLINE: case SO_TIMESTAMP: - error = sockopt_setint(sopt, - (so->so_options & sopt->sopt_name) ? 1 : 0); +#ifdef SO_OTIMESTAMP + case SO_OTIMESTAMP: +#endif + error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0); break; case SO_TYPE: @@ -1811,9 +1963,25 @@ sogetopt1(struct socket *so, struct sock error = sockopt_setint(sopt, so->so_rcv.sb_lowat); break; +#ifdef COMPAT_50 + case SO_OSNDTIMEO: + case SO_ORCVTIMEO: { + struct timeval50 otv; + + optval = (opt == SO_OSNDTIMEO ? + so->so_snd.sb_timeo : so->so_rcv.sb_timeo); + + otv.tv_sec = optval / hz; + otv.tv_usec = (optval % hz) * tick; + + error = sockopt_set(sopt, &otv, sizeof(otv)); + break; + } +#endif /* COMPAT_50 */ + case SO_SNDTIMEO: case SO_RCVTIMEO: - optval = (sopt->sopt_name == SO_SNDTIMEO ? + optval = (opt == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); tv.tv_sec = optval / hz; @@ -2254,16 +2422,18 @@ sysctl_kern_somaxkva(SYSCTLFN_ARGS) return (error); } -SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup") +static void +sysctl_kern_somaxkva_setup(void) { - sysctl_createv(clog, 0, NULL, NULL, + KASSERT(socket_sysctllog == NULL); + sysctl_createv(&socket_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "kern", NULL, NULL, 0, NULL, 0, CTL_KERN, CTL_EOL); - sysctl_createv(clog, 0, NULL, NULL, + sysctl_createv(&socket_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "somaxkva", SYSCTL_DESCR("Maximum amount of kernel memory to be "