Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files. =================================================================== RCS file: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v rcsdiff: /ftp/cvs/cvsroot/src/sys/kern/uipc_socket.c,v: warning: Unknown phrases like `commitid ...;' are present. retrieving revision 1.165 retrieving revision 1.205.2.1 diff -u -p -r1.165 -r1.205.2.1 --- src/sys/kern/uipc_socket.c 2008/05/24 18:43:02 1.165 +++ src/sys/kern/uipc_socket.c 2012/04/17 00:08:30 1.205.2.1 @@ -1,11 +1,11 @@ -/* $NetBSD: uipc_socket.c,v 1.165 2008/05/24 18:43:02 christos Exp $ */ +/* $NetBSD: uipc_socket.c,v 1.205.2.1 2012/04/17 00:08:30 yamt Exp $ */ /*- - * Copyright (c) 2002, 2007, 2008 The NetBSD Foundation, Inc. + * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation - * by Jason R. Thorpe of Wasabi Systems, Inc. + * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -63,19 +63,21 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.165 2008/05/24 18:43:02 christos Exp $"); +__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.205.2.1 2012/04/17 00:08:30 yamt Exp $"); +#include "opt_compat_netbsd.h" #include "opt_sock_counters.h" #include "opt_sosend_loan.h" #include "opt_mbuftrace.h" #include "opt_somaxkva.h" +#include "opt_multiprocessor.h" /* XXX */ #include #include #include #include #include -#include +#include #include #include #include @@ -84,13 +86,22 @@ __KERNEL_RCSID(0, "$NetBSD: uipc_socket. #include #include #include +#include #include #include #include #include #include +#include -#include +#ifdef COMPAT_50 +#include +#include +#endif + +#include +#include +#include MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); MALLOC_DEFINE(M_SONAME, "soname", "socket name"); @@ -125,16 +136,14 @@ EVCNT_ATTACH_STATIC(sosend_kvalimit); #endif /* SOSEND_COUNTERS */ -static struct callback_entry sokva_reclaimerentry; - -#ifdef SOSEND_NO_LOAN +#if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR) int sock_loan_thresh = -1; #else int sock_loan_thresh = 4096; #endif static kmutex_t so_pendfree_lock; -static struct mbuf *so_pendfree; +static struct mbuf *so_pendfree = NULL; #ifndef SOMAXKVA #define SOMAXKVA (16 * 1024 * 1024) @@ -143,10 +152,16 @@ int somaxkva = SOMAXKVA; static int socurkva; static kcondvar_t socurkva_cv; +static kauth_listener_t socket_listener; + #define SOCK_LOAN_CHUNK 65536 -static size_t sodopendfree(void); -static size_t sodopendfreel(void); +static void sopendfree_thread(void *); +static kcondvar_t pendfree_thread_cv; +static lwp_t *sopendfree_lwp; + +static void sysctl_kern_somaxkva_setup(void); +static struct sysctllog *socket_sysctllog; static vsize_t sokvareserve(struct socket *so, vsize_t len) @@ -155,21 +170,6 @@ sokvareserve(struct socket *so, vsize_t mutex_enter(&so_pendfree_lock); while (socurkva + len > somaxkva) { - size_t freed; - - /* - * try to do pendfree. - */ - - freed = sodopendfreel(); - - /* - * if some kva was freed, try again. - */ - - if (freed) - continue; - SOSEND_COUNTER_INCR(&sosend_kvalimit); error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock); if (error) { @@ -197,7 +197,7 @@ sokvaunreserve(vsize_t len) */ vaddr_t -sokvaalloc(vsize_t len, struct socket *so) +sokvaalloc(vaddr_t sva, vsize_t len, struct socket *so) { vaddr_t lva; @@ -212,7 +212,8 @@ sokvaalloc(vsize_t len, struct socket *s * allocate kva. */ - lva = uvm_km_alloc(kernel_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA); + lva = uvm_km_alloc(kernel_map, len, atop(sva) & uvmexp.colormask, + UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA); if (lva == 0) { sokvaunreserve(len); return (0); @@ -262,56 +263,45 @@ sodoloanfree(struct vm_page **pgs, void sokvafree(sva, len); } -static size_t -sodopendfree(void) -{ - size_t rv; - - if (__predict_true(so_pendfree == NULL)) - return 0; - - mutex_enter(&so_pendfree_lock); - rv = sodopendfreel(); - mutex_exit(&so_pendfree_lock); - - return rv; -} - /* - * sodopendfreel: free mbufs on "pendfree" list. + * sopendfree_thread: free mbufs on "pendfree" list. * unlock and relock so_pendfree_lock when freeing mbufs. - * - * => called with so_pendfree_lock held. */ -static size_t -sodopendfreel(void) +static void +sopendfree_thread(void *v) { struct mbuf *m, *next; - size_t rv = 0; - - KASSERT(mutex_owned(&so_pendfree_lock)); + size_t rv; - while (so_pendfree != NULL) { - m = so_pendfree; - so_pendfree = NULL; - mutex_exit(&so_pendfree_lock); + mutex_enter(&so_pendfree_lock); - for (; m != NULL; m = next) { - next = m->m_next; - KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); - KASSERT(m->m_ext.ext_refcnt == 0); + for (;;) { + rv = 0; + while (so_pendfree != NULL) { + m = so_pendfree; + so_pendfree = NULL; + mutex_exit(&so_pendfree_lock); + + for (; m != NULL; m = next) { + next = m->m_next; + KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); + KASSERT(m->m_ext.ext_refcnt == 0); + + rv += m->m_ext.ext_size; + sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, + m->m_ext.ext_size); + pool_cache_put(mb_cache, m); + } - rv += m->m_ext.ext_size; - sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, - m->m_ext.ext_size); - pool_cache_put(mb_cache, m); + mutex_enter(&so_pendfree_lock); } - - mutex_enter(&so_pendfree_lock); + if (rv) + cv_broadcast(&socurkva_cv); + cv_wait(&pendfree_thread_cv, &so_pendfree_lock); } - - return (rv); + panic("sopendfree_thread"); + /* NOTREACHED */ } void @@ -330,7 +320,7 @@ soloanfree(struct mbuf *m, void *buf, si mutex_enter(&so_pendfree_lock); m->m_next = so_pendfree; so_pendfree = m; - cv_broadcast(&socurkva_cv); + cv_signal(&pendfree_thread_cv); mutex_exit(&so_pendfree_lock); } @@ -360,7 +350,7 @@ sosend_loan(struct socket *so, struct ui KASSERT(npgs <= M_EXT_MAXPAGES); - lva = sokvaalloc(len, so); + lva = sokvaalloc(sva, len, so); if (lva == 0) return 0; @@ -373,7 +363,7 @@ sosend_loan(struct socket *so, struct ui for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), - VM_PROT_READ); + VM_PROT_READ, 0); pmap_update(pmap_kernel()); lva += (vaddr_t) iov->iov_base & PAGE_MASK; @@ -393,20 +383,6 @@ sosend_loan(struct socket *so, struct ui return (space); } -static int -sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg) -{ - - KASSERT(ce == &sokva_reclaimerentry); - KASSERT(obj == NULL); - - sodopendfree(); - if (!vm_map_starved_p(kernel_map)) { - return CALLBACK_CHAIN_ABORT; - } - return CALLBACK_CHAIN_CONTINUE; -} - struct mbuf * getsombuf(struct socket *so, int type) { @@ -417,31 +393,89 @@ getsombuf(struct socket *so, int type) return m; } -struct mbuf * -m_intopt(struct socket *so, int val) +static int +socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, + void *arg0, void *arg1, void *arg2, void *arg3) { - struct mbuf *m; + int result; + enum kauth_network_req req; - m = getsombuf(so, MT_SOOPTS); - m->m_len = sizeof(int); - *mtod(m, int *) = val; - return m; + result = KAUTH_RESULT_DEFER; + req = (enum kauth_network_req)arg0; + + if ((action != KAUTH_NETWORK_SOCKET) && + (action != KAUTH_NETWORK_BIND)) + return result; + + switch (req) { + case KAUTH_REQ_NETWORK_BIND_PORT: + result = KAUTH_RESULT_ALLOW; + break; + + case KAUTH_REQ_NETWORK_SOCKET_DROP: { + /* Normal users can only drop their own connections. */ + struct socket *so = (struct socket *)arg1; + + if (proc_uidmatch(cred, so->so_cred)) + result = KAUTH_RESULT_ALLOW; + + break; + } + + case KAUTH_REQ_NETWORK_SOCKET_OPEN: + /* We allow "raw" routing/bluetooth sockets to anyone. */ + if ((u_long)arg1 == PF_ROUTE || (u_long)arg1 == PF_OROUTE + || (u_long)arg1 == PF_BLUETOOTH) { + result = KAUTH_RESULT_ALLOW; + } else { + /* Privileged, let secmodel handle this. */ + if ((u_long)arg2 == SOCK_RAW) + break; + } + + result = KAUTH_RESULT_ALLOW; + + break; + + case KAUTH_REQ_NETWORK_SOCKET_CANSEE: + result = KAUTH_RESULT_ALLOW; + + break; + + default: + break; + } + + return result; } void soinit(void) { + sysctl_kern_somaxkva_setup(); + mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM); softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); cv_init(&socurkva_cv, "sokva"); + cv_init(&pendfree_thread_cv, "sopendfr"); + soinit2(); /* Set the initial adjusted socket buffer size. */ if (sb_max_set(sb_max)) panic("bad initial sb_max value: %lu", sb_max); - callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, - &sokva_reclaimerentry, NULL, sokva_reclaim_callback); + socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK, + socket_listener_cb, NULL); +} + +void +soinit1(void) +{ + int error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, + sopendfree_thread, NULL, &sopendfree_lwp, "sopendfree"); + if (error) + panic("soinit1 %d", error); } /* @@ -498,6 +532,7 @@ socreate(int dom, struct socket **aso, i #endif uid = kauth_cred_geteuid(l->l_cred); so->so_uidinfo = uid_find(uid); + so->so_cpid = l->l_proc->p_pid; if (lockso != NULL) { /* Caller wants us to share a lock. */ lock = lockso->so_lock; @@ -515,6 +550,7 @@ socreate(int dom, struct socket **aso, i sofree(so); return error; } + so->so_cred = kauth_cred_dup(l->l_cred); sounlock(so); *aso = so; return 0; @@ -530,10 +566,14 @@ fsocreate(int domain, struct socket **so struct socket *so; struct file *fp; int fd, error; + int flags = type & SOCK_FLAGS_MASK; + type &= ~SOCK_FLAGS_MASK; if ((error = fd_allocfile(&fp, &fd)) != 0) - return (error); - fp->f_flag = FREAD|FWRITE; + return error; + fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0); + fp->f_flag = FREAD|FWRITE|((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0)| + ((flags & SOCK_NOSIGPIPE) ? FNOSIGPIPE : 0); fp->f_type = DTYPE_SOCKET; fp->f_ops = &socketops; error = socreate(domain, &so, type, protocol, l, NULL); @@ -550,6 +590,19 @@ fsocreate(int domain, struct socket **so } int +sofamily(const struct socket *so) +{ + const struct protosw *pr; + const struct domain *dom; + + if ((pr = so->so_proto) == NULL) + return AF_UNSPEC; + if ((dom = pr->pr_domain) == NULL) + return AF_UNSPEC; + return dom->dom_family; +} + +int sobind(struct socket *so, struct mbuf *nam, struct lwp *l) { int error; @@ -569,7 +622,7 @@ solisten(struct socket *so, int backlog, if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) != 0) { sounlock(so); - return (EOPNOTSUPP); + return (EINVAL); } error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL, l); @@ -620,6 +673,9 @@ sofree(struct socket *so) KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); sorflush(so); refs = so->so_aborting; /* XXX */ + /* Remove acccept filter if one is present. */ + if (so->so_accf != NULL) + (void)accept_filt_clear(so); sounlock(so); if (refs == 0) /* XXX */ soput(so); @@ -640,7 +696,7 @@ soclose(struct socket *so) error = 0; solock(so); if (so->so_options & SO_ACCEPTCONN) { - do { + for (;;) { if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { KASSERT(solocked2(so, so2)); (void) soqremque(so2, 0); @@ -657,7 +713,8 @@ soclose(struct socket *so) solock(so); continue; } - } while (0); + break; + } } if (so->so_pcb == 0) goto discard; @@ -668,10 +725,11 @@ soclose(struct socket *so) goto drop; } if (so->so_options & SO_LINGER) { - if ((so->so_state & SS_ISDISCONNECTING) && so->so_nbio) + if ((so->so_state & (SS_ISDISCONNECTING|SS_NBIO)) == + (SS_ISDISCONNECTING|SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { - error = sowait(so, so->so_linger * hz); + error = sowait(so, true, so->so_linger * hz); if (error) break; } @@ -687,6 +745,7 @@ soclose(struct socket *so) discard: if (so->so_state & SS_NOFDREF) panic("soclose: NOFDREF"); + kauth_cred_free(so->so_cred); so->so_state |= SS_NOFDREF; sofree(so); return (error); @@ -789,7 +848,6 @@ sodisconnect(struct socket *so) error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL, NULL, NULL); } - sodopendfree(); return (error); } @@ -819,9 +877,9 @@ sosend(struct socket *so, struct mbuf *a struct proc *p; long space, len, resid, clen, mlen; int error, s, dontroute, atomic; + short wakeup_state = 0; p = l->l_proc; - sodopendfree(); clen = 0; /* @@ -888,16 +946,22 @@ sosend(struct socket *so, struct mbuf *a } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { - if (so->so_nbio) { + if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { error = EWOULDBLOCK; goto release; } sbunlock(&so->so_snd); + if (wakeup_state & SS_RESTARTSYS) { + error = ERESTART; + goto out; + } error = sbwait(&so->so_snd); if (error) goto out; + wakeup_state = so->so_state; goto restart; } + wakeup_state = 0; mp = ⊤ space -= clen; do { @@ -932,7 +996,7 @@ sosend(struct socket *so, struct mbuf *a } if (resid >= MINCLSIZE && space >= MCLBYTES) { SOSEND_COUNTER_INCR(&sosend_copy_big); - m_clget(m, M_WAIT); + m_clget(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) goto nopages; mlen = MCLBYTES; @@ -1073,6 +1137,7 @@ soreceive(struct socket *so, struct mbuf struct mbuf *nextrecord; int mbuf_removed = 0; const struct domain *dom; + short wakeup_state = 0; pr = so->so_proto; atomic = pr->pr_flags & PR_ATOMIC; @@ -1090,9 +1155,6 @@ soreceive(struct socket *so, struct mbuf else flags = 0; - if ((flags & MSG_DONTWAIT) == 0) - sodopendfree(); - if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); solock(so); @@ -1180,19 +1242,24 @@ soreceive(struct socket *so, struct mbuf } if (uio->uio_resid == 0) goto release; - if (so->so_nbio || (flags & MSG_DONTWAIT)) { + if ((so->so_state & SS_NBIO) || + (flags & (MSG_DONTWAIT|MSG_NBIO))) { error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); sbunlock(&so->so_rcv); - error = sbwait(&so->so_rcv); + if (wakeup_state & SS_RESTARTSYS) + error = ERESTART; + else + error = sbwait(&so->so_rcv); if (error != 0) { sounlock(so); splx(s); return error; } + wakeup_state = so->so_state; goto restart; } dontblock: @@ -1281,7 +1348,9 @@ soreceive(struct socket *so, struct mbuf type == SCM_RIGHTS) { sounlock(so); splx(s); - error = (*dom->dom_externalize)(cm, l); + error = (*dom->dom_externalize)(cm, l, + (flags & MSG_CMSG_CLOEXEC) ? + O_CLOEXEC : 0); s = splsoftnet(); solock(so); } @@ -1331,6 +1400,7 @@ soreceive(struct socket *so, struct mbuf panic("receive 3"); #endif so->so_state &= ~SS_RCVATMARK; + wakeup_state = 0; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; @@ -1463,7 +1533,10 @@ soreceive(struct socket *so, struct mbuf NULL, (struct mbuf *)(long)flags, NULL, l); SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); - error = sbwait(&so->so_rcv); + if (wakeup_state & SS_RESTARTSYS) + error = ERESTART; + else + error = sbwait(&so->so_rcv); if (error != 0) { sbunlock(&so->so_rcv); sounlock(so); @@ -1472,6 +1545,7 @@ soreceive(struct socket *so, struct mbuf } if ((m = so->so_rcv.sb_mb) != NULL) nextrecord = m->m_nextpkt; + wakeup_state = so->so_state; } } @@ -1539,6 +1613,25 @@ soshutdown(struct socket *so, int how) } void +sorestart(struct socket *so) +{ + /* + * An application has called close() on an fd on which another + * of its threads has called a socket system call. + * Mark this and wake everyone up, and code that would block again + * instead returns ERESTART. + * On system call re-entry the fd is validated and EBADF returned. + * Any other fd will block again on the 2nd syscall. + */ + solock(so); + so->so_state |= SS_RESTARTSYS; + cv_broadcast(&so->so_cv); + cv_broadcast(&so->so_snd.sb_cv); + cv_broadcast(&so->so_rcv.sb_cv); + sounlock(so); +} + +void sorflush(struct socket *so) { struct sockbuf *sb, asb; @@ -1567,30 +1660,40 @@ sorflush(struct socket *so) sbrelease(&asb, so); } +/* + * internal set SOL_SOCKET options + */ static int -sosetopt1(struct socket *so, int level, int optname, struct mbuf *m) +sosetopt1(struct socket *so, const struct sockopt *sopt) { - int optval, val; - struct linger *l; - struct sockbuf *sb; - struct timeval *tv; + int error = EINVAL, optval, opt; + struct linger l; + struct timeval tv; - switch (optname) { + switch ((opt = sopt->sopt_name)) { - case SO_LINGER: - if (m == NULL || m->m_len != sizeof(struct linger)) - return EINVAL; - l = mtod(m, struct linger *); - if (l->l_linger < 0 || l->l_linger > USHRT_MAX || - l->l_linger > (INT_MAX / hz)) - return EDOM; - so->so_linger = l->l_linger; - if (l->l_onoff) - so->so_options |= SO_LINGER; - else - so->so_options &= ~SO_LINGER; + case SO_ACCEPTFILTER: + error = accept_filt_setopt(so, sopt); + KASSERT(solocked(so)); break; + case SO_LINGER: + error = sockopt_get(sopt, &l, sizeof(l)); + solock(so); + if (error) + break; + if (l.l_linger < 0 || l.l_linger > USHRT_MAX || + l.l_linger > (INT_MAX / hz)) { + error = EDOM; + break; + } + so->so_linger = l.l_linger; + if (l.l_onoff) + so->so_options |= SO_LINGER; + else + so->so_options &= ~SO_LINGER; + break; + case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: @@ -1600,38 +1703,53 @@ sosetopt1(struct socket *so, int level, case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: - if (m == NULL || m->m_len < sizeof(int)) - return EINVAL; - if (*mtod(m, int *)) - so->so_options |= optname; + case SO_NOSIGPIPE: +#ifdef SO_OTIMESTAMP + case SO_OTIMESTAMP: +#endif + error = sockopt_getint(sopt, &optval); + solock(so); + if (error) + break; + if (optval) + so->so_options |= opt; else - so->so_options &= ~optname; + so->so_options &= ~opt; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: - if (m == NULL || m->m_len < sizeof(int)) - return EINVAL; + error = sockopt_getint(sopt, &optval); + solock(so); + if (error) + break; /* * Values < 1 make no sense for any of these * options, so disallow them. */ - optval = *mtod(m, int *); - if (optval < 1) - return EINVAL; - - switch (optname) { + if (optval < 1) { + error = EINVAL; + break; + } + switch (opt) { case SO_SNDBUF: + if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) { + error = ENOBUFS; + break; + } + so->so_snd.sb_flags &= ~SB_AUTOSIZE; + break; + case SO_RCVBUF: - sb = (optname == SO_SNDBUF) ? - &so->so_snd : &so->so_rcv; - if (sbreserve(sb, (u_long)optval, so) == 0) - return ENOBUFS; - sb->sb_flags &= ~SB_AUTOSIZE; + if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) { + error = ENOBUFS; + break; + } + so->so_rcv.sb_flags &= ~SB_AUTOSIZE; break; /* @@ -1639,163 +1757,406 @@ sosetopt1(struct socket *so, int level, * the high-water. */ case SO_SNDLOWAT: - so->so_snd.sb_lowat = - (optval > so->so_snd.sb_hiwat) ? - so->so_snd.sb_hiwat : optval; + if (optval > so->so_snd.sb_hiwat) + optval = so->so_snd.sb_hiwat; + + so->so_snd.sb_lowat = optval; break; + case SO_RCVLOWAT: - so->so_rcv.sb_lowat = - (optval > so->so_rcv.sb_hiwat) ? - so->so_rcv.sb_hiwat : optval; + if (optval > so->so_rcv.sb_hiwat) + optval = so->so_rcv.sb_hiwat; + + so->so_rcv.sb_lowat = optval; break; } break; +#ifdef COMPAT_50 + case SO_OSNDTIMEO: + case SO_ORCVTIMEO: { + struct timeval50 otv; + error = sockopt_get(sopt, &otv, sizeof(otv)); + if (error) { + solock(so); + break; + } + timeval50_to_timeval(&otv, &tv); + opt = opt == SO_OSNDTIMEO ? SO_SNDTIMEO : SO_RCVTIMEO; + error = 0; + /*FALLTHROUGH*/ + } +#endif /* COMPAT_50 */ + case SO_SNDTIMEO: case SO_RCVTIMEO: - if (m == NULL || m->m_len < sizeof(*tv)) - return EINVAL; - tv = mtod(m, struct timeval *); - if (tv->tv_sec > (INT_MAX - tv->tv_usec / tick) / hz) - return EDOM; - val = tv->tv_sec * hz + tv->tv_usec / tick; - if (val == 0 && tv->tv_usec != 0) - val = 1; + if (error) + error = sockopt_get(sopt, &tv, sizeof(tv)); + solock(so); + if (error) + break; + + if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) { + error = EDOM; + break; + } - switch (optname) { + optval = tv.tv_sec * hz + tv.tv_usec / tick; + if (optval == 0 && tv.tv_usec != 0) + optval = 1; + switch (opt) { case SO_SNDTIMEO: - so->so_snd.sb_timeo = val; + so->so_snd.sb_timeo = optval; break; case SO_RCVTIMEO: - so->so_rcv.sb_timeo = val; + so->so_rcv.sb_timeo = optval; break; } break; default: - return ENOPROTOOPT; + solock(so); + error = ENOPROTOOPT; + break; } - return 0; + KASSERT(solocked(so)); + return error; } int -sosetopt(struct socket *so, int level, int optname, struct mbuf *m) +sosetopt(struct socket *so, struct sockopt *sopt) { int error, prerr; - solock(so); - if (level == SOL_SOCKET) - error = sosetopt1(so, level, optname, m); - else + if (sopt->sopt_level == SOL_SOCKET) { + error = sosetopt1(so, sopt); + KASSERT(solocked(so)); + } else { error = ENOPROTOOPT; + solock(so); + } if ((error == 0 || error == ENOPROTOOPT) && so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { /* give the protocol stack a shot */ - prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, level, - optname, &m); + prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt); if (prerr == 0) error = 0; else if (prerr != ENOPROTOOPT) error = prerr; - } else if (m != NULL) - (void)m_free(m); + } sounlock(so); return error; } +/* + * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt() + */ int -sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) +so_setsockopt(struct lwp *l, struct socket *so, int level, int name, + const void *val, size_t valsize) +{ + struct sockopt sopt; + int error; + + KASSERT(valsize == 0 || val != NULL); + + sockopt_init(&sopt, level, name, valsize); + sockopt_set(&sopt, val, valsize); + + error = sosetopt(so, &sopt); + + sockopt_destroy(&sopt); + + return error; +} + +/* + * internal get SOL_SOCKET options + */ +static int +sogetopt1(struct socket *so, struct sockopt *sopt) +{ + int error, optval, opt; + struct linger l; + struct timeval tv; + + switch ((opt = sopt->sopt_name)) { + + case SO_ACCEPTFILTER: + error = accept_filt_getopt(so, sopt); + break; + + case SO_LINGER: + l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0; + l.l_linger = so->so_linger; + + error = sockopt_set(sopt, &l, sizeof(l)); + break; + + case SO_USELOOPBACK: + case SO_DONTROUTE: + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_BROADCAST: + case SO_OOBINLINE: + case SO_TIMESTAMP: + case SO_NOSIGPIPE: +#ifdef SO_OTIMESTAMP + case SO_OTIMESTAMP: +#endif + error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0); + break; + + case SO_TYPE: + error = sockopt_setint(sopt, so->so_type); + break; + + case SO_ERROR: + error = sockopt_setint(sopt, so->so_error); + so->so_error = 0; + break; + + case SO_SNDBUF: + error = sockopt_setint(sopt, so->so_snd.sb_hiwat); + break; + + case SO_RCVBUF: + error = sockopt_setint(sopt, so->so_rcv.sb_hiwat); + break; + + case SO_SNDLOWAT: + error = sockopt_setint(sopt, so->so_snd.sb_lowat); + break; + + case SO_RCVLOWAT: + error = sockopt_setint(sopt, so->so_rcv.sb_lowat); + break; + +#ifdef COMPAT_50 + case SO_OSNDTIMEO: + case SO_ORCVTIMEO: { + struct timeval50 otv; + + optval = (opt == SO_OSNDTIMEO ? + so->so_snd.sb_timeo : so->so_rcv.sb_timeo); + + otv.tv_sec = optval / hz; + otv.tv_usec = (optval % hz) * tick; + + error = sockopt_set(sopt, &otv, sizeof(otv)); + break; + } +#endif /* COMPAT_50 */ + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + optval = (opt == SO_SNDTIMEO ? + so->so_snd.sb_timeo : so->so_rcv.sb_timeo); + + tv.tv_sec = optval / hz; + tv.tv_usec = (optval % hz) * tick; + + error = sockopt_set(sopt, &tv, sizeof(tv)); + break; + + case SO_OVERFLOWED: + error = sockopt_setint(sopt, so->so_rcv.sb_overflowed); + break; + + default: + error = ENOPROTOOPT; + break; + } + + return (error); +} + +int +sogetopt(struct socket *so, struct sockopt *sopt) { - struct mbuf *m; int error; solock(so); - if (level != SOL_SOCKET) { + if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) { error = ((*so->so_proto->pr_ctloutput) - (PRCO_GETOPT, so, level, optname, mp)); + (PRCO_GETOPT, so, sopt)); } else error = (ENOPROTOOPT); } else { - m = m_get(M_WAIT, MT_SOOPTS); - m->m_len = sizeof(int); + error = sogetopt1(so, sopt); + } + sounlock(so); + return (error); +} - switch (optname) { +/* + * alloc sockopt data buffer buffer + * - will be released at destroy + */ +static int +sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag) +{ - case SO_LINGER: - m->m_len = sizeof(struct linger); - mtod(m, struct linger *)->l_onoff = - (so->so_options & SO_LINGER) ? 1 : 0; - mtod(m, struct linger *)->l_linger = so->so_linger; - break; + KASSERT(sopt->sopt_size == 0); - case SO_USELOOPBACK: - case SO_DONTROUTE: - case SO_DEBUG: - case SO_KEEPALIVE: - case SO_REUSEADDR: - case SO_REUSEPORT: - case SO_BROADCAST: - case SO_OOBINLINE: - case SO_TIMESTAMP: - *mtod(m, int *) = (so->so_options & optname) ? 1 : 0; - break; + if (len > sizeof(sopt->sopt_buf)) { + sopt->sopt_data = kmem_zalloc(len, kmflag); + if (sopt->sopt_data == NULL) + return ENOMEM; + } else + sopt->sopt_data = sopt->sopt_buf; - case SO_TYPE: - *mtod(m, int *) = so->so_type; - break; + sopt->sopt_size = len; + return 0; +} - case SO_ERROR: - *mtod(m, int *) = so->so_error; - so->so_error = 0; - break; +/* + * initialise sockopt storage + * - MAY sleep during allocation + */ +void +sockopt_init(struct sockopt *sopt, int level, int name, size_t size) +{ - case SO_SNDBUF: - *mtod(m, int *) = so->so_snd.sb_hiwat; - break; + memset(sopt, 0, sizeof(*sopt)); - case SO_RCVBUF: - *mtod(m, int *) = so->so_rcv.sb_hiwat; - break; + sopt->sopt_level = level; + sopt->sopt_name = name; + (void)sockopt_alloc(sopt, size, KM_SLEEP); +} - case SO_SNDLOWAT: - *mtod(m, int *) = so->so_snd.sb_lowat; - break; +/* + * destroy sockopt storage + * - will release any held memory references + */ +void +sockopt_destroy(struct sockopt *sopt) +{ - case SO_RCVLOWAT: - *mtod(m, int *) = so->so_rcv.sb_lowat; - break; + if (sopt->sopt_data != sopt->sopt_buf) + kmem_free(sopt->sopt_data, sopt->sopt_size); - case SO_SNDTIMEO: - case SO_RCVTIMEO: - { - int val = (optname == SO_SNDTIMEO ? - so->so_snd.sb_timeo : so->so_rcv.sb_timeo); - - m->m_len = sizeof(struct timeval); - mtod(m, struct timeval *)->tv_sec = val / hz; - mtod(m, struct timeval *)->tv_usec = - (val % hz) * tick; - break; - } + memset(sopt, 0, sizeof(*sopt)); +} - case SO_OVERFLOWED: - *mtod(m, int *) = so->so_rcv.sb_overflowed; - break; +/* + * set sockopt value + * - value is copied into sockopt + * - memory is allocated when necessary, will not sleep + */ +int +sockopt_set(struct sockopt *sopt, const void *buf, size_t len) +{ + int error; - default: - sounlock(so); - (void)m_free(m); - return (ENOPROTOOPT); + if (sopt->sopt_size == 0) { + error = sockopt_alloc(sopt, len, KM_NOSLEEP); + if (error) + return error; + } + + KASSERT(sopt->sopt_size == len); + memcpy(sopt->sopt_data, buf, len); + return 0; +} + +/* + * common case of set sockopt integer value + */ +int +sockopt_setint(struct sockopt *sopt, int val) +{ + + return sockopt_set(sopt, &val, sizeof(int)); +} + +/* + * get sockopt value + * - correct size must be given + */ +int +sockopt_get(const struct sockopt *sopt, void *buf, size_t len) +{ + + if (sopt->sopt_size != len) + return EINVAL; + + memcpy(buf, sopt->sopt_data, len); + return 0; +} + +/* + * common case of get sockopt integer value + */ +int +sockopt_getint(const struct sockopt *sopt, int *valp) +{ + + return sockopt_get(sopt, valp, sizeof(int)); +} + +/* + * set sockopt value from mbuf + * - ONLY for legacy code + * - mbuf is released by sockopt + * - will not sleep + */ +int +sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m) +{ + size_t len; + int error; + + len = m_length(m); + + if (sopt->sopt_size == 0) { + error = sockopt_alloc(sopt, len, KM_NOSLEEP); + if (error) + return error; + } + + KASSERT(sopt->sopt_size == len); + m_copydata(m, 0, len, sopt->sopt_data); + m_freem(m); + + return 0; +} + +/* + * get sockopt value into mbuf + * - ONLY for legacy code + * - mbuf to be released by the caller + * - will not sleep + */ +struct mbuf * +sockopt_getmbuf(const struct sockopt *sopt) +{ + struct mbuf *m; + + if (sopt->sopt_size > MCLBYTES) + return NULL; + + m = m_get(M_DONTWAIT, MT_SOOPTS); + if (m == NULL) + return NULL; + + if (sopt->sopt_size > MLEN) { + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_free(m); + return NULL; } - *mp = m; - error = 0; } - sounlock(so); - return (error); + memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size); + m->m_len = sopt->sopt_size; + + return m; } void @@ -1803,7 +2164,7 @@ sohasoutofband(struct socket *so) { fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); - selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, 0); + selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT); } static void @@ -2034,16 +2395,18 @@ sysctl_kern_somaxkva(SYSCTLFN_ARGS) return (error); } -SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup") +static void +sysctl_kern_somaxkva_setup(void) { - sysctl_createv(clog, 0, NULL, NULL, + KASSERT(socket_sysctllog == NULL); + sysctl_createv(&socket_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "kern", NULL, NULL, 0, NULL, 0, CTL_KERN, CTL_EOL); - sysctl_createv(clog, 0, NULL, NULL, + sysctl_createv(&socket_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "somaxkva", SYSCTL_DESCR("Maximum amount of kernel memory to be "