[BACK]Return to uipc_socket2.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / kern

Annotation of src/sys/kern/uipc_socket2.c, Revision 1.109.2.2

1.109.2.2! yamt        1: /*     $NetBSD: uipc_socket2.c,v 1.109.2.1 2012/04/17 00:08:31 yamt Exp $      */
1.91      ad          2:
                      3: /*-
                      4:  * Copyright (c) 2008 The NetBSD Foundation, Inc.
                      5:  * All rights reserved.
                      6:  *
                      7:  * Redistribution and use in source and binary forms, with or without
                      8:  * modification, are permitted provided that the following conditions
                      9:  * are met:
                     10:  * 1. Redistributions of source code must retain the above copyright
                     11:  *    notice, this list of conditions and the following disclaimer.
                     12:  * 2. Redistributions in binary form must reproduce the above copyright
                     13:  *    notice, this list of conditions and the following disclaimer in the
                     14:  *    documentation and/or other materials provided with the distribution.
                     15:  *
                     16:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     17:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     18:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     19:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     20:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     21:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     22:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     23:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     24:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     25:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     26:  * POSSIBILITY OF SUCH DAMAGE.
                     27:  */
1.9       cgd        28:
1.1       cgd        29: /*
1.7       mycroft    30:  * Copyright (c) 1982, 1986, 1988, 1990, 1993
                     31:  *     The Regents of the University of California.  All rights reserved.
1.1       cgd        32:  *
                     33:  * Redistribution and use in source and binary forms, with or without
                     34:  * modification, are permitted provided that the following conditions
                     35:  * are met:
                     36:  * 1. Redistributions of source code must retain the above copyright
                     37:  *    notice, this list of conditions and the following disclaimer.
                     38:  * 2. Redistributions in binary form must reproduce the above copyright
                     39:  *    notice, this list of conditions and the following disclaimer in the
                     40:  *    documentation and/or other materials provided with the distribution.
1.54      agc        41:  * 3. Neither the name of the University nor the names of its contributors
1.1       cgd        42:  *    may be used to endorse or promote products derived from this software
                     43:  *    without specific prior written permission.
                     44:  *
                     45:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
                     46:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     47:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     48:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
                     49:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     50:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     51:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     52:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     53:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     54:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     55:  * SUCH DAMAGE.
                     56:  *
1.23      fvdl       57:  *     @(#)uipc_socket2.c      8.2 (Berkeley) 2/14/95
1.1       cgd        58:  */
1.42      lukem      59:
                     60: #include <sys/cdefs.h>
1.109.2.2! yamt       61: __KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.109.2.1 2012/04/17 00:08:31 yamt Exp $");
1.51      martin     62:
                     63: #include "opt_mbuftrace.h"
1.58      thorpej    64: #include "opt_sb_max.h"
1.1       cgd        65:
1.5       mycroft    66: #include <sys/param.h>
                     67: #include <sys/systm.h>
                     68: #include <sys/proc.h>
                     69: #include <sys/file.h>
                     70: #include <sys/buf.h>
                     71: #include <sys/mbuf.h>
                     72: #include <sys/protosw.h>
1.91      ad         73: #include <sys/domain.h>
1.55      christos   74: #include <sys/poll.h>
1.5       mycroft    75: #include <sys/socket.h>
                     76: #include <sys/socketvar.h>
1.11      christos   77: #include <sys/signalvar.h>
1.71      elad       78: #include <sys/kauth.h>
1.91      ad         79: #include <sys/pool.h>
1.98      pooka      80: #include <sys/uidinfo.h>
1.1       cgd        81:
                     82: /*
1.91      ad         83:  * Primitive routines for operating on sockets and socket buffers.
                     84:  *
                     85:  * Locking rules and assumptions:
                     86:  *
                     87:  * o socket::so_lock can change on the fly.  The low level routines used
                     88:  *   to lock sockets are aware of this.  When so_lock is acquired, the
                     89:  *   routine locking must check to see if so_lock still points to the
                     90:  *   lock that was acquired.  If so_lock has changed in the meantime, the
                     91:  *   now irellevant lock that was acquired must be dropped and the lock
                     92:  *   operation retried.  Although not proven here, this is completely safe
                     93:  *   on a multiprocessor system, even with relaxed memory ordering, given
                     94:  *   the next two rules:
                     95:  *
                     96:  * o In order to mutate so_lock, the lock pointed to by the current value
                     97:  *   of so_lock must be held: i.e., the socket must be held locked by the
                     98:  *   changing thread.  The thread must issue membar_exit() to prevent
                     99:  *   memory accesses being reordered, and can set so_lock to the desired
                    100:  *   value.  If the lock pointed to by the new value of so_lock is not
                    101:  *   held by the changing thread, the socket must then be considered
                    102:  *   unlocked.
                    103:  *
                    104:  * o If so_lock is mutated, and the previous lock referred to by so_lock
                    105:  *   could still be visible to other threads in the system (e.g. via file
                    106:  *   descriptor or protocol-internal reference), then the old lock must
                    107:  *   remain valid until the socket and/or protocol control block has been
                    108:  *   torn down.
                    109:  *
                    110:  * o If a socket has a non-NULL so_head value (i.e. is in the process of
                    111:  *   connecting), then locking the socket must also lock the socket pointed
                    112:  *   to by so_head: their lock pointers must match.
                    113:  *
                    114:  * o If a socket has connections in progress (so_q, so_q0 not empty) then
                    115:  *   locking the socket must also lock the sockets attached to both queues.
                    116:  *   Again, their lock pointers must match.
                    117:  *
                    118:  * o Beyond the initial lock assigment in socreate(), assigning locks to
                    119:  *   sockets is the responsibility of the individual protocols / protocol
                    120:  *   domains.
1.1       cgd       121:  */
                    122:
1.94      ad        123: static pool_cache_t socket_cache;
1.1       cgd       124:
1.58      thorpej   125: u_long sb_max = SB_MAX;        /* maximum socket buffer size */
                    126: static u_long sb_max_adj;      /* adjusted sb_max */
                    127:
1.1       cgd       128: /*
                    129:  * Procedures to manipulate state flags of socket
                    130:  * and do appropriate wakeups.  Normal sequence from the
                    131:  * active (originating) side is that soisconnecting() is
                    132:  * called during processing of connect() call,
                    133:  * resulting in an eventual call to soisconnected() if/when the
                    134:  * connection is established.  When the connection is torn down
                    135:  * soisdisconnecting() is called during processing of disconnect() call,
                    136:  * and soisdisconnected() is called when the connection to the peer
                    137:  * is totally severed.  The semantics of these routines are such that
                    138:  * connectionless protocols can call soisconnected() and soisdisconnected()
                    139:  * only, bypassing the in-progress calls when setting up a ``connection''
                    140:  * takes no time.
                    141:  *
                    142:  * From the passive side, a socket is created with
                    143:  * two queues of sockets: so_q0 for connections in progress
                    144:  * and so_q for connections already made and awaiting user acceptance.
                    145:  * As a protocol is preparing incoming connections, it creates a socket
                    146:  * structure queued on so_q0 by calling sonewconn().  When the connection
                    147:  * is established, soisconnected() is called, and transfers the
                    148:  * socket structure to so_q, making it available to accept().
1.66      perry     149:  *
1.1       cgd       150:  * If a socket is closed with sockets on either
                    151:  * so_q0 or so_q, these sockets are dropped.
                    152:  *
                    153:  * If higher level protocols are implemented in
                    154:  * the kernel, the wakeups done here will sometimes
                    155:  * cause software-interrupt process scheduling.
                    156:  */
                    157:
1.7       mycroft   158: void
1.37      lukem     159: soisconnecting(struct socket *so)
1.1       cgd       160: {
                    161:
1.91      ad        162:        KASSERT(solocked(so));
                    163:
1.1       cgd       164:        so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
                    165:        so->so_state |= SS_ISCONNECTING;
                    166: }
                    167:
1.7       mycroft   168: void
1.37      lukem     169: soisconnected(struct socket *so)
1.1       cgd       170: {
1.37      lukem     171:        struct socket   *head;
1.1       cgd       172:
1.37      lukem     173:        head = so->so_head;
1.91      ad        174:
                    175:        KASSERT(solocked(so));
                    176:        KASSERT(head == NULL || solocked2(so, head));
                    177:
1.109.2.2! yamt      178:        so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING);
1.1       cgd       179:        so->so_state |= SS_ISCONNECTED;
1.97      tls       180:        if (head && so->so_onq == &head->so_q0) {
                    181:                if ((so->so_options & SO_ACCEPTFILTER) == 0) {
                    182:                        soqremque(so, 0);
                    183:                        soqinsque(head, so, 1);
                    184:                        sorwakeup(head);
                    185:                        cv_broadcast(&head->so_cv);
                    186:                } else {
                    187:                        so->so_upcall =
                    188:                            head->so_accf->so_accept_filter->accf_callback;
                    189:                        so->so_upcallarg = head->so_accf->so_accept_filter_arg;
                    190:                        so->so_rcv.sb_flags |= SB_UPCALL;
                    191:                        so->so_options &= ~SO_ACCEPTFILTER;
1.104     tls       192:                        (*so->so_upcall)(so, so->so_upcallarg,
                    193:                                         POLLIN|POLLRDNORM, M_DONTWAIT);
1.101     yamt      194:                }
1.1       cgd       195:        } else {
1.91      ad        196:                cv_broadcast(&so->so_cv);
1.1       cgd       197:                sorwakeup(so);
                    198:                sowwakeup(so);
                    199:        }
                    200: }
                    201:
1.7       mycroft   202: void
1.37      lukem     203: soisdisconnecting(struct socket *so)
1.1       cgd       204: {
                    205:
1.91      ad        206:        KASSERT(solocked(so));
                    207:
1.1       cgd       208:        so->so_state &= ~SS_ISCONNECTING;
                    209:        so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
1.91      ad        210:        cv_broadcast(&so->so_cv);
1.1       cgd       211:        sowwakeup(so);
                    212:        sorwakeup(so);
                    213: }
                    214:
1.7       mycroft   215: void
1.37      lukem     216: soisdisconnected(struct socket *so)
1.1       cgd       217: {
                    218:
1.91      ad        219:        KASSERT(solocked(so));
                    220:
1.1       cgd       221:        so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
1.27      mycroft   222:        so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
1.91      ad        223:        cv_broadcast(&so->so_cv);
1.1       cgd       224:        sowwakeup(so);
                    225:        sorwakeup(so);
                    226: }
                    227:
1.94      ad        228: void
                    229: soinit2(void)
                    230: {
                    231:
                    232:        socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0,
                    233:            "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL);
                    234: }
                    235:
1.1       cgd       236: /*
                    237:  * When an attempt at a new connection is noted on a socket
                    238:  * which accepts connections, sonewconn is called.  If the
                    239:  * connection is possible (subject to space constraints, etc.)
                    240:  * then we allocate a new structure, propoerly linked into the
                    241:  * data structure of the original socket, and return this.
                    242:  */
                    243: struct socket *
1.109.2.2! yamt      244: sonewconn(struct socket *head, bool conncomplete)
1.1       cgd       245: {
1.37      lukem     246:        struct socket   *so;
1.91      ad        247:        int             soqueue, error;
                    248:
                    249:        KASSERT(solocked(head));
1.1       cgd       250:
1.97      tls       251:        if ((head->so_options & SO_ACCEPTFILTER) != 0)
1.109.2.2! yamt      252:                conncomplete = false;
        !           253:        soqueue = conncomplete ? 1 : 0;
        !           254:
1.1       cgd       255:        if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2)
1.100     dyoung    256:                return NULL;
1.91      ad        257:        so = soget(false);
1.66      perry     258:        if (so == NULL)
1.100     dyoung    259:                return NULL;
1.91      ad        260:        mutex_obj_hold(head->so_lock);
                    261:        so->so_lock = head->so_lock;
1.1       cgd       262:        so->so_type = head->so_type;
                    263:        so->so_options = head->so_options &~ SO_ACCEPTCONN;
                    264:        so->so_linger = head->so_linger;
                    265:        so->so_state = head->so_state | SS_NOFDREF;
                    266:        so->so_proto = head->so_proto;
                    267:        so->so_timeo = head->so_timeo;
                    268:        so->so_pgid = head->so_pgid;
1.24      matt      269:        so->so_send = head->so_send;
                    270:        so->so_receive = head->so_receive;
1.67      christos  271:        so->so_uidinfo = head->so_uidinfo;
1.96      yamt      272:        so->so_cpid = head->so_cpid;
1.49      matt      273: #ifdef MBUFTRACE
                    274:        so->so_mowner = head->so_mowner;
                    275:        so->so_rcv.sb_mowner = head->so_rcv.sb_mowner;
                    276:        so->so_snd.sb_mowner = head->so_snd.sb_mowner;
                    277: #endif
1.103     christos  278:        if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) != 0)
                    279:                goto out;
1.83      tls       280:        so->so_snd.sb_lowat = head->so_snd.sb_lowat;
                    281:        so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
1.84      tls       282:        so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
                    283:        so->so_snd.sb_timeo = head->so_snd.sb_timeo;
1.107     christos  284:        so->so_rcv.sb_flags |= head->so_rcv.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
                    285:        so->so_snd.sb_flags |= head->so_snd.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
1.1       cgd       286:        soqinsque(head, so, soqueue);
1.91      ad        287:        error = (*so->so_proto->pr_usrreq)(so, PRU_ATTACH, NULL, NULL,
                    288:            NULL, NULL);
                    289:        KASSERT(solocked(so));
                    290:        if (error != 0) {
1.1       cgd       291:                (void) soqremque(so, soqueue);
1.103     christos  292: out:
1.99      ad        293:                /*
                    294:                 * Remove acccept filter if one is present.
                    295:                 * XXX Is this really needed?
                    296:                 */
1.97      tls       297:                if (so->so_accf != NULL)
1.99      ad        298:                        (void)accept_filt_clear(so);
1.91      ad        299:                soput(so);
1.100     dyoung    300:                return NULL;
1.1       cgd       301:        }
1.109.2.2! yamt      302:        if (conncomplete) {
1.1       cgd       303:                sorwakeup(head);
1.91      ad        304:                cv_broadcast(&head->so_cv);
1.109.2.2! yamt      305:                so->so_state |= SS_ISCONNECTED;
1.1       cgd       306:        }
1.100     dyoung    307:        return so;
1.1       cgd       308: }
                    309:
1.91      ad        310: struct socket *
                    311: soget(bool waitok)
                    312: {
                    313:        struct socket *so;
                    314:
1.94      ad        315:        so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
1.91      ad        316:        if (__predict_false(so == NULL))
                    317:                return (NULL);
                    318:        memset(so, 0, sizeof(*so));
                    319:        TAILQ_INIT(&so->so_q0);
                    320:        TAILQ_INIT(&so->so_q);
                    321:        cv_init(&so->so_cv, "socket");
                    322:        cv_init(&so->so_rcv.sb_cv, "netio");
                    323:        cv_init(&so->so_snd.sb_cv, "netio");
                    324:        selinit(&so->so_rcv.sb_sel);
                    325:        selinit(&so->so_snd.sb_sel);
                    326:        so->so_rcv.sb_so = so;
                    327:        so->so_snd.sb_so = so;
                    328:        return so;
                    329: }
                    330:
                    331: void
                    332: soput(struct socket *so)
                    333: {
                    334:
                    335:        KASSERT(!cv_has_waiters(&so->so_cv));
                    336:        KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
                    337:        KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
                    338:        seldestroy(&so->so_rcv.sb_sel);
                    339:        seldestroy(&so->so_snd.sb_sel);
                    340:        mutex_obj_free(so->so_lock);
                    341:        cv_destroy(&so->so_cv);
                    342:        cv_destroy(&so->so_rcv.sb_cv);
                    343:        cv_destroy(&so->so_snd.sb_cv);
1.94      ad        344:        pool_cache_put(socket_cache, so);
1.91      ad        345: }
                    346:
1.7       mycroft   347: void
1.37      lukem     348: soqinsque(struct socket *head, struct socket *so, int q)
1.1       cgd       349: {
                    350:
1.91      ad        351:        KASSERT(solocked2(head, so));
                    352:
1.22      thorpej   353: #ifdef DIAGNOSTIC
                    354:        if (so->so_onq != NULL)
                    355:                panic("soqinsque");
                    356: #endif
                    357:
1.1       cgd       358:        so->so_head = head;
                    359:        if (q == 0) {
                    360:                head->so_q0len++;
1.22      thorpej   361:                so->so_onq = &head->so_q0;
1.1       cgd       362:        } else {
                    363:                head->so_qlen++;
1.22      thorpej   364:                so->so_onq = &head->so_q;
1.1       cgd       365:        }
1.22      thorpej   366:        TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
1.1       cgd       367: }
                    368:
1.7       mycroft   369: int
1.37      lukem     370: soqremque(struct socket *so, int q)
1.1       cgd       371: {
1.37      lukem     372:        struct socket   *head;
1.1       cgd       373:
1.37      lukem     374:        head = so->so_head;
1.91      ad        375:
                    376:        KASSERT(solocked(so));
1.22      thorpej   377:        if (q == 0) {
                    378:                if (so->so_onq != &head->so_q0)
1.17      thorpej   379:                        return (0);
1.1       cgd       380:                head->so_q0len--;
                    381:        } else {
1.22      thorpej   382:                if (so->so_onq != &head->so_q)
                    383:                        return (0);
1.1       cgd       384:                head->so_qlen--;
                    385:        }
1.91      ad        386:        KASSERT(solocked2(so, head));
1.22      thorpej   387:        TAILQ_REMOVE(so->so_onq, so, so_qe);
                    388:        so->so_onq = NULL;
                    389:        so->so_head = NULL;
1.1       cgd       390:        return (1);
                    391: }
                    392:
                    393: /*
                    394:  * Socantsendmore indicates that no more data will be sent on the
                    395:  * socket; it would normally be applied to a socket when the user
                    396:  * informs the system that no more data is to be sent, by the protocol
                    397:  * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
                    398:  * will be received, and will normally be applied to the socket by a
                    399:  * protocol when it detects that the peer will send no more data.
                    400:  * Data queued for reading in the socket may yet be read.
                    401:  */
                    402:
1.4       andrew    403: void
1.37      lukem     404: socantsendmore(struct socket *so)
1.1       cgd       405: {
                    406:
1.91      ad        407:        KASSERT(solocked(so));
                    408:
1.1       cgd       409:        so->so_state |= SS_CANTSENDMORE;
                    410:        sowwakeup(so);
                    411: }
                    412:
1.4       andrew    413: void
1.37      lukem     414: socantrcvmore(struct socket *so)
1.1       cgd       415: {
                    416:
1.91      ad        417:        KASSERT(solocked(so));
                    418:
1.1       cgd       419:        so->so_state |= SS_CANTRCVMORE;
                    420:        sorwakeup(so);
                    421: }
                    422:
                    423: /*
                    424:  * Wait for data to arrive at/drain from a socket buffer.
                    425:  */
1.7       mycroft   426: int
1.37      lukem     427: sbwait(struct sockbuf *sb)
1.1       cgd       428: {
1.91      ad        429:        struct socket *so;
                    430:        kmutex_t *lock;
                    431:        int error;
1.1       cgd       432:
1.91      ad        433:        so = sb->sb_so;
1.1       cgd       434:
1.91      ad        435:        KASSERT(solocked(so));
1.1       cgd       436:
1.91      ad        437:        sb->sb_flags |= SB_NOTIFY;
                    438:        lock = so->so_lock;
                    439:        if ((sb->sb_flags & SB_NOINTR) != 0)
                    440:                error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo);
                    441:        else
                    442:                error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo);
                    443:        if (__predict_false(lock != so->so_lock))
                    444:                solockretry(so, lock);
                    445:        return error;
1.1       cgd       446: }
                    447:
                    448: /*
                    449:  * Wakeup processes waiting on a socket buffer.
                    450:  * Do asynchronous notification via SIGIO
1.39      manu      451:  * if the socket buffer has the SB_ASYNC flag set.
1.1       cgd       452:  */
1.7       mycroft   453: void
1.55      christos  454: sowakeup(struct socket *so, struct sockbuf *sb, int code)
1.1       cgd       455: {
1.90      rmind     456:        int band;
                    457:
1.91      ad        458:        KASSERT(solocked(so));
                    459:        KASSERT(sb->sb_so == so);
                    460:
1.90      rmind     461:        if (code == POLL_IN)
                    462:                band = POLLIN|POLLRDNORM;
                    463:        else
                    464:                band = POLLOUT|POLLWRNORM;
1.91      ad        465:        sb->sb_flags &= ~SB_NOTIFY;
                    466:        selnotify(&sb->sb_sel, band, NOTE_SUBMIT);
                    467:        cv_broadcast(&sb->sb_cv);
1.90      rmind     468:        if (sb->sb_flags & SB_ASYNC)
1.57      christos  469:                fownsignal(so->so_pgid, SIGIO, code, band, so);
1.24      matt      470:        if (sb->sb_flags & SB_UPCALL)
1.104     tls       471:                (*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT);
1.1       cgd       472: }
                    473:
                    474: /*
1.95      ad        475:  * Reset a socket's lock pointer.  Wake all threads waiting on the
                    476:  * socket's condition variables so that they can restart their waits
                    477:  * using the new lock.  The existing lock must be held.
                    478:  */
                    479: void
                    480: solockreset(struct socket *so, kmutex_t *lock)
                    481: {
                    482:
                    483:        KASSERT(solocked(so));
                    484:
                    485:        so->so_lock = lock;
                    486:        cv_broadcast(&so->so_snd.sb_cv);
                    487:        cv_broadcast(&so->so_rcv.sb_cv);
                    488:        cv_broadcast(&so->so_cv);
                    489: }
                    490:
                    491: /*
1.1       cgd       492:  * Socket buffer (struct sockbuf) utility routines.
                    493:  *
                    494:  * Each socket contains two socket buffers: one for sending data and
                    495:  * one for receiving data.  Each buffer contains a queue of mbufs,
                    496:  * information about the number of mbufs and amount of data in the
1.13      mycroft   497:  * queue, and other fields allowing poll() statements and notification
1.1       cgd       498:  * on data availability to be implemented.
                    499:  *
                    500:  * Data stored in a socket buffer is maintained as a list of records.
                    501:  * Each record is a list of mbufs chained together with the m_next
                    502:  * field.  Records are chained together with the m_nextpkt field. The upper
                    503:  * level routine soreceive() expects the following conventions to be
                    504:  * observed when placing information in the receive buffer:
                    505:  *
                    506:  * 1. If the protocol requires each message be preceded by the sender's
                    507:  *    name, then a record containing that name must be present before
                    508:  *    any associated data (mbuf's must be of type MT_SONAME).
                    509:  * 2. If the protocol supports the exchange of ``access rights'' (really
                    510:  *    just additional data associated with the message), and there are
                    511:  *    ``rights'' to be received, then a record containing this data
1.10      mycroft   512:  *    should be present (mbuf's must be of type MT_CONTROL).
1.1       cgd       513:  * 3. If a name or rights record exists, then it must be followed by
                    514:  *    a data record, perhaps of zero length.
                    515:  *
                    516:  * Before using a new socket structure it is first necessary to reserve
                    517:  * buffer space to the socket, by calling sbreserve().  This should commit
                    518:  * some of the available buffer space in the system buffer pool for the
                    519:  * socket (currently, it does nothing but enforce limits).  The space
                    520:  * should be released by calling sbrelease() when the socket is destroyed.
                    521:  */
                    522:
1.7       mycroft   523: int
1.58      thorpej   524: sb_max_set(u_long new_sbmax)
                    525: {
                    526:        int s;
                    527:
                    528:        if (new_sbmax < (16 * 1024))
                    529:                return (EINVAL);
                    530:
                    531:        s = splsoftnet();
                    532:        sb_max = new_sbmax;
                    533:        sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES);
                    534:        splx(s);
                    535:
                    536:        return (0);
                    537: }
                    538:
                    539: int
1.37      lukem     540: soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
1.1       cgd       541: {
1.91      ad        542:
                    543:        KASSERT(so->so_lock == NULL || solocked(so));
                    544:
1.74      christos  545:        /*
                    546:         * there's at least one application (a configure script of screen)
                    547:         * which expects a fifo is writable even if it has "some" bytes
                    548:         * in its buffer.
                    549:         * so we want to make sure (hiwat - lowat) >= (some bytes).
                    550:         *
                    551:         * PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
                    552:         * we expect it's large enough for such applications.
                    553:         */
                    554:        u_long  lowat = MAX(sock_loan_thresh, MCLBYTES);
                    555:        u_long  hiwat = lowat + PIPE_BUF;
1.1       cgd       556:
1.74      christos  557:        if (sndcc < hiwat)
                    558:                sndcc = hiwat;
1.59      christos  559:        if (sbreserve(&so->so_snd, sndcc, so) == 0)
1.1       cgd       560:                goto bad;
1.59      christos  561:        if (sbreserve(&so->so_rcv, rcvcc, so) == 0)
1.1       cgd       562:                goto bad2;
                    563:        if (so->so_rcv.sb_lowat == 0)
                    564:                so->so_rcv.sb_lowat = 1;
                    565:        if (so->so_snd.sb_lowat == 0)
1.74      christos  566:                so->so_snd.sb_lowat = lowat;
1.1       cgd       567:        if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
                    568:                so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
                    569:        return (0);
1.37      lukem     570:  bad2:
1.59      christos  571:        sbrelease(&so->so_snd, so);
1.37      lukem     572:  bad:
1.1       cgd       573:        return (ENOBUFS);
                    574: }
                    575:
                    576: /*
                    577:  * Allot mbufs to a sockbuf.
                    578:  * Attempt to scale mbmax so that mbcnt doesn't become limiting
                    579:  * if buffering efficiency is near the normal case.
                    580:  */
1.7       mycroft   581: int
1.59      christos  582: sbreserve(struct sockbuf *sb, u_long cc, struct socket *so)
1.1       cgd       583: {
1.75      ad        584:        struct lwp *l = curlwp; /* XXX */
1.62      christos  585:        rlim_t maxcc;
1.67      christos  586:        struct uidinfo *uidinfo;
1.1       cgd       587:
1.91      ad        588:        KASSERT(so->so_lock == NULL || solocked(so));
                    589:        KASSERT(sb->sb_so == so);
                    590:        KASSERT(sb_max_adj != 0);
                    591:
1.58      thorpej   592:        if (cc == 0 || cc > sb_max_adj)
1.1       cgd       593:                return (0);
1.93      christos  594:
1.105     elad      595:        maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur;
1.93      christos  596:
                    597:        uidinfo = so->so_uidinfo;
1.67      christos  598:        if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc))
1.62      christos  599:                return 0;
1.1       cgd       600:        sb->sb_mbmax = min(cc * 2, sb_max);
                    601:        if (sb->sb_lowat > sb->sb_hiwat)
                    602:                sb->sb_lowat = sb->sb_hiwat;
                    603:        return (1);
                    604: }
                    605:
                    606: /*
1.91      ad        607:  * Free mbufs held by a socket, and reserved mbuf space.  We do not assert
                    608:  * that the socket is held locked here: see sorflush().
1.1       cgd       609:  */
1.7       mycroft   610: void
1.59      christos  611: sbrelease(struct sockbuf *sb, struct socket *so)
1.1       cgd       612: {
                    613:
1.91      ad        614:        KASSERT(sb->sb_so == so);
                    615:
1.1       cgd       616:        sbflush(sb);
1.87      yamt      617:        (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY);
1.59      christos  618:        sb->sb_mbmax = 0;
1.1       cgd       619: }
                    620:
                    621: /*
                    622:  * Routines to add and remove
                    623:  * data from an mbuf queue.
                    624:  *
                    625:  * The routines sbappend() or sbappendrecord() are normally called to
                    626:  * append new mbufs to a socket buffer, after checking that adequate
                    627:  * space is available, comparing the function sbspace() with the amount
                    628:  * of data to be added.  sbappendrecord() differs from sbappend() in
                    629:  * that data supplied is treated as the beginning of a new record.
                    630:  * To place a sender's address, optional access rights, and data in a
                    631:  * socket receive buffer, sbappendaddr() should be used.  To place
                    632:  * access rights and data in a socket receive buffer, sbappendrights()
                    633:  * should be used.  In either case, the new data begins a new record.
                    634:  * Note that unlike sbappend() and sbappendrecord(), these routines check
                    635:  * for the caller that there will be enough space to store the data.
                    636:  * Each fails if there is not enough space, or if it cannot find mbufs
                    637:  * to store additional information in.
                    638:  *
                    639:  * Reliable protocols may use the socket send buffer to hold data
                    640:  * awaiting acknowledgement.  Data is normally copied from a socket
                    641:  * send buffer in a protocol with m_copy for output to a peer,
                    642:  * and then removing the data from the socket buffer with sbdrop()
                    643:  * or sbdroprecord() when the data is acknowledged by the peer.
                    644:  */
                    645:
1.43      thorpej   646: #ifdef SOCKBUF_DEBUG
                    647: void
                    648: sblastrecordchk(struct sockbuf *sb, const char *where)
                    649: {
                    650:        struct mbuf *m = sb->sb_mb;
                    651:
1.91      ad        652:        KASSERT(solocked(sb->sb_so));
                    653:
1.43      thorpej   654:        while (m && m->m_nextpkt)
                    655:                m = m->m_nextpkt;
                    656:
                    657:        if (m != sb->sb_lastrecord) {
                    658:                printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
                    659:                    sb->sb_mb, sb->sb_lastrecord, m);
                    660:                printf("packet chain:\n");
                    661:                for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
                    662:                        printf("\t%p\n", m);
1.47      provos    663:                panic("sblastrecordchk from %s", where);
1.43      thorpej   664:        }
                    665: }
                    666:
                    667: void
                    668: sblastmbufchk(struct sockbuf *sb, const char *where)
                    669: {
                    670:        struct mbuf *m = sb->sb_mb;
                    671:        struct mbuf *n;
                    672:
1.91      ad        673:        KASSERT(solocked(sb->sb_so));
                    674:
1.43      thorpej   675:        while (m && m->m_nextpkt)
                    676:                m = m->m_nextpkt;
                    677:
                    678:        while (m && m->m_next)
                    679:                m = m->m_next;
                    680:
                    681:        if (m != sb->sb_mbtail) {
                    682:                printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
                    683:                    sb->sb_mb, sb->sb_mbtail, m);
                    684:                printf("packet tree:\n");
                    685:                for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
                    686:                        printf("\t");
                    687:                        for (n = m; n != NULL; n = n->m_next)
                    688:                                printf("%p ", n);
                    689:                        printf("\n");
                    690:                }
                    691:                panic("sblastmbufchk from %s", where);
                    692:        }
                    693: }
                    694: #endif /* SOCKBUF_DEBUG */
                    695:
1.63      jonathan  696: /*
                    697:  * Link a chain of records onto a socket buffer
                    698:  */
                    699: #define        SBLINKRECORDCHAIN(sb, m0, mlast)                                \
1.43      thorpej   700: do {                                                                   \
                    701:        if ((sb)->sb_lastrecord != NULL)                                \
                    702:                (sb)->sb_lastrecord->m_nextpkt = (m0);                  \
                    703:        else                                                            \
                    704:                (sb)->sb_mb = (m0);                                     \
1.63      jonathan  705:        (sb)->sb_lastrecord = (mlast);                                  \
1.43      thorpej   706: } while (/*CONSTCOND*/0)
                    707:
1.63      jonathan  708:
                    709: #define        SBLINKRECORD(sb, m0)                                            \
                    710:     SBLINKRECORDCHAIN(sb, m0, m0)
                    711:
1.1       cgd       712: /*
                    713:  * Append mbuf chain m to the last record in the
                    714:  * socket buffer sb.  The additional space associated
                    715:  * the mbuf chain is recorded in sb.  Empty mbufs are
                    716:  * discarded and mbufs are compacted where possible.
                    717:  */
1.7       mycroft   718: void
1.37      lukem     719: sbappend(struct sockbuf *sb, struct mbuf *m)
1.1       cgd       720: {
1.37      lukem     721:        struct mbuf     *n;
1.1       cgd       722:
1.91      ad        723:        KASSERT(solocked(sb->sb_so));
                    724:
1.109.2.2! yamt      725:        if (m == NULL)
1.1       cgd       726:                return;
1.43      thorpej   727:
1.49      matt      728: #ifdef MBUFTRACE
1.65      jonathan  729:        m_claimm(m, sb->sb_mowner);
1.49      matt      730: #endif
                    731:
1.43      thorpej   732:        SBLASTRECORDCHK(sb, "sbappend 1");
                    733:
                    734:        if ((n = sb->sb_lastrecord) != NULL) {
                    735:                /*
                    736:                 * XXX Would like to simply use sb_mbtail here, but
                    737:                 * XXX I need to verify that I won't miss an EOR that
                    738:                 * XXX way.
                    739:                 */
1.1       cgd       740:                do {
                    741:                        if (n->m_flags & M_EOR) {
                    742:                                sbappendrecord(sb, m); /* XXXXXX!!!! */
                    743:                                return;
                    744:                        }
                    745:                } while (n->m_next && (n = n->m_next));
1.43      thorpej   746:        } else {
                    747:                /*
                    748:                 * If this is the first record in the socket buffer, it's
                    749:                 * also the last record.
                    750:                 */
                    751:                sb->sb_lastrecord = m;
1.1       cgd       752:        }
                    753:        sbcompress(sb, m, n);
1.43      thorpej   754:        SBLASTRECORDCHK(sb, "sbappend 2");
                    755: }
                    756:
                    757: /*
                    758:  * This version of sbappend() should only be used when the caller
                    759:  * absolutely knows that there will never be more than one record
                    760:  * in the socket buffer, that is, a stream protocol (such as TCP).
                    761:  */
                    762: void
1.44      thorpej   763: sbappendstream(struct sockbuf *sb, struct mbuf *m)
1.43      thorpej   764: {
                    765:
1.91      ad        766:        KASSERT(solocked(sb->sb_so));
1.43      thorpej   767:        KDASSERT(m->m_nextpkt == NULL);
                    768:        KASSERT(sb->sb_mb == sb->sb_lastrecord);
                    769:
                    770:        SBLASTMBUFCHK(sb, __func__);
                    771:
1.49      matt      772: #ifdef MBUFTRACE
1.65      jonathan  773:        m_claimm(m, sb->sb_mowner);
1.49      matt      774: #endif
                    775:
1.43      thorpej   776:        sbcompress(sb, m, sb->sb_mbtail);
                    777:
                    778:        sb->sb_lastrecord = sb->sb_mb;
                    779:        SBLASTRECORDCHK(sb, __func__);
1.1       cgd       780: }
                    781:
                    782: #ifdef SOCKBUF_DEBUG
1.7       mycroft   783: void
1.37      lukem     784: sbcheck(struct sockbuf *sb)
1.1       cgd       785: {
1.91      ad        786:        struct mbuf     *m, *m2;
1.43      thorpej   787:        u_long          len, mbcnt;
1.1       cgd       788:
1.91      ad        789:        KASSERT(solocked(sb->sb_so));
                    790:
1.37      lukem     791:        len = 0;
                    792:        mbcnt = 0;
1.91      ad        793:        for (m = sb->sb_mb; m; m = m->m_nextpkt) {
                    794:                for (m2 = m; m2 != NULL; m2 = m2->m_next) {
                    795:                        len += m2->m_len;
                    796:                        mbcnt += MSIZE;
                    797:                        if (m2->m_flags & M_EXT)
                    798:                                mbcnt += m2->m_ext.ext_size;
                    799:                        if (m2->m_nextpkt != NULL)
                    800:                                panic("sbcheck nextpkt");
                    801:                }
1.1       cgd       802:        }
                    803:        if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
1.43      thorpej   804:                printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
1.1       cgd       805:                    mbcnt, sb->sb_mbcnt);
                    806:                panic("sbcheck");
                    807:        }
                    808: }
                    809: #endif
                    810:
                    811: /*
                    812:  * As above, except the mbuf chain
                    813:  * begins a new record.
                    814:  */
1.7       mycroft   815: void
1.37      lukem     816: sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
1.1       cgd       817: {
1.37      lukem     818:        struct mbuf     *m;
1.1       cgd       819:
1.91      ad        820:        KASSERT(solocked(sb->sb_so));
                    821:
1.109.2.2! yamt      822:        if (m0 == NULL)
1.1       cgd       823:                return;
1.43      thorpej   824:
1.49      matt      825: #ifdef MBUFTRACE
1.65      jonathan  826:        m_claimm(m0, sb->sb_mowner);
1.49      matt      827: #endif
1.1       cgd       828:        /*
                    829:         * Put the first mbuf on the queue.
                    830:         * Note this permits zero length records.
                    831:         */
                    832:        sballoc(sb, m0);
1.43      thorpej   833:        SBLASTRECORDCHK(sb, "sbappendrecord 1");
                    834:        SBLINKRECORD(sb, m0);
1.1       cgd       835:        m = m0->m_next;
                    836:        m0->m_next = 0;
                    837:        if (m && (m0->m_flags & M_EOR)) {
                    838:                m0->m_flags &= ~M_EOR;
                    839:                m->m_flags |= M_EOR;
                    840:        }
                    841:        sbcompress(sb, m, m0);
1.43      thorpej   842:        SBLASTRECORDCHK(sb, "sbappendrecord 2");
1.1       cgd       843: }
                    844:
                    845: /*
                    846:  * As above except that OOB data
                    847:  * is inserted at the beginning of the sockbuf,
                    848:  * but after any other OOB data.
                    849:  */
1.7       mycroft   850: void
1.37      lukem     851: sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
1.1       cgd       852: {
1.37      lukem     853:        struct mbuf     *m, **mp;
1.1       cgd       854:
1.91      ad        855:        KASSERT(solocked(sb->sb_so));
                    856:
1.109.2.2! yamt      857:        if (m0 == NULL)
1.1       cgd       858:                return;
1.43      thorpej   859:
                    860:        SBLASTRECORDCHK(sb, "sbinsertoob 1");
                    861:
1.11      christos  862:        for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
1.1       cgd       863:            again:
                    864:                switch (m->m_type) {
                    865:
                    866:                case MT_OOBDATA:
                    867:                        continue;               /* WANT next train */
                    868:
                    869:                case MT_CONTROL:
1.11      christos  870:                        if ((m = m->m_next) != NULL)
1.1       cgd       871:                                goto again;     /* inspect THIS train further */
                    872:                }
                    873:                break;
                    874:        }
                    875:        /*
                    876:         * Put the first mbuf on the queue.
                    877:         * Note this permits zero length records.
                    878:         */
                    879:        sballoc(sb, m0);
                    880:        m0->m_nextpkt = *mp;
1.43      thorpej   881:        if (*mp == NULL) {
                    882:                /* m0 is actually the new tail */
                    883:                sb->sb_lastrecord = m0;
                    884:        }
1.1       cgd       885:        *mp = m0;
                    886:        m = m0->m_next;
                    887:        m0->m_next = 0;
                    888:        if (m && (m0->m_flags & M_EOR)) {
                    889:                m0->m_flags &= ~M_EOR;
                    890:                m->m_flags |= M_EOR;
                    891:        }
                    892:        sbcompress(sb, m, m0);
1.43      thorpej   893:        SBLASTRECORDCHK(sb, "sbinsertoob 2");
1.1       cgd       894: }
                    895:
                    896: /*
                    897:  * Append address and data, and optionally, control (ancillary) data
                    898:  * to the receive queue of a socket.  If present,
                    899:  * m0 must include a packet header with total length.
                    900:  * Returns 0 if no space in sockbuf or insufficient mbufs.
                    901:  */
1.7       mycroft   902: int
1.61      matt      903: sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
1.37      lukem     904:        struct mbuf *control)
1.1       cgd       905: {
1.43      thorpej   906:        struct mbuf     *m, *n, *nlast;
1.50      fvdl      907:        int             space, len;
1.1       cgd       908:
1.91      ad        909:        KASSERT(solocked(sb->sb_so));
                    910:
1.37      lukem     911:        space = asa->sa_len;
                    912:
1.49      matt      913:        if (m0 != NULL) {
                    914:                if ((m0->m_flags & M_PKTHDR) == 0)
                    915:                        panic("sbappendaddr");
1.1       cgd       916:                space += m0->m_pkthdr.len;
1.49      matt      917: #ifdef MBUFTRACE
1.65      jonathan  918:                m_claimm(m0, sb->sb_mowner);
1.49      matt      919: #endif
                    920:        }
1.1       cgd       921:        for (n = control; n; n = n->m_next) {
                    922:                space += n->m_len;
1.49      matt      923:                MCLAIM(n, sb->sb_mowner);
1.109.2.2! yamt      924:                if (n->m_next == NULL)  /* keep pointer to last control buf */
1.1       cgd       925:                        break;
                    926:        }
                    927:        if (space > sbspace(sb))
                    928:                return (0);
1.109.2.2! yamt      929:        m = m_get(M_DONTWAIT, MT_SONAME);
        !           930:        if (m == NULL)
1.1       cgd       931:                return (0);
1.49      matt      932:        MCLAIM(m, sb->sb_mowner);
1.50      fvdl      933:        /*
                    934:         * XXX avoid 'comparison always true' warning which isn't easily
                    935:         * avoided.
                    936:         */
                    937:        len = asa->sa_len;
                    938:        if (len > MLEN) {
1.20      thorpej   939:                MEXTMALLOC(m, asa->sa_len, M_NOWAIT);
                    940:                if ((m->m_flags & M_EXT) == 0) {
                    941:                        m_free(m);
                    942:                        return (0);
                    943:                }
                    944:        }
1.1       cgd       945:        m->m_len = asa->sa_len;
1.82      christos  946:        memcpy(mtod(m, void *), asa, asa->sa_len);
1.1       cgd       947:        if (n)
                    948:                n->m_next = m0;         /* concatenate data to control */
                    949:        else
                    950:                control = m0;
                    951:        m->m_next = control;
1.43      thorpej   952:
                    953:        SBLASTRECORDCHK(sb, "sbappendaddr 1");
                    954:
                    955:        for (n = m; n->m_next != NULL; n = n->m_next)
1.1       cgd       956:                sballoc(sb, n);
1.43      thorpej   957:        sballoc(sb, n);
                    958:        nlast = n;
                    959:        SBLINKRECORD(sb, m);
                    960:
                    961:        sb->sb_mbtail = nlast;
                    962:        SBLASTMBUFCHK(sb, "sbappendaddr");
                    963:        SBLASTRECORDCHK(sb, "sbappendaddr 2");
                    964:
1.1       cgd       965:        return (1);
                    966: }
                    967:
1.63      jonathan  968: /*
                    969:  * Helper for sbappendchainaddr: prepend a struct sockaddr* to
                    970:  * an mbuf chain.
                    971:  */
1.70      perry     972: static inline struct mbuf *
1.81      yamt      973: m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0,
1.64      jonathan  974:                   const struct sockaddr *asa)
1.63      jonathan  975: {
                    976:        struct mbuf *m;
1.64      jonathan  977:        const int salen = asa->sa_len;
1.63      jonathan  978:
1.91      ad        979:        KASSERT(solocked(sb->sb_so));
                    980:
1.63      jonathan  981:        /* only the first in each chain need be a pkthdr */
1.109.2.2! yamt      982:        m = m_gethdr(M_DONTWAIT, MT_SONAME);
        !           983:        if (m == NULL)
        !           984:                return NULL;
1.63      jonathan  985:        MCLAIM(m, sb->sb_mowner);
1.64      jonathan  986: #ifdef notyet
                    987:        if (salen > MHLEN) {
                    988:                MEXTMALLOC(m, salen, M_NOWAIT);
                    989:                if ((m->m_flags & M_EXT) == 0) {
                    990:                        m_free(m);
1.109.2.2! yamt      991:                        return NULL;
1.64      jonathan  992:                }
                    993:        }
                    994: #else
                    995:        KASSERT(salen <= MHLEN);
                    996: #endif
                    997:        m->m_len = salen;
1.82      christos  998:        memcpy(mtod(m, void *), asa, salen);
1.63      jonathan  999:        m->m_next = m0;
1.64      jonathan 1000:        m->m_pkthdr.len = salen + m0->m_pkthdr.len;
1.63      jonathan 1001:
                   1002:        return m;
                   1003: }
                   1004:
                   1005: int
                   1006: sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
                   1007:                  struct mbuf *m0, int sbprio)
                   1008: {
                   1009:        struct mbuf *m, *n, *n0, *nlast;
                   1010:        int error;
                   1011:
1.91      ad       1012:        KASSERT(solocked(sb->sb_so));
                   1013:
1.63      jonathan 1014:        /*
                   1015:         * XXX sbprio reserved for encoding priority of this* request:
                   1016:         *  SB_PRIO_NONE --> honour normal sb limits
                   1017:         *  SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
                   1018:         *      take whole chain. Intended for large requests
                   1019:         *      that should be delivered atomically (all, or none).
                   1020:         * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
                   1021:         *       over normal socket limits, for messages indicating
                   1022:         *       buffer overflow in earlier normal/lower-priority messages
                   1023:         * SB_PRIO_BESTEFFORT -->  ignore limits entirely.
                   1024:         *       Intended for  kernel-generated messages only.
                   1025:         *        Up to generator to avoid total mbuf resource exhaustion.
                   1026:         */
                   1027:        (void)sbprio;
                   1028:
                   1029:        if (m0 && (m0->m_flags & M_PKTHDR) == 0)
                   1030:                panic("sbappendaddrchain");
                   1031:
1.109.2.2! yamt     1032: #ifdef notyet
1.63      jonathan 1033:        space = sbspace(sb);
1.66      perry    1034:
                   1035:        /*
1.63      jonathan 1036:         * Enforce SB_PRIO_* limits as described above.
                   1037:         */
                   1038: #endif
                   1039:
                   1040:        n0 = NULL;
                   1041:        nlast = NULL;
                   1042:        for (m = m0; m; m = m->m_nextpkt) {
                   1043:                struct mbuf *np;
                   1044:
1.64      jonathan 1045: #ifdef MBUFTRACE
1.65      jonathan 1046:                m_claimm(m, sb->sb_mowner);
1.64      jonathan 1047: #endif
                   1048:
1.63      jonathan 1049:                /* Prepend sockaddr to this record (m) of input chain m0 */
1.64      jonathan 1050:                n = m_prepend_sockaddr(sb, m, asa);
1.63      jonathan 1051:                if (n == NULL) {
                   1052:                        error = ENOBUFS;
                   1053:                        goto bad;
                   1054:                }
                   1055:
                   1056:                /* Append record (asa+m) to end of new chain n0 */
                   1057:                if (n0 == NULL) {
                   1058:                        n0 = n;
                   1059:                } else {
                   1060:                        nlast->m_nextpkt = n;
                   1061:                }
                   1062:                /* Keep track of last record on new chain */
                   1063:                nlast = n;
                   1064:
                   1065:                for (np = n; np; np = np->m_next)
                   1066:                        sballoc(sb, np);
                   1067:        }
                   1068:
1.64      jonathan 1069:        SBLASTRECORDCHK(sb, "sbappendaddrchain 1");
                   1070:
1.63      jonathan 1071:        /* Drop the entire chain of (asa+m) records onto the socket */
                   1072:        SBLINKRECORDCHAIN(sb, n0, nlast);
1.64      jonathan 1073:
                   1074:        SBLASTRECORDCHK(sb, "sbappendaddrchain 2");
                   1075:
1.63      jonathan 1076:        for (m = nlast; m->m_next; m = m->m_next)
                   1077:                ;
                   1078:        sb->sb_mbtail = m;
1.64      jonathan 1079:        SBLASTMBUFCHK(sb, "sbappendaddrchain");
                   1080:
1.63      jonathan 1081:        return (1);
                   1082:
                   1083: bad:
1.64      jonathan 1084:        /*
                   1085:         * On error, free the prepended addreseses. For consistency
                   1086:         * with sbappendaddr(), leave it to our caller to free
                   1087:         * the input record chain passed to us as m0.
                   1088:         */
                   1089:        while ((n = n0) != NULL) {
                   1090:                struct mbuf *np;
                   1091:
                   1092:                /* Undo the sballoc() of this record */
                   1093:                for (np = n; np; np = np->m_next)
                   1094:                        sbfree(sb, np);
                   1095:
                   1096:                n0 = n->m_nextpkt;      /* iterate at next prepended address */
                   1097:                MFREE(n, np);           /* free prepended address (not data) */
                   1098:        }
1.109.2.2! yamt     1099:        return error;
1.63      jonathan 1100: }
                   1101:
                   1102:
1.7       mycroft  1103: int
1.37      lukem    1104: sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
1.1       cgd      1105: {
1.43      thorpej  1106:        struct mbuf     *m, *mlast, *n;
1.37      lukem    1107:        int             space;
1.1       cgd      1108:
1.91      ad       1109:        KASSERT(solocked(sb->sb_so));
                   1110:
1.37      lukem    1111:        space = 0;
1.109.2.2! yamt     1112:        if (control == NULL)
1.1       cgd      1113:                panic("sbappendcontrol");
                   1114:        for (m = control; ; m = m->m_next) {
                   1115:                space += m->m_len;
1.49      matt     1116:                MCLAIM(m, sb->sb_mowner);
1.109.2.2! yamt     1117:                if (m->m_next == NULL)
1.1       cgd      1118:                        break;
                   1119:        }
                   1120:        n = m;                  /* save pointer to last control buffer */
1.49      matt     1121:        for (m = m0; m; m = m->m_next) {
                   1122:                MCLAIM(m, sb->sb_mowner);
1.1       cgd      1123:                space += m->m_len;
1.49      matt     1124:        }
1.1       cgd      1125:        if (space > sbspace(sb))
                   1126:                return (0);
                   1127:        n->m_next = m0;                 /* concatenate data to control */
1.43      thorpej  1128:
                   1129:        SBLASTRECORDCHK(sb, "sbappendcontrol 1");
                   1130:
                   1131:        for (m = control; m->m_next != NULL; m = m->m_next)
1.1       cgd      1132:                sballoc(sb, m);
1.43      thorpej  1133:        sballoc(sb, m);
                   1134:        mlast = m;
                   1135:        SBLINKRECORD(sb, control);
                   1136:
                   1137:        sb->sb_mbtail = mlast;
                   1138:        SBLASTMBUFCHK(sb, "sbappendcontrol");
                   1139:        SBLASTRECORDCHK(sb, "sbappendcontrol 2");
                   1140:
1.1       cgd      1141:        return (1);
                   1142: }
                   1143:
                   1144: /*
                   1145:  * Compress mbuf chain m into the socket
                   1146:  * buffer sb following mbuf n.  If n
                   1147:  * is null, the buffer is presumed empty.
                   1148:  */
1.7       mycroft  1149: void
1.37      lukem    1150: sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1.1       cgd      1151: {
1.37      lukem    1152:        int             eor;
                   1153:        struct mbuf     *o;
1.1       cgd      1154:
1.91      ad       1155:        KASSERT(solocked(sb->sb_so));
                   1156:
1.37      lukem    1157:        eor = 0;
1.1       cgd      1158:        while (m) {
                   1159:                eor |= m->m_flags & M_EOR;
                   1160:                if (m->m_len == 0 &&
                   1161:                    (eor == 0 ||
                   1162:                     (((o = m->m_next) || (o = n)) &&
                   1163:                      o->m_type == m->m_type))) {
1.46      thorpej  1164:                        if (sb->sb_lastrecord == m)
                   1165:                                sb->sb_lastrecord = m->m_next;
1.1       cgd      1166:                        m = m_free(m);
                   1167:                        continue;
                   1168:                }
1.40      thorpej  1169:                if (n && (n->m_flags & M_EOR) == 0 &&
                   1170:                    /* M_TRAILINGSPACE() checks buffer writeability */
                   1171:                    m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
                   1172:                    m->m_len <= M_TRAILINGSPACE(n) &&
                   1173:                    n->m_type == m->m_type) {
1.82      christos 1174:                        memcpy(mtod(n, char *) + n->m_len, mtod(m, void *),
1.1       cgd      1175:                            (unsigned)m->m_len);
                   1176:                        n->m_len += m->m_len;
                   1177:                        sb->sb_cc += m->m_len;
                   1178:                        m = m_free(m);
                   1179:                        continue;
                   1180:                }
                   1181:                if (n)
                   1182:                        n->m_next = m;
                   1183:                else
                   1184:                        sb->sb_mb = m;
1.43      thorpej  1185:                sb->sb_mbtail = m;
1.1       cgd      1186:                sballoc(sb, m);
                   1187:                n = m;
                   1188:                m->m_flags &= ~M_EOR;
                   1189:                m = m->m_next;
                   1190:                n->m_next = 0;
                   1191:        }
                   1192:        if (eor) {
                   1193:                if (n)
                   1194:                        n->m_flags |= eor;
                   1195:                else
1.15      christos 1196:                        printf("semi-panic: sbcompress\n");
1.1       cgd      1197:        }
1.43      thorpej  1198:        SBLASTMBUFCHK(sb, __func__);
1.1       cgd      1199: }
                   1200:
                   1201: /*
                   1202:  * Free all mbufs in a sockbuf.
                   1203:  * Check that all resources are reclaimed.
                   1204:  */
1.7       mycroft  1205: void
1.37      lukem    1206: sbflush(struct sockbuf *sb)
1.1       cgd      1207: {
                   1208:
1.91      ad       1209:        KASSERT(solocked(sb->sb_so));
1.43      thorpej  1210:        KASSERT((sb->sb_flags & SB_LOCK) == 0);
                   1211:
1.1       cgd      1212:        while (sb->sb_mbcnt)
                   1213:                sbdrop(sb, (int)sb->sb_cc);
1.43      thorpej  1214:
                   1215:        KASSERT(sb->sb_cc == 0);
                   1216:        KASSERT(sb->sb_mb == NULL);
                   1217:        KASSERT(sb->sb_mbtail == NULL);
                   1218:        KASSERT(sb->sb_lastrecord == NULL);
1.1       cgd      1219: }
                   1220:
                   1221: /*
                   1222:  * Drop data from (the front of) a sockbuf.
                   1223:  */
1.7       mycroft  1224: void
1.37      lukem    1225: sbdrop(struct sockbuf *sb, int len)
1.1       cgd      1226: {
1.37      lukem    1227:        struct mbuf     *m, *mn, *next;
1.1       cgd      1228:
1.91      ad       1229:        KASSERT(solocked(sb->sb_so));
                   1230:
1.109.2.2! yamt     1231:        next = (m = sb->sb_mb) ? m->m_nextpkt : NULL;
1.1       cgd      1232:        while (len > 0) {
1.109.2.2! yamt     1233:                if (m == NULL) {
        !          1234:                        if (next == NULL)
        !          1235:                                panic("sbdrop(%p,%d): cc=%lu",
        !          1236:                                    sb, len, sb->sb_cc);
1.1       cgd      1237:                        m = next;
                   1238:                        next = m->m_nextpkt;
                   1239:                        continue;
                   1240:                }
                   1241:                if (m->m_len > len) {
                   1242:                        m->m_len -= len;
                   1243:                        m->m_data += len;
                   1244:                        sb->sb_cc -= len;
                   1245:                        break;
                   1246:                }
                   1247:                len -= m->m_len;
                   1248:                sbfree(sb, m);
                   1249:                MFREE(m, mn);
                   1250:                m = mn;
                   1251:        }
                   1252:        while (m && m->m_len == 0) {
                   1253:                sbfree(sb, m);
                   1254:                MFREE(m, mn);
                   1255:                m = mn;
                   1256:        }
                   1257:        if (m) {
                   1258:                sb->sb_mb = m;
                   1259:                m->m_nextpkt = next;
                   1260:        } else
                   1261:                sb->sb_mb = next;
1.43      thorpej  1262:        /*
1.45      thorpej  1263:         * First part is an inline SB_EMPTY_FIXUP().  Second part
1.43      thorpej  1264:         * makes sure sb_lastrecord is up-to-date if we dropped
                   1265:         * part of the last record.
                   1266:         */
                   1267:        m = sb->sb_mb;
                   1268:        if (m == NULL) {
                   1269:                sb->sb_mbtail = NULL;
                   1270:                sb->sb_lastrecord = NULL;
                   1271:        } else if (m->m_nextpkt == NULL)
                   1272:                sb->sb_lastrecord = m;
1.1       cgd      1273: }
                   1274:
                   1275: /*
                   1276:  * Drop a record off the front of a sockbuf
                   1277:  * and move the next record to the front.
                   1278:  */
1.7       mycroft  1279: void
1.37      lukem    1280: sbdroprecord(struct sockbuf *sb)
1.1       cgd      1281: {
1.37      lukem    1282:        struct mbuf     *m, *mn;
1.1       cgd      1283:
1.91      ad       1284:        KASSERT(solocked(sb->sb_so));
                   1285:
1.1       cgd      1286:        m = sb->sb_mb;
                   1287:        if (m) {
                   1288:                sb->sb_mb = m->m_nextpkt;
                   1289:                do {
                   1290:                        sbfree(sb, m);
                   1291:                        MFREE(m, mn);
1.11      christos 1292:                } while ((m = mn) != NULL);
1.1       cgd      1293:        }
1.45      thorpej  1294:        SB_EMPTY_FIXUP(sb);
1.19      thorpej  1295: }
                   1296:
                   1297: /*
                   1298:  * Create a "control" mbuf containing the specified data
                   1299:  * with the specified type for presentation on a socket buffer.
                   1300:  */
                   1301: struct mbuf *
1.109.2.2! yamt     1302: sbcreatecontrol1(void **p, int size, int type, int level, int flags)
1.19      thorpej  1303: {
1.37      lukem    1304:        struct cmsghdr  *cp;
                   1305:        struct mbuf     *m;
1.109.2.2! yamt     1306:        int space = CMSG_SPACE(size);
1.19      thorpej  1307:
1.109.2.2! yamt     1308:        if ((flags & M_DONTWAIT) && space > MCLBYTES) {
        !          1309:                printf("%s: message too large %d\n", __func__, space);
1.30      itojun   1310:                return NULL;
                   1311:        }
                   1312:
1.109.2.2! yamt     1313:        if ((m = m_get(flags, MT_CONTROL)) == NULL)
        !          1314:                return NULL;
        !          1315:        if (space > MLEN) {
        !          1316:                if (space > MCLBYTES)
        !          1317:                        MEXTMALLOC(m, space, M_WAITOK);
        !          1318:                else
        !          1319:                        MCLGET(m, flags);
1.30      itojun   1320:                if ((m->m_flags & M_EXT) == 0) {
                   1321:                        m_free(m);
                   1322:                        return NULL;
                   1323:                }
                   1324:        }
1.19      thorpej  1325:        cp = mtod(m, struct cmsghdr *);
1.109.2.2! yamt     1326:        *p = CMSG_DATA(cp);
        !          1327:        m->m_len = space;
1.35      itojun   1328:        cp->cmsg_len = CMSG_LEN(size);
1.19      thorpej  1329:        cp->cmsg_level = level;
                   1330:        cp->cmsg_type = type;
1.109.2.2! yamt     1331:        return m;
        !          1332: }
        !          1333:
        !          1334: struct mbuf *
        !          1335: sbcreatecontrol(void *p, int size, int type, int level)
        !          1336: {
        !          1337:        struct mbuf *m;
        !          1338:        void *v;
        !          1339:
        !          1340:        m = sbcreatecontrol1(&v, size, type, level, M_DONTWAIT);
        !          1341:        if (m == NULL)
        !          1342:                return NULL;
        !          1343:        memcpy(v, p, size);
        !          1344:        return m;
1.1       cgd      1345: }
1.91      ad       1346:
                   1347: void
                   1348: solockretry(struct socket *so, kmutex_t *lock)
                   1349: {
                   1350:
                   1351:        while (lock != so->so_lock) {
                   1352:                mutex_exit(lock);
                   1353:                lock = so->so_lock;
                   1354:                mutex_enter(lock);
                   1355:        }
                   1356: }
                   1357:
                   1358: bool
                   1359: solocked(struct socket *so)
                   1360: {
                   1361:
                   1362:        return mutex_owned(so->so_lock);
                   1363: }
                   1364:
                   1365: bool
                   1366: solocked2(struct socket *so1, struct socket *so2)
                   1367: {
                   1368:        kmutex_t *lock;
                   1369:
                   1370:        lock = so1->so_lock;
                   1371:        if (lock != so2->so_lock)
                   1372:                return false;
                   1373:        return mutex_owned(lock);
                   1374: }
                   1375:
                   1376: /*
                   1377:  * Assign a default lock to a new socket.  For PRU_ATTACH, and done by
                   1378:  * protocols that do not have special locking requirements.
                   1379:  */
                   1380: void
                   1381: sosetlock(struct socket *so)
                   1382: {
                   1383:        kmutex_t *lock;
                   1384:
                   1385:        if (so->so_lock == NULL) {
                   1386:                lock = softnet_lock;
                   1387:                so->so_lock = lock;
                   1388:                mutex_obj_hold(lock);
                   1389:                mutex_enter(lock);
                   1390:        }
                   1391:
                   1392:        /* In all cases, lock must be held on return from PRU_ATTACH. */
                   1393:        KASSERT(solocked(so));
                   1394: }
                   1395:
                   1396: /*
                   1397:  * Set lock on sockbuf sb; sleep if lock is already held.
                   1398:  * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
                   1399:  * Returns error without lock if sleep is interrupted.
                   1400:  */
                   1401: int
                   1402: sblock(struct sockbuf *sb, int wf)
                   1403: {
                   1404:        struct socket *so;
                   1405:        kmutex_t *lock;
                   1406:        int error;
                   1407:
                   1408:        KASSERT(solocked(sb->sb_so));
                   1409:
                   1410:        for (;;) {
                   1411:                if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) {
                   1412:                        sb->sb_flags |= SB_LOCK;
                   1413:                        return 0;
                   1414:                }
                   1415:                if (wf != M_WAITOK)
                   1416:                        return EWOULDBLOCK;
                   1417:                so = sb->sb_so;
                   1418:                lock = so->so_lock;
                   1419:                if ((sb->sb_flags & SB_NOINTR) != 0) {
                   1420:                        cv_wait(&so->so_cv, lock);
                   1421:                        error = 0;
                   1422:                } else
                   1423:                        error = cv_wait_sig(&so->so_cv, lock);
                   1424:                if (__predict_false(lock != so->so_lock))
                   1425:                        solockretry(so, lock);
                   1426:                if (error != 0)
                   1427:                        return error;
                   1428:        }
                   1429: }
                   1430:
                   1431: void
                   1432: sbunlock(struct sockbuf *sb)
                   1433: {
                   1434:        struct socket *so;
                   1435:
                   1436:        so = sb->sb_so;
                   1437:
                   1438:        KASSERT(solocked(so));
                   1439:        KASSERT((sb->sb_flags & SB_LOCK) != 0);
                   1440:
                   1441:        sb->sb_flags &= ~SB_LOCK;
                   1442:        cv_broadcast(&so->so_cv);
                   1443: }
                   1444:
                   1445: int
1.101     yamt     1446: sowait(struct socket *so, bool catch, int timo)
1.91      ad       1447: {
                   1448:        kmutex_t *lock;
                   1449:        int error;
                   1450:
                   1451:        KASSERT(solocked(so));
1.101     yamt     1452:        KASSERT(catch || timo != 0);
1.91      ad       1453:
                   1454:        lock = so->so_lock;
1.101     yamt     1455:        if (catch)
                   1456:                error = cv_timedwait_sig(&so->so_cv, lock, timo);
                   1457:        else
                   1458:                error = cv_timedwait(&so->so_cv, lock, timo);
1.91      ad       1459:        if (__predict_false(lock != so->so_lock))
                   1460:                solockretry(so, lock);
                   1461:        return error;
                   1462: }

CVSweb <webmaster@jp.NetBSD.org>