[BACK]Return to uipc_socket2.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / kern

Annotation of src/sys/kern/uipc_socket2.c, Revision 1.89.6.5

1.89.6.1  mjf         1: /*     $NetBSD$        */
1.9       cgd         2:
1.89.6.2  mjf         3: /*-
                      4:  * Copyright (c) 2008 The NetBSD Foundation, Inc.
                      5:  * All rights reserved.
                      6:  *
                      7:  * Redistribution and use in source and binary forms, with or without
                      8:  * modification, are permitted provided that the following conditions
                      9:  * are met:
                     10:  * 1. Redistributions of source code must retain the above copyright
                     11:  *    notice, this list of conditions and the following disclaimer.
                     12:  * 2. Redistributions in binary form must reproduce the above copyright
                     13:  *    notice, this list of conditions and the following disclaimer in the
                     14:  *    documentation and/or other materials provided with the distribution.
                     15:  *
                     16:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     17:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     18:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     19:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     20:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     21:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     22:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     23:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     24:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     25:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     26:  * POSSIBILITY OF SUCH DAMAGE.
                     27:  */
                     28:
1.1       cgd        29: /*
1.7       mycroft    30:  * Copyright (c) 1982, 1986, 1988, 1990, 1993
                     31:  *     The Regents of the University of California.  All rights reserved.
1.1       cgd        32:  *
                     33:  * Redistribution and use in source and binary forms, with or without
                     34:  * modification, are permitted provided that the following conditions
                     35:  * are met:
                     36:  * 1. Redistributions of source code must retain the above copyright
                     37:  *    notice, this list of conditions and the following disclaimer.
                     38:  * 2. Redistributions in binary form must reproduce the above copyright
                     39:  *    notice, this list of conditions and the following disclaimer in the
                     40:  *    documentation and/or other materials provided with the distribution.
1.54      agc        41:  * 3. Neither the name of the University nor the names of its contributors
1.1       cgd        42:  *    may be used to endorse or promote products derived from this software
                     43:  *    without specific prior written permission.
                     44:  *
                     45:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
                     46:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     47:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     48:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
                     49:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     50:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     51:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     52:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     53:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     54:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     55:  * SUCH DAMAGE.
                     56:  *
1.23      fvdl       57:  *     @(#)uipc_socket2.c      8.2 (Berkeley) 2/14/95
1.1       cgd        58:  */
1.42      lukem      59:
                     60: #include <sys/cdefs.h>
1.89.6.1  mjf        61: __KERNEL_RCSID(0, "$NetBSD$");
1.51      martin     62:
                     63: #include "opt_mbuftrace.h"
1.58      thorpej    64: #include "opt_sb_max.h"
1.1       cgd        65:
1.5       mycroft    66: #include <sys/param.h>
                     67: #include <sys/systm.h>
                     68: #include <sys/proc.h>
                     69: #include <sys/file.h>
                     70: #include <sys/buf.h>
                     71: #include <sys/malloc.h>
                     72: #include <sys/mbuf.h>
                     73: #include <sys/protosw.h>
1.89.6.2  mjf        74: #include <sys/domain.h>
1.55      christos   75: #include <sys/poll.h>
1.5       mycroft    76: #include <sys/socket.h>
                     77: #include <sys/socketvar.h>
1.11      christos   78: #include <sys/signalvar.h>
1.71      elad       79: #include <sys/kauth.h>
1.89.6.2  mjf        80: #include <sys/pool.h>
1.89.6.5! mjf        81: #include <sys/uidinfo.h>
1.1       cgd        82:
                     83: /*
1.89.6.2  mjf        84:  * Primitive routines for operating on sockets and socket buffers.
                     85:  *
                     86:  * Locking rules and assumptions:
                     87:  *
                     88:  * o socket::so_lock can change on the fly.  The low level routines used
                     89:  *   to lock sockets are aware of this.  When so_lock is acquired, the
                     90:  *   routine locking must check to see if so_lock still points to the
                     91:  *   lock that was acquired.  If so_lock has changed in the meantime, the
                     92:  *   now irellevant lock that was acquired must be dropped and the lock
                     93:  *   operation retried.  Although not proven here, this is completely safe
                     94:  *   on a multiprocessor system, even with relaxed memory ordering, given
                     95:  *   the next two rules:
                     96:  *
                     97:  * o In order to mutate so_lock, the lock pointed to by the current value
                     98:  *   of so_lock must be held: i.e., the socket must be held locked by the
                     99:  *   changing thread.  The thread must issue membar_exit() to prevent
                    100:  *   memory accesses being reordered, and can set so_lock to the desired
                    101:  *   value.  If the lock pointed to by the new value of so_lock is not
                    102:  *   held by the changing thread, the socket must then be considered
                    103:  *   unlocked.
                    104:  *
                    105:  * o If so_lock is mutated, and the previous lock referred to by so_lock
                    106:  *   could still be visible to other threads in the system (e.g. via file
                    107:  *   descriptor or protocol-internal reference), then the old lock must
                    108:  *   remain valid until the socket and/or protocol control block has been
                    109:  *   torn down.
                    110:  *
                    111:  * o If a socket has a non-NULL so_head value (i.e. is in the process of
                    112:  *   connecting), then locking the socket must also lock the socket pointed
                    113:  *   to by so_head: their lock pointers must match.
                    114:  *
                    115:  * o If a socket has connections in progress (so_q, so_q0 not empty) then
                    116:  *   locking the socket must also lock the sockets attached to both queues.
                    117:  *   Again, their lock pointers must match.
                    118:  *
                    119:  * o Beyond the initial lock assigment in socreate(), assigning locks to
                    120:  *   sockets is the responsibility of the individual protocols / protocol
                    121:  *   domains.
1.1       cgd       122:  */
                    123:
1.89.6.2  mjf       124: static pool_cache_t socket_cache;
1.1       cgd       125:
1.58      thorpej   126: u_long sb_max = SB_MAX;        /* maximum socket buffer size */
                    127: static u_long sb_max_adj;      /* adjusted sb_max */
                    128:
1.1       cgd       129: /*
                    130:  * Procedures to manipulate state flags of socket
                    131:  * and do appropriate wakeups.  Normal sequence from the
                    132:  * active (originating) side is that soisconnecting() is
                    133:  * called during processing of connect() call,
                    134:  * resulting in an eventual call to soisconnected() if/when the
                    135:  * connection is established.  When the connection is torn down
                    136:  * soisdisconnecting() is called during processing of disconnect() call,
                    137:  * and soisdisconnected() is called when the connection to the peer
                    138:  * is totally severed.  The semantics of these routines are such that
                    139:  * connectionless protocols can call soisconnected() and soisdisconnected()
                    140:  * only, bypassing the in-progress calls when setting up a ``connection''
                    141:  * takes no time.
                    142:  *
                    143:  * From the passive side, a socket is created with
                    144:  * two queues of sockets: so_q0 for connections in progress
                    145:  * and so_q for connections already made and awaiting user acceptance.
                    146:  * As a protocol is preparing incoming connections, it creates a socket
                    147:  * structure queued on so_q0 by calling sonewconn().  When the connection
                    148:  * is established, soisconnected() is called, and transfers the
                    149:  * socket structure to so_q, making it available to accept().
1.66      perry     150:  *
1.1       cgd       151:  * If a socket is closed with sockets on either
                    152:  * so_q0 or so_q, these sockets are dropped.
                    153:  *
                    154:  * If higher level protocols are implemented in
                    155:  * the kernel, the wakeups done here will sometimes
                    156:  * cause software-interrupt process scheduling.
                    157:  */
                    158:
1.7       mycroft   159: void
1.37      lukem     160: soisconnecting(struct socket *so)
1.1       cgd       161: {
                    162:
1.89.6.2  mjf       163:        KASSERT(solocked(so));
                    164:
1.1       cgd       165:        so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
                    166:        so->so_state |= SS_ISCONNECTING;
                    167: }
                    168:
1.7       mycroft   169: void
1.37      lukem     170: soisconnected(struct socket *so)
1.1       cgd       171: {
1.37      lukem     172:        struct socket   *head;
1.1       cgd       173:
1.37      lukem     174:        head = so->so_head;
1.89.6.2  mjf       175:
                    176:        KASSERT(solocked(so));
                    177:        KASSERT(head == NULL || solocked2(so, head));
                    178:
1.1       cgd       179:        so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
                    180:        so->so_state |= SS_ISCONNECTED;
1.89.6.4  mjf       181:        if (head && so->so_onq == &head->so_q0) {
                    182:                if ((so->so_options & SO_ACCEPTFILTER) == 0) {
                    183:                        soqremque(so, 0);
                    184:                        soqinsque(head, so, 1);
                    185:                        sorwakeup(head);
                    186:                        cv_broadcast(&head->so_cv);
                    187:                } else {
                    188:                        so->so_upcall =
                    189:                            head->so_accf->so_accept_filter->accf_callback;
                    190:                        so->so_upcallarg = head->so_accf->so_accept_filter_arg;
                    191:                        so->so_rcv.sb_flags |= SB_UPCALL;
                    192:                        so->so_options &= ~SO_ACCEPTFILTER;
1.89.6.5! mjf       193:                        (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);             }
1.1       cgd       194:        } else {
1.89.6.2  mjf       195:                cv_broadcast(&so->so_cv);
1.1       cgd       196:                sorwakeup(so);
                    197:                sowwakeup(so);
                    198:        }
                    199: }
                    200:
1.7       mycroft   201: void
1.37      lukem     202: soisdisconnecting(struct socket *so)
1.1       cgd       203: {
                    204:
1.89.6.2  mjf       205:        KASSERT(solocked(so));
                    206:
1.1       cgd       207:        so->so_state &= ~SS_ISCONNECTING;
                    208:        so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
1.89.6.2  mjf       209:        cv_broadcast(&so->so_cv);
1.1       cgd       210:        sowwakeup(so);
                    211:        sorwakeup(so);
                    212: }
                    213:
1.7       mycroft   214: void
1.37      lukem     215: soisdisconnected(struct socket *so)
1.1       cgd       216: {
                    217:
1.89.6.2  mjf       218:        KASSERT(solocked(so));
                    219:
1.1       cgd       220:        so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
1.27      mycroft   221:        so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
1.89.6.2  mjf       222:        cv_broadcast(&so->so_cv);
1.1       cgd       223:        sowwakeup(so);
                    224:        sorwakeup(so);
                    225: }
                    226:
1.89.6.2  mjf       227: void
                    228: soinit2(void)
                    229: {
                    230:
                    231:        socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0,
                    232:            "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL);
                    233: }
                    234:
1.1       cgd       235: /*
                    236:  * When an attempt at a new connection is noted on a socket
                    237:  * which accepts connections, sonewconn is called.  If the
                    238:  * connection is possible (subject to space constraints, etc.)
                    239:  * then we allocate a new structure, propoerly linked into the
                    240:  * data structure of the original socket, and return this.
1.77      plunky    241:  * Connstatus may be 0, SS_ISCONFIRMING, or SS_ISCONNECTED.
1.1       cgd       242:  */
                    243: struct socket *
1.76      plunky    244: sonewconn(struct socket *head, int connstatus)
1.1       cgd       245: {
1.37      lukem     246:        struct socket   *so;
1.89.6.2  mjf       247:        int             soqueue, error;
                    248:
                    249:        KASSERT(solocked(head));
1.1       cgd       250:
1.89.6.4  mjf       251:        if ((head->so_options & SO_ACCEPTFILTER) != 0)
                    252:                connstatus = 0;
1.37      lukem     253:        soqueue = connstatus ? 1 : 0;
1.1       cgd       254:        if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2)
1.89.6.5! mjf       255:                return NULL;
1.89.6.2  mjf       256:        so = soget(false);
1.66      perry     257:        if (so == NULL)
1.89.6.5! mjf       258:                return NULL;
1.89.6.2  mjf       259:        mutex_obj_hold(head->so_lock);
                    260:        so->so_lock = head->so_lock;
1.1       cgd       261:        so->so_type = head->so_type;
                    262:        so->so_options = head->so_options &~ SO_ACCEPTCONN;
                    263:        so->so_linger = head->so_linger;
                    264:        so->so_state = head->so_state | SS_NOFDREF;
1.89      ad        265:        so->so_nbio = head->so_nbio;
1.1       cgd       266:        so->so_proto = head->so_proto;
                    267:        so->so_timeo = head->so_timeo;
                    268:        so->so_pgid = head->so_pgid;
1.24      matt      269:        so->so_send = head->so_send;
                    270:        so->so_receive = head->so_receive;
1.67      christos  271:        so->so_uidinfo = head->so_uidinfo;
1.89.6.3  mjf       272:        so->so_egid = head->so_egid;
                    273:        so->so_cpid = head->so_cpid;
1.49      matt      274: #ifdef MBUFTRACE
                    275:        so->so_mowner = head->so_mowner;
                    276:        so->so_rcv.sb_mowner = head->so_rcv.sb_mowner;
                    277:        so->so_snd.sb_mowner = head->so_snd.sb_mowner;
                    278: #endif
1.1       cgd       279:        (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat);
1.83      tls       280:        so->so_snd.sb_lowat = head->so_snd.sb_lowat;
                    281:        so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
1.84      tls       282:        so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
                    283:        so->so_snd.sb_timeo = head->so_snd.sb_timeo;
1.85      rmind     284:        so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
                    285:        so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
1.1       cgd       286:        soqinsque(head, so, soqueue);
1.89.6.2  mjf       287:        error = (*so->so_proto->pr_usrreq)(so, PRU_ATTACH, NULL, NULL,
                    288:            NULL, NULL);
                    289:        KASSERT(solocked(so));
                    290:        if (error != 0) {
1.1       cgd       291:                (void) soqremque(so, soqueue);
1.89.6.5! mjf       292:                /*
        !           293:                 * Remove acccept filter if one is present.
        !           294:                 * XXX Is this really needed?
        !           295:                 */
1.89.6.4  mjf       296:                if (so->so_accf != NULL)
1.89.6.5! mjf       297:                        (void)accept_filt_clear(so);
1.89.6.2  mjf       298:                soput(so);
1.89.6.5! mjf       299:                return NULL;
1.1       cgd       300:        }
                    301:        if (connstatus) {
                    302:                sorwakeup(head);
1.89.6.2  mjf       303:                cv_broadcast(&head->so_cv);
1.1       cgd       304:                so->so_state |= connstatus;
                    305:        }
1.89.6.5! mjf       306:        return so;
1.1       cgd       307: }
                    308:
1.89.6.2  mjf       309: struct socket *
                    310: soget(bool waitok)
                    311: {
                    312:        struct socket *so;
                    313:
                    314:        so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
                    315:        if (__predict_false(so == NULL))
                    316:                return (NULL);
                    317:        memset(so, 0, sizeof(*so));
                    318:        TAILQ_INIT(&so->so_q0);
                    319:        TAILQ_INIT(&so->so_q);
                    320:        cv_init(&so->so_cv, "socket");
                    321:        cv_init(&so->so_rcv.sb_cv, "netio");
                    322:        cv_init(&so->so_snd.sb_cv, "netio");
                    323:        selinit(&so->so_rcv.sb_sel);
                    324:        selinit(&so->so_snd.sb_sel);
                    325:        so->so_rcv.sb_so = so;
                    326:        so->so_snd.sb_so = so;
                    327:        return so;
                    328: }
                    329:
                    330: void
                    331: soput(struct socket *so)
                    332: {
                    333:
                    334:        KASSERT(!cv_has_waiters(&so->so_cv));
                    335:        KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
                    336:        KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
                    337:        seldestroy(&so->so_rcv.sb_sel);
                    338:        seldestroy(&so->so_snd.sb_sel);
                    339:        mutex_obj_free(so->so_lock);
                    340:        cv_destroy(&so->so_cv);
                    341:        cv_destroy(&so->so_rcv.sb_cv);
                    342:        cv_destroy(&so->so_snd.sb_cv);
                    343:        pool_cache_put(socket_cache, so);
                    344: }
                    345:
1.7       mycroft   346: void
1.37      lukem     347: soqinsque(struct socket *head, struct socket *so, int q)
1.1       cgd       348: {
                    349:
1.89.6.2  mjf       350:        KASSERT(solocked2(head, so));
                    351:
1.22      thorpej   352: #ifdef DIAGNOSTIC
                    353:        if (so->so_onq != NULL)
                    354:                panic("soqinsque");
                    355: #endif
                    356:
1.1       cgd       357:        so->so_head = head;
                    358:        if (q == 0) {
                    359:                head->so_q0len++;
1.22      thorpej   360:                so->so_onq = &head->so_q0;
1.1       cgd       361:        } else {
                    362:                head->so_qlen++;
1.22      thorpej   363:                so->so_onq = &head->so_q;
1.1       cgd       364:        }
1.22      thorpej   365:        TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
1.1       cgd       366: }
                    367:
1.7       mycroft   368: int
1.37      lukem     369: soqremque(struct socket *so, int q)
1.1       cgd       370: {
1.37      lukem     371:        struct socket   *head;
1.1       cgd       372:
1.37      lukem     373:        head = so->so_head;
1.89.6.2  mjf       374:
                    375:        KASSERT(solocked(so));
1.22      thorpej   376:        if (q == 0) {
                    377:                if (so->so_onq != &head->so_q0)
1.17      thorpej   378:                        return (0);
1.1       cgd       379:                head->so_q0len--;
                    380:        } else {
1.22      thorpej   381:                if (so->so_onq != &head->so_q)
                    382:                        return (0);
1.1       cgd       383:                head->so_qlen--;
                    384:        }
1.89.6.2  mjf       385:        KASSERT(solocked2(so, head));
1.22      thorpej   386:        TAILQ_REMOVE(so->so_onq, so, so_qe);
                    387:        so->so_onq = NULL;
                    388:        so->so_head = NULL;
1.1       cgd       389:        return (1);
                    390: }
                    391:
                    392: /*
                    393:  * Socantsendmore indicates that no more data will be sent on the
                    394:  * socket; it would normally be applied to a socket when the user
                    395:  * informs the system that no more data is to be sent, by the protocol
                    396:  * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
                    397:  * will be received, and will normally be applied to the socket by a
                    398:  * protocol when it detects that the peer will send no more data.
                    399:  * Data queued for reading in the socket may yet be read.
                    400:  */
                    401:
1.4       andrew    402: void
1.37      lukem     403: socantsendmore(struct socket *so)
1.1       cgd       404: {
                    405:
1.89.6.2  mjf       406:        KASSERT(solocked(so));
                    407:
1.1       cgd       408:        so->so_state |= SS_CANTSENDMORE;
                    409:        sowwakeup(so);
                    410: }
                    411:
1.4       andrew    412: void
1.37      lukem     413: socantrcvmore(struct socket *so)
1.1       cgd       414: {
                    415:
1.89.6.2  mjf       416:        KASSERT(solocked(so));
                    417:
1.1       cgd       418:        so->so_state |= SS_CANTRCVMORE;
                    419:        sorwakeup(so);
                    420: }
                    421:
                    422: /*
                    423:  * Wait for data to arrive at/drain from a socket buffer.
                    424:  */
1.7       mycroft   425: int
1.37      lukem     426: sbwait(struct sockbuf *sb)
1.1       cgd       427: {
1.89.6.2  mjf       428:        struct socket *so;
                    429:        kmutex_t *lock;
                    430:        int error;
1.1       cgd       431:
1.89.6.2  mjf       432:        so = sb->sb_so;
1.1       cgd       433:
1.89.6.2  mjf       434:        KASSERT(solocked(so));
1.1       cgd       435:
1.89.6.2  mjf       436:        sb->sb_flags |= SB_NOTIFY;
                    437:        lock = so->so_lock;
                    438:        if ((sb->sb_flags & SB_NOINTR) != 0)
                    439:                error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo);
                    440:        else
                    441:                error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo);
                    442:        if (__predict_false(lock != so->so_lock))
                    443:                solockretry(so, lock);
                    444:        return error;
1.1       cgd       445: }
                    446:
                    447: /*
                    448:  * Wakeup processes waiting on a socket buffer.
                    449:  * Do asynchronous notification via SIGIO
1.39      manu      450:  * if the socket buffer has the SB_ASYNC flag set.
1.1       cgd       451:  */
1.7       mycroft   452: void
1.55      christos  453: sowakeup(struct socket *so, struct sockbuf *sb, int code)
1.1       cgd       454: {
1.89.6.1  mjf       455:        int band;
                    456:
1.89.6.2  mjf       457:        KASSERT(solocked(so));
                    458:        KASSERT(sb->sb_so == so);
                    459:
1.89.6.1  mjf       460:        if (code == POLL_IN)
                    461:                band = POLLIN|POLLRDNORM;
                    462:        else
                    463:                band = POLLOUT|POLLWRNORM;
1.89.6.2  mjf       464:        sb->sb_flags &= ~SB_NOTIFY;
                    465:        selnotify(&sb->sb_sel, band, NOTE_SUBMIT);
                    466:        cv_broadcast(&sb->sb_cv);
1.89.6.1  mjf       467:        if (sb->sb_flags & SB_ASYNC)
1.57      christos  468:                fownsignal(so->so_pgid, SIGIO, code, band, so);
1.24      matt      469:        if (sb->sb_flags & SB_UPCALL)
                    470:                (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
1.1       cgd       471: }
                    472:
                    473: /*
1.89.6.3  mjf       474:  * Reset a socket's lock pointer.  Wake all threads waiting on the
                    475:  * socket's condition variables so that they can restart their waits
                    476:  * using the new lock.  The existing lock must be held.
                    477:  */
                    478: void
                    479: solockreset(struct socket *so, kmutex_t *lock)
                    480: {
                    481:
                    482:        KASSERT(solocked(so));
                    483:
                    484:        so->so_lock = lock;
                    485:        cv_broadcast(&so->so_snd.sb_cv);
                    486:        cv_broadcast(&so->so_rcv.sb_cv);
                    487:        cv_broadcast(&so->so_cv);
                    488: }
                    489:
                    490: /*
1.1       cgd       491:  * Socket buffer (struct sockbuf) utility routines.
                    492:  *
                    493:  * Each socket contains two socket buffers: one for sending data and
                    494:  * one for receiving data.  Each buffer contains a queue of mbufs,
                    495:  * information about the number of mbufs and amount of data in the
1.13      mycroft   496:  * queue, and other fields allowing poll() statements and notification
1.1       cgd       497:  * on data availability to be implemented.
                    498:  *
                    499:  * Data stored in a socket buffer is maintained as a list of records.
                    500:  * Each record is a list of mbufs chained together with the m_next
                    501:  * field.  Records are chained together with the m_nextpkt field. The upper
                    502:  * level routine soreceive() expects the following conventions to be
                    503:  * observed when placing information in the receive buffer:
                    504:  *
                    505:  * 1. If the protocol requires each message be preceded by the sender's
                    506:  *    name, then a record containing that name must be present before
                    507:  *    any associated data (mbuf's must be of type MT_SONAME).
                    508:  * 2. If the protocol supports the exchange of ``access rights'' (really
                    509:  *    just additional data associated with the message), and there are
                    510:  *    ``rights'' to be received, then a record containing this data
1.10      mycroft   511:  *    should be present (mbuf's must be of type MT_CONTROL).
1.1       cgd       512:  * 3. If a name or rights record exists, then it must be followed by
                    513:  *    a data record, perhaps of zero length.
                    514:  *
                    515:  * Before using a new socket structure it is first necessary to reserve
                    516:  * buffer space to the socket, by calling sbreserve().  This should commit
                    517:  * some of the available buffer space in the system buffer pool for the
                    518:  * socket (currently, it does nothing but enforce limits).  The space
                    519:  * should be released by calling sbrelease() when the socket is destroyed.
                    520:  */
                    521:
1.7       mycroft   522: int
1.58      thorpej   523: sb_max_set(u_long new_sbmax)
                    524: {
                    525:        int s;
                    526:
                    527:        if (new_sbmax < (16 * 1024))
                    528:                return (EINVAL);
                    529:
                    530:        s = splsoftnet();
                    531:        sb_max = new_sbmax;
                    532:        sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES);
                    533:        splx(s);
                    534:
                    535:        return (0);
                    536: }
                    537:
                    538: int
1.37      lukem     539: soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
1.1       cgd       540: {
1.89.6.2  mjf       541:
                    542:        KASSERT(so->so_lock == NULL || solocked(so));
                    543:
1.74      christos  544:        /*
                    545:         * there's at least one application (a configure script of screen)
                    546:         * which expects a fifo is writable even if it has "some" bytes
                    547:         * in its buffer.
                    548:         * so we want to make sure (hiwat - lowat) >= (some bytes).
                    549:         *
                    550:         * PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
                    551:         * we expect it's large enough for such applications.
                    552:         */
                    553:        u_long  lowat = MAX(sock_loan_thresh, MCLBYTES);
                    554:        u_long  hiwat = lowat + PIPE_BUF;
1.1       cgd       555:
1.74      christos  556:        if (sndcc < hiwat)
                    557:                sndcc = hiwat;
1.59      christos  558:        if (sbreserve(&so->so_snd, sndcc, so) == 0)
1.1       cgd       559:                goto bad;
1.59      christos  560:        if (sbreserve(&so->so_rcv, rcvcc, so) == 0)
1.1       cgd       561:                goto bad2;
                    562:        if (so->so_rcv.sb_lowat == 0)
                    563:                so->so_rcv.sb_lowat = 1;
                    564:        if (so->so_snd.sb_lowat == 0)
1.74      christos  565:                so->so_snd.sb_lowat = lowat;
1.1       cgd       566:        if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
                    567:                so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
                    568:        return (0);
1.37      lukem     569:  bad2:
1.59      christos  570:        sbrelease(&so->so_snd, so);
1.37      lukem     571:  bad:
1.1       cgd       572:        return (ENOBUFS);
                    573: }
                    574:
                    575: /*
                    576:  * Allot mbufs to a sockbuf.
                    577:  * Attempt to scale mbmax so that mbcnt doesn't become limiting
                    578:  * if buffering efficiency is near the normal case.
                    579:  */
1.7       mycroft   580: int
1.59      christos  581: sbreserve(struct sockbuf *sb, u_long cc, struct socket *so)
1.1       cgd       582: {
1.75      ad        583:        struct lwp *l = curlwp; /* XXX */
1.62      christos  584:        rlim_t maxcc;
1.67      christos  585:        struct uidinfo *uidinfo;
1.1       cgd       586:
1.89.6.2  mjf       587:        KASSERT(so->so_lock == NULL || solocked(so));
                    588:        KASSERT(sb->sb_so == so);
                    589:        KASSERT(sb_max_adj != 0);
                    590:
1.58      thorpej   591:        if (cc == 0 || cc > sb_max_adj)
1.1       cgd       592:                return (0);
1.89.6.2  mjf       593:
                    594:        if (kauth_cred_geteuid(l->l_cred) == so->so_uidinfo->ui_uid)
                    595:                maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur;
                    596:        else
1.62      christos  597:                maxcc = RLIM_INFINITY;
1.89.6.2  mjf       598:
                    599:        uidinfo = so->so_uidinfo;
1.67      christos  600:        if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc))
1.62      christos  601:                return 0;
1.1       cgd       602:        sb->sb_mbmax = min(cc * 2, sb_max);
                    603:        if (sb->sb_lowat > sb->sb_hiwat)
                    604:                sb->sb_lowat = sb->sb_hiwat;
                    605:        return (1);
                    606: }
                    607:
                    608: /*
1.89.6.2  mjf       609:  * Free mbufs held by a socket, and reserved mbuf space.  We do not assert
                    610:  * that the socket is held locked here: see sorflush().
1.1       cgd       611:  */
1.7       mycroft   612: void
1.59      christos  613: sbrelease(struct sockbuf *sb, struct socket *so)
1.1       cgd       614: {
                    615:
1.89.6.2  mjf       616:        KASSERT(sb->sb_so == so);
                    617:
1.1       cgd       618:        sbflush(sb);
1.87      yamt      619:        (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY);
1.59      christos  620:        sb->sb_mbmax = 0;
1.1       cgd       621: }
                    622:
                    623: /*
                    624:  * Routines to add and remove
                    625:  * data from an mbuf queue.
                    626:  *
                    627:  * The routines sbappend() or sbappendrecord() are normally called to
                    628:  * append new mbufs to a socket buffer, after checking that adequate
                    629:  * space is available, comparing the function sbspace() with the amount
                    630:  * of data to be added.  sbappendrecord() differs from sbappend() in
                    631:  * that data supplied is treated as the beginning of a new record.
                    632:  * To place a sender's address, optional access rights, and data in a
                    633:  * socket receive buffer, sbappendaddr() should be used.  To place
                    634:  * access rights and data in a socket receive buffer, sbappendrights()
                    635:  * should be used.  In either case, the new data begins a new record.
                    636:  * Note that unlike sbappend() and sbappendrecord(), these routines check
                    637:  * for the caller that there will be enough space to store the data.
                    638:  * Each fails if there is not enough space, or if it cannot find mbufs
                    639:  * to store additional information in.
                    640:  *
                    641:  * Reliable protocols may use the socket send buffer to hold data
                    642:  * awaiting acknowledgement.  Data is normally copied from a socket
                    643:  * send buffer in a protocol with m_copy for output to a peer,
                    644:  * and then removing the data from the socket buffer with sbdrop()
                    645:  * or sbdroprecord() when the data is acknowledged by the peer.
                    646:  */
                    647:
1.43      thorpej   648: #ifdef SOCKBUF_DEBUG
                    649: void
                    650: sblastrecordchk(struct sockbuf *sb, const char *where)
                    651: {
                    652:        struct mbuf *m = sb->sb_mb;
                    653:
1.89.6.2  mjf       654:        KASSERT(solocked(sb->sb_so));
                    655:
1.43      thorpej   656:        while (m && m->m_nextpkt)
                    657:                m = m->m_nextpkt;
                    658:
                    659:        if (m != sb->sb_lastrecord) {
                    660:                printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
                    661:                    sb->sb_mb, sb->sb_lastrecord, m);
                    662:                printf("packet chain:\n");
                    663:                for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
                    664:                        printf("\t%p\n", m);
1.47      provos    665:                panic("sblastrecordchk from %s", where);
1.43      thorpej   666:        }
                    667: }
                    668:
                    669: void
                    670: sblastmbufchk(struct sockbuf *sb, const char *where)
                    671: {
                    672:        struct mbuf *m = sb->sb_mb;
                    673:        struct mbuf *n;
                    674:
1.89.6.2  mjf       675:        KASSERT(solocked(sb->sb_so));
                    676:
1.43      thorpej   677:        while (m && m->m_nextpkt)
                    678:                m = m->m_nextpkt;
                    679:
                    680:        while (m && m->m_next)
                    681:                m = m->m_next;
                    682:
                    683:        if (m != sb->sb_mbtail) {
                    684:                printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
                    685:                    sb->sb_mb, sb->sb_mbtail, m);
                    686:                printf("packet tree:\n");
                    687:                for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
                    688:                        printf("\t");
                    689:                        for (n = m; n != NULL; n = n->m_next)
                    690:                                printf("%p ", n);
                    691:                        printf("\n");
                    692:                }
                    693:                panic("sblastmbufchk from %s", where);
                    694:        }
                    695: }
                    696: #endif /* SOCKBUF_DEBUG */
                    697:
1.63      jonathan  698: /*
                    699:  * Link a chain of records onto a socket buffer
                    700:  */
                    701: #define        SBLINKRECORDCHAIN(sb, m0, mlast)                                \
1.43      thorpej   702: do {                                                                   \
                    703:        if ((sb)->sb_lastrecord != NULL)                                \
                    704:                (sb)->sb_lastrecord->m_nextpkt = (m0);                  \
                    705:        else                                                            \
                    706:                (sb)->sb_mb = (m0);                                     \
1.63      jonathan  707:        (sb)->sb_lastrecord = (mlast);                                  \
1.43      thorpej   708: } while (/*CONSTCOND*/0)
                    709:
1.63      jonathan  710:
                    711: #define        SBLINKRECORD(sb, m0)                                            \
                    712:     SBLINKRECORDCHAIN(sb, m0, m0)
                    713:
1.1       cgd       714: /*
                    715:  * Append mbuf chain m to the last record in the
                    716:  * socket buffer sb.  The additional space associated
                    717:  * the mbuf chain is recorded in sb.  Empty mbufs are
                    718:  * discarded and mbufs are compacted where possible.
                    719:  */
1.7       mycroft   720: void
1.37      lukem     721: sbappend(struct sockbuf *sb, struct mbuf *m)
1.1       cgd       722: {
1.37      lukem     723:        struct mbuf     *n;
1.1       cgd       724:
1.89.6.2  mjf       725:        KASSERT(solocked(sb->sb_so));
                    726:
1.1       cgd       727:        if (m == 0)
                    728:                return;
1.43      thorpej   729:
1.49      matt      730: #ifdef MBUFTRACE
1.65      jonathan  731:        m_claimm(m, sb->sb_mowner);
1.49      matt      732: #endif
                    733:
1.43      thorpej   734:        SBLASTRECORDCHK(sb, "sbappend 1");
                    735:
                    736:        if ((n = sb->sb_lastrecord) != NULL) {
                    737:                /*
                    738:                 * XXX Would like to simply use sb_mbtail here, but
                    739:                 * XXX I need to verify that I won't miss an EOR that
                    740:                 * XXX way.
                    741:                 */
1.1       cgd       742:                do {
                    743:                        if (n->m_flags & M_EOR) {
                    744:                                sbappendrecord(sb, m); /* XXXXXX!!!! */
                    745:                                return;
                    746:                        }
                    747:                } while (n->m_next && (n = n->m_next));
1.43      thorpej   748:        } else {
                    749:                /*
                    750:                 * If this is the first record in the socket buffer, it's
                    751:                 * also the last record.
                    752:                 */
                    753:                sb->sb_lastrecord = m;
1.1       cgd       754:        }
                    755:        sbcompress(sb, m, n);
1.43      thorpej   756:        SBLASTRECORDCHK(sb, "sbappend 2");
                    757: }
                    758:
                    759: /*
                    760:  * This version of sbappend() should only be used when the caller
                    761:  * absolutely knows that there will never be more than one record
                    762:  * in the socket buffer, that is, a stream protocol (such as TCP).
                    763:  */
                    764: void
1.44      thorpej   765: sbappendstream(struct sockbuf *sb, struct mbuf *m)
1.43      thorpej   766: {
                    767:
1.89.6.2  mjf       768:        KASSERT(solocked(sb->sb_so));
1.43      thorpej   769:        KDASSERT(m->m_nextpkt == NULL);
                    770:        KASSERT(sb->sb_mb == sb->sb_lastrecord);
                    771:
                    772:        SBLASTMBUFCHK(sb, __func__);
                    773:
1.49      matt      774: #ifdef MBUFTRACE
1.65      jonathan  775:        m_claimm(m, sb->sb_mowner);
1.49      matt      776: #endif
                    777:
1.43      thorpej   778:        sbcompress(sb, m, sb->sb_mbtail);
                    779:
                    780:        sb->sb_lastrecord = sb->sb_mb;
                    781:        SBLASTRECORDCHK(sb, __func__);
1.1       cgd       782: }
                    783:
                    784: #ifdef SOCKBUF_DEBUG
1.7       mycroft   785: void
1.37      lukem     786: sbcheck(struct sockbuf *sb)
1.1       cgd       787: {
1.89.6.2  mjf       788:        struct mbuf     *m, *m2;
1.43      thorpej   789:        u_long          len, mbcnt;
1.1       cgd       790:
1.89.6.2  mjf       791:        KASSERT(solocked(sb->sb_so));
                    792:
1.37      lukem     793:        len = 0;
                    794:        mbcnt = 0;
1.89.6.2  mjf       795:        for (m = sb->sb_mb; m; m = m->m_nextpkt) {
                    796:                for (m2 = m; m2 != NULL; m2 = m2->m_next) {
                    797:                        len += m2->m_len;
                    798:                        mbcnt += MSIZE;
                    799:                        if (m2->m_flags & M_EXT)
                    800:                                mbcnt += m2->m_ext.ext_size;
                    801:                        if (m2->m_nextpkt != NULL)
                    802:                                panic("sbcheck nextpkt");
                    803:                }
1.1       cgd       804:        }
                    805:        if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
1.43      thorpej   806:                printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
1.1       cgd       807:                    mbcnt, sb->sb_mbcnt);
                    808:                panic("sbcheck");
                    809:        }
                    810: }
                    811: #endif
                    812:
                    813: /*
                    814:  * As above, except the mbuf chain
                    815:  * begins a new record.
                    816:  */
1.7       mycroft   817: void
1.37      lukem     818: sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
1.1       cgd       819: {
1.37      lukem     820:        struct mbuf     *m;
1.1       cgd       821:
1.89.6.2  mjf       822:        KASSERT(solocked(sb->sb_so));
                    823:
1.1       cgd       824:        if (m0 == 0)
                    825:                return;
1.43      thorpej   826:
1.49      matt      827: #ifdef MBUFTRACE
1.65      jonathan  828:        m_claimm(m0, sb->sb_mowner);
1.49      matt      829: #endif
1.1       cgd       830:        /*
                    831:         * Put the first mbuf on the queue.
                    832:         * Note this permits zero length records.
                    833:         */
                    834:        sballoc(sb, m0);
1.43      thorpej   835:        SBLASTRECORDCHK(sb, "sbappendrecord 1");
                    836:        SBLINKRECORD(sb, m0);
1.1       cgd       837:        m = m0->m_next;
                    838:        m0->m_next = 0;
                    839:        if (m && (m0->m_flags & M_EOR)) {
                    840:                m0->m_flags &= ~M_EOR;
                    841:                m->m_flags |= M_EOR;
                    842:        }
                    843:        sbcompress(sb, m, m0);
1.43      thorpej   844:        SBLASTRECORDCHK(sb, "sbappendrecord 2");
1.1       cgd       845: }
                    846:
                    847: /*
                    848:  * As above except that OOB data
                    849:  * is inserted at the beginning of the sockbuf,
                    850:  * but after any other OOB data.
                    851:  */
1.7       mycroft   852: void
1.37      lukem     853: sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
1.1       cgd       854: {
1.37      lukem     855:        struct mbuf     *m, **mp;
1.1       cgd       856:
1.89.6.2  mjf       857:        KASSERT(solocked(sb->sb_so));
                    858:
1.1       cgd       859:        if (m0 == 0)
                    860:                return;
1.43      thorpej   861:
                    862:        SBLASTRECORDCHK(sb, "sbinsertoob 1");
                    863:
1.11      christos  864:        for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
1.1       cgd       865:            again:
                    866:                switch (m->m_type) {
                    867:
                    868:                case MT_OOBDATA:
                    869:                        continue;               /* WANT next train */
                    870:
                    871:                case MT_CONTROL:
1.11      christos  872:                        if ((m = m->m_next) != NULL)
1.1       cgd       873:                                goto again;     /* inspect THIS train further */
                    874:                }
                    875:                break;
                    876:        }
                    877:        /*
                    878:         * Put the first mbuf on the queue.
                    879:         * Note this permits zero length records.
                    880:         */
                    881:        sballoc(sb, m0);
                    882:        m0->m_nextpkt = *mp;
1.43      thorpej   883:        if (*mp == NULL) {
                    884:                /* m0 is actually the new tail */
                    885:                sb->sb_lastrecord = m0;
                    886:        }
1.1       cgd       887:        *mp = m0;
                    888:        m = m0->m_next;
                    889:        m0->m_next = 0;
                    890:        if (m && (m0->m_flags & M_EOR)) {
                    891:                m0->m_flags &= ~M_EOR;
                    892:                m->m_flags |= M_EOR;
                    893:        }
                    894:        sbcompress(sb, m, m0);
1.43      thorpej   895:        SBLASTRECORDCHK(sb, "sbinsertoob 2");
1.1       cgd       896: }
                    897:
                    898: /*
                    899:  * Append address and data, and optionally, control (ancillary) data
                    900:  * to the receive queue of a socket.  If present,
                    901:  * m0 must include a packet header with total length.
                    902:  * Returns 0 if no space in sockbuf or insufficient mbufs.
                    903:  */
1.7       mycroft   904: int
1.61      matt      905: sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
1.37      lukem     906:        struct mbuf *control)
1.1       cgd       907: {
1.43      thorpej   908:        struct mbuf     *m, *n, *nlast;
1.50      fvdl      909:        int             space, len;
1.1       cgd       910:
1.89.6.2  mjf       911:        KASSERT(solocked(sb->sb_so));
                    912:
1.37      lukem     913:        space = asa->sa_len;
                    914:
1.49      matt      915:        if (m0 != NULL) {
                    916:                if ((m0->m_flags & M_PKTHDR) == 0)
                    917:                        panic("sbappendaddr");
1.1       cgd       918:                space += m0->m_pkthdr.len;
1.49      matt      919: #ifdef MBUFTRACE
1.65      jonathan  920:                m_claimm(m0, sb->sb_mowner);
1.49      matt      921: #endif
                    922:        }
1.1       cgd       923:        for (n = control; n; n = n->m_next) {
                    924:                space += n->m_len;
1.49      matt      925:                MCLAIM(n, sb->sb_mowner);
1.1       cgd       926:                if (n->m_next == 0)     /* keep pointer to last control buf */
                    927:                        break;
                    928:        }
                    929:        if (space > sbspace(sb))
                    930:                return (0);
                    931:        MGET(m, M_DONTWAIT, MT_SONAME);
                    932:        if (m == 0)
                    933:                return (0);
1.49      matt      934:        MCLAIM(m, sb->sb_mowner);
1.50      fvdl      935:        /*
                    936:         * XXX avoid 'comparison always true' warning which isn't easily
                    937:         * avoided.
                    938:         */
                    939:        len = asa->sa_len;
                    940:        if (len > MLEN) {
1.20      thorpej   941:                MEXTMALLOC(m, asa->sa_len, M_NOWAIT);
                    942:                if ((m->m_flags & M_EXT) == 0) {
                    943:                        m_free(m);
                    944:                        return (0);
                    945:                }
                    946:        }
1.1       cgd       947:        m->m_len = asa->sa_len;
1.82      christos  948:        memcpy(mtod(m, void *), asa, asa->sa_len);
1.1       cgd       949:        if (n)
                    950:                n->m_next = m0;         /* concatenate data to control */
                    951:        else
                    952:                control = m0;
                    953:        m->m_next = control;
1.43      thorpej   954:
                    955:        SBLASTRECORDCHK(sb, "sbappendaddr 1");
                    956:
                    957:        for (n = m; n->m_next != NULL; n = n->m_next)
1.1       cgd       958:                sballoc(sb, n);
1.43      thorpej   959:        sballoc(sb, n);
                    960:        nlast = n;
                    961:        SBLINKRECORD(sb, m);
                    962:
                    963:        sb->sb_mbtail = nlast;
                    964:        SBLASTMBUFCHK(sb, "sbappendaddr");
                    965:        SBLASTRECORDCHK(sb, "sbappendaddr 2");
                    966:
1.1       cgd       967:        return (1);
                    968: }
                    969:
1.63      jonathan  970: /*
                    971:  * Helper for sbappendchainaddr: prepend a struct sockaddr* to
                    972:  * an mbuf chain.
                    973:  */
1.70      perry     974: static inline struct mbuf *
1.81      yamt      975: m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0,
1.64      jonathan  976:                   const struct sockaddr *asa)
1.63      jonathan  977: {
                    978:        struct mbuf *m;
1.64      jonathan  979:        const int salen = asa->sa_len;
1.63      jonathan  980:
1.89.6.2  mjf       981:        KASSERT(solocked(sb->sb_so));
                    982:
1.63      jonathan  983:        /* only the first in each chain need be a pkthdr */
                    984:        MGETHDR(m, M_DONTWAIT, MT_SONAME);
                    985:        if (m == 0)
                    986:                return (0);
                    987:        MCLAIM(m, sb->sb_mowner);
1.64      jonathan  988: #ifdef notyet
                    989:        if (salen > MHLEN) {
                    990:                MEXTMALLOC(m, salen, M_NOWAIT);
                    991:                if ((m->m_flags & M_EXT) == 0) {
                    992:                        m_free(m);
                    993:                        return (0);
                    994:                }
                    995:        }
                    996: #else
                    997:        KASSERT(salen <= MHLEN);
                    998: #endif
                    999:        m->m_len = salen;
1.82      christos 1000:        memcpy(mtod(m, void *), asa, salen);
1.63      jonathan 1001:        m->m_next = m0;
1.64      jonathan 1002:        m->m_pkthdr.len = salen + m0->m_pkthdr.len;
1.63      jonathan 1003:
                   1004:        return m;
                   1005: }
                   1006:
                   1007: int
                   1008: sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
                   1009:                  struct mbuf *m0, int sbprio)
                   1010: {
                   1011:        int space;
                   1012:        struct mbuf *m, *n, *n0, *nlast;
                   1013:        int error;
                   1014:
1.89.6.2  mjf      1015:        KASSERT(solocked(sb->sb_so));
                   1016:
1.63      jonathan 1017:        /*
                   1018:         * XXX sbprio reserved for encoding priority of this* request:
                   1019:         *  SB_PRIO_NONE --> honour normal sb limits
                   1020:         *  SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
                   1021:         *      take whole chain. Intended for large requests
                   1022:         *      that should be delivered atomically (all, or none).
                   1023:         * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
                   1024:         *       over normal socket limits, for messages indicating
                   1025:         *       buffer overflow in earlier normal/lower-priority messages
                   1026:         * SB_PRIO_BESTEFFORT -->  ignore limits entirely.
                   1027:         *       Intended for  kernel-generated messages only.
                   1028:         *        Up to generator to avoid total mbuf resource exhaustion.
                   1029:         */
                   1030:        (void)sbprio;
                   1031:
                   1032:        if (m0 && (m0->m_flags & M_PKTHDR) == 0)
                   1033:                panic("sbappendaddrchain");
                   1034:
                   1035:        space = sbspace(sb);
1.66      perry    1036:
1.63      jonathan 1037: #ifdef notyet
1.66      perry    1038:        /*
1.63      jonathan 1039:         * Enforce SB_PRIO_* limits as described above.
                   1040:         */
                   1041: #endif
                   1042:
                   1043:        n0 = NULL;
                   1044:        nlast = NULL;
                   1045:        for (m = m0; m; m = m->m_nextpkt) {
                   1046:                struct mbuf *np;
                   1047:
1.64      jonathan 1048: #ifdef MBUFTRACE
1.65      jonathan 1049:                m_claimm(m, sb->sb_mowner);
1.64      jonathan 1050: #endif
                   1051:
1.63      jonathan 1052:                /* Prepend sockaddr to this record (m) of input chain m0 */
1.64      jonathan 1053:                n = m_prepend_sockaddr(sb, m, asa);
1.63      jonathan 1054:                if (n == NULL) {
                   1055:                        error = ENOBUFS;
                   1056:                        goto bad;
                   1057:                }
                   1058:
                   1059:                /* Append record (asa+m) to end of new chain n0 */
                   1060:                if (n0 == NULL) {
                   1061:                        n0 = n;
                   1062:                } else {
                   1063:                        nlast->m_nextpkt = n;
                   1064:                }
                   1065:                /* Keep track of last record on new chain */
                   1066:                nlast = n;
                   1067:
                   1068:                for (np = n; np; np = np->m_next)
                   1069:                        sballoc(sb, np);
                   1070:        }
                   1071:
1.64      jonathan 1072:        SBLASTRECORDCHK(sb, "sbappendaddrchain 1");
                   1073:
1.63      jonathan 1074:        /* Drop the entire chain of (asa+m) records onto the socket */
                   1075:        SBLINKRECORDCHAIN(sb, n0, nlast);
1.64      jonathan 1076:
                   1077:        SBLASTRECORDCHK(sb, "sbappendaddrchain 2");
                   1078:
1.63      jonathan 1079:        for (m = nlast; m->m_next; m = m->m_next)
                   1080:                ;
                   1081:        sb->sb_mbtail = m;
1.64      jonathan 1082:        SBLASTMBUFCHK(sb, "sbappendaddrchain");
                   1083:
1.63      jonathan 1084:        return (1);
                   1085:
                   1086: bad:
1.64      jonathan 1087:        /*
                   1088:         * On error, free the prepended addreseses. For consistency
                   1089:         * with sbappendaddr(), leave it to our caller to free
                   1090:         * the input record chain passed to us as m0.
                   1091:         */
                   1092:        while ((n = n0) != NULL) {
                   1093:                struct mbuf *np;
                   1094:
                   1095:                /* Undo the sballoc() of this record */
                   1096:                for (np = n; np; np = np->m_next)
                   1097:                        sbfree(sb, np);
                   1098:
                   1099:                n0 = n->m_nextpkt;      /* iterate at next prepended address */
                   1100:                MFREE(n, np);           /* free prepended address (not data) */
                   1101:        }
1.66      perry    1102:        return 0;
1.63      jonathan 1103: }
                   1104:
                   1105:
1.7       mycroft  1106: int
1.37      lukem    1107: sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
1.1       cgd      1108: {
1.43      thorpej  1109:        struct mbuf     *m, *mlast, *n;
1.37      lukem    1110:        int             space;
1.1       cgd      1111:
1.89.6.2  mjf      1112:        KASSERT(solocked(sb->sb_so));
                   1113:
1.37      lukem    1114:        space = 0;
1.1       cgd      1115:        if (control == 0)
                   1116:                panic("sbappendcontrol");
                   1117:        for (m = control; ; m = m->m_next) {
                   1118:                space += m->m_len;
1.49      matt     1119:                MCLAIM(m, sb->sb_mowner);
1.1       cgd      1120:                if (m->m_next == 0)
                   1121:                        break;
                   1122:        }
                   1123:        n = m;                  /* save pointer to last control buffer */
1.49      matt     1124:        for (m = m0; m; m = m->m_next) {
                   1125:                MCLAIM(m, sb->sb_mowner);
1.1       cgd      1126:                space += m->m_len;
1.49      matt     1127:        }
1.1       cgd      1128:        if (space > sbspace(sb))
                   1129:                return (0);
                   1130:        n->m_next = m0;                 /* concatenate data to control */
1.43      thorpej  1131:
                   1132:        SBLASTRECORDCHK(sb, "sbappendcontrol 1");
                   1133:
                   1134:        for (m = control; m->m_next != NULL; m = m->m_next)
1.1       cgd      1135:                sballoc(sb, m);
1.43      thorpej  1136:        sballoc(sb, m);
                   1137:        mlast = m;
                   1138:        SBLINKRECORD(sb, control);
                   1139:
                   1140:        sb->sb_mbtail = mlast;
                   1141:        SBLASTMBUFCHK(sb, "sbappendcontrol");
                   1142:        SBLASTRECORDCHK(sb, "sbappendcontrol 2");
                   1143:
1.1       cgd      1144:        return (1);
                   1145: }
                   1146:
                   1147: /*
                   1148:  * Compress mbuf chain m into the socket
                   1149:  * buffer sb following mbuf n.  If n
                   1150:  * is null, the buffer is presumed empty.
                   1151:  */
1.7       mycroft  1152: void
1.37      lukem    1153: sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1.1       cgd      1154: {
1.37      lukem    1155:        int             eor;
                   1156:        struct mbuf     *o;
1.1       cgd      1157:
1.89.6.2  mjf      1158:        KASSERT(solocked(sb->sb_so));
                   1159:
1.37      lukem    1160:        eor = 0;
1.1       cgd      1161:        while (m) {
                   1162:                eor |= m->m_flags & M_EOR;
                   1163:                if (m->m_len == 0 &&
                   1164:                    (eor == 0 ||
                   1165:                     (((o = m->m_next) || (o = n)) &&
                   1166:                      o->m_type == m->m_type))) {
1.46      thorpej  1167:                        if (sb->sb_lastrecord == m)
                   1168:                                sb->sb_lastrecord = m->m_next;
1.1       cgd      1169:                        m = m_free(m);
                   1170:                        continue;
                   1171:                }
1.40      thorpej  1172:                if (n && (n->m_flags & M_EOR) == 0 &&
                   1173:                    /* M_TRAILINGSPACE() checks buffer writeability */
                   1174:                    m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
                   1175:                    m->m_len <= M_TRAILINGSPACE(n) &&
                   1176:                    n->m_type == m->m_type) {
1.82      christos 1177:                        memcpy(mtod(n, char *) + n->m_len, mtod(m, void *),
1.1       cgd      1178:                            (unsigned)m->m_len);
                   1179:                        n->m_len += m->m_len;
                   1180:                        sb->sb_cc += m->m_len;
                   1181:                        m = m_free(m);
                   1182:                        continue;
                   1183:                }
                   1184:                if (n)
                   1185:                        n->m_next = m;
                   1186:                else
                   1187:                        sb->sb_mb = m;
1.43      thorpej  1188:                sb->sb_mbtail = m;
1.1       cgd      1189:                sballoc(sb, m);
                   1190:                n = m;
                   1191:                m->m_flags &= ~M_EOR;
                   1192:                m = m->m_next;
                   1193:                n->m_next = 0;
                   1194:        }
                   1195:        if (eor) {
                   1196:                if (n)
                   1197:                        n->m_flags |= eor;
                   1198:                else
1.15      christos 1199:                        printf("semi-panic: sbcompress\n");
1.1       cgd      1200:        }
1.43      thorpej  1201:        SBLASTMBUFCHK(sb, __func__);
1.1       cgd      1202: }
                   1203:
                   1204: /*
                   1205:  * Free all mbufs in a sockbuf.
                   1206:  * Check that all resources are reclaimed.
                   1207:  */
1.7       mycroft  1208: void
1.37      lukem    1209: sbflush(struct sockbuf *sb)
1.1       cgd      1210: {
                   1211:
1.89.6.2  mjf      1212:        KASSERT(solocked(sb->sb_so));
1.43      thorpej  1213:        KASSERT((sb->sb_flags & SB_LOCK) == 0);
                   1214:
1.1       cgd      1215:        while (sb->sb_mbcnt)
                   1216:                sbdrop(sb, (int)sb->sb_cc);
1.43      thorpej  1217:
                   1218:        KASSERT(sb->sb_cc == 0);
                   1219:        KASSERT(sb->sb_mb == NULL);
                   1220:        KASSERT(sb->sb_mbtail == NULL);
                   1221:        KASSERT(sb->sb_lastrecord == NULL);
1.1       cgd      1222: }
                   1223:
                   1224: /*
                   1225:  * Drop data from (the front of) a sockbuf.
                   1226:  */
1.7       mycroft  1227: void
1.37      lukem    1228: sbdrop(struct sockbuf *sb, int len)
1.1       cgd      1229: {
1.37      lukem    1230:        struct mbuf     *m, *mn, *next;
1.1       cgd      1231:
1.89.6.2  mjf      1232:        KASSERT(solocked(sb->sb_so));
                   1233:
1.1       cgd      1234:        next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
                   1235:        while (len > 0) {
                   1236:                if (m == 0) {
                   1237:                        if (next == 0)
                   1238:                                panic("sbdrop");
                   1239:                        m = next;
                   1240:                        next = m->m_nextpkt;
                   1241:                        continue;
                   1242:                }
                   1243:                if (m->m_len > len) {
                   1244:                        m->m_len -= len;
                   1245:                        m->m_data += len;
                   1246:                        sb->sb_cc -= len;
                   1247:                        break;
                   1248:                }
                   1249:                len -= m->m_len;
                   1250:                sbfree(sb, m);
                   1251:                MFREE(m, mn);
                   1252:                m = mn;
                   1253:        }
                   1254:        while (m && m->m_len == 0) {
                   1255:                sbfree(sb, m);
                   1256:                MFREE(m, mn);
                   1257:                m = mn;
                   1258:        }
                   1259:        if (m) {
                   1260:                sb->sb_mb = m;
                   1261:                m->m_nextpkt = next;
                   1262:        } else
                   1263:                sb->sb_mb = next;
1.43      thorpej  1264:        /*
1.45      thorpej  1265:         * First part is an inline SB_EMPTY_FIXUP().  Second part
1.43      thorpej  1266:         * makes sure sb_lastrecord is up-to-date if we dropped
                   1267:         * part of the last record.
                   1268:         */
                   1269:        m = sb->sb_mb;
                   1270:        if (m == NULL) {
                   1271:                sb->sb_mbtail = NULL;
                   1272:                sb->sb_lastrecord = NULL;
                   1273:        } else if (m->m_nextpkt == NULL)
                   1274:                sb->sb_lastrecord = m;
1.1       cgd      1275: }
                   1276:
                   1277: /*
                   1278:  * Drop a record off the front of a sockbuf
                   1279:  * and move the next record to the front.
                   1280:  */
1.7       mycroft  1281: void
1.37      lukem    1282: sbdroprecord(struct sockbuf *sb)
1.1       cgd      1283: {
1.37      lukem    1284:        struct mbuf     *m, *mn;
1.1       cgd      1285:
1.89.6.2  mjf      1286:        KASSERT(solocked(sb->sb_so));
                   1287:
1.1       cgd      1288:        m = sb->sb_mb;
                   1289:        if (m) {
                   1290:                sb->sb_mb = m->m_nextpkt;
                   1291:                do {
                   1292:                        sbfree(sb, m);
                   1293:                        MFREE(m, mn);
1.11      christos 1294:                } while ((m = mn) != NULL);
1.1       cgd      1295:        }
1.45      thorpej  1296:        SB_EMPTY_FIXUP(sb);
1.19      thorpej  1297: }
                   1298:
                   1299: /*
                   1300:  * Create a "control" mbuf containing the specified data
                   1301:  * with the specified type for presentation on a socket buffer.
                   1302:  */
                   1303: struct mbuf *
1.82      christos 1304: sbcreatecontrol(void *p, int size, int type, int level)
1.19      thorpej  1305: {
1.37      lukem    1306:        struct cmsghdr  *cp;
                   1307:        struct mbuf     *m;
1.19      thorpej  1308:
1.35      itojun   1309:        if (CMSG_SPACE(size) > MCLBYTES) {
1.30      itojun   1310:                printf("sbcreatecontrol: message too large %d\n", size);
                   1311:                return NULL;
                   1312:        }
                   1313:
1.19      thorpej  1314:        if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
                   1315:                return ((struct mbuf *) NULL);
1.35      itojun   1316:        if (CMSG_SPACE(size) > MLEN) {
1.30      itojun   1317:                MCLGET(m, M_DONTWAIT);
                   1318:                if ((m->m_flags & M_EXT) == 0) {
                   1319:                        m_free(m);
                   1320:                        return NULL;
                   1321:                }
                   1322:        }
1.19      thorpej  1323:        cp = mtod(m, struct cmsghdr *);
1.26      perry    1324:        memcpy(CMSG_DATA(cp), p, size);
1.35      itojun   1325:        m->m_len = CMSG_SPACE(size);
                   1326:        cp->cmsg_len = CMSG_LEN(size);
1.19      thorpej  1327:        cp->cmsg_level = level;
                   1328:        cp->cmsg_type = type;
                   1329:        return (m);
1.1       cgd      1330: }
1.89.6.2  mjf      1331:
                   1332: void
                   1333: solockretry(struct socket *so, kmutex_t *lock)
                   1334: {
                   1335:
                   1336:        while (lock != so->so_lock) {
                   1337:                mutex_exit(lock);
                   1338:                lock = so->so_lock;
                   1339:                mutex_enter(lock);
                   1340:        }
                   1341: }
                   1342:
                   1343: bool
                   1344: solocked(struct socket *so)
                   1345: {
                   1346:
                   1347:        return mutex_owned(so->so_lock);
                   1348: }
                   1349:
                   1350: bool
                   1351: solocked2(struct socket *so1, struct socket *so2)
                   1352: {
                   1353:        kmutex_t *lock;
                   1354:
                   1355:        lock = so1->so_lock;
                   1356:        if (lock != so2->so_lock)
                   1357:                return false;
                   1358:        return mutex_owned(lock);
                   1359: }
                   1360:
                   1361: /*
                   1362:  * Assign a default lock to a new socket.  For PRU_ATTACH, and done by
                   1363:  * protocols that do not have special locking requirements.
                   1364:  */
                   1365: void
                   1366: sosetlock(struct socket *so)
                   1367: {
                   1368:        kmutex_t *lock;
                   1369:
                   1370:        if (so->so_lock == NULL) {
                   1371:                lock = softnet_lock;
                   1372:                so->so_lock = lock;
                   1373:                mutex_obj_hold(lock);
                   1374:                mutex_enter(lock);
                   1375:        }
                   1376:
                   1377:        /* In all cases, lock must be held on return from PRU_ATTACH. */
                   1378:        KASSERT(solocked(so));
                   1379: }
                   1380:
                   1381: /*
                   1382:  * Set lock on sockbuf sb; sleep if lock is already held.
                   1383:  * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
                   1384:  * Returns error without lock if sleep is interrupted.
                   1385:  */
                   1386: int
                   1387: sblock(struct sockbuf *sb, int wf)
                   1388: {
                   1389:        struct socket *so;
                   1390:        kmutex_t *lock;
                   1391:        int error;
                   1392:
                   1393:        KASSERT(solocked(sb->sb_so));
                   1394:
                   1395:        for (;;) {
                   1396:                if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) {
                   1397:                        sb->sb_flags |= SB_LOCK;
                   1398:                        return 0;
                   1399:                }
                   1400:                if (wf != M_WAITOK)
                   1401:                        return EWOULDBLOCK;
                   1402:                so = sb->sb_so;
                   1403:                lock = so->so_lock;
                   1404:                if ((sb->sb_flags & SB_NOINTR) != 0) {
                   1405:                        cv_wait(&so->so_cv, lock);
                   1406:                        error = 0;
                   1407:                } else
                   1408:                        error = cv_wait_sig(&so->so_cv, lock);
                   1409:                if (__predict_false(lock != so->so_lock))
                   1410:                        solockretry(so, lock);
                   1411:                if (error != 0)
                   1412:                        return error;
                   1413:        }
                   1414: }
                   1415:
                   1416: void
                   1417: sbunlock(struct sockbuf *sb)
                   1418: {
                   1419:        struct socket *so;
                   1420:
                   1421:        so = sb->sb_so;
                   1422:
                   1423:        KASSERT(solocked(so));
                   1424:        KASSERT((sb->sb_flags & SB_LOCK) != 0);
                   1425:
                   1426:        sb->sb_flags &= ~SB_LOCK;
                   1427:        cv_broadcast(&so->so_cv);
                   1428: }
                   1429:
                   1430: int
                   1431: sowait(struct socket *so, int timo)
                   1432: {
                   1433:        kmutex_t *lock;
                   1434:        int error;
                   1435:
                   1436:        KASSERT(solocked(so));
                   1437:
                   1438:        lock = so->so_lock;
                   1439:        error = cv_timedwait_sig(&so->so_cv, lock, timo);
                   1440:        if (__predict_false(lock != so->so_lock))
                   1441:                solockretry(so, lock);
                   1442:        return error;
                   1443: }

CVSweb <webmaster@jp.NetBSD.org>