Annotation of src/sys/kern/uipc_socket2.c, Revision 1.84.2.1
1.84.2.1! skrll 1: /* $NetBSD: uipc_socket2.c,v 1.85 2007/08/02 02:42:40 rmind Exp $ */
1.9 cgd 2:
1.1 cgd 3: /*
1.7 mycroft 4: * Copyright (c) 1982, 1986, 1988, 1990, 1993
5: * The Regents of the University of California. All rights reserved.
1.1 cgd 6: *
7: * Redistribution and use in source and binary forms, with or without
8: * modification, are permitted provided that the following conditions
9: * are met:
10: * 1. Redistributions of source code must retain the above copyright
11: * notice, this list of conditions and the following disclaimer.
12: * 2. Redistributions in binary form must reproduce the above copyright
13: * notice, this list of conditions and the following disclaimer in the
14: * documentation and/or other materials provided with the distribution.
1.54 agc 15: * 3. Neither the name of the University nor the names of its contributors
1.1 cgd 16: * may be used to endorse or promote products derived from this software
17: * without specific prior written permission.
18: *
19: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29: * SUCH DAMAGE.
30: *
1.23 fvdl 31: * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95
1.1 cgd 32: */
1.42 lukem 33:
34: #include <sys/cdefs.h>
1.84.2.1! skrll 35: __KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.85 2007/08/02 02:42:40 rmind Exp $");
1.51 martin 36:
37: #include "opt_mbuftrace.h"
1.58 thorpej 38: #include "opt_sb_max.h"
1.1 cgd 39:
1.5 mycroft 40: #include <sys/param.h>
41: #include <sys/systm.h>
42: #include <sys/proc.h>
43: #include <sys/file.h>
44: #include <sys/buf.h>
45: #include <sys/malloc.h>
46: #include <sys/mbuf.h>
47: #include <sys/protosw.h>
1.55 christos 48: #include <sys/poll.h>
1.5 mycroft 49: #include <sys/socket.h>
50: #include <sys/socketvar.h>
1.11 christos 51: #include <sys/signalvar.h>
1.71 elad 52: #include <sys/kauth.h>
1.1 cgd 53:
54: /*
55: * Primitive routines for operating on sockets and socket buffers
56: */
57:
58: /* strings for sleep message: */
1.21 mycroft 59: const char netcon[] = "netcon";
60: const char netcls[] = "netcls";
1.41 enami 61: const char netio[] = "netio";
62: const char netlck[] = "netlck";
1.1 cgd 63:
1.58 thorpej 64: u_long sb_max = SB_MAX; /* maximum socket buffer size */
65: static u_long sb_max_adj; /* adjusted sb_max */
66:
1.1 cgd 67: /*
68: * Procedures to manipulate state flags of socket
69: * and do appropriate wakeups. Normal sequence from the
70: * active (originating) side is that soisconnecting() is
71: * called during processing of connect() call,
72: * resulting in an eventual call to soisconnected() if/when the
73: * connection is established. When the connection is torn down
74: * soisdisconnecting() is called during processing of disconnect() call,
75: * and soisdisconnected() is called when the connection to the peer
76: * is totally severed. The semantics of these routines are such that
77: * connectionless protocols can call soisconnected() and soisdisconnected()
78: * only, bypassing the in-progress calls when setting up a ``connection''
79: * takes no time.
80: *
81: * From the passive side, a socket is created with
82: * two queues of sockets: so_q0 for connections in progress
83: * and so_q for connections already made and awaiting user acceptance.
84: * As a protocol is preparing incoming connections, it creates a socket
85: * structure queued on so_q0 by calling sonewconn(). When the connection
86: * is established, soisconnected() is called, and transfers the
87: * socket structure to so_q, making it available to accept().
1.66 perry 88: *
1.1 cgd 89: * If a socket is closed with sockets on either
90: * so_q0 or so_q, these sockets are dropped.
91: *
92: * If higher level protocols are implemented in
93: * the kernel, the wakeups done here will sometimes
94: * cause software-interrupt process scheduling.
95: */
96:
1.7 mycroft 97: void
1.37 lukem 98: soisconnecting(struct socket *so)
1.1 cgd 99: {
100:
101: so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
102: so->so_state |= SS_ISCONNECTING;
103: }
104:
1.7 mycroft 105: void
1.37 lukem 106: soisconnected(struct socket *so)
1.1 cgd 107: {
1.37 lukem 108: struct socket *head;
1.1 cgd 109:
1.37 lukem 110: head = so->so_head;
1.1 cgd 111: so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
112: so->so_state |= SS_ISCONNECTED;
113: if (head && soqremque(so, 0)) {
114: soqinsque(head, so, 1);
115: sorwakeup(head);
1.82 christos 116: wakeup((void *)&head->so_timeo);
1.1 cgd 117: } else {
1.82 christos 118: wakeup((void *)&so->so_timeo);
1.1 cgd 119: sorwakeup(so);
120: sowwakeup(so);
121: }
122: }
123:
1.7 mycroft 124: void
1.37 lukem 125: soisdisconnecting(struct socket *so)
1.1 cgd 126: {
127:
128: so->so_state &= ~SS_ISCONNECTING;
129: so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
1.82 christos 130: wakeup((void *)&so->so_timeo);
1.1 cgd 131: sowwakeup(so);
132: sorwakeup(so);
133: }
134:
1.7 mycroft 135: void
1.37 lukem 136: soisdisconnected(struct socket *so)
1.1 cgd 137: {
138:
139: so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
1.27 mycroft 140: so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
1.82 christos 141: wakeup((void *)&so->so_timeo);
1.1 cgd 142: sowwakeup(so);
143: sorwakeup(so);
144: }
145:
146: /*
147: * When an attempt at a new connection is noted on a socket
148: * which accepts connections, sonewconn is called. If the
149: * connection is possible (subject to space constraints, etc.)
150: * then we allocate a new structure, propoerly linked into the
151: * data structure of the original socket, and return this.
1.77 plunky 152: * Connstatus may be 0, SS_ISCONFIRMING, or SS_ISCONNECTED.
1.1 cgd 153: */
154: struct socket *
1.76 plunky 155: sonewconn(struct socket *head, int connstatus)
1.1 cgd 156: {
1.37 lukem 157: struct socket *so;
158: int soqueue;
1.1 cgd 159:
1.37 lukem 160: soqueue = connstatus ? 1 : 0;
1.1 cgd 161: if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2)
162: return ((struct socket *)0);
1.25 thorpej 163: so = pool_get(&socket_pool, PR_NOWAIT);
1.66 perry 164: if (so == NULL)
1.25 thorpej 165: return (NULL);
1.82 christos 166: memset((void *)so, 0, sizeof(*so));
1.1 cgd 167: so->so_type = head->so_type;
168: so->so_options = head->so_options &~ SO_ACCEPTCONN;
169: so->so_linger = head->so_linger;
170: so->so_state = head->so_state | SS_NOFDREF;
171: so->so_proto = head->so_proto;
172: so->so_timeo = head->so_timeo;
173: so->so_pgid = head->so_pgid;
1.24 matt 174: so->so_send = head->so_send;
175: so->so_receive = head->so_receive;
1.67 christos 176: so->so_uidinfo = head->so_uidinfo;
1.49 matt 177: #ifdef MBUFTRACE
178: so->so_mowner = head->so_mowner;
179: so->so_rcv.sb_mowner = head->so_rcv.sb_mowner;
180: so->so_snd.sb_mowner = head->so_snd.sb_mowner;
181: #endif
1.1 cgd 182: (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat);
1.83 tls 183: so->so_snd.sb_lowat = head->so_snd.sb_lowat;
184: so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
1.84 tls 185: so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
186: so->so_snd.sb_timeo = head->so_snd.sb_timeo;
1.84.2.1! skrll 187: so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
! 188: so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
1.1 cgd 189: soqinsque(head, so, soqueue);
190: if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH,
1.12 mycroft 191: (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
1.69 christos 192: (struct lwp *)0)) {
1.1 cgd 193: (void) soqremque(so, soqueue);
1.25 thorpej 194: pool_put(&socket_pool, so);
195: return (NULL);
1.1 cgd 196: }
197: if (connstatus) {
198: sorwakeup(head);
1.82 christos 199: wakeup((void *)&head->so_timeo);
1.1 cgd 200: so->so_state |= connstatus;
201: }
202: return (so);
203: }
204:
1.7 mycroft 205: void
1.37 lukem 206: soqinsque(struct socket *head, struct socket *so, int q)
1.1 cgd 207: {
208:
1.22 thorpej 209: #ifdef DIAGNOSTIC
210: if (so->so_onq != NULL)
211: panic("soqinsque");
212: #endif
213:
1.1 cgd 214: so->so_head = head;
215: if (q == 0) {
216: head->so_q0len++;
1.22 thorpej 217: so->so_onq = &head->so_q0;
1.1 cgd 218: } else {
219: head->so_qlen++;
1.22 thorpej 220: so->so_onq = &head->so_q;
1.1 cgd 221: }
1.22 thorpej 222: TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
1.1 cgd 223: }
224:
1.7 mycroft 225: int
1.37 lukem 226: soqremque(struct socket *so, int q)
1.1 cgd 227: {
1.37 lukem 228: struct socket *head;
1.1 cgd 229:
1.37 lukem 230: head = so->so_head;
1.22 thorpej 231: if (q == 0) {
232: if (so->so_onq != &head->so_q0)
1.17 thorpej 233: return (0);
1.1 cgd 234: head->so_q0len--;
235: } else {
1.22 thorpej 236: if (so->so_onq != &head->so_q)
237: return (0);
1.1 cgd 238: head->so_qlen--;
239: }
1.22 thorpej 240: TAILQ_REMOVE(so->so_onq, so, so_qe);
241: so->so_onq = NULL;
242: so->so_head = NULL;
1.1 cgd 243: return (1);
244: }
245:
246: /*
247: * Socantsendmore indicates that no more data will be sent on the
248: * socket; it would normally be applied to a socket when the user
249: * informs the system that no more data is to be sent, by the protocol
250: * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
251: * will be received, and will normally be applied to the socket by a
252: * protocol when it detects that the peer will send no more data.
253: * Data queued for reading in the socket may yet be read.
254: */
255:
1.4 andrew 256: void
1.37 lukem 257: socantsendmore(struct socket *so)
1.1 cgd 258: {
259:
260: so->so_state |= SS_CANTSENDMORE;
261: sowwakeup(so);
262: }
263:
1.4 andrew 264: void
1.37 lukem 265: socantrcvmore(struct socket *so)
1.1 cgd 266: {
267:
268: so->so_state |= SS_CANTRCVMORE;
269: sorwakeup(so);
270: }
271:
272: /*
273: * Wait for data to arrive at/drain from a socket buffer.
274: */
1.7 mycroft 275: int
1.37 lukem 276: sbwait(struct sockbuf *sb)
1.1 cgd 277: {
278:
279: sb->sb_flags |= SB_WAIT;
1.82 christos 280: return (tsleep((void *)&sb->sb_cc,
1.1 cgd 281: (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio,
282: sb->sb_timeo));
283: }
284:
1.66 perry 285: /*
1.1 cgd 286: * Lock a sockbuf already known to be locked;
287: * return any error returned from sleep (EINTR).
288: */
1.7 mycroft 289: int
1.37 lukem 290: sb_lock(struct sockbuf *sb)
1.1 cgd 291: {
1.37 lukem 292: int error;
1.1 cgd 293:
294: while (sb->sb_flags & SB_LOCK) {
295: sb->sb_flags |= SB_WANT;
1.82 christos 296: error = tsleep((void *)&sb->sb_flags,
1.41 enami 297: (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
298: netlck, 0);
1.11 christos 299: if (error)
1.1 cgd 300: return (error);
301: }
302: sb->sb_flags |= SB_LOCK;
303: return (0);
304: }
305:
306: /*
307: * Wakeup processes waiting on a socket buffer.
308: * Do asynchronous notification via SIGIO
1.39 manu 309: * if the socket buffer has the SB_ASYNC flag set.
1.1 cgd 310: */
1.7 mycroft 311: void
1.55 christos 312: sowakeup(struct socket *so, struct sockbuf *sb, int code)
1.1 cgd 313: {
1.48 jdolecek 314: selnotify(&sb->sb_sel, 0);
1.7 mycroft 315: sb->sb_flags &= ~SB_SEL;
1.1 cgd 316: if (sb->sb_flags & SB_WAIT) {
317: sb->sb_flags &= ~SB_WAIT;
1.82 christos 318: wakeup((void *)&sb->sb_cc);
1.1 cgd 319: }
1.39 manu 320: if (sb->sb_flags & SB_ASYNC) {
1.56 jdolecek 321: int band;
1.57 christos 322: if (code == POLL_IN)
323: band = POLLIN|POLLRDNORM;
324: else
325: band = POLLOUT|POLLWRNORM;
326: fownsignal(so->so_pgid, SIGIO, code, band, so);
1.1 cgd 327: }
1.24 matt 328: if (sb->sb_flags & SB_UPCALL)
329: (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
1.1 cgd 330: }
331:
332: /*
333: * Socket buffer (struct sockbuf) utility routines.
334: *
335: * Each socket contains two socket buffers: one for sending data and
336: * one for receiving data. Each buffer contains a queue of mbufs,
337: * information about the number of mbufs and amount of data in the
1.13 mycroft 338: * queue, and other fields allowing poll() statements and notification
1.1 cgd 339: * on data availability to be implemented.
340: *
341: * Data stored in a socket buffer is maintained as a list of records.
342: * Each record is a list of mbufs chained together with the m_next
343: * field. Records are chained together with the m_nextpkt field. The upper
344: * level routine soreceive() expects the following conventions to be
345: * observed when placing information in the receive buffer:
346: *
347: * 1. If the protocol requires each message be preceded by the sender's
348: * name, then a record containing that name must be present before
349: * any associated data (mbuf's must be of type MT_SONAME).
350: * 2. If the protocol supports the exchange of ``access rights'' (really
351: * just additional data associated with the message), and there are
352: * ``rights'' to be received, then a record containing this data
1.10 mycroft 353: * should be present (mbuf's must be of type MT_CONTROL).
1.1 cgd 354: * 3. If a name or rights record exists, then it must be followed by
355: * a data record, perhaps of zero length.
356: *
357: * Before using a new socket structure it is first necessary to reserve
358: * buffer space to the socket, by calling sbreserve(). This should commit
359: * some of the available buffer space in the system buffer pool for the
360: * socket (currently, it does nothing but enforce limits). The space
361: * should be released by calling sbrelease() when the socket is destroyed.
362: */
363:
1.7 mycroft 364: int
1.58 thorpej 365: sb_max_set(u_long new_sbmax)
366: {
367: int s;
368:
369: if (new_sbmax < (16 * 1024))
370: return (EINVAL);
371:
372: s = splsoftnet();
373: sb_max = new_sbmax;
374: sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES);
375: splx(s);
376:
377: return (0);
378: }
379:
380: int
1.37 lukem 381: soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
1.1 cgd 382: {
1.74 christos 383: /*
384: * there's at least one application (a configure script of screen)
385: * which expects a fifo is writable even if it has "some" bytes
386: * in its buffer.
387: * so we want to make sure (hiwat - lowat) >= (some bytes).
388: *
389: * PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
390: * we expect it's large enough for such applications.
391: */
392: u_long lowat = MAX(sock_loan_thresh, MCLBYTES);
393: u_long hiwat = lowat + PIPE_BUF;
1.1 cgd 394:
1.74 christos 395: if (sndcc < hiwat)
396: sndcc = hiwat;
1.59 christos 397: if (sbreserve(&so->so_snd, sndcc, so) == 0)
1.1 cgd 398: goto bad;
1.59 christos 399: if (sbreserve(&so->so_rcv, rcvcc, so) == 0)
1.1 cgd 400: goto bad2;
401: if (so->so_rcv.sb_lowat == 0)
402: so->so_rcv.sb_lowat = 1;
403: if (so->so_snd.sb_lowat == 0)
1.74 christos 404: so->so_snd.sb_lowat = lowat;
1.1 cgd 405: if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
406: so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
407: return (0);
1.37 lukem 408: bad2:
1.59 christos 409: sbrelease(&so->so_snd, so);
1.37 lukem 410: bad:
1.1 cgd 411: return (ENOBUFS);
412: }
413:
414: /*
415: * Allot mbufs to a sockbuf.
416: * Attempt to scale mbmax so that mbcnt doesn't become limiting
417: * if buffering efficiency is near the normal case.
418: */
1.7 mycroft 419: int
1.59 christos 420: sbreserve(struct sockbuf *sb, u_long cc, struct socket *so)
1.1 cgd 421: {
1.75 ad 422: struct lwp *l = curlwp; /* XXX */
1.62 christos 423: rlim_t maxcc;
1.67 christos 424: struct uidinfo *uidinfo;
1.1 cgd 425:
1.58 thorpej 426: KDASSERT(sb_max_adj != 0);
427: if (cc == 0 || cc > sb_max_adj)
1.1 cgd 428: return (0);
1.60 matt 429: if (so) {
1.75 ad 430: if (l && kauth_cred_geteuid(l->l_cred) == so->so_uidinfo->ui_uid)
431: maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur;
1.60 matt 432: else
433: maxcc = RLIM_INFINITY;
1.67 christos 434: uidinfo = so->so_uidinfo;
1.62 christos 435: } else {
1.67 christos 436: uidinfo = uid_find(0); /* XXX: nothing better */
1.62 christos 437: maxcc = RLIM_INFINITY;
1.60 matt 438: }
1.67 christos 439: if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc))
1.62 christos 440: return 0;
1.1 cgd 441: sb->sb_mbmax = min(cc * 2, sb_max);
442: if (sb->sb_lowat > sb->sb_hiwat)
443: sb->sb_lowat = sb->sb_hiwat;
444: return (1);
445: }
446:
447: /*
448: * Free mbufs held by a socket, and reserved mbuf space.
449: */
1.7 mycroft 450: void
1.59 christos 451: sbrelease(struct sockbuf *sb, struct socket *so)
1.1 cgd 452: {
453:
454: sbflush(sb);
1.67 christos 455: (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0,
1.59 christos 456: RLIM_INFINITY);
457: sb->sb_mbmax = 0;
1.1 cgd 458: }
459:
460: /*
461: * Routines to add and remove
462: * data from an mbuf queue.
463: *
464: * The routines sbappend() or sbappendrecord() are normally called to
465: * append new mbufs to a socket buffer, after checking that adequate
466: * space is available, comparing the function sbspace() with the amount
467: * of data to be added. sbappendrecord() differs from sbappend() in
468: * that data supplied is treated as the beginning of a new record.
469: * To place a sender's address, optional access rights, and data in a
470: * socket receive buffer, sbappendaddr() should be used. To place
471: * access rights and data in a socket receive buffer, sbappendrights()
472: * should be used. In either case, the new data begins a new record.
473: * Note that unlike sbappend() and sbappendrecord(), these routines check
474: * for the caller that there will be enough space to store the data.
475: * Each fails if there is not enough space, or if it cannot find mbufs
476: * to store additional information in.
477: *
478: * Reliable protocols may use the socket send buffer to hold data
479: * awaiting acknowledgement. Data is normally copied from a socket
480: * send buffer in a protocol with m_copy for output to a peer,
481: * and then removing the data from the socket buffer with sbdrop()
482: * or sbdroprecord() when the data is acknowledged by the peer.
483: */
484:
1.43 thorpej 485: #ifdef SOCKBUF_DEBUG
486: void
487: sblastrecordchk(struct sockbuf *sb, const char *where)
488: {
489: struct mbuf *m = sb->sb_mb;
490:
491: while (m && m->m_nextpkt)
492: m = m->m_nextpkt;
493:
494: if (m != sb->sb_lastrecord) {
495: printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
496: sb->sb_mb, sb->sb_lastrecord, m);
497: printf("packet chain:\n");
498: for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
499: printf("\t%p\n", m);
1.47 provos 500: panic("sblastrecordchk from %s", where);
1.43 thorpej 501: }
502: }
503:
504: void
505: sblastmbufchk(struct sockbuf *sb, const char *where)
506: {
507: struct mbuf *m = sb->sb_mb;
508: struct mbuf *n;
509:
510: while (m && m->m_nextpkt)
511: m = m->m_nextpkt;
512:
513: while (m && m->m_next)
514: m = m->m_next;
515:
516: if (m != sb->sb_mbtail) {
517: printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
518: sb->sb_mb, sb->sb_mbtail, m);
519: printf("packet tree:\n");
520: for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
521: printf("\t");
522: for (n = m; n != NULL; n = n->m_next)
523: printf("%p ", n);
524: printf("\n");
525: }
526: panic("sblastmbufchk from %s", where);
527: }
528: }
529: #endif /* SOCKBUF_DEBUG */
530:
1.63 jonathan 531: /*
532: * Link a chain of records onto a socket buffer
533: */
534: #define SBLINKRECORDCHAIN(sb, m0, mlast) \
1.43 thorpej 535: do { \
536: if ((sb)->sb_lastrecord != NULL) \
537: (sb)->sb_lastrecord->m_nextpkt = (m0); \
538: else \
539: (sb)->sb_mb = (m0); \
1.63 jonathan 540: (sb)->sb_lastrecord = (mlast); \
1.43 thorpej 541: } while (/*CONSTCOND*/0)
542:
1.63 jonathan 543:
544: #define SBLINKRECORD(sb, m0) \
545: SBLINKRECORDCHAIN(sb, m0, m0)
546:
1.1 cgd 547: /*
548: * Append mbuf chain m to the last record in the
549: * socket buffer sb. The additional space associated
550: * the mbuf chain is recorded in sb. Empty mbufs are
551: * discarded and mbufs are compacted where possible.
552: */
1.7 mycroft 553: void
1.37 lukem 554: sbappend(struct sockbuf *sb, struct mbuf *m)
1.1 cgd 555: {
1.37 lukem 556: struct mbuf *n;
1.1 cgd 557:
558: if (m == 0)
559: return;
1.43 thorpej 560:
1.49 matt 561: #ifdef MBUFTRACE
1.65 jonathan 562: m_claimm(m, sb->sb_mowner);
1.49 matt 563: #endif
564:
1.43 thorpej 565: SBLASTRECORDCHK(sb, "sbappend 1");
566:
567: if ((n = sb->sb_lastrecord) != NULL) {
568: /*
569: * XXX Would like to simply use sb_mbtail here, but
570: * XXX I need to verify that I won't miss an EOR that
571: * XXX way.
572: */
1.1 cgd 573: do {
574: if (n->m_flags & M_EOR) {
575: sbappendrecord(sb, m); /* XXXXXX!!!! */
576: return;
577: }
578: } while (n->m_next && (n = n->m_next));
1.43 thorpej 579: } else {
580: /*
581: * If this is the first record in the socket buffer, it's
582: * also the last record.
583: */
584: sb->sb_lastrecord = m;
1.1 cgd 585: }
586: sbcompress(sb, m, n);
1.43 thorpej 587: SBLASTRECORDCHK(sb, "sbappend 2");
588: }
589:
590: /*
591: * This version of sbappend() should only be used when the caller
592: * absolutely knows that there will never be more than one record
593: * in the socket buffer, that is, a stream protocol (such as TCP).
594: */
595: void
1.44 thorpej 596: sbappendstream(struct sockbuf *sb, struct mbuf *m)
1.43 thorpej 597: {
598:
599: KDASSERT(m->m_nextpkt == NULL);
600: KASSERT(sb->sb_mb == sb->sb_lastrecord);
601:
602: SBLASTMBUFCHK(sb, __func__);
603:
1.49 matt 604: #ifdef MBUFTRACE
1.65 jonathan 605: m_claimm(m, sb->sb_mowner);
1.49 matt 606: #endif
607:
1.43 thorpej 608: sbcompress(sb, m, sb->sb_mbtail);
609:
610: sb->sb_lastrecord = sb->sb_mb;
611: SBLASTRECORDCHK(sb, __func__);
1.1 cgd 612: }
613:
614: #ifdef SOCKBUF_DEBUG
1.7 mycroft 615: void
1.37 lukem 616: sbcheck(struct sockbuf *sb)
1.1 cgd 617: {
1.37 lukem 618: struct mbuf *m;
1.43 thorpej 619: u_long len, mbcnt;
1.1 cgd 620:
1.37 lukem 621: len = 0;
622: mbcnt = 0;
1.1 cgd 623: for (m = sb->sb_mb; m; m = m->m_next) {
624: len += m->m_len;
625: mbcnt += MSIZE;
626: if (m->m_flags & M_EXT)
627: mbcnt += m->m_ext.ext_size;
628: if (m->m_nextpkt)
629: panic("sbcheck nextpkt");
630: }
631: if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
1.43 thorpej 632: printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
1.1 cgd 633: mbcnt, sb->sb_mbcnt);
634: panic("sbcheck");
635: }
636: }
637: #endif
638:
639: /*
640: * As above, except the mbuf chain
641: * begins a new record.
642: */
1.7 mycroft 643: void
1.37 lukem 644: sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
1.1 cgd 645: {
1.37 lukem 646: struct mbuf *m;
1.1 cgd 647:
648: if (m0 == 0)
649: return;
1.43 thorpej 650:
1.49 matt 651: #ifdef MBUFTRACE
1.65 jonathan 652: m_claimm(m0, sb->sb_mowner);
1.49 matt 653: #endif
1.1 cgd 654: /*
655: * Put the first mbuf on the queue.
656: * Note this permits zero length records.
657: */
658: sballoc(sb, m0);
1.43 thorpej 659: SBLASTRECORDCHK(sb, "sbappendrecord 1");
660: SBLINKRECORD(sb, m0);
1.1 cgd 661: m = m0->m_next;
662: m0->m_next = 0;
663: if (m && (m0->m_flags & M_EOR)) {
664: m0->m_flags &= ~M_EOR;
665: m->m_flags |= M_EOR;
666: }
667: sbcompress(sb, m, m0);
1.43 thorpej 668: SBLASTRECORDCHK(sb, "sbappendrecord 2");
1.1 cgd 669: }
670:
671: /*
672: * As above except that OOB data
673: * is inserted at the beginning of the sockbuf,
674: * but after any other OOB data.
675: */
1.7 mycroft 676: void
1.37 lukem 677: sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
1.1 cgd 678: {
1.37 lukem 679: struct mbuf *m, **mp;
1.1 cgd 680:
681: if (m0 == 0)
682: return;
1.43 thorpej 683:
684: SBLASTRECORDCHK(sb, "sbinsertoob 1");
685:
1.11 christos 686: for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
1.1 cgd 687: again:
688: switch (m->m_type) {
689:
690: case MT_OOBDATA:
691: continue; /* WANT next train */
692:
693: case MT_CONTROL:
1.11 christos 694: if ((m = m->m_next) != NULL)
1.1 cgd 695: goto again; /* inspect THIS train further */
696: }
697: break;
698: }
699: /*
700: * Put the first mbuf on the queue.
701: * Note this permits zero length records.
702: */
703: sballoc(sb, m0);
704: m0->m_nextpkt = *mp;
1.43 thorpej 705: if (*mp == NULL) {
706: /* m0 is actually the new tail */
707: sb->sb_lastrecord = m0;
708: }
1.1 cgd 709: *mp = m0;
710: m = m0->m_next;
711: m0->m_next = 0;
712: if (m && (m0->m_flags & M_EOR)) {
713: m0->m_flags &= ~M_EOR;
714: m->m_flags |= M_EOR;
715: }
716: sbcompress(sb, m, m0);
1.43 thorpej 717: SBLASTRECORDCHK(sb, "sbinsertoob 2");
1.1 cgd 718: }
719:
720: /*
721: * Append address and data, and optionally, control (ancillary) data
722: * to the receive queue of a socket. If present,
723: * m0 must include a packet header with total length.
724: * Returns 0 if no space in sockbuf or insufficient mbufs.
725: */
1.7 mycroft 726: int
1.61 matt 727: sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
1.37 lukem 728: struct mbuf *control)
1.1 cgd 729: {
1.43 thorpej 730: struct mbuf *m, *n, *nlast;
1.50 fvdl 731: int space, len;
1.1 cgd 732:
1.37 lukem 733: space = asa->sa_len;
734:
1.49 matt 735: if (m0 != NULL) {
736: if ((m0->m_flags & M_PKTHDR) == 0)
737: panic("sbappendaddr");
1.1 cgd 738: space += m0->m_pkthdr.len;
1.49 matt 739: #ifdef MBUFTRACE
1.65 jonathan 740: m_claimm(m0, sb->sb_mowner);
1.49 matt 741: #endif
742: }
1.1 cgd 743: for (n = control; n; n = n->m_next) {
744: space += n->m_len;
1.49 matt 745: MCLAIM(n, sb->sb_mowner);
1.1 cgd 746: if (n->m_next == 0) /* keep pointer to last control buf */
747: break;
748: }
749: if (space > sbspace(sb))
750: return (0);
751: MGET(m, M_DONTWAIT, MT_SONAME);
752: if (m == 0)
753: return (0);
1.49 matt 754: MCLAIM(m, sb->sb_mowner);
1.50 fvdl 755: /*
756: * XXX avoid 'comparison always true' warning which isn't easily
757: * avoided.
758: */
759: len = asa->sa_len;
760: if (len > MLEN) {
1.20 thorpej 761: MEXTMALLOC(m, asa->sa_len, M_NOWAIT);
762: if ((m->m_flags & M_EXT) == 0) {
763: m_free(m);
764: return (0);
765: }
766: }
1.1 cgd 767: m->m_len = asa->sa_len;
1.82 christos 768: memcpy(mtod(m, void *), asa, asa->sa_len);
1.1 cgd 769: if (n)
770: n->m_next = m0; /* concatenate data to control */
771: else
772: control = m0;
773: m->m_next = control;
1.43 thorpej 774:
775: SBLASTRECORDCHK(sb, "sbappendaddr 1");
776:
777: for (n = m; n->m_next != NULL; n = n->m_next)
1.1 cgd 778: sballoc(sb, n);
1.43 thorpej 779: sballoc(sb, n);
780: nlast = n;
781: SBLINKRECORD(sb, m);
782:
783: sb->sb_mbtail = nlast;
784: SBLASTMBUFCHK(sb, "sbappendaddr");
785:
786: SBLASTRECORDCHK(sb, "sbappendaddr 2");
787:
1.1 cgd 788: return (1);
789: }
790:
1.63 jonathan 791: /*
792: * Helper for sbappendchainaddr: prepend a struct sockaddr* to
793: * an mbuf chain.
794: */
1.70 perry 795: static inline struct mbuf *
1.81 yamt 796: m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0,
1.64 jonathan 797: const struct sockaddr *asa)
1.63 jonathan 798: {
799: struct mbuf *m;
1.64 jonathan 800: const int salen = asa->sa_len;
1.63 jonathan 801:
802: /* only the first in each chain need be a pkthdr */
803: MGETHDR(m, M_DONTWAIT, MT_SONAME);
804: if (m == 0)
805: return (0);
806: MCLAIM(m, sb->sb_mowner);
1.64 jonathan 807: #ifdef notyet
808: if (salen > MHLEN) {
809: MEXTMALLOC(m, salen, M_NOWAIT);
810: if ((m->m_flags & M_EXT) == 0) {
811: m_free(m);
812: return (0);
813: }
814: }
815: #else
816: KASSERT(salen <= MHLEN);
817: #endif
818: m->m_len = salen;
1.82 christos 819: memcpy(mtod(m, void *), asa, salen);
1.63 jonathan 820: m->m_next = m0;
1.64 jonathan 821: m->m_pkthdr.len = salen + m0->m_pkthdr.len;
1.63 jonathan 822:
823: return m;
824: }
825:
826: int
827: sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
828: struct mbuf *m0, int sbprio)
829: {
830: int space;
831: struct mbuf *m, *n, *n0, *nlast;
832: int error;
833:
834: /*
835: * XXX sbprio reserved for encoding priority of this* request:
836: * SB_PRIO_NONE --> honour normal sb limits
837: * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
838: * take whole chain. Intended for large requests
839: * that should be delivered atomically (all, or none).
840: * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
841: * over normal socket limits, for messages indicating
842: * buffer overflow in earlier normal/lower-priority messages
843: * SB_PRIO_BESTEFFORT --> ignore limits entirely.
844: * Intended for kernel-generated messages only.
845: * Up to generator to avoid total mbuf resource exhaustion.
846: */
847: (void)sbprio;
848:
849: if (m0 && (m0->m_flags & M_PKTHDR) == 0)
850: panic("sbappendaddrchain");
851:
852: space = sbspace(sb);
1.66 perry 853:
1.63 jonathan 854: #ifdef notyet
1.66 perry 855: /*
1.63 jonathan 856: * Enforce SB_PRIO_* limits as described above.
857: */
858: #endif
859:
860: n0 = NULL;
861: nlast = NULL;
862: for (m = m0; m; m = m->m_nextpkt) {
863: struct mbuf *np;
864:
1.64 jonathan 865: #ifdef MBUFTRACE
1.65 jonathan 866: m_claimm(m, sb->sb_mowner);
1.64 jonathan 867: #endif
868:
1.63 jonathan 869: /* Prepend sockaddr to this record (m) of input chain m0 */
1.64 jonathan 870: n = m_prepend_sockaddr(sb, m, asa);
1.63 jonathan 871: if (n == NULL) {
872: error = ENOBUFS;
873: goto bad;
874: }
875:
876: /* Append record (asa+m) to end of new chain n0 */
877: if (n0 == NULL) {
878: n0 = n;
879: } else {
880: nlast->m_nextpkt = n;
881: }
882: /* Keep track of last record on new chain */
883: nlast = n;
884:
885: for (np = n; np; np = np->m_next)
886: sballoc(sb, np);
887: }
888:
1.64 jonathan 889: SBLASTRECORDCHK(sb, "sbappendaddrchain 1");
890:
1.63 jonathan 891: /* Drop the entire chain of (asa+m) records onto the socket */
892: SBLINKRECORDCHAIN(sb, n0, nlast);
1.64 jonathan 893:
894: SBLASTRECORDCHK(sb, "sbappendaddrchain 2");
895:
1.63 jonathan 896: for (m = nlast; m->m_next; m = m->m_next)
897: ;
898: sb->sb_mbtail = m;
1.64 jonathan 899: SBLASTMBUFCHK(sb, "sbappendaddrchain");
900:
1.63 jonathan 901: return (1);
902:
903: bad:
1.64 jonathan 904: /*
905: * On error, free the prepended addreseses. For consistency
906: * with sbappendaddr(), leave it to our caller to free
907: * the input record chain passed to us as m0.
908: */
909: while ((n = n0) != NULL) {
910: struct mbuf *np;
911:
912: /* Undo the sballoc() of this record */
913: for (np = n; np; np = np->m_next)
914: sbfree(sb, np);
915:
916: n0 = n->m_nextpkt; /* iterate at next prepended address */
917: MFREE(n, np); /* free prepended address (not data) */
918: }
1.66 perry 919: return 0;
1.63 jonathan 920: }
921:
922:
1.7 mycroft 923: int
1.37 lukem 924: sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
1.1 cgd 925: {
1.43 thorpej 926: struct mbuf *m, *mlast, *n;
1.37 lukem 927: int space;
1.1 cgd 928:
1.37 lukem 929: space = 0;
1.1 cgd 930: if (control == 0)
931: panic("sbappendcontrol");
932: for (m = control; ; m = m->m_next) {
933: space += m->m_len;
1.49 matt 934: MCLAIM(m, sb->sb_mowner);
1.1 cgd 935: if (m->m_next == 0)
936: break;
937: }
938: n = m; /* save pointer to last control buffer */
1.49 matt 939: for (m = m0; m; m = m->m_next) {
940: MCLAIM(m, sb->sb_mowner);
1.1 cgd 941: space += m->m_len;
1.49 matt 942: }
1.1 cgd 943: if (space > sbspace(sb))
944: return (0);
945: n->m_next = m0; /* concatenate data to control */
1.43 thorpej 946:
947: SBLASTRECORDCHK(sb, "sbappendcontrol 1");
948:
949: for (m = control; m->m_next != NULL; m = m->m_next)
1.1 cgd 950: sballoc(sb, m);
1.43 thorpej 951: sballoc(sb, m);
952: mlast = m;
953: SBLINKRECORD(sb, control);
954:
955: sb->sb_mbtail = mlast;
956: SBLASTMBUFCHK(sb, "sbappendcontrol");
957:
958: SBLASTRECORDCHK(sb, "sbappendcontrol 2");
959:
1.1 cgd 960: return (1);
961: }
962:
963: /*
964: * Compress mbuf chain m into the socket
965: * buffer sb following mbuf n. If n
966: * is null, the buffer is presumed empty.
967: */
1.7 mycroft 968: void
1.37 lukem 969: sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1.1 cgd 970: {
1.37 lukem 971: int eor;
972: struct mbuf *o;
1.1 cgd 973:
1.37 lukem 974: eor = 0;
1.1 cgd 975: while (m) {
976: eor |= m->m_flags & M_EOR;
977: if (m->m_len == 0 &&
978: (eor == 0 ||
979: (((o = m->m_next) || (o = n)) &&
980: o->m_type == m->m_type))) {
1.46 thorpej 981: if (sb->sb_lastrecord == m)
982: sb->sb_lastrecord = m->m_next;
1.1 cgd 983: m = m_free(m);
984: continue;
985: }
1.40 thorpej 986: if (n && (n->m_flags & M_EOR) == 0 &&
987: /* M_TRAILINGSPACE() checks buffer writeability */
988: m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
989: m->m_len <= M_TRAILINGSPACE(n) &&
990: n->m_type == m->m_type) {
1.82 christos 991: memcpy(mtod(n, char *) + n->m_len, mtod(m, void *),
1.1 cgd 992: (unsigned)m->m_len);
993: n->m_len += m->m_len;
994: sb->sb_cc += m->m_len;
995: m = m_free(m);
996: continue;
997: }
998: if (n)
999: n->m_next = m;
1000: else
1001: sb->sb_mb = m;
1.43 thorpej 1002: sb->sb_mbtail = m;
1.1 cgd 1003: sballoc(sb, m);
1004: n = m;
1005: m->m_flags &= ~M_EOR;
1006: m = m->m_next;
1007: n->m_next = 0;
1008: }
1009: if (eor) {
1010: if (n)
1011: n->m_flags |= eor;
1012: else
1.15 christos 1013: printf("semi-panic: sbcompress\n");
1.1 cgd 1014: }
1.43 thorpej 1015: SBLASTMBUFCHK(sb, __func__);
1.1 cgd 1016: }
1017:
1018: /*
1019: * Free all mbufs in a sockbuf.
1020: * Check that all resources are reclaimed.
1021: */
1.7 mycroft 1022: void
1.37 lukem 1023: sbflush(struct sockbuf *sb)
1.1 cgd 1024: {
1025:
1.43 thorpej 1026: KASSERT((sb->sb_flags & SB_LOCK) == 0);
1027:
1.1 cgd 1028: while (sb->sb_mbcnt)
1029: sbdrop(sb, (int)sb->sb_cc);
1.43 thorpej 1030:
1031: KASSERT(sb->sb_cc == 0);
1032: KASSERT(sb->sb_mb == NULL);
1033: KASSERT(sb->sb_mbtail == NULL);
1034: KASSERT(sb->sb_lastrecord == NULL);
1.1 cgd 1035: }
1036:
1037: /*
1038: * Drop data from (the front of) a sockbuf.
1039: */
1.7 mycroft 1040: void
1.37 lukem 1041: sbdrop(struct sockbuf *sb, int len)
1.1 cgd 1042: {
1.37 lukem 1043: struct mbuf *m, *mn, *next;
1.1 cgd 1044:
1045: next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
1046: while (len > 0) {
1047: if (m == 0) {
1048: if (next == 0)
1049: panic("sbdrop");
1050: m = next;
1051: next = m->m_nextpkt;
1052: continue;
1053: }
1054: if (m->m_len > len) {
1055: m->m_len -= len;
1056: m->m_data += len;
1057: sb->sb_cc -= len;
1058: break;
1059: }
1060: len -= m->m_len;
1061: sbfree(sb, m);
1062: MFREE(m, mn);
1063: m = mn;
1064: }
1065: while (m && m->m_len == 0) {
1066: sbfree(sb, m);
1067: MFREE(m, mn);
1068: m = mn;
1069: }
1070: if (m) {
1071: sb->sb_mb = m;
1072: m->m_nextpkt = next;
1073: } else
1074: sb->sb_mb = next;
1.43 thorpej 1075: /*
1.45 thorpej 1076: * First part is an inline SB_EMPTY_FIXUP(). Second part
1.43 thorpej 1077: * makes sure sb_lastrecord is up-to-date if we dropped
1078: * part of the last record.
1079: */
1080: m = sb->sb_mb;
1081: if (m == NULL) {
1082: sb->sb_mbtail = NULL;
1083: sb->sb_lastrecord = NULL;
1084: } else if (m->m_nextpkt == NULL)
1085: sb->sb_lastrecord = m;
1.1 cgd 1086: }
1087:
1088: /*
1089: * Drop a record off the front of a sockbuf
1090: * and move the next record to the front.
1091: */
1.7 mycroft 1092: void
1.37 lukem 1093: sbdroprecord(struct sockbuf *sb)
1.1 cgd 1094: {
1.37 lukem 1095: struct mbuf *m, *mn;
1.1 cgd 1096:
1097: m = sb->sb_mb;
1098: if (m) {
1099: sb->sb_mb = m->m_nextpkt;
1100: do {
1101: sbfree(sb, m);
1102: MFREE(m, mn);
1.11 christos 1103: } while ((m = mn) != NULL);
1.1 cgd 1104: }
1.45 thorpej 1105: SB_EMPTY_FIXUP(sb);
1.19 thorpej 1106: }
1107:
1108: /*
1109: * Create a "control" mbuf containing the specified data
1110: * with the specified type for presentation on a socket buffer.
1111: */
1112: struct mbuf *
1.82 christos 1113: sbcreatecontrol(void *p, int size, int type, int level)
1.19 thorpej 1114: {
1.37 lukem 1115: struct cmsghdr *cp;
1116: struct mbuf *m;
1.19 thorpej 1117:
1.35 itojun 1118: if (CMSG_SPACE(size) > MCLBYTES) {
1.30 itojun 1119: printf("sbcreatecontrol: message too large %d\n", size);
1120: return NULL;
1121: }
1122:
1.19 thorpej 1123: if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
1124: return ((struct mbuf *) NULL);
1.35 itojun 1125: if (CMSG_SPACE(size) > MLEN) {
1.30 itojun 1126: MCLGET(m, M_DONTWAIT);
1127: if ((m->m_flags & M_EXT) == 0) {
1128: m_free(m);
1129: return NULL;
1130: }
1131: }
1.19 thorpej 1132: cp = mtod(m, struct cmsghdr *);
1.26 perry 1133: memcpy(CMSG_DATA(cp), p, size);
1.35 itojun 1134: m->m_len = CMSG_SPACE(size);
1135: cp->cmsg_len = CMSG_LEN(size);
1.19 thorpej 1136: cp->cmsg_level = level;
1137: cp->cmsg_type = type;
1138: return (m);
1.1 cgd 1139: }
CVSweb <webmaster@jp.NetBSD.org>