version 1.141.2.4, 2008/03/23 02:05:01 |
version 1.159, 2008/04/14 15:42:20 |
|
|
/* uipc_socket.c,v 1.141.2.3 2008/01/09 01:56:28 matt Exp */ |
/* $NetBSD$ */ |
|
|
/*- |
/*- |
* Copyright (c) 2002, 2007, 2008 The NetBSD Foundation, Inc. |
* Copyright (c) 2002, 2007, 2008 The NetBSD Foundation, Inc. |
|
|
*/ |
*/ |
|
|
/* |
/* |
|
* Copyright (c) 2004 The FreeBSD Foundation |
|
* Copyright (c) 2004 Robert Watson |
* Copyright (c) 1982, 1986, 1988, 1990, 1993 |
* Copyright (c) 1982, 1986, 1988, 1990, 1993 |
* The Regents of the University of California. All rights reserved. |
* The Regents of the University of California. All rights reserved. |
* |
* |
|
|
*/ |
*/ |
|
|
#include <sys/cdefs.h> |
#include <sys/cdefs.h> |
__KERNEL_RCSID(0, "uipc_socket.c,v 1.141.2.3 2008/01/09 01:56:28 matt Exp"); |
__KERNEL_RCSID(0, "$NetBSD$"); |
|
|
#include "opt_sock_counters.h" |
#include "opt_sock_counters.h" |
#include "opt_sosend_loan.h" |
#include "opt_sosend_loan.h" |
Line 253 sokvafree(vaddr_t sva, vsize_t len) |
|
Line 255 sokvafree(vaddr_t sva, vsize_t len) |
|
static void |
static void |
sodoloanfree(struct vm_page **pgs, void *buf, size_t size) |
sodoloanfree(struct vm_page **pgs, void *buf, size_t size) |
{ |
{ |
vaddr_t va, sva, eva; |
vaddr_t sva, eva; |
vsize_t len; |
vsize_t len; |
paddr_t pa; |
int npgs; |
int i, npgs; |
|
|
KASSERT(pgs != NULL); |
|
|
eva = round_page((vaddr_t) buf + size); |
eva = round_page((vaddr_t) buf + size); |
sva = trunc_page((vaddr_t) buf); |
sva = trunc_page((vaddr_t) buf); |
len = eva - sva; |
len = eva - sva; |
npgs = len >> PAGE_SHIFT; |
npgs = len >> PAGE_SHIFT; |
|
|
if (__predict_false(pgs == NULL)) { |
|
pgs = alloca(npgs * sizeof(*pgs)); |
|
|
|
for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { |
|
if (pmap_extract(pmap_kernel(), va, &pa) == false) |
|
panic("sodoloanfree: va 0x%lx not mapped", va); |
|
pgs[i] = PHYS_TO_VM_PAGE(pa); |
|
} |
|
} |
|
|
|
pmap_kremove(sva, len); |
pmap_kremove(sva, len); |
pmap_update(pmap_kernel()); |
pmap_update(pmap_kernel()); |
uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); |
uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); |
Line 313 sodopendfreel(void) |
|
Line 306 sodopendfreel(void) |
|
|
|
for (; m != NULL; m = next) { |
for (; m != NULL; m = next) { |
next = m->m_next; |
next = m->m_next; |
|
KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); |
|
KASSERT(m->m_ext.ext_refcnt == 0); |
|
|
rv += m->m_ext.ext_size; |
rv += m->m_ext.ext_size; |
sodoloanfree((m->m_flags & M_EXT_PAGES) ? |
sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, |
m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, |
|
m->m_ext.ext_size); |
m->m_ext.ext_size); |
pool_cache_put(mb_cache, m); |
pool_cache_put(mb_cache, m); |
} |
} |
|
|
soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) |
soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) |
{ |
{ |
|
|
if (m == NULL) { |
KASSERT(m != NULL); |
|
|
/* |
|
* called from MEXTREMOVE. |
|
*/ |
|
|
|
sodoloanfree(NULL, buf, size); |
|
return; |
|
} |
|
|
|
/* |
/* |
* postpone freeing mbuf. |
* postpone freeing mbuf. |
Line 361 sosend_loan(struct socket *so, struct ui |
|
Line 347 sosend_loan(struct socket *so, struct ui |
|
struct iovec *iov = uio->uio_iov; |
struct iovec *iov = uio->uio_iov; |
vaddr_t sva, eva; |
vaddr_t sva, eva; |
vsize_t len; |
vsize_t len; |
vaddr_t lva, va; |
vaddr_t lva; |
int npgs, i, error; |
int npgs, error; |
|
vaddr_t va; |
|
int i; |
|
|
if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) |
if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) |
return (0); |
return (0); |
|
|
fsocreate(int domain, struct socket **sop, int type, int protocol, |
fsocreate(int domain, struct socket **sop, int type, int protocol, |
struct lwp *l, int *fdout) |
struct lwp *l, int *fdout) |
{ |
{ |
struct filedesc *fdp; |
|
struct socket *so; |
struct socket *so; |
struct file *fp; |
struct file *fp; |
int fd, error; |
int fd, error; |
|
|
fdp = l->l_proc->p_fd; |
if ((error = fd_allocfile(&fp, &fd)) != 0) |
/* falloc() will use the desciptor for us */ |
|
if ((error = falloc(l, &fp, &fd)) != 0) |
|
return (error); |
return (error); |
fp->f_flag = FREAD|FWRITE; |
fp->f_flag = FREAD|FWRITE; |
fp->f_type = DTYPE_SOCKET; |
fp->f_type = DTYPE_SOCKET; |
fp->f_ops = &socketops; |
fp->f_ops = &socketops; |
error = socreate(domain, &so, type, protocol, l); |
error = socreate(domain, &so, type, protocol, l); |
if (error != 0) { |
if (error != 0) { |
FILE_UNUSE(fp, l); |
fd_abort(curproc, fp, fd); |
fdremove(fdp, fd); |
|
ffree(fp); |
|
} else { |
} else { |
if (sop != NULL) |
if (sop != NULL) |
*sop = so; |
*sop = so; |
fp->f_data = so; |
fp->f_data = so; |
FILE_SET_MATURE(fp); |
fd_affix(curproc, fp, fd); |
FILE_UNUSE(fp, l); |
|
*fdout = fd; |
*fdout = fd; |
} |
} |
return error; |
return error; |
Line 585 solisten(struct socket *so, int backlog, |
|
Line 567 solisten(struct socket *so, int backlog, |
|
int s, error; |
int s, error; |
|
|
s = splsoftnet(); |
s = splsoftnet(); |
|
if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | |
|
SS_ISDISCONNECTING)) != 0) |
|
return (EOPNOTSUPP); |
error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, |
error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, |
NULL, NULL, l); |
NULL, NULL, l); |
if (error != 0) { |
if (error != 0) { |
Line 834 sosend(struct socket *so, struct mbuf *a |
|
Line 819 sosend(struct socket *so, struct mbuf *a |
|
dontroute = |
dontroute = |
(flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && |
(flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && |
(so->so_proto->pr_flags & PR_ATOMIC); |
(so->so_proto->pr_flags & PR_ATOMIC); |
if (p) |
if (l) |
p->p_stats->p_ru.ru_msgsnd++; |
l->l_ru.ru_msgsnd++; |
if (control) |
if (control) |
clen = control->m_len; |
clen = control->m_len; |
#define snderr(errno) { error = errno; splx(s); goto release; } |
#define snderr(errno) { error = errno; splx(s); goto release; } |
Line 988 sosend(struct socket *so, struct mbuf *a |
|
Line 973 sosend(struct socket *so, struct mbuf *a |
|
} |
} |
|
|
/* |
/* |
|
* Following replacement or removal of the first mbuf on the first |
|
* mbuf chain of a socket buffer, push necessary state changes back |
|
* into the socket buffer so that other consumers see the values |
|
* consistently. 'nextrecord' is the callers locally stored value of |
|
* the original value of sb->sb_mb->m_nextpkt which must be restored |
|
* when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. |
|
*/ |
|
static void |
|
sbsync(struct sockbuf *sb, struct mbuf *nextrecord) |
|
{ |
|
|
|
/* |
|
* First, update for the new value of nextrecord. If necessary, |
|
* make it the first record. |
|
*/ |
|
if (sb->sb_mb != NULL) |
|
sb->sb_mb->m_nextpkt = nextrecord; |
|
else |
|
sb->sb_mb = nextrecord; |
|
|
|
/* |
|
* Now update any dependent socket buffer fields to reflect |
|
* the new state. This is an inline of SB_EMPTY_FIXUP, with |
|
* the addition of a second clause that takes care of the |
|
* case where sb_mb has been updated, but remains the last |
|
* record. |
|
*/ |
|
if (sb->sb_mb == NULL) { |
|
sb->sb_mbtail = NULL; |
|
sb->sb_lastrecord = NULL; |
|
} else if (sb->sb_mb->m_nextpkt == NULL) |
|
sb->sb_lastrecord = sb->sb_mb; |
|
} |
|
|
|
/* |
* Implement receive operations on a socket. |
* Implement receive operations on a socket. |
* We depend on the way that records are added to the sockbuf |
* We depend on the way that records are added to the sockbuf |
* by sbappend*. In particular, each record (mbufs linked through m_next) |
* by sbappend*. In particular, each record (mbufs linked through m_next) |
Line 1125 soreceive(struct socket *so, struct mbuf |
|
Line 1145 soreceive(struct socket *so, struct mbuf |
|
dontblock: |
dontblock: |
/* |
/* |
* On entry here, m points to the first record of the socket buffer. |
* On entry here, m points to the first record of the socket buffer. |
* While we process the initial mbufs containing address and control |
* From this point onward, we maintain 'nextrecord' as a cache of the |
* info, we save a copy of m->m_nextpkt into nextrecord. |
* pointer to the next record in the socket buffer. We must keep the |
|
* various socket buffer pointers and local stack versions of the |
|
* pointers in sync, pushing out modifications before dropping the |
|
* IPL, and re-reading them when picking it up. |
|
* |
|
* Otherwise, we will race with the network stack appending new data |
|
* or records onto the socket buffer by using inconsistent/stale |
|
* versions of the field, possibly resulting in socket buffer |
|
* corruption. |
|
* |
|
* By holding the high-level sblock(), we prevent simultaneous |
|
* readers from pulling off the front of the socket buffer. |
*/ |
*/ |
if (l != NULL) |
if (l != NULL) |
l->l_proc->p_stats->p_ru.ru_msgrcv++; |
l->l_ru.ru_msgrcv++; |
KASSERT(m == so->so_rcv.sb_mb); |
KASSERT(m == so->so_rcv.sb_mb); |
SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); |
SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); |
SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); |
SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); |
Line 1156 soreceive(struct socket *so, struct mbuf |
|
Line 1187 soreceive(struct socket *so, struct mbuf |
|
MFREE(m, so->so_rcv.sb_mb); |
MFREE(m, so->so_rcv.sb_mb); |
m = so->so_rcv.sb_mb; |
m = so->so_rcv.sb_mb; |
} |
} |
|
sbsync(&so->so_rcv, nextrecord); |
} |
} |
} |
} |
while (m != NULL && m->m_type == MT_CONTROL && error == 0) { |
|
if (flags & MSG_PEEK) { |
/* |
if (controlp != NULL) |
* Process one or more MT_CONTROL mbufs present before any data mbufs |
*controlp = m_copy(m, 0, m->m_len); |
* in the first mbuf chain on the socket buffer. If MSG_PEEK, we |
m = m->m_next; |
* just copy the data; if !MSG_PEEK, we call into the protocol to |
} else { |
* perform externalization (or freeing if controlp == NULL). |
sbfree(&so->so_rcv, m); |
*/ |
mbuf_removed = 1; |
if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) { |
if (controlp != NULL) { |
struct mbuf *cm = NULL, *cmn; |
if (dom->dom_externalize && l && |
struct mbuf **cme = &cm; |
mtod(m, struct cmsghdr *)->cmsg_type == |
|
SCM_RIGHTS) |
do { |
error = (*dom->dom_externalize)(m, l); |
if (flags & MSG_PEEK) { |
*controlp = m; |
if (controlp != NULL) { |
|
*controlp = m_copy(m, 0, m->m_len); |
|
controlp = &(*controlp)->m_next; |
|
} |
|
m = m->m_next; |
|
} else { |
|
sbfree(&so->so_rcv, m); |
so->so_rcv.sb_mb = m->m_next; |
so->so_rcv.sb_mb = m->m_next; |
m->m_next = NULL; |
m->m_next = NULL; |
|
*cme = m; |
|
cme = &(*cme)->m_next; |
m = so->so_rcv.sb_mb; |
m = so->so_rcv.sb_mb; |
|
} |
|
} while (m != NULL && m->m_type == MT_CONTROL); |
|
if ((flags & MSG_PEEK) == 0) |
|
sbsync(&so->so_rcv, nextrecord); |
|
for (; cm != NULL; cm = cmn) { |
|
cmn = cm->m_next; |
|
cm->m_next = NULL; |
|
type = mtod(cm, struct cmsghdr *)->cmsg_type; |
|
if (controlp != NULL) { |
|
if (dom->dom_externalize != NULL && |
|
type == SCM_RIGHTS) { |
|
splx(s); |
|
error = (*dom->dom_externalize)(cm, l); |
|
s = splsoftnet(); |
|
} |
|
*controlp = cm; |
|
while (*controlp != NULL) |
|
controlp = &(*controlp)->m_next; |
} else { |
} else { |
/* |
/* |
* Dispose of any SCM_RIGHTS message that went |
* Dispose of any SCM_RIGHTS message that went |
* through the read path rather than recv. |
* through the read path rather than recv. |
*/ |
*/ |
if (dom->dom_dispose && |
if (dom->dom_dispose != NULL && |
mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) |
type == SCM_RIGHTS) { |
(*dom->dom_dispose)(m); |
splx(s); |
MFREE(m, so->so_rcv.sb_mb); |
(*dom->dom_dispose)(cm); |
m = so->so_rcv.sb_mb; |
s = splsoftnet(); |
|
} |
|
m_freem(cm); |
} |
} |
} |
} |
if (controlp != NULL) { |
if (m != NULL) |
orig_resid = 0; |
nextrecord = so->so_rcv.sb_mb->m_nextpkt; |
controlp = &(*controlp)->m_next; |
else |
} |
nextrecord = so->so_rcv.sb_mb; |
|
orig_resid = 0; |
} |
} |
|
|
/* |
/* If m is non-NULL, we have some data to read. */ |
* If m is non-NULL, we have some data to read. From now on, |
if (__predict_true(m != NULL)) { |
* make sure to keep sb_lastrecord consistent when working on |
|
* the last packet on the chain (nextrecord == NULL) and we |
|
* change m->m_nextpkt. |
|
*/ |
|
if (m != NULL) { |
|
if ((flags & MSG_PEEK) == 0) { |
|
m->m_nextpkt = nextrecord; |
|
/* |
|
* If nextrecord == NULL (this is a single chain), |
|
* then sb_lastrecord may not be valid here if m |
|
* was changed earlier. |
|
*/ |
|
if (nextrecord == NULL) { |
|
KASSERT(so->so_rcv.sb_mb == m); |
|
so->so_rcv.sb_lastrecord = m; |
|
} |
|
} |
|
type = m->m_type; |
type = m->m_type; |
if (type == MT_OOBDATA) |
if (type == MT_OOBDATA) |
flags |= MSG_OOB; |
flags |= MSG_OOB; |
} else { |
|
if ((flags & MSG_PEEK) == 0) { |
|
KASSERT(so->so_rcv.sb_mb == m); |
|
so->so_rcv.sb_mb = nextrecord; |
|
SB_EMPTY_FIXUP(&so->so_rcv); |
|
} |
|
} |
} |
SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); |
SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); |
SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); |
SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); |
Line 1692 filt_sordetach(struct knote *kn) |
|
Line 1730 filt_sordetach(struct knote *kn) |
|
{ |
{ |
struct socket *so; |
struct socket *so; |
|
|
so = (struct socket *)kn->kn_fp->f_data; |
so = ((file_t *)kn->kn_obj)->f_data; |
SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); |
SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); |
if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) |
if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) |
so->so_rcv.sb_flags &= ~SB_KNOTE; |
so->so_rcv.sb_flags &= ~SB_KNOTE; |
Line 1704 filt_soread(struct knote *kn, long hint) |
|
Line 1742 filt_soread(struct knote *kn, long hint) |
|
{ |
{ |
struct socket *so; |
struct socket *so; |
|
|
so = (struct socket *)kn->kn_fp->f_data; |
so = ((file_t *)kn->kn_obj)->f_data; |
kn->kn_data = so->so_rcv.sb_cc; |
kn->kn_data = so->so_rcv.sb_cc; |
if (so->so_state & SS_CANTRCVMORE) { |
if (so->so_state & SS_CANTRCVMORE) { |
kn->kn_flags |= EV_EOF; |
kn->kn_flags |= EV_EOF; |
Line 1723 filt_sowdetach(struct knote *kn) |
|
Line 1761 filt_sowdetach(struct knote *kn) |
|
{ |
{ |
struct socket *so; |
struct socket *so; |
|
|
so = (struct socket *)kn->kn_fp->f_data; |
so = ((file_t *)kn->kn_obj)->f_data; |
SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); |
SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); |
if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) |
if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) |
so->so_snd.sb_flags &= ~SB_KNOTE; |
so->so_snd.sb_flags &= ~SB_KNOTE; |
Line 1735 filt_sowrite(struct knote *kn, long hint |
|
Line 1773 filt_sowrite(struct knote *kn, long hint |
|
{ |
{ |
struct socket *so; |
struct socket *so; |
|
|
so = (struct socket *)kn->kn_fp->f_data; |
so = ((file_t *)kn->kn_obj)->f_data; |
kn->kn_data = sbspace(&so->so_snd); |
kn->kn_data = sbspace(&so->so_snd); |
if (so->so_state & SS_CANTSENDMORE) { |
if (so->so_state & SS_CANTSENDMORE) { |
kn->kn_flags |= EV_EOF; |
kn->kn_flags |= EV_EOF; |
Line 1758 filt_solisten(struct knote *kn, long hin |
|
Line 1796 filt_solisten(struct knote *kn, long hin |
|
{ |
{ |
struct socket *so; |
struct socket *so; |
|
|
so = (struct socket *)kn->kn_fp->f_data; |
so = ((file_t *)kn->kn_obj)->f_data; |
|
|
/* |
/* |
* Set kn_data to number of incoming connections, not |
* Set kn_data to number of incoming connections, not |
Line 1781 soo_kqfilter(struct file *fp, struct kno |
|
Line 1819 soo_kqfilter(struct file *fp, struct kno |
|
struct socket *so; |
struct socket *so; |
struct sockbuf *sb; |
struct sockbuf *sb; |
|
|
so = (struct socket *)kn->kn_fp->f_data; |
so = ((file_t *)kn->kn_obj)->f_data; |
switch (kn->kn_filter) { |
switch (kn->kn_filter) { |
case EVFILT_READ: |
case EVFILT_READ: |
if (so->so_options & SO_ACCEPTCONN) |
if (so->so_options & SO_ACCEPTCONN) |