[BACK]Return to npf_conn.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / net / npf

Annotation of src/sys/net/npf/npf_conn.c, Revision 1.8

1.8     ! rmind       1: /*     $NetBSD: npf_conn.c,v 1.7 2014/07/25 23:07:21 rmind Exp $       */
1.1       rmind       2:
                      3: /*-
                      4:  * Copyright (c) 2014 Mindaugas Rasiukevicius <rmind at netbsd org>
                      5:  * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
                      6:  * All rights reserved.
                      7:  *
                      8:  * This material is based upon work partially supported by The
                      9:  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
                     10:  *
                     11:  * Redistribution and use in source and binary forms, with or without
                     12:  * modification, are permitted provided that the following conditions
                     13:  * are met:
                     14:  * 1. Redistributions of source code must retain the above copyright
                     15:  *    notice, this list of conditions and the following disclaimer.
                     16:  * 2. Redistributions in binary form must reproduce the above copyright
                     17:  *    notice, this list of conditions and the following disclaimer in the
                     18:  *    documentation and/or other materials provided with the distribution.
                     19:  *
                     20:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     21:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     22:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     23:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     24:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     25:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     26:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     27:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     28:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     29:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     30:  * POSSIBILITY OF SUCH DAMAGE.
                     31:  */
                     32:
                     33: /*
                     34:  * NPF connection tracking for stateful filtering and translation.
                     35:  *
                     36:  * Overview
                     37:  *
                     38:  *     Connection direction is identified by the direction of its first
                     39:  *     packet.  Packets can be incoming or outgoing with respect to an
                     40:  *     interface.  To describe the packet in the context of connection
                     41:  *     direction we will use the terms "forwards stream" and "backwards
                     42:  *     stream".  All connections have two keys and thus two entries:
                     43:  *
                     44:  *             npf_conn_t::c_forw_entry for the forwards stream and
                     45:  *             npf_conn_t::c_back_entry for the backwards stream.
                     46:  *
                     47:  *     The keys are formed from the 5-tuple (source/destination address,
                     48:  *     source/destination port and the protocol).  Additional matching
                     49:  *     is performed for the interface (a common behaviour is equivalent
                     50:  *     to the 6-tuple lookup including the interface ID).  Note that the
                     51:  *     key may be formed using translated values in a case of NAT.
                     52:  *
                     53:  *     Connections can serve two purposes: for the implicit passing or
                     54:  *     to accommodate the dynamic NAT.  Connections for the former purpose
                     55:  *     are created by the rules with "stateful" attribute and are used for
                     56:  *     stateful filtering.  Such connections indicate that the packet of
                     57:  *     the backwards stream should be passed without inspection of the
                     58:  *     ruleset.  The other purpose is to associate a dynamic NAT mechanism
                     59:  *     with a connection.  Such connections are created by the NAT policies
                     60:  *     and they have a relationship with NAT translation structure via
                     61:  *     npf_conn_t::c_nat.  A single connection can serve both purposes,
                     62:  *     which is a common case.
                     63:  *
                     64:  * Connection life-cycle
                     65:  *
                     66:  *     Connections are established when a packet matches said rule or
                     67:  *     NAT policy.  Both keys of the established connection are inserted
                     68:  *     into the connection database.  A garbage collection thread
                     69:  *     periodically scans all connections and depending on connection
                     70:  *     properties (e.g. last activity time, protocol) removes connection
                     71:  *     entries and expires the actual connections.
                     72:  *
                     73:  *     Each connection has a reference count.  The reference is acquired
                     74:  *     on lookup and should be released by the caller.  It guarantees that
                     75:  *     the connection will not be destroyed, although it may be expired.
                     76:  *
                     77:  * Synchronisation
                     78:  *
                     79:  *     Connection database is accessed in a lock-less manner by the main
                     80:  *     routines: npf_conn_inspect() and npf_conn_establish().  Since they
                     81:  *     are always called from a software interrupt, the database is
                     82:  *     protected using passive serialisation.  The main place which can
                     83:  *     destroy a connection is npf_conn_worker().  The database itself
                     84:  *     can be replaced and destroyed in npf_conn_reload().
                     85:  *
                     86:  * ALG support
                     87:  *
                     88:  *     Application-level gateways (ALGs) can override generic connection
                     89:  *     inspection (npf_alg_conn() call in npf_conn_inspect() function) by
                     90:  *     performing their own lookup using different key.  Recursive call
                     91:  *     to npf_conn_inspect() is not allowed.  The ALGs ought to use the
                     92:  *     npf_conn_lookup() function for this purpose.
                     93:  *
                     94:  * Lock order
                     95:  *
1.6       rmind      96:  *     npf_config_lock ->
                     97:  *             conn_lock ->
                     98:  *                     npf_conn_t::c_lock
1.1       rmind      99:  */
                    100:
                    101: #include <sys/cdefs.h>
1.8     ! rmind     102: __KERNEL_RCSID(0, "$NetBSD: npf_conn.c,v 1.7 2014/07/25 23:07:21 rmind Exp $");
1.1       rmind     103:
                    104: #include <sys/param.h>
                    105: #include <sys/types.h>
                    106:
                    107: #include <netinet/in.h>
                    108: #include <netinet/tcp.h>
                    109:
                    110: #include <sys/atomic.h>
                    111: #include <sys/condvar.h>
                    112: #include <sys/kmem.h>
                    113: #include <sys/kthread.h>
                    114: #include <sys/mutex.h>
                    115: #include <net/pfil.h>
                    116: #include <sys/pool.h>
                    117: #include <sys/queue.h>
                    118: #include <sys/systm.h>
                    119:
                    120: #define __NPF_CONN_PRIVATE
                    121: #include "npf_conn.h"
                    122: #include "npf_impl.h"
                    123:
                    124: /*
                    125:  * Connection flags: PFIL_IN and PFIL_OUT values are reserved for direction.
                    126:  */
                    127: CTASSERT(PFIL_ALL == (0x001 | 0x002));
                    128: #define        CONN_ACTIVE     0x004   /* visible on inspection */
                    129: #define        CONN_PASS       0x008   /* perform implicit passing */
                    130: #define        CONN_EXPIRE     0x010   /* explicitly expire */
                    131: #define        CONN_REMOVED    0x020   /* "forw/back" entries removed */
                    132:
                    133: /*
1.6       rmind     134:  * Connection tracking state: disabled (off) or enabled (on).
1.1       rmind     135:  */
1.6       rmind     136: enum { CONN_TRACKING_OFF, CONN_TRACKING_ON };
1.1       rmind     137: static volatile int    conn_tracking   __cacheline_aligned;
                    138:
                    139: /* Connection tracking database, connection cache and the lock. */
                    140: static npf_conndb_t *  conn_db         __read_mostly;
                    141: static pool_cache_t    conn_cache      __read_mostly;
                    142: static kmutex_t                conn_lock       __cacheline_aligned;
                    143:
                    144: static void    npf_conn_worker(void);
                    145: static void    npf_conn_destroy(npf_conn_t *);
                    146:
                    147: /*
                    148:  * npf_conn_sys{init,fini}: initialise/destroy connection tracking.
                    149:  */
                    150:
                    151: void
                    152: npf_conn_sysinit(void)
                    153: {
                    154:        conn_cache = pool_cache_init(sizeof(npf_conn_t), coherency_unit,
                    155:            0, 0, "npfconpl", NULL, IPL_NET, NULL, NULL, NULL);
                    156:        mutex_init(&conn_lock, MUTEX_DEFAULT, IPL_NONE);
                    157:        conn_tracking = CONN_TRACKING_OFF;
1.6       rmind     158:        conn_db = npf_conndb_create();
1.1       rmind     159:
                    160:        npf_worker_register(npf_conn_worker);
                    161: }
                    162:
                    163: void
                    164: npf_conn_sysfini(void)
                    165: {
1.6       rmind     166:        /* Note: the caller should have flushed the connections. */
                    167:        KASSERT(conn_tracking == CONN_TRACKING_OFF);
1.1       rmind     168:        npf_worker_unregister(npf_conn_worker);
                    169:
1.6       rmind     170:        npf_conndb_destroy(conn_db);
1.1       rmind     171:        pool_cache_destroy(conn_cache);
                    172:        mutex_destroy(&conn_lock);
                    173: }
                    174:
                    175: /*
1.6       rmind     176:  * npf_conn_load: perform the load by flushing the current connection
                    177:  * database and replacing it with the new one or just destroying.
1.1       rmind     178:  *
1.6       rmind     179:  * => The caller must disable the connection tracking and ensure that
                    180:  *    there are no connection database lookups or references in-flight.
1.1       rmind     181:  */
1.6       rmind     182: void
                    183: npf_conn_load(npf_conndb_t *ndb, bool track)
1.1       rmind     184: {
1.6       rmind     185:        npf_conndb_t *odb = NULL;
1.1       rmind     186:
1.6       rmind     187:        KASSERT(npf_config_locked_p());
1.1       rmind     188:
                    189:        /*
1.6       rmind     190:         * The connection database is in the quiescent state.
                    191:         * Prevent G/C thread from running and install a new database.
1.1       rmind     192:         */
1.6       rmind     193:        mutex_enter(&conn_lock);
                    194:        if (ndb) {
                    195:                KASSERT(conn_tracking == CONN_TRACKING_OFF);
                    196:                odb = conn_db;
                    197:                conn_db = ndb;
                    198:                membar_sync();
                    199:        }
                    200:        if (track) {
                    201:                /* After this point lookups start flying in. */
                    202:                conn_tracking = CONN_TRACKING_ON;
1.1       rmind     203:        }
1.6       rmind     204:        mutex_exit(&conn_lock);
1.1       rmind     205:
                    206:        if (odb) {
1.6       rmind     207:                /*
                    208:                 * Flush all, no sync since the caller did it for us.
                    209:                 * Also, release the pool cache memory.
                    210:                 */
                    211:                npf_conn_gc(odb, true, false);
1.1       rmind     212:                npf_conndb_destroy(odb);
1.6       rmind     213:                pool_cache_invalidate(conn_cache);
1.1       rmind     214:        }
                    215: }
                    216:
                    217: /*
                    218:  * npf_conn_tracking: enable/disable connection tracking.
                    219:  */
                    220: void
                    221: npf_conn_tracking(bool track)
                    222: {
1.6       rmind     223:        KASSERT(npf_config_locked_p());
                    224:        conn_tracking = track ? CONN_TRACKING_ON : CONN_TRACKING_OFF;
1.1       rmind     225: }
                    226:
1.6       rmind     227: static inline bool
1.1       rmind     228: npf_conn_trackable_p(const npf_cache_t *npc)
                    229: {
                    230:        /*
                    231:         * Check if connection tracking is on.  Also, if layer 3 and 4 are
                    232:         * not cached - protocol is not supported or packet is invalid.
                    233:         */
                    234:        if (conn_tracking != CONN_TRACKING_ON) {
                    235:                return false;
                    236:        }
                    237:        if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
                    238:                return false;
                    239:        }
                    240:        return true;
                    241: }
                    242:
                    243: /*
                    244:  * npf_conn_conkey: construct a key for the connection lookup.
1.8     ! rmind     245:  *
        !           246:  * => Returns the key length in bytes or zero on failure.
1.1       rmind     247:  */
1.8     ! rmind     248: unsigned
1.1       rmind     249: npf_conn_conkey(const npf_cache_t *npc, npf_connkey_t *key, const bool forw)
                    250: {
                    251:        const u_int alen = npc->npc_alen;
                    252:        const struct tcphdr *th;
                    253:        const struct udphdr *uh;
                    254:        u_int keylen, isrc, idst;
                    255:        uint16_t id[2];
                    256:
                    257:        switch (npc->npc_proto) {
                    258:        case IPPROTO_TCP:
                    259:                KASSERT(npf_iscached(npc, NPC_TCP));
                    260:                th = npc->npc_l4.tcp;
                    261:                id[NPF_SRC] = th->th_sport;
                    262:                id[NPF_DST] = th->th_dport;
                    263:                break;
                    264:        case IPPROTO_UDP:
                    265:                KASSERT(npf_iscached(npc, NPC_UDP));
                    266:                uh = npc->npc_l4.udp;
                    267:                id[NPF_SRC] = uh->uh_sport;
                    268:                id[NPF_DST] = uh->uh_dport;
                    269:                break;
                    270:        case IPPROTO_ICMP:
                    271:                if (npf_iscached(npc, NPC_ICMP_ID)) {
                    272:                        const struct icmp *ic = npc->npc_l4.icmp;
                    273:                        id[NPF_SRC] = ic->icmp_id;
                    274:                        id[NPF_DST] = ic->icmp_id;
                    275:                        break;
                    276:                }
1.8     ! rmind     277:                return 0;
1.1       rmind     278:        case IPPROTO_ICMPV6:
                    279:                if (npf_iscached(npc, NPC_ICMP_ID)) {
                    280:                        const struct icmp6_hdr *ic6 = npc->npc_l4.icmp6;
                    281:                        id[NPF_SRC] = ic6->icmp6_id;
                    282:                        id[NPF_DST] = ic6->icmp6_id;
                    283:                        break;
                    284:                }
1.8     ! rmind     285:                return 0;
1.1       rmind     286:        default:
                    287:                /* Unsupported protocol. */
1.8     ! rmind     288:                return 0;
1.1       rmind     289:        }
                    290:
                    291:        if (__predict_true(forw)) {
                    292:                isrc = NPF_SRC, idst = NPF_DST;
                    293:        } else {
                    294:                isrc = NPF_DST, idst = NPF_SRC;
                    295:        }
                    296:
1.8     ! rmind     297:        /*
        !           298:         * Construct a key formed out of 32-bit integers.  The key layout:
        !           299:         *
        !           300:         * Field: | proto |  alen | src-id | dst-id | src-addr | dst-addr |
        !           301:         *        +-------+-------+--------+--------+----------+----------+
        !           302:         * Bits:  |   8   |   8   |   16   |   16   |  32-128  |  32-128  |
        !           303:         *
        !           304:         * The source and destination are inverted if they key is for the
        !           305:         * backwards stream (forw == false).  The address length depends
        !           306:         * on the 'alen' field; it is a length in bytes, either 4 or 16.
        !           307:         */
        !           308:
1.1       rmind     309:        key->ck_key[0] = ((uint32_t)npc->npc_proto << 16) | (alen & 0xffff);
                    310:        key->ck_key[1] = ((uint32_t)id[isrc] << 16) | id[idst];
                    311:
                    312:        if (__predict_true(alen == sizeof(in_addr_t))) {
                    313:                key->ck_key[2] = npc->npc_ips[isrc]->s6_addr32[0];
                    314:                key->ck_key[3] = npc->npc_ips[idst]->s6_addr32[0];
                    315:                keylen = 4 * sizeof(uint32_t);
                    316:        } else {
                    317:                const u_int nwords = alen >> 2;
                    318:                memcpy(&key->ck_key[2], npc->npc_ips[isrc], alen);
                    319:                memcpy(&key->ck_key[2 + nwords], npc->npc_ips[idst], alen);
                    320:                keylen = (2 + (nwords * 2)) * sizeof(uint32_t);
                    321:        }
1.8     ! rmind     322:        return keylen;
1.1       rmind     323: }
                    324:
1.3       christos  325: static __inline void
1.1       rmind     326: connkey_set_addr(npf_connkey_t *key, const npf_addr_t *naddr, const int di)
                    327: {
                    328:        const u_int alen = key->ck_key[0] & 0xffff;
                    329:        uint32_t *addr = &key->ck_key[2 + ((alen >> 2) * di)];
                    330:
                    331:        KASSERT(alen > 0);
                    332:        memcpy(addr, naddr, alen);
                    333: }
                    334:
1.3       christos  335: static __inline void
1.1       rmind     336: connkey_set_id(npf_connkey_t *key, const uint16_t id, const int di)
                    337: {
                    338:        const uint32_t oid = key->ck_key[1];
                    339:        const u_int shift = 16 * !di;
                    340:        const uint32_t mask = 0xffff0000 >> shift;
                    341:
                    342:        key->ck_key[1] = ((uint32_t)id << shift) | (oid & mask);
                    343: }
                    344:
                    345: /*
                    346:  * npf_conn_lookup: lookup if there is an established connection.
                    347:  *
                    348:  * => If found, we will hold a reference for the caller.
                    349:  */
                    350: npf_conn_t *
1.4       rmind     351: npf_conn_lookup(const npf_cache_t *npc, const int di, bool *forw)
1.1       rmind     352: {
1.4       rmind     353:        const nbuf_t *nbuf = npc->npc_nbuf;
1.1       rmind     354:        npf_conn_t *con;
                    355:        npf_connkey_t key;
                    356:        u_int flags, cifid;
                    357:        bool ok, pforw;
                    358:
                    359:        /* Construct a key and lookup for a connection in the store. */
                    360:        if (!npf_conn_conkey(npc, &key, true)) {
                    361:                return NULL;
                    362:        }
                    363:        con = npf_conndb_lookup(conn_db, &key, forw);
                    364:        if (con == NULL) {
                    365:                return NULL;
                    366:        }
                    367:        KASSERT(npc->npc_proto == con->c_proto);
                    368:
                    369:        /* Check if connection is active and not expired. */
                    370:        flags = con->c_flags;
                    371:        ok = (flags & (CONN_ACTIVE | CONN_EXPIRE)) == CONN_ACTIVE;
                    372:
                    373:        if (__predict_false(!ok)) {
                    374:                atomic_dec_uint(&con->c_refcnt);
                    375:                return NULL;
                    376:        }
                    377:
                    378:        /*
                    379:         * Match the interface and the direction of the connection entry
                    380:         * and the packet.
                    381:         */
                    382:        cifid = con->c_ifid;
                    383:        if (__predict_false(cifid && cifid != nbuf->nb_ifid)) {
                    384:                atomic_dec_uint(&con->c_refcnt);
                    385:                return NULL;
                    386:        }
                    387:        pforw = (flags & PFIL_ALL) == di;
                    388:        if (__predict_false(*forw != pforw)) {
                    389:                atomic_dec_uint(&con->c_refcnt);
                    390:                return NULL;
                    391:        }
                    392:
                    393:        /* Update the last activity time. */
                    394:        getnanouptime(&con->c_atime);
                    395:        return con;
                    396: }
                    397:
                    398: /*
                    399:  * npf_conn_inspect: lookup a connection and inspecting the protocol data.
                    400:  *
                    401:  * => If found, we will hold a reference for the caller.
                    402:  */
                    403: npf_conn_t *
1.4       rmind     404: npf_conn_inspect(npf_cache_t *npc, const int di, int *error)
1.1       rmind     405: {
1.4       rmind     406:        nbuf_t *nbuf = npc->npc_nbuf;
1.1       rmind     407:        npf_conn_t *con;
                    408:        bool forw, ok;
                    409:
                    410:        KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
                    411:        if (!npf_conn_trackable_p(npc)) {
                    412:                return NULL;
                    413:        }
                    414:
                    415:        /* Query ALG which may lookup connection for us. */
1.4       rmind     416:        if ((con = npf_alg_conn(npc, di)) != NULL) {
1.1       rmind     417:                /* Note: reference is held. */
                    418:                return con;
                    419:        }
                    420:        if (nbuf_head_mbuf(nbuf) == NULL) {
                    421:                *error = ENOMEM;
                    422:                return NULL;
                    423:        }
                    424:        KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
                    425:
                    426:        /* Main lookup of the connection. */
1.4       rmind     427:        if ((con = npf_conn_lookup(npc, di, &forw)) == NULL) {
1.1       rmind     428:                return NULL;
                    429:        }
                    430:
                    431:        /* Inspect the protocol data and handle state changes. */
                    432:        mutex_enter(&con->c_lock);
1.4       rmind     433:        ok = npf_state_inspect(npc, &con->c_state, forw);
1.1       rmind     434:        mutex_exit(&con->c_lock);
                    435:
                    436:        if (__predict_false(!ok)) {
                    437:                /* Invalid: let the rules deal with it. */
                    438:                npf_conn_release(con);
                    439:                npf_stats_inc(NPF_STAT_INVALID_STATE);
                    440:                con = NULL;
                    441:        }
                    442:        return con;
                    443: }
                    444:
                    445: /*
                    446:  * npf_conn_establish: create a new connection, insert into the global list.
                    447:  *
                    448:  * => Connection is created with the reference held for the caller.
                    449:  * => Connection will be activated on the first reference release.
                    450:  */
                    451: npf_conn_t *
1.4       rmind     452: npf_conn_establish(npf_cache_t *npc, int di, bool per_if)
1.1       rmind     453: {
1.4       rmind     454:        const nbuf_t *nbuf = npc->npc_nbuf;
1.1       rmind     455:        npf_conn_t *con;
                    456:
                    457:        KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
                    458:
                    459:        if (!npf_conn_trackable_p(npc)) {
                    460:                return NULL;
                    461:        }
                    462:
                    463:        /* Allocate and initialise the new connection. */
                    464:        con = pool_cache_get(conn_cache, PR_NOWAIT);
                    465:        if (__predict_false(!con)) {
                    466:                return NULL;
                    467:        }
                    468:        NPF_PRINTF(("NPF: create conn %p\n", con));
1.6       rmind     469:        npf_stats_inc(NPF_STAT_CONN_CREATE);
1.1       rmind     470:
                    471:        /* Reference count and flags (indicate direction). */
                    472:        mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
                    473:        con->c_flags = (di & PFIL_ALL);
                    474:        con->c_refcnt = 1;
                    475:        con->c_rproc = NULL;
                    476:        con->c_nat = NULL;
                    477:
                    478:        /* Initialize protocol state. */
1.4       rmind     479:        if (!npf_state_init(npc, &con->c_state)) {
1.1       rmind     480:                goto err;
                    481:        }
                    482:
                    483:        KASSERT(npf_iscached(npc, NPC_IP46));
                    484:        npf_connkey_t *fw = &con->c_forw_entry;
                    485:        npf_connkey_t *bk = &con->c_back_entry;
                    486:
                    487:        /*
                    488:         * Construct "forwards" and "backwards" keys.  Also, set the
                    489:         * interface ID for this connection (unless it is global).
                    490:         */
                    491:        if (!npf_conn_conkey(npc, fw, true)) {
                    492:                goto err;
                    493:        }
                    494:        if (!npf_conn_conkey(npc, bk, false)) {
                    495:                goto err;
                    496:        }
                    497:        fw->ck_backptr = bk->ck_backptr = con;
                    498:        con->c_ifid = per_if ? nbuf->nb_ifid : 0;
                    499:        con->c_proto = npc->npc_proto;
                    500:
                    501:        /* Set last activity time for a new connection. */
                    502:        getnanouptime(&con->c_atime);
                    503:
                    504:        /*
                    505:         * Insert both keys (entries representing directions) of the
                    506:         * connection.  At this point, it becomes visible.
                    507:         */
                    508:        if (!npf_conndb_insert(conn_db, fw, con)) {
                    509:                goto err;
                    510:        }
                    511:        if (!npf_conndb_insert(conn_db, bk, con)) {
                    512:                /* We have hit the duplicate. */
                    513:                npf_conndb_remove(conn_db, fw);
1.6       rmind     514:                npf_stats_inc(NPF_STAT_RACE_CONN);
1.1       rmind     515:                goto err;
                    516:        }
                    517:
                    518:        /* Finally, insert into the connection list. */
                    519:        NPF_PRINTF(("NPF: establish conn %p\n", con));
                    520:        npf_conndb_enqueue(conn_db, con);
                    521:        return con;
                    522: err:
                    523:        npf_conn_destroy(con);
                    524:        return NULL;
                    525: }
                    526:
                    527: static void
                    528: npf_conn_destroy(npf_conn_t *con)
                    529: {
                    530:        if (con->c_nat) {
                    531:                /* Release any NAT structures. */
                    532:                npf_nat_destroy(con->c_nat);
                    533:        }
                    534:        if (con->c_rproc) {
                    535:                /* Release the rule procedure. */
                    536:                npf_rproc_release(con->c_rproc);
                    537:        }
                    538:
                    539:        /* Destroy the state. */
                    540:        npf_state_destroy(&con->c_state);
                    541:        mutex_destroy(&con->c_lock);
                    542:
                    543:        /* Free the structure, increase the counter. */
                    544:        pool_cache_put(conn_cache, con);
1.6       rmind     545:        npf_stats_inc(NPF_STAT_CONN_DESTROY);
1.1       rmind     546:        NPF_PRINTF(("NPF: conn %p destroyed\n", con));
                    547: }
                    548:
                    549: /*
                    550:  * npf_conn_setnat: associate NAT entry with the connection, update and
                    551:  * re-insert connection entry using the translation values.
                    552:  */
                    553: int
                    554: npf_conn_setnat(const npf_cache_t *npc, npf_conn_t *con,
                    555:     npf_nat_t *nt, u_int ntype)
                    556: {
                    557:        static const u_int nat_type_dimap[] = {
                    558:                [NPF_NATOUT] = NPF_DST,
                    559:                [NPF_NATIN] = NPF_SRC,
                    560:        };
                    561:        npf_connkey_t key, *bk;
1.2       rmind     562:        npf_conn_t *ret __diagused;
1.1       rmind     563:        npf_addr_t *taddr;
                    564:        in_port_t tport;
                    565:        u_int tidx;
                    566:
                    567:        KASSERT(con->c_refcnt > 0);
                    568:
                    569:        npf_nat_gettrans(nt, &taddr, &tport);
                    570:        KASSERT(ntype == NPF_NATOUT || ntype == NPF_NATIN);
                    571:        tidx = nat_type_dimap[ntype];
                    572:
                    573:        /* Construct a "backwards" key. */
                    574:        if (!npf_conn_conkey(npc, &key, false)) {
                    575:                return EINVAL;
                    576:        }
                    577:
                    578:        /* Acquire the lock and check for the races. */
                    579:        mutex_enter(&con->c_lock);
                    580:        if (__predict_false(con->c_flags & CONN_EXPIRE)) {
                    581:                /* The connection got expired. */
                    582:                mutex_exit(&con->c_lock);
                    583:                return EINVAL;
                    584:        }
                    585:        if (__predict_false(con->c_nat != NULL)) {
                    586:                /* Race with a duplicate packet. */
                    587:                mutex_exit(&con->c_lock);
                    588:                npf_stats_inc(NPF_STAT_RACE_NAT);
                    589:                return EISCONN;
                    590:        }
                    591:
                    592:        /* Remove the "backwards" entry. */
                    593:        ret = npf_conndb_remove(conn_db, &key);
                    594:        KASSERT(ret == con);
                    595:
                    596:        /* Set the source/destination IDs to the translation values. */
                    597:        bk = &con->c_back_entry;
                    598:        connkey_set_addr(bk, taddr, tidx);
                    599:        if (tport) {
                    600:                connkey_set_id(bk, tport, tidx);
                    601:        }
                    602:
                    603:        /* Finally, re-insert the "backwards" entry. */
                    604:        if (!npf_conndb_insert(conn_db, bk, con)) {
                    605:                /*
                    606:                 * Race: we have hit the duplicate, remove the "forwards"
                    607:                 * entry and expire our connection; it is no longer valid.
                    608:                 */
                    609:                (void)npf_conndb_remove(conn_db, &con->c_forw_entry);
                    610:                atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
                    611:                mutex_exit(&con->c_lock);
                    612:
                    613:                npf_stats_inc(NPF_STAT_RACE_NAT);
                    614:                return EISCONN;
                    615:        }
                    616:
                    617:        /* Associate the NAT entry and release the lock. */
                    618:        con->c_nat = nt;
                    619:        mutex_exit(&con->c_lock);
                    620:        return 0;
                    621: }
                    622:
                    623: /*
                    624:  * npf_conn_expire: explicitly mark connection as expired.
                    625:  */
                    626: void
                    627: npf_conn_expire(npf_conn_t *con)
                    628: {
                    629:        /* KASSERT(con->c_refcnt > 0); XXX: npf_nat_freepolicy() */
                    630:        atomic_or_uint(&con->c_flags, CONN_EXPIRE);
                    631: }
                    632:
                    633: /*
                    634:  * npf_conn_pass: return true if connection is "pass" one, otherwise false.
                    635:  */
                    636: bool
                    637: npf_conn_pass(const npf_conn_t *con, npf_rproc_t **rp)
                    638: {
                    639:        KASSERT(con->c_refcnt > 0);
                    640:        if (__predict_true(con->c_flags & CONN_PASS)) {
                    641:                *rp = con->c_rproc;
                    642:                return true;
                    643:        }
                    644:        return false;
                    645: }
                    646:
                    647: /*
                    648:  * npf_conn_setpass: mark connection as a "pass" one and associate the
                    649:  * rule procedure with it.
                    650:  */
                    651: void
                    652: npf_conn_setpass(npf_conn_t *con, npf_rproc_t *rp)
                    653: {
                    654:        KASSERT((con->c_flags & CONN_ACTIVE) == 0);
                    655:        KASSERT(con->c_refcnt > 0);
                    656:        KASSERT(con->c_rproc == NULL);
                    657:
                    658:        /*
                    659:         * No need for atomic since the connection is not yet active.
                    660:         * If rproc is set, the caller transfers its reference to us,
                    661:         * which will be released on npf_conn_destroy().
                    662:         */
                    663:        con->c_flags |= CONN_PASS;
                    664:        con->c_rproc = rp;
                    665: }
                    666:
                    667: /*
                    668:  * npf_conn_release: release a reference, which might allow G/C thread
                    669:  * to destroy this connection.
                    670:  */
                    671: void
                    672: npf_conn_release(npf_conn_t *con)
                    673: {
                    674:        if ((con->c_flags & (CONN_ACTIVE | CONN_EXPIRE)) == 0) {
                    675:                /* Activate: after this, connection is globally visible. */
                    676:                con->c_flags |= CONN_ACTIVE;
                    677:        }
                    678:        KASSERT(con->c_refcnt > 0);
                    679:        atomic_dec_uint(&con->c_refcnt);
                    680: }
                    681:
                    682: /*
                    683:  * npf_conn_retnat: return associated NAT data entry and indicate
                    684:  * whether it is a "forwards" or "backwards" stream.
                    685:  */
                    686: npf_nat_t *
                    687: npf_conn_retnat(npf_conn_t *con, const int di, bool *forw)
                    688: {
                    689:        KASSERT(con->c_refcnt > 0);
                    690:        *forw = (con->c_flags & PFIL_ALL) == di;
                    691:        return con->c_nat;
                    692: }
                    693:
                    694: /*
                    695:  * npf_conn_expired: criterion to check if connection is expired.
                    696:  */
                    697: static inline bool
                    698: npf_conn_expired(const npf_conn_t *con, const struct timespec *tsnow)
                    699: {
                    700:        const int etime = npf_state_etime(&con->c_state, con->c_proto);
                    701:        struct timespec tsdiff;
                    702:
                    703:        if (__predict_false(con->c_flags & CONN_EXPIRE)) {
                    704:                /* Explicitly marked to be expired. */
                    705:                return true;
                    706:        }
                    707:        timespecsub(tsnow, &con->c_atime, &tsdiff);
                    708:        return tsdiff.tv_sec > etime;
                    709: }
                    710:
                    711: /*
1.6       rmind     712:  * npf_conn_gc: garbage collect the expired connections.
                    713:  *
                    714:  * => Must run in a single-threaded manner.
                    715:  * => If it is a flush request, then destroy all connections.
                    716:  * => If 'sync' is true, then perform passive serialisation.
1.1       rmind     717:  */
1.7       rmind     718: void
1.6       rmind     719: npf_conn_gc(npf_conndb_t *cd, bool flush, bool sync)
1.1       rmind     720: {
                    721:        npf_conn_t *con, *prev, *gclist = NULL;
                    722:        struct timespec tsnow;
                    723:
                    724:        getnanouptime(&tsnow);
                    725:
                    726:        /*
                    727:         * Scan all connections and check them for expiration.
                    728:         */
                    729:        prev = NULL;
                    730:        con = npf_conndb_getlist(cd);
                    731:        while (con) {
                    732:                npf_conn_t *next = con->c_next;
                    733:
                    734:                /* Expired?  Flushing all? */
1.6       rmind     735:                if (!npf_conn_expired(con, &tsnow) && !flush) {
1.1       rmind     736:                        prev = con;
                    737:                        con = next;
                    738:                        continue;
                    739:                }
                    740:
                    741:                /* Remove both entries of the connection. */
                    742:                mutex_enter(&con->c_lock);
                    743:                if ((con->c_flags & CONN_REMOVED) == 0) {
                    744:                        npf_conn_t *ret __diagused;
                    745:
                    746:                        ret = npf_conndb_remove(cd, &con->c_forw_entry);
                    747:                        KASSERT(ret == con);
                    748:                        ret = npf_conndb_remove(cd, &con->c_back_entry);
                    749:                        KASSERT(ret == con);
                    750:                }
                    751:
                    752:                /* Flag the removal and expiration. */
                    753:                atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
                    754:                mutex_exit(&con->c_lock);
                    755:
                    756:                /* Move to the G/C list. */
                    757:                npf_conndb_dequeue(cd, con, prev);
                    758:                con->c_next = gclist;
                    759:                gclist = con;
                    760:
                    761:                /* Next.. */
                    762:                con = next;
                    763:        }
                    764:        npf_conndb_settail(cd, prev);
1.6       rmind     765:
                    766:        /*
                    767:         * Ensure it is safe to destroy the connections.
                    768:         * Note: drop the conn_lock (see the lock order).
                    769:         */
                    770:        if (sync) {
                    771:                mutex_exit(&conn_lock);
                    772:                if (gclist) {
                    773:                        npf_config_enter();
                    774:                        npf_config_sync();
                    775:                        npf_config_exit();
                    776:                }
1.1       rmind     777:        }
                    778:
                    779:        /*
                    780:         * Garbage collect all expired connections.
                    781:         * May need to wait for the references to drain.
                    782:         */
                    783:        con = gclist;
                    784:        while (con) {
                    785:                npf_conn_t *next = con->c_next;
                    786:
                    787:                /*
                    788:                 * Destroy only if removed and no references.
                    789:                 * Otherwise, wait for a tiny moment.
                    790:                 */
                    791:                if (__predict_false(con->c_refcnt)) {
                    792:                        kpause("npfcongc", false, 1, NULL);
                    793:                        continue;
                    794:                }
                    795:                npf_conn_destroy(con);
                    796:                con = next;
                    797:        }
                    798: }
                    799:
1.6       rmind     800: /*
                    801:  * npf_conn_worker: G/C to run from a worker thread.
                    802:  */
                    803: static void
                    804: npf_conn_worker(void)
1.1       rmind     805: {
1.6       rmind     806:        mutex_enter(&conn_lock);
                    807:        /* Note: the conn_lock will be released (sync == true). */
                    808:        npf_conn_gc(conn_db, false, true);
1.1       rmind     809: }
                    810:
                    811: /*
1.6       rmind     812:  * npf_conn_export: construct a list of connections prepared for saving.
1.1       rmind     813:  * Note: this is expected to be an expensive operation.
                    814:  */
                    815: int
1.6       rmind     816: npf_conn_export(prop_array_t conlist)
1.1       rmind     817: {
                    818:        npf_conn_t *con, *prev;
                    819:
                    820:        /*
                    821:         * Note: acquire conn_lock to prevent from the database
                    822:         * destruction and G/C thread.
                    823:         */
                    824:        mutex_enter(&conn_lock);
1.6       rmind     825:        if (conn_tracking != CONN_TRACKING_ON) {
1.1       rmind     826:                mutex_exit(&conn_lock);
                    827:                return 0;
                    828:        }
                    829:        prev = NULL;
                    830:        con = npf_conndb_getlist(conn_db);
                    831:        while (con) {
                    832:                npf_conn_t *next = con->c_next;
                    833:                prop_data_t d;
                    834:
                    835:                if ((con->c_flags & (CONN_ACTIVE|CONN_EXPIRE)) != CONN_ACTIVE)
                    836:                        goto skip;
                    837:
                    838:                prop_dictionary_t cdict = prop_dictionary_create();
                    839:                prop_dictionary_set_uint32(cdict, "flags", con->c_flags);
                    840:                prop_dictionary_set_uint32(cdict, "proto", con->c_proto);
                    841:                /* FIXME: interface-id */
                    842:
                    843:                d = prop_data_create_data(&con->c_state, sizeof(npf_state_t));
                    844:                prop_dictionary_set_and_rel(cdict, "state", d);
                    845:
                    846:                const uint32_t *fkey = con->c_forw_entry.ck_key;
                    847:                d = prop_data_create_data(fkey, NPF_CONN_MAXKEYLEN);
                    848:                prop_dictionary_set_and_rel(cdict, "forw-key", d);
                    849:
                    850:                const uint32_t *bkey = con->c_back_entry.ck_key;
                    851:                d = prop_data_create_data(bkey, NPF_CONN_MAXKEYLEN);
                    852:                prop_dictionary_set_and_rel(cdict, "back-key", d);
                    853:
                    854:                if (con->c_nat) {
1.6       rmind     855:                        npf_nat_export(cdict, con->c_nat);
1.1       rmind     856:                }
                    857:                prop_array_add(conlist, cdict);
                    858:                prop_object_release(cdict);
                    859: skip:
                    860:                prev = con;
                    861:                con = next;
                    862:        }
                    863:        npf_conndb_settail(conn_db, prev);
                    864:        mutex_exit(&conn_lock);
1.5       joerg     865:        return 0;
1.1       rmind     866: }
                    867:
                    868: /*
1.6       rmind     869:  * npf_conn_import: fully reconstruct a single connection from a
                    870:  * directory and insert into the given database.
1.1       rmind     871:  */
                    872: int
1.6       rmind     873: npf_conn_import(npf_conndb_t *cd, prop_dictionary_t cdict,
                    874:     npf_ruleset_t *natlist)
1.1       rmind     875: {
                    876:        npf_conn_t *con;
                    877:        npf_connkey_t *fw, *bk;
                    878:        prop_object_t obj;
                    879:        const void *d;
                    880:
                    881:        /* Allocate a connection and initialise it (clear first). */
                    882:        con = pool_cache_get(conn_cache, PR_WAITOK);
                    883:        memset(con, 0, sizeof(npf_conn_t));
                    884:        mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
                    885:
                    886:        prop_dictionary_get_uint32(cdict, "proto", &con->c_proto);
                    887:        prop_dictionary_get_uint32(cdict, "flags", &con->c_flags);
                    888:        con->c_flags &= PFIL_ALL | CONN_ACTIVE | CONN_PASS;
                    889:        getnanouptime(&con->c_atime);
                    890:
                    891:        obj = prop_dictionary_get(cdict, "state");
                    892:        if ((d = prop_data_data_nocopy(obj)) == NULL ||
                    893:            prop_data_size(obj) != sizeof(npf_state_t)) {
                    894:                goto err;
                    895:        }
                    896:        memcpy(&con->c_state, d, sizeof(npf_state_t));
                    897:
                    898:        /* Reconstruct NAT association, if any, or return NULL. */
1.6       rmind     899:        con->c_nat = npf_nat_import(cdict, natlist, con);
1.1       rmind     900:
                    901:        /*
                    902:         * Fetch and copy the keys for each direction.
                    903:         */
                    904:        obj = prop_dictionary_get(cdict, "forw-key");
                    905:        if ((d = prop_data_data_nocopy(obj)) == NULL ||
                    906:            prop_data_size(obj) != NPF_CONN_MAXKEYLEN) {
                    907:                goto err;
                    908:        }
                    909:        fw = &con->c_forw_entry;
                    910:        memcpy(&fw->ck_key, d, NPF_CONN_MAXKEYLEN);
                    911:
                    912:        obj = prop_dictionary_get(cdict, "back-key");
                    913:        if ((d = prop_data_data_nocopy(obj)) == NULL ||
                    914:            prop_data_size(obj) != NPF_CONN_MAXKEYLEN) {
                    915:                goto err;
                    916:        }
                    917:        bk = &con->c_back_entry;
                    918:        memcpy(&bk->ck_key, d, NPF_CONN_MAXKEYLEN);
                    919:
                    920:        fw->ck_backptr = bk->ck_backptr = con;
                    921:
                    922:        /* Insert the entries and the connection itself. */
                    923:        if (!npf_conndb_insert(cd, fw, con)) {
                    924:                goto err;
                    925:        }
                    926:        if (!npf_conndb_insert(cd, bk, con)) {
                    927:                npf_conndb_remove(cd, fw);
                    928:                goto err;
                    929:        }
                    930:        npf_conndb_enqueue(cd, con);
                    931:        return 0;
                    932: err:
                    933:        npf_conn_destroy(con);
                    934:        return EINVAL;
                    935: }
                    936:
                    937: #if defined(DDB) || defined(_NPF_TESTING)
                    938:
                    939: void
                    940: npf_conn_print(const npf_conn_t *con)
                    941: {
                    942:        const u_int alen = NPF_CONN_GETALEN(&con->c_forw_entry);
                    943:        const uint32_t *fkey = con->c_forw_entry.ck_key;
                    944:        const uint32_t *bkey = con->c_back_entry.ck_key;
                    945:        const u_int proto = con->c_proto;
                    946:        struct timespec tsnow, tsdiff;
                    947:        const void *src, *dst;
                    948:        int etime;
                    949:
                    950:        getnanouptime(&tsnow);
                    951:        timespecsub(&tsnow, &con->c_atime, &tsdiff);
                    952:        etime = npf_state_etime(&con->c_state, proto);
                    953:
                    954:        printf("%p:\n\tproto %d flags 0x%x tsdiff %d etime %d\n",
                    955:            con, proto, con->c_flags, (int)tsdiff.tv_sec, etime);
                    956:
                    957:        src = &fkey[2], dst = &fkey[2 + (alen >> 2)];
                    958:        printf("\tforw %s:%d", npf_addr_dump(src, alen), ntohs(fkey[1] >> 16));
                    959:        printf("-> %s:%d\n", npf_addr_dump(dst, alen), ntohs(fkey[1] & 0xffff));
                    960:
                    961:        src = &bkey[2], dst = &bkey[2 + (alen >> 2)];
                    962:        printf("\tback %s:%d", npf_addr_dump(src, alen), ntohs(bkey[1] >> 16));
                    963:        printf("-> %s:%d\n", npf_addr_dump(dst, alen), ntohs(bkey[1] & 0xffff));
                    964:
                    965:        npf_state_dump(&con->c_state);
                    966:        if (con->c_nat) {
                    967:                npf_nat_dump(con->c_nat);
                    968:        }
                    969: }
                    970:
                    971: #endif

CVSweb <webmaster@jp.NetBSD.org>