[BACK]Return to sys_futex.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / kern

Annotation of src/sys/kern/sys_futex.c, Revision 1.12.4.5

1.12.4.5! thorpej     1: /*     $NetBSD: sys_futex.c,v 1.12.4.4 2021/08/06 23:53:53 thorpej Exp $       */
1.1       thorpej     2:
                      3: /*-
                      4:  * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
                      5:  * All rights reserved.
                      6:  *
                      7:  * This code is derived from software contributed to The NetBSD Foundation
                      8:  * by Taylor R. Campbell and Jason R. Thorpe.
                      9:  *
                     10:  * Redistribution and use in source and binary forms, with or without
                     11:  * modification, are permitted provided that the following conditions
                     12:  * are met:
                     13:  * 1. Redistributions of source code must retain the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer.
                     15:  * 2. Redistributions in binary form must reproduce the above copyright
                     16:  *    notice, this list of conditions and the following disclaimer in the
                     17:  *    documentation and/or other materials provided with the distribution.
                     18:  *
                     19:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     20:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     21:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     22:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     23:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     24:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     25:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     26:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     27:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     28:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     29:  * POSSIBILITY OF SUCH DAMAGE.
                     30:  */
                     31:
                     32: #include <sys/cdefs.h>
1.12.4.5! thorpej    33: __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.4 2021/08/06 23:53:53 thorpej Exp $");
1.1       thorpej    34:
                     35: /*
                     36:  * Futexes
                     37:  *
                     38:  *     The futex system call coordinates notifying threads waiting for
                     39:  *     changes on a 32-bit word of memory.  The word can be managed by
                     40:  *     CPU atomic operations in userland, without system calls, as long
                     41:  *     as there is no contention.
                     42:  *
                     43:  *     The simplest use case demonstrating the utility is:
                     44:  *
                     45:  *             // 32-bit word of memory shared among threads or
                     46:  *             // processes in userland.  lock & 1 means owned;
                     47:  *             // lock & 2 means there are waiters waiting.
                     48:  *             volatile int lock = 0;
                     49:  *
                     50:  *             int v;
                     51:  *
                     52:  *             // Acquire a lock.
                     53:  *             do {
                     54:  *                     v = lock;
                     55:  *                     if (v & 1) {
                     56:  *                             // Lock is held.  Set a bit to say that
                     57:  *                             // there are waiters, and wait for lock
                     58:  *                             // to change to anything other than v;
                     59:  *                             // then retry.
                     60:  *                             if (atomic_cas_uint(&lock, v, v | 2) != v)
                     61:  *                                     continue;
                     62:  *                             futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
                     63:  *                             continue;
                     64:  *                     }
                     65:  *             } while (atomic_cas_uint(&lock, v, v & ~1) != v);
                     66:  *             membar_enter();
                     67:  *
                     68:  *             ...
                     69:  *
                     70:  *             // Release the lock.  Optimistically assume there are
                     71:  *             // no waiters first until demonstrated otherwise.
                     72:  *             membar_exit();
                     73:  *             if (atomic_cas_uint(&lock, 1, 0) != 1) {
                     74:  *                     // There may be waiters.
                     75:  *                     v = atomic_swap_uint(&lock, 0);
                     76:  *                     // If there are still waiters, wake one.
                     77:  *                     if (v & 2)
                     78:  *                             futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
                     79:  *             }
                     80:  *
                     81:  *     The goal is to avoid the futex system call unless there is
                     82:  *     contention; then if there is contention, to guarantee no missed
                     83:  *     wakeups.
                     84:  *
                     85:  *     For a simple implementation, futex(FUTEX_WAIT) could queue
                     86:  *     itself to be woken, double-check the lock word, and then sleep;
                     87:  *     spurious wakeups are generally a fact of life, so any
                     88:  *     FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
                     89:  *
                     90:  *     If this were all there is to it, we could then increase
                     91:  *     parallelism by refining the approximation: partition the
                     92:  *     waiters into buckets by hashing the lock addresses to reduce
                     93:  *     the incidence of spurious wakeups.  But this is not all.
                     94:  *
                     95:  *     The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
                     96:  *     operation not only wakes n waiters on lock if lock == val, but
                     97:  *     also _transfers_ m additional waiters to lock2.  Unless wakeups
                     98:  *     on lock2 also trigger wakeups on lock, we cannot move waiters
                     99:  *     to lock2 if they merely share the same hash as waiters on lock.
                    100:  *     Thus, we can't approximately distribute waiters into queues by
                    101:  *     a hash function; we must distinguish futex queues exactly by
                    102:  *     lock address.
                    103:  *
                    104:  *     For now, we use a global red/black tree to index futexes.  This
                    105:  *     should be replaced by a lockless radix tree with a thread to
                    106:  *     free entries no longer in use once all lookups on all CPUs have
                    107:  *     completed.
                    108:  *
                    109:  *     Specifically, we maintain two maps:
                    110:  *
                    111:  *     futex_tab.va[vmspace, va] for private futexes
                    112:  *     futex_tab.oa[uvm_voaddr] for shared futexes
                    113:  *
                    114:  *     This implementation does not support priority inheritance.
                    115:  */
                    116:
1.12      skrll     117: #include <sys/param.h>
1.1       thorpej   118: #include <sys/types.h>
                    119: #include <sys/atomic.h>
                    120: #include <sys/condvar.h>
                    121: #include <sys/futex.h>
                    122: #include <sys/mutex.h>
                    123: #include <sys/rbtree.h>
1.12.4.1  thorpej   124: #include <sys/sleepq.h>
1.1       thorpej   125: #include <sys/queue.h>
1.12.4.1  thorpej   126: #include <sys/sdt.h>
1.1       thorpej   127:
                    128: #include <sys/syscall.h>
                    129: #include <sys/syscallargs.h>
                    130: #include <sys/syscallvar.h>
                    131:
                    132: #include <uvm/uvm_extern.h>
                    133:
                    134: /*
1.12.4.1  thorpej   135:  * DTrace probes.
                    136:  */
                    137: SDT_PROVIDER_DEFINE(futex);
                    138:
                    139: /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
                    140: /* exit: error */
                    141: SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
                    142:                  "struct timespec *", "int", "int");
                    143: SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
                    144:
                    145: /* entry: uaddr, nwake, bitset, fflags */
                    146: /* exit: error, nwoken */
                    147: SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
                    148: SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
                    149:
                    150: /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
                    151: /* exit: error, nwoken */
                    152: SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
                    153:                  "int");
                    154: SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
                    155:
                    156: /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
                    157: /* exit: error, nwoken */
                    158: SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
                    159:                  "int", "int", "int");
                    160: SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
                    161:
                    162: /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
                    163: /* exit: error, nwoken */
                    164: SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
                    165:                  "int", "int");
                    166: SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
                    167:
                    168: SDT_PROBE_DEFINE0(futex, wait, finish, normally);
                    169: SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
                    170: SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
                    171:
                    172: /* entry: timo */
                    173: /* exit: error */
                    174: SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
                    175: SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
                    176:
                    177: /*
1.1       thorpej   178:  * Lock order:
                    179:  *
                    180:  *     futex_tab.lock
1.12.4.1  thorpej   181:  *     futex::fx_op_lock               ordered by kva of struct futex
                    182:  *      -> futex::fx_sq_lock           ordered by kva of sleepq lock
                    183:  *
                    184:  * N.B. multiple futexes can share a single sleepq lock.
1.1       thorpej   185:  */
                    186:
                    187: /*
                    188:  * union futex_key
                    189:  *
                    190:  *     A futex is addressed either by a vmspace+va (private) or by
                    191:  *     a uvm_voaddr (shared).
                    192:  */
1.12.4.4  thorpej   193: typedef union futex_key {
1.1       thorpej   194:        struct {
                    195:                struct vmspace                  *vmspace;
                    196:                vaddr_t                         va;
                    197:        }                       fk_private;
                    198:        struct uvm_voaddr       fk_shared;
1.12.4.4  thorpej   199: } futex_key_t;
                    200:
                    201: CTASSERT(offsetof(futex_key_t, fk_private.va) ==
                    202:         offsetof(futex_key_t, fk_shared.offset));
1.1       thorpej   203:
                    204: /*
                    205:  * struct futex
                    206:  *
                    207:  *     Kernel state for a futex located at a particular address in a
                    208:  *     particular virtual address space.
                    209:  *
                    210:  *     N.B. fx_refcnt is an unsigned long because we need to be able
                    211:  *     to operate on it atomically on all systems while at the same
                    212:  *     time rendering practically impossible the chance of it reaching
                    213:  *     its max value.  In practice, we're limited by the number of LWPs
                    214:  *     that can be present on the system at any given time, and the
                    215:  *     assumption is that limit will be good enough on a 32-bit platform.
                    216:  *     See futex_wake() for why overflow needs to be avoided.
1.12.4.1  thorpej   217:  *
                    218:  *     XXX Since futex addresses must be 4-byte aligned, we could
                    219:  *     XXX squirrel away fx_shared and fx_on_tree bits in the "va"
                    220:  *     XXX field of the key.  Worth it?
1.1       thorpej   221:  */
                    222: struct futex {
1.12.4.4  thorpej   223:        futex_key_t             fx_key;
1.12.4.1  thorpej   224:        struct rb_node          fx_node;
1.1       thorpej   225:        unsigned long           fx_refcnt;
                    226:        bool                    fx_shared;
                    227:        bool                    fx_on_tree;
1.12.4.1  thorpej   228:        uint8_t                 fx_class;
1.1       thorpej   229:
1.12.4.1  thorpej   230:        kmutex_t                fx_op_lock;     /* adaptive */
                    231:        kmutex_t *              fx_sq_lock;     /* &sleepq_locks[...] */
                    232:        sleepq_t                fx_sqs[2];      /* 0=reader, 1=writer */
                    233:        unsigned int            fx_nwaiters[2];
1.1       thorpej   234: };
                    235:
                    236: /*
1.12.4.1  thorpej   237:  * futex classes: Some futex operations can only be carried out on
                    238:  * futexes that are known to be abiding by a certain protocol.  These
                    239:  * classes are assigned to a futex when created due to a wait event,
                    240:  * and when subsequent wake or requeue operations are issued, the
                    241:  * class is checked at futex lookup time.  If the class does not match,
                    242:  * EINVAL is the result.
                    243:  */
                    244: #define        FUTEX_CLASS_ANY         0               /* match any class in lookup */
                    245: #define        FUTEX_CLASS_NORMAL      1               /* normal futex */
                    246: #define        FUTEX_CLASS_RWLOCK      2               /* rwlock futex */
                    247: #define        FUTEX_CLASS_PI          3               /* for FUTEX_*_PI ops */
                    248:
                    249: /* sleepq definitions */
                    250: #define        FUTEX_READERQ           0
                    251: #define        FUTEX_WRITERQ           1
                    252:
                    253: static const char futex_wmesg[] = "futex";
                    254:
                    255: static void    futex_unsleep(lwp_t *, bool);
                    256:
                    257: static syncobj_t futex_syncobj = {
                    258:        .sobj_flag      = SOBJ_SLEEPQ_SORTED,
                    259:        .sobj_unsleep   = futex_unsleep,
                    260:        .sobj_changepri = sleepq_changepri,
                    261:        .sobj_lendpri   = sleepq_lendpri,
                    262:        .sobj_owner     = syncobj_noowner,
1.1       thorpej   263: };
                    264:
                    265: /*
                    266:  * futex_tab
                    267:  *
                    268:  *     Global trees of futexes by vmspace/va and VM object address.
                    269:  *
                    270:  *     XXX This obviously doesn't scale in parallel.  We could use a
                    271:  *     pserialize-safe data structure, but there may be a high cost to
                    272:  *     frequent deletion since we don't cache futexes after we're done
                    273:  *     with them.  We could use hashed locks.  But for now, just make
                    274:  *     sure userland can't DoS the serial performance, by using a
                    275:  *     balanced binary tree for lookup.
                    276:  *
                    277:  *     XXX We could use a per-process tree for the table indexed by
                    278:  *     virtual address to reduce contention between processes.
                    279:  */
                    280: static struct {
                    281:        kmutex_t        lock;
                    282:        struct rb_tree  va;
                    283:        struct rb_tree  oa;
                    284: } futex_tab __cacheline_aligned;
                    285:
                    286: static int
                    287: compare_futex_key(void *cookie, const void *n, const void *k)
                    288: {
                    289:        const struct futex *fa = n;
1.12.4.4  thorpej   290:        const futex_key_t *fka = &fa->fx_key;
                    291:        const futex_key_t *fkb = k;
1.1       thorpej   292:
                    293:        if ((uintptr_t)fka->fk_private.vmspace <
                    294:            (uintptr_t)fkb->fk_private.vmspace)
                    295:                return -1;
                    296:        if ((uintptr_t)fka->fk_private.vmspace >
                    297:            (uintptr_t)fkb->fk_private.vmspace)
                    298:                return +1;
                    299:        if (fka->fk_private.va < fkb->fk_private.va)
                    300:                return -1;
                    301:        if (fka->fk_private.va > fkb->fk_private.va)
                    302:                return -1;
                    303:        return 0;
                    304: }
                    305:
                    306: static int
                    307: compare_futex(void *cookie, const void *na, const void *nb)
                    308: {
                    309:        const struct futex *fa = na;
                    310:        const struct futex *fb = nb;
                    311:
                    312:        return compare_futex_key(cookie, fa, &fb->fx_key);
                    313: }
                    314:
                    315: static const rb_tree_ops_t futex_rb_ops = {
                    316:        .rbto_compare_nodes = compare_futex,
                    317:        .rbto_compare_key = compare_futex_key,
                    318:        .rbto_node_offset = offsetof(struct futex, fx_node),
                    319: };
                    320:
                    321: static int
                    322: compare_futex_shared_key(void *cookie, const void *n, const void *k)
                    323: {
                    324:        const struct futex *fa = n;
1.12.4.4  thorpej   325:        const futex_key_t *fka = &fa->fx_key;
                    326:        const futex_key_t *fkb = k;
1.1       thorpej   327:
                    328:        return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
                    329: }
                    330:
                    331: static int
                    332: compare_futex_shared(void *cookie, const void *na, const void *nb)
                    333: {
                    334:        const struct futex *fa = na;
                    335:        const struct futex *fb = nb;
                    336:
                    337:        return compare_futex_shared_key(cookie, fa, &fb->fx_key);
                    338: }
                    339:
                    340: static const rb_tree_ops_t futex_shared_rb_ops = {
                    341:        .rbto_compare_nodes = compare_futex_shared,
                    342:        .rbto_compare_key = compare_futex_shared_key,
                    343:        .rbto_node_offset = offsetof(struct futex, fx_node),
                    344: };
                    345:
1.12.4.1  thorpej   346: /*
                    347:  * futex_sq_lock2(f, f2)
                    348:  *
                    349:  *     Acquire the sleepq locks for f and f2, which may be null, or
                    350:  *     which may be the same.  If they are distinct, an arbitrary total
                    351:  *     order is chosen on the locks.
                    352:  *
                    353:  *     Callers should only ever acquire multiple sleepq locks
                    354:  *     simultaneously using futex_sq_lock2.
                    355:  */
                    356: static void
                    357: futex_sq_lock2(struct futex * const f, struct futex * const f2)
                    358: {
                    359:
                    360:        /*
                    361:         * If both are null, do nothing; if one is null and the other
                    362:         * is not, lock the other and be done with it.
                    363:         */
                    364:        if (f == NULL && f2 == NULL) {
                    365:                return;
                    366:        } else if (f == NULL) {
                    367:                mutex_spin_enter(f2->fx_sq_lock);
                    368:                return;
                    369:        } else if (f2 == NULL) {
                    370:                mutex_spin_enter(f->fx_sq_lock);
                    371:                return;
                    372:        }
                    373:
                    374:        kmutex_t * const m = f->fx_sq_lock;
                    375:        kmutex_t * const m2 = f2->fx_sq_lock;
                    376:
                    377:        /* If both locks are the same, acquire only one. */
                    378:        if (m == m2) {
                    379:                mutex_spin_enter(m);
                    380:                return;
                    381:        }
                    382:
                    383:        /* Otherwise, use the ordering on the kva of the lock pointer.  */
                    384:        if ((uintptr_t)m < (uintptr_t)m2) {
                    385:                mutex_spin_enter(m);
                    386:                mutex_spin_enter(m2);
                    387:        } else {
                    388:                mutex_spin_enter(m2);
                    389:                mutex_spin_enter(m);
                    390:        }
                    391: }
                    392:
                    393: /*
                    394:  * futex_sq_unlock2(f, f2)
                    395:  *
                    396:  *     Release the sleep queue locks for f and f2, which may be null, or
                    397:  *     which may have the same underlying lock.
                    398:  */
                    399: static void
                    400: futex_sq_unlock2(struct futex * const f, struct futex * const f2)
                    401: {
                    402:
                    403:        /*
                    404:         * If both are null, do nothing; if one is null and the other
                    405:         * is not, unlock the other and be done with it.
                    406:         */
                    407:        if (f == NULL && f2 == NULL) {
                    408:                return;
                    409:        } else if (f == NULL) {
                    410:                mutex_spin_exit(f2->fx_sq_lock);
                    411:                return;
                    412:        } else if (f2 == NULL) {
                    413:                mutex_spin_exit(f->fx_sq_lock);
                    414:                return;
                    415:        }
                    416:
                    417:        kmutex_t * const m = f->fx_sq_lock;
                    418:        kmutex_t * const m2 = f2->fx_sq_lock;
                    419:
                    420:        /* If both locks are the same, release only one. */
                    421:        if (m == m2) {
                    422:                mutex_spin_exit(m);
                    423:                return;
                    424:        }
                    425:
                    426:        /* Otherwise, use the ordering on the kva of the lock pointer.  */
                    427:        if ((uintptr_t)m < (uintptr_t)m2) {
                    428:                mutex_spin_exit(m2);
                    429:                mutex_spin_exit(m);
                    430:        } else {
                    431:                mutex_spin_exit(m);
                    432:                mutex_spin_exit(m2);
                    433:        }
                    434: }
1.1       thorpej   435:
                    436: /*
                    437:  * futex_load(uaddr, kaddr)
                    438:  *
                    439:  *     Perform a single atomic load to read *uaddr, and return the
                    440:  *     result in *kaddr.  Return 0 on success, EFAULT if uaddr is not
                    441:  *     mapped.
                    442:  */
                    443: static inline int
                    444: futex_load(int *uaddr, int *kaddr)
                    445: {
                    446:        return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
                    447: }
                    448:
                    449: /*
                    450:  * futex_test(uaddr, expected)
                    451:  *
                    452:  *     True if *uaddr == expected.  False if *uaddr != expected, or if
                    453:  *     uaddr is not mapped.
                    454:  */
                    455: static bool
                    456: futex_test(int *uaddr, int expected)
                    457: {
                    458:        int val;
                    459:        int error;
                    460:
                    461:        error = futex_load(uaddr, &val);
                    462:        if (error)
                    463:                return false;
                    464:        return val == expected;
                    465: }
                    466:
1.12.4.1  thorpej   467: static pool_cache_t    futex_cache             __read_mostly;
                    468:
                    469: static int     futex_ctor(void *, void *, int);
                    470: static void    futex_dtor(void *, void *);
                    471:
1.1       thorpej   472: /*
                    473:  * futex_sys_init()
                    474:  *
                    475:  *     Initialize the futex subsystem.
                    476:  */
                    477: void
                    478: futex_sys_init(void)
                    479: {
                    480:
                    481:        mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
                    482:        rb_tree_init(&futex_tab.va, &futex_rb_ops);
                    483:        rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
1.12.4.1  thorpej   484:
                    485:        futex_cache = pool_cache_init(sizeof(struct futex),
                    486:            coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
                    487:            futex_dtor, NULL);
1.1       thorpej   488: }
                    489:
                    490: /*
                    491:  * futex_sys_fini()
                    492:  *
                    493:  *     Finalize the futex subsystem.
                    494:  */
                    495: void
                    496: futex_sys_fini(void)
                    497: {
                    498:
                    499:        KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
                    500:        KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
                    501:        mutex_destroy(&futex_tab.lock);
                    502: }
                    503:
                    504: /*
1.12.4.1  thorpej   505:  * futex_ctor()
1.1       thorpej   506:  *
1.12.4.1  thorpej   507:  *     Futex object constructor.
1.1       thorpej   508:  */
1.12.4.1  thorpej   509: static int
                    510: futex_ctor(void *arg __unused, void *obj, int flags __unused)
1.1       thorpej   511: {
1.12.4.1  thorpej   512:        extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
                    513:        struct futex * const f = obj;
1.1       thorpej   514:
1.12.4.1  thorpej   515:        mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
                    516:        f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
1.1       thorpej   517:
1.12.4.1  thorpej   518:        sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
                    519:        sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
                    520:        f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
1.1       thorpej   521:
1.12.4.1  thorpej   522:        return 0;
1.1       thorpej   523: }
                    524:
                    525: /*
1.12.4.1  thorpej   526:  * futex_dtor()
1.1       thorpej   527:  *
1.12.4.1  thorpej   528:  *     Futex object destructor.
1.1       thorpej   529:  */
                    530: static void
1.12.4.1  thorpej   531: futex_dtor(void *arg __unused, void *obj)
1.1       thorpej   532: {
1.12.4.1  thorpej   533:        struct futex * const f = obj;
1.1       thorpej   534:
1.12.4.1  thorpej   535:        mutex_destroy(&f->fx_op_lock);
                    536:        f->fx_sq_lock = NULL;
1.1       thorpej   537: }
                    538:
                    539: /*
                    540:  * futex_key_init(key, vm, va, shared)
                    541:  *
                    542:  *     Initialize a futex key for lookup, etc.
                    543:  */
                    544: static int
1.12.4.4  thorpej   545: futex_key_init(futex_key_t *fk, struct vmspace *vm, vaddr_t va, bool shared)
1.1       thorpej   546: {
                    547:        int error = 0;
                    548:
                    549:        if (__predict_false(shared)) {
                    550:                if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
                    551:                        error = EFAULT;
                    552:        } else {
                    553:                fk->fk_private.vmspace = vm;
                    554:                fk->fk_private.va = va;
                    555:        }
                    556:
                    557:        return error;
                    558: }
                    559:
                    560: /*
                    561:  * futex_key_fini(key, shared)
                    562:  *
                    563:  *     Release a futex key.
                    564:  */
                    565: static void
1.12.4.4  thorpej   566: futex_key_fini(futex_key_t *fk, bool shared)
1.1       thorpej   567: {
                    568:        if (__predict_false(shared))
                    569:                uvm_voaddr_release(&fk->fk_shared);
                    570:        memset(fk, 0, sizeof(*fk));
                    571: }
                    572:
                    573: /*
                    574:  * futex_create(fk, shared)
                    575:  *
                    576:  *     Create a futex.  Initial reference count is 1, representing the
                    577:  *     caller.  Returns NULL on failure.  Always takes ownership of the
                    578:  *     key, either transferring it to the newly-created futex, or releasing
                    579:  *     the key if creation fails.
                    580:  *
                    581:  *     Never sleeps for memory, but may sleep to acquire a lock.
                    582:  */
                    583: static struct futex *
1.12.4.4  thorpej   584: futex_create(futex_key_t *fk, bool shared, uint8_t class)
1.1       thorpej   585: {
                    586:        struct futex *f;
                    587:
1.12.4.1  thorpej   588:        f = pool_cache_get(futex_cache, PR_NOWAIT);
1.1       thorpej   589:        if (f == NULL) {
                    590:                futex_key_fini(fk, shared);
                    591:                return NULL;
                    592:        }
                    593:        f->fx_key = *fk;
                    594:        f->fx_refcnt = 1;
                    595:        f->fx_shared = shared;
                    596:        f->fx_on_tree = false;
1.12.4.1  thorpej   597:        f->fx_class = class;
1.1       thorpej   598:
                    599:        return f;
                    600: }
                    601:
                    602: /*
                    603:  * futex_destroy(f)
                    604:  *
                    605:  *     Destroy a futex created with futex_create.  Reference count
                    606:  *     must be zero.
                    607:  *
                    608:  *     May sleep.
                    609:  */
                    610: static void
                    611: futex_destroy(struct futex *f)
                    612: {
                    613:
                    614:        ASSERT_SLEEPABLE();
                    615:
                    616:        KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
                    617:        KASSERT(!f->fx_on_tree);
1.12.4.1  thorpej   618:        KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
                    619:        KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
                    620:        KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
                    621:        KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
1.1       thorpej   622:
                    623:        futex_key_fini(&f->fx_key, f->fx_shared);
                    624:
1.12.4.1  thorpej   625:        pool_cache_put(futex_cache, f);
1.1       thorpej   626: }
                    627:
                    628: /*
1.12.4.1  thorpej   629:  * futex_hold_count(f, n)
1.1       thorpej   630:  *
                    631:  *     Attempt to acquire a reference to f.  Return 0 on success,
                    632:  *     ENFILE on too many references.
                    633:  *
                    634:  *     Never sleeps.
                    635:  */
                    636: static int
1.12.4.1  thorpej   637: futex_hold_count(struct futex *f, unsigned long const count)
1.1       thorpej   638: {
                    639:        unsigned long refcnt;
                    640:
                    641:        do {
                    642:                refcnt = atomic_load_relaxed(&f->fx_refcnt);
1.12.4.1  thorpej   643:                if (ULONG_MAX - refcnt < count)
1.1       thorpej   644:                        return ENFILE;
1.12.4.1  thorpej   645:        } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
                    646:                                  refcnt + count) != refcnt);
1.1       thorpej   647:
                    648:        return 0;
                    649: }
1.12.4.1  thorpej   650: #define        futex_hold(f)   futex_hold_count(f, 1)
1.1       thorpej   651:
                    652: /*
1.12.4.1  thorpej   653:  * futex_rele_count(f, n)
1.1       thorpej   654:  *
                    655:  *     Release a reference to f acquired with futex_create or
                    656:  *     futex_hold.
                    657:  *
                    658:  *     May sleep to free f.
                    659:  */
                    660: static void
1.12.4.1  thorpej   661: futex_rele_count(struct futex *f, unsigned long const count)
1.1       thorpej   662: {
                    663:        unsigned long refcnt;
                    664:
                    665:        ASSERT_SLEEPABLE();
                    666:
                    667:        do {
                    668:                refcnt = atomic_load_relaxed(&f->fx_refcnt);
1.12.4.1  thorpej   669:                KASSERT(refcnt >= count);
                    670:                if (refcnt - count == 0)
1.1       thorpej   671:                        goto trylast;
1.12.4.1  thorpej   672:        } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
                    673:                                  refcnt - count) != refcnt);
1.1       thorpej   674:        return;
                    675:
                    676: trylast:
1.12.4.1  thorpej   677:        KASSERT(count <= LONG_MAX);
1.1       thorpej   678:        mutex_enter(&futex_tab.lock);
1.12.4.1  thorpej   679:        if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
1.1       thorpej   680:                if (f->fx_on_tree) {
                    681:                        if (__predict_false(f->fx_shared))
                    682:                                rb_tree_remove_node(&futex_tab.oa, f);
                    683:                        else
                    684:                                rb_tree_remove_node(&futex_tab.va, f);
                    685:                        f->fx_on_tree = false;
                    686:                }
                    687:        } else {
                    688:                /* References remain -- don't destroy it.  */
                    689:                f = NULL;
                    690:        }
                    691:        mutex_exit(&futex_tab.lock);
                    692:        if (f != NULL)
                    693:                futex_destroy(f);
                    694: }
1.12.4.1  thorpej   695: #define        futex_rele(f)   futex_rele_count(f, 1)
1.1       thorpej   696:
                    697: /*
1.12.4.1  thorpej   698:  * futex_rele_count_not_last(f, n)
1.1       thorpej   699:  *
                    700:  *     Release a reference to f acquired with futex_create or
                    701:  *     futex_hold.
                    702:  *
                    703:  *     This version asserts that we are not dropping the last
                    704:  *     reference to f.
                    705:  */
                    706: static void
1.12.4.1  thorpej   707: futex_rele_count_not_last(struct futex *f, unsigned long const count)
1.1       thorpej   708: {
                    709:        unsigned long refcnt;
                    710:
                    711:        do {
                    712:                refcnt = atomic_load_relaxed(&f->fx_refcnt);
1.12.4.1  thorpej   713:                KASSERT(refcnt >= count);
                    714:        } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
                    715:                                  refcnt - count) != refcnt);
1.1       thorpej   716: }
                    717:
                    718: /*
1.12.4.1  thorpej   719:  * futex_lookup_by_key(key, shared, class, &f)
1.1       thorpej   720:  *
                    721:  *     Try to find an existing futex va reference in the specified key
                    722:  *     On success, return 0, set f to found futex or to NULL if not found,
                    723:  *     and increment f's reference count if found.
                    724:  *
                    725:  *     Return ENFILE if reference count too high.
                    726:  *
                    727:  *     Internal lookup routine shared by futex_lookup() and
1.5       riastrad  728:  *     futex_lookup_create().
1.1       thorpej   729:  */
                    730: static int
1.12.4.4  thorpej   731: futex_lookup_by_key(futex_key_t *fk, bool shared, uint8_t class,
1.12.4.1  thorpej   732:     struct futex **fp)
1.1       thorpej   733: {
                    734:        struct futex *f;
                    735:        int error = 0;
                    736:
                    737:        mutex_enter(&futex_tab.lock);
                    738:        if (__predict_false(shared)) {
                    739:                f = rb_tree_find_node(&futex_tab.oa, fk);
                    740:        } else {
                    741:                f = rb_tree_find_node(&futex_tab.va, fk);
                    742:        }
                    743:        if (f) {
1.12.4.1  thorpej   744:                if (__predict_true(f->fx_class == class ||
                    745:                                   class == FUTEX_CLASS_ANY))
                    746:                        error = futex_hold(f);
                    747:                else
                    748:                        error = EINVAL;
1.1       thorpej   749:                if (error)
                    750:                        f = NULL;
                    751:        }
                    752:        *fp = f;
                    753:        mutex_exit(&futex_tab.lock);
                    754:
                    755:        return error;
                    756: }
                    757:
                    758: /*
                    759:  * futex_insert(f, fp)
                    760:  *
                    761:  *     Try to insert the futex f into the tree by va.  If there
                    762:  *     already is a futex for its va, acquire a reference to it, and
                    763:  *     store it in *fp; otherwise store f in *fp.
                    764:  *
                    765:  *     Return 0 on success, ENFILE if there already is a futex but its
                    766:  *     reference count is too high.
                    767:  */
                    768: static int
                    769: futex_insert(struct futex *f, struct futex **fp)
                    770: {
                    771:        struct futex *f0;
                    772:        int error;
                    773:
                    774:        KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
                    775:        KASSERT(!f->fx_on_tree);
                    776:
                    777:        mutex_enter(&futex_tab.lock);
                    778:        if (__predict_false(f->fx_shared))
                    779:                f0 = rb_tree_insert_node(&futex_tab.oa, f);
                    780:        else
                    781:                f0 = rb_tree_insert_node(&futex_tab.va, f);
                    782:        if (f0 == f) {
                    783:                f->fx_on_tree = true;
                    784:                error = 0;
                    785:        } else {
                    786:                KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
                    787:                KASSERT(f0->fx_on_tree);
                    788:                error = futex_hold(f0);
                    789:                if (error)
                    790:                        goto out;
                    791:        }
                    792:        *fp = f0;
                    793: out:   mutex_exit(&futex_tab.lock);
                    794:
                    795:        return error;
                    796: }
                    797:
                    798: /*
1.12.4.1  thorpej   799:  * futex_lookup(uaddr, shared, class, &f)
1.1       thorpej   800:  *
                    801:  *     Find a futex at the userland pointer uaddr in the current
                    802:  *     process's VM space.  On success, return the futex in f and
                    803:  *     increment its reference count.
                    804:  *
1.5       riastrad  805:  *     Caller must call futex_rele when done.
1.1       thorpej   806:  */
                    807: static int
1.12.4.1  thorpej   808: futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
1.1       thorpej   809: {
1.12.4.4  thorpej   810:        futex_key_t fk;
1.1       thorpej   811:        struct vmspace *vm = curproc->p_vmspace;
                    812:        vaddr_t va = (vaddr_t)uaddr;
                    813:        int error;
                    814:
                    815:        /*
                    816:         * Reject unaligned user pointers so we don't cross page
                    817:         * boundaries and so atomics will work.
                    818:         */
1.12.4.1  thorpej   819:        if (__predict_false((va & 3) != 0))
1.1       thorpej   820:                return EINVAL;
                    821:
                    822:        /* Look it up. */
                    823:        error = futex_key_init(&fk, vm, va, shared);
                    824:        if (error)
                    825:                return error;
                    826:
1.12.4.1  thorpej   827:        error = futex_lookup_by_key(&fk, shared, class, fp);
1.1       thorpej   828:        futex_key_fini(&fk, shared);
                    829:        if (error)
                    830:                return error;
                    831:
                    832:        KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
1.12.4.1  thorpej   833:        KASSERT(*fp == NULL || (*fp)->fx_class == class);
1.1       thorpej   834:        KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
                    835:
                    836:        /*
                    837:         * Success!  (Caller must still check whether we found
                    838:         * anything, but nothing went _wrong_ like trying to use
                    839:         * unmapped memory.)
                    840:         */
                    841:        KASSERT(error == 0);
                    842:
                    843:        return error;
                    844: }
                    845:
                    846: /*
1.12.4.1  thorpej   847:  * futex_lookup_create(uaddr, shared, class, &f)
1.1       thorpej   848:  *
                    849:  *     Find or create a futex at the userland pointer uaddr in the
                    850:  *     current process's VM space.  On success, return the futex in f
                    851:  *     and increment its reference count.
                    852:  *
1.5       riastrad  853:  *     Caller must call futex_rele when done.
1.1       thorpej   854:  */
                    855: static int
1.12.4.1  thorpej   856: futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
1.1       thorpej   857: {
1.12.4.4  thorpej   858:        futex_key_t fk;
1.1       thorpej   859:        struct vmspace *vm = curproc->p_vmspace;
                    860:        struct futex *f = NULL;
                    861:        vaddr_t va = (vaddr_t)uaddr;
                    862:        int error;
                    863:
                    864:        /*
                    865:         * Reject unaligned user pointers so we don't cross page
                    866:         * boundaries and so atomics will work.
                    867:         */
1.12.4.1  thorpej   868:        if (__predict_false((va & 3) != 0))
                    869:                return EINVAL;
                    870:
                    871:        if (__predict_false(class == FUTEX_CLASS_ANY))
1.1       thorpej   872:                return EINVAL;
                    873:
                    874:        error = futex_key_init(&fk, vm, va, shared);
                    875:        if (error)
                    876:                return error;
                    877:
                    878:        /*
                    879:         * Optimistically assume there already is one, and try to find
                    880:         * it.
                    881:         */
1.12.4.1  thorpej   882:        error = futex_lookup_by_key(&fk, shared, class, fp);
1.1       thorpej   883:        if (error || *fp != NULL) {
                    884:                /*
                    885:                 * We either found one, or there was an error.
                    886:                 * In either case, we are done with the key.
                    887:                 */
                    888:                futex_key_fini(&fk, shared);
                    889:                goto out;
                    890:        }
                    891:
                    892:        /*
                    893:         * Create a futex recoard.  This tranfers ownership of the key
                    894:         * in all cases.
                    895:         */
1.12.4.1  thorpej   896:        f = futex_create(&fk, shared, class);
1.1       thorpej   897:        if (f == NULL) {
                    898:                error = ENOMEM;
                    899:                goto out;
                    900:        }
                    901:
                    902:        /*
                    903:         * Insert our new futex, or use existing if someone else beat
                    904:         * us to it.
                    905:         */
                    906:        error = futex_insert(f, fp);
                    907:        if (error)
                    908:                goto out;
                    909:        if (*fp == f)
                    910:                f = NULL;       /* don't release on exit */
                    911:
                    912:        /* Success!  */
                    913:        KASSERT(error == 0);
                    914:
                    915: out:   if (f != NULL)
                    916:                futex_rele(f);
                    917:        KASSERT(error || *fp != NULL);
                    918:        KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
                    919:        return error;
                    920: }
                    921:
                    922: /*
1.12.4.1  thorpej   923:  * futex_unsleep:
1.1       thorpej   924:  *
1.12.4.1  thorpej   925:  *     Remove an LWP from the futex and sleep queue.  This is called when
                    926:  *     the LWP has not been awoken normally but instead interrupted: for
                    927:  *     example, when a signal is received.  Must be called with the LWP
                    928:  *     locked.  Will unlock if "unlock" is true.
1.1       thorpej   929:  */
                    930: static void
1.12.4.1  thorpej   931: futex_unsleep(lwp_t *l, bool unlock)
1.1       thorpej   932: {
1.12.4.1  thorpej   933:        struct futex *f __diagused;
1.1       thorpej   934:
1.12.4.1  thorpej   935:        f = (struct futex *)(uintptr_t)l->l_wchan;
1.1       thorpej   936:
1.12.4.1  thorpej   937:        KASSERT(mutex_owned(f->fx_sq_lock));
1.1       thorpej   938:
1.12.4.1  thorpej   939:        /* WRITERQ is more likely */
                    940:        if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
                    941:                KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
                    942:                f->fx_nwaiters[FUTEX_WRITERQ]--;
                    943:        } else {
                    944:                KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
                    945:                KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
                    946:                f->fx_nwaiters[FUTEX_READERQ]--;
                    947:        }
1.1       thorpej   948:
1.12.4.1  thorpej   949:        sleepq_unsleep(l, unlock);
1.1       thorpej   950: }
                    951:
                    952: /*
1.12.4.1  thorpej   953:  * futex_wait(f, timeout, clkid, bitset)
1.1       thorpej   954:  *
1.12.4.1  thorpej   955:  *     Wait until deadline on the clock clkid, or forever if deadline
                    956:  *     is NULL, for a futex wakeup.  Return 0 on explicit wakeup,
                    957:  *     ETIMEDOUT on timeout, EINTR on signal.
1.1       thorpej   958:  */
1.12.4.1  thorpej   959: static int
                    960: futex_wait(struct futex *f, int q, const struct timespec *deadline,
                    961:     clockid_t clkid, int bitset)
1.1       thorpej   962: {
                    963:
                    964:        /*
1.12.4.1  thorpej   965:         * Some things to note about how this function works:
                    966:         *
                    967:         * ==> When we enter futex_wait(), the futex's op lock is
                    968:         * held, preventing us from missing wakes.  Once we are on
                    969:         * the futex's sleepq, it is safe to release the op lock.
                    970:         *
                    971:         * ==> We have a single early escape to avoid calling
                    972:         * sleepq_block(): our deadline already having passed.
                    973:         * If this is a no-timeout wait, or if the deadline has
                    974:         * not already passed, we are guaranteed to call sleepq_block().
                    975:         *
                    976:         * ==> sleepq_block() contains all of the necessary logic
                    977:         * for handling signals; we do not need to check for them
                    978:         * ourselves.
                    979:         *
                    980:         * ==> Once we have blocked, other futex operations will
                    981:         * be able to wake us or requeue us.  Until we have blocked,
                    982:         * those other futex operations will not be able to acquire
                    983:         * the sleepq locks in order to perform those operations,
                    984:         * even if we have dropped the op lock.  Thus, we will not
                    985:         * miss wakes.  This fundamental invariant is relied upon by
                    986:         * every other synchronization object in the kernel.
                    987:         *
                    988:         * ==> sleepq_block() returns for one of three reasons:
                    989:         *
                    990:         *      -- We were awakened.
                    991:         *      -- We were signaled.
                    992:         *      -- We timed out.
                    993:         *
                    994:         * ==> Once sleepq_block() returns, we will not call
                    995:         * sleepq_block() again.
                    996:         *
                    997:         * ==> If sleepq_block() returns due to being signaled,
                    998:         * we must check to see if we were also awakened.  This
                    999:         * is to handle the following sequence:
                   1000:         *
                   1001:         *      -- Futex wake takes us off sleepq, places us on runq.
                   1002:         *      -- We are signaled while sitting on the runq.
                   1003:         *      -- We run, and sleepq_block() notices the signal and
                   1004:         *         returns  EINTR/ERESTART.
                   1005:         *
                   1006:         * In this situation, we want to indicate a successful wake
                   1007:         * to the caller, because that wake has been reported to the
                   1008:         * thread that issued it.
                   1009:         *
                   1010:         * Note that it is NOT possible for a wake to be issued after
                   1011:         * a signal; the LWP will already have been taken off the sleepq
                   1012:         * by the signal, so the wake will never happen.  Note that for
                   1013:         * this reason, we must *never* return ERESTART, because it could
                   1014:         * result in us waiting again with no one to awaken us.
1.1       thorpej  1015:         */
                   1016:
1.12.4.1  thorpej  1017:        struct lwp * const l = curlwp;
                   1018:        int timo;
                   1019:        int error;
1.1       thorpej  1020:
1.4       riastrad 1021:        /*
1.12.4.1  thorpej  1022:         * Compute our timeout before taking any locks.
1.4       riastrad 1023:         */
1.12.4.1  thorpej  1024:        if (deadline) {
                   1025:                struct timespec ts;
1.4       riastrad 1026:
1.12.4.1  thorpej  1027:                /* Check our watch.  */
                   1028:                error = clock_gettime1(clkid, &ts);
                   1029:                if (error) {
                   1030:                        mutex_exit(&f->fx_op_lock);
                   1031:                        return error;
                   1032:                }
1.1       thorpej  1033:
1.12.4.1  thorpej  1034:                /*
                   1035:                 * If we're past the deadline, ETIMEDOUT.
                   1036:                 */
                   1037:                if (timespeccmp(deadline, &ts, <=)) {
                   1038:                        mutex_exit(&f->fx_op_lock);
                   1039:                        return ETIMEDOUT;
                   1040:                } else {
                   1041:                        /* Count how much time is left.  */
                   1042:                        timespecsub(deadline, &ts, &ts);
                   1043:                        timo = tstohz(&ts);
                   1044:                        if (timo == 0) {
                   1045:                                /*
                   1046:                                 * tstohz() already rounds up, and
                   1047:                                 * a return value of 0 ticks means
                   1048:                                 * "expired now or in the past".
                   1049:                                 */
                   1050:                                mutex_exit(&f->fx_op_lock);
                   1051:                                return ETIMEDOUT;
                   1052:                        }
                   1053:                }
                   1054:        } else {
                   1055:                timo = 0;
                   1056:        }
1.1       thorpej  1057:
                   1058:        /*
1.12.4.1  thorpej  1059:         * Acquire in sleepq -> lwp order.  While we're at it, ensure
                   1060:         * that there's room for us to block.
1.1       thorpej  1061:         */
1.12.4.1  thorpej  1062:        mutex_spin_enter(f->fx_sq_lock);
                   1063:        if (__predict_false(q == FUTEX_READERQ &&
                   1064:                            f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
                   1065:                mutex_spin_exit(f->fx_sq_lock);
                   1066:                mutex_exit(&f->fx_op_lock);
                   1067:                return ENFILE;
                   1068:        }
                   1069:        lwp_lock(l);
1.4       riastrad 1070:
                   1071:        /*
1.12.4.1  thorpej  1072:         * No need for locks here; until we're on a futex's sleepq, these
                   1073:         * values are private to the LWP, and the locks needed to get onto
                   1074:         * the sleepq provide the memory barriers needed to ensure visibility.
1.4       riastrad 1075:         */
1.12.4.1  thorpej  1076:        l->l_futex = f;
                   1077:        l->l_futex_wakesel = bitset;
1.4       riastrad 1078:
                   1079:        /*
1.12.4.1  thorpej  1080:         * This is an inlined bit of sleepq_enter();
                   1081:         * we already hold the lwp lock.
1.4       riastrad 1082:         */
1.12.4.1  thorpej  1083:        lwp_unlock_to(l, f->fx_sq_lock);
                   1084:        KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
                   1085:        KASSERT(lwp_locked(l, f->fx_sq_lock));
1.4       riastrad 1086:
1.12.4.1  thorpej  1087:        sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
                   1088:        f->fx_nwaiters[q]++;
1.11      riastrad 1089:
1.12.4.1  thorpej  1090:        /* We can now safely release the op lock. */
                   1091:        mutex_exit(&f->fx_op_lock);
1.11      riastrad 1092:
1.12.4.1  thorpej  1093:        SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
                   1094:        error = sleepq_block(timo, true);
                   1095:        SDT_PROBE1(futex, wait, sleepq_block, exit, error);
1.11      riastrad 1096:
1.12.4.1  thorpej  1097:        /*
                   1098:         * f is no longer valid; we may have been moved to another
                   1099:         * futex sleepq while we slept.
                   1100:         */
1.4       riastrad 1101:
                   1102:        /*
1.12.4.1  thorpej  1103:         * If something went wrong, we may still have a futex reference.
                   1104:         * Check for that here and drop it if so.  We can do this without
                   1105:         * taking any locks because l->l_futex is private to the LWP when
                   1106:         * not on any futex's sleepq, and we are not on any sleepq because
                   1107:         * we are running.
                   1108:         *
                   1109:         * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
                   1110:         * EWOULDBLOCK, and ensure the only other error we return is EINTR.
1.4       riastrad 1111:         */
                   1112:        if (error) {
1.12.4.1  thorpej  1113:                f = l->l_futex;
                   1114:                if (f != NULL) {
                   1115:                        SDT_PROBE0(futex, wait, finish, aborted);
                   1116:                        l->l_futex = NULL;
                   1117:                        futex_rele(f);
                   1118:                } else {
                   1119:                        /* Raced with wakeup; report success. */
                   1120:                        SDT_PROBE0(futex, wait, finish, wakerace);
                   1121:                        error = 0;
                   1122:                }
1.4       riastrad 1123:                if (error == EWOULDBLOCK)
                   1124:                        error = ETIMEDOUT;
1.12.4.1  thorpej  1125:                else if (error != ETIMEDOUT)
                   1126:                        error = EINTR;
                   1127:        } else {
1.12.4.2  thorpej  1128:                f = l->l_futex;
                   1129:                if (__predict_false(f != NULL)) {
                   1130:                        /*
                   1131:                         * There are certain situations that can cause
                   1132:                         * sleepq_block() to return 0 even if we were
                   1133:                         * signalled (by e.g. SIGKILL).  In this case,
                   1134:                         * we will need to release our futex hold and
                   1135:                         * return EINTR (we're probably about to die
                   1136:                         * anyway).
                   1137:                         */
                   1138:                        SDT_PROBE0(futex, wait, finish, aborted);
                   1139:                        l->l_futex = NULL;
                   1140:                        futex_rele(f);
                   1141:                        error = EINTR;
                   1142:                } else {
                   1143:                        SDT_PROBE0(futex, wait, finish, normally);
                   1144:                }
1.4       riastrad 1145:        }
                   1146:
1.1       thorpej  1147:        return error;
                   1148: }
                   1149:
                   1150: /*
1.12.4.1  thorpej  1151:  * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
1.1       thorpej  1152:  *
                   1153:  *     Wake up to nwake waiters on f matching bitset; then, if f2 is
                   1154:  *     provided, move up to nrequeue remaining waiters on f matching
                   1155:  *     bitset to f2.  Return the number of waiters actually woken.
                   1156:  *     Caller must hold the locks of f and f2, if provided.
                   1157:  */
                   1158: static unsigned
1.12.4.1  thorpej  1159: futex_wake(struct futex *f, int q, unsigned int const nwake,
                   1160:     struct futex *f2, int q2, unsigned int const nrequeue,
1.12.4.5! thorpej  1161:     int bitset, unsigned int *nrequeuedp)
1.1       thorpej  1162: {
1.12.4.1  thorpej  1163:        struct lwp *l, *l_next;
1.1       thorpej  1164:        unsigned nwoken = 0;
1.12.4.1  thorpej  1165:        unsigned nrequeued = 0;
1.1       thorpej  1166:
1.12.4.1  thorpej  1167:        KASSERT(mutex_owned(&f->fx_op_lock));
                   1168:        KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
                   1169:
                   1170:        futex_sq_lock2(f, f2);
1.1       thorpej  1171:
                   1172:        /* Wake up to nwake waiters, and count the number woken.  */
1.12.4.1  thorpej  1173:        LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
                   1174:                if (nwoken == nwake)
1.1       thorpej  1175:                        break;
1.12.4.1  thorpej  1176:
                   1177:                KASSERT(l->l_syncobj == &futex_syncobj);
                   1178:                KASSERT(l->l_wchan == (wchan_t)f);
                   1179:                KASSERT(l->l_futex == f);
                   1180:                KASSERT(l->l_sleepq == &f->fx_sqs[q]);
                   1181:                KASSERT(l->l_mutex == f->fx_sq_lock);
                   1182:
                   1183:                if ((l->l_futex_wakesel & bitset) == 0)
                   1184:                        continue;
                   1185:
                   1186:                l->l_futex_wakesel = 0;
                   1187:                l->l_futex = NULL;
                   1188:                sleepq_remove(&f->fx_sqs[q], l);
                   1189:                f->fx_nwaiters[q]--;
                   1190:                nwoken++;
                   1191:                /* hold counts adjusted in bulk below */
1.1       thorpej  1192:        }
                   1193:
1.12.4.1  thorpej  1194:        if (nrequeue) {
                   1195:                KASSERT(f2 != NULL);
1.1       thorpej  1196:                /* Move up to nrequeue waiters from f's queue to f2's queue. */
1.12.4.1  thorpej  1197:                LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
                   1198:                        if (nrequeued == nrequeue)
1.1       thorpej  1199:                                break;
1.12.4.1  thorpej  1200:
                   1201:                        KASSERT(l->l_syncobj == &futex_syncobj);
                   1202:                        KASSERT(l->l_wchan == (wchan_t)f);
                   1203:                        KASSERT(l->l_futex == f);
                   1204:                        KASSERT(l->l_sleepq == &f->fx_sqs[q]);
                   1205:                        KASSERT(l->l_mutex == f->fx_sq_lock);
                   1206:
                   1207:                        if ((l->l_futex_wakesel & bitset) == 0)
                   1208:                                continue;
                   1209:
                   1210:                        l->l_futex = f2;
                   1211:                        sleepq_transfer(l, &f->fx_sqs[q],
                   1212:                            &f2->fx_sqs[q2], f2, futex_wmesg,
                   1213:                            &futex_syncobj, f2->fx_sq_lock, true);
                   1214:                        f->fx_nwaiters[q]--;
                   1215:                        f2->fx_nwaiters[q2]++;
                   1216:                        nrequeued++;
                   1217:                        /* hold counts adjusted in bulk below */
                   1218:                }
                   1219:                if (nrequeued) {
                   1220:                        /* XXX futex_hold() could theoretically fail here. */
                   1221:                        int hold_error __diagused =
                   1222:                            futex_hold_count(f2, nrequeued);
                   1223:                        KASSERT(hold_error == 0);
1.1       thorpej  1224:                }
                   1225:        }
                   1226:
1.12.4.1  thorpej  1227:        /*
                   1228:         * Adjust the futex reference counts for the wakes and
                   1229:         * requeues.
                   1230:         */
                   1231:        KASSERT(nwoken + nrequeued <= LONG_MAX);
                   1232:        futex_rele_count_not_last(f, nwoken + nrequeued);
                   1233:
                   1234:        futex_sq_unlock2(f, f2);
                   1235:
                   1236:        /* Return the number of waiters woken and requeued.  */
1.12.4.5! thorpej  1237:        if (nrequeuedp != NULL) {
        !          1238:                *nrequeuedp = nrequeued;
        !          1239:        }
        !          1240:        return nwoken;
1.1       thorpej  1241: }
                   1242:
                   1243: /*
1.12.4.1  thorpej  1244:  * futex_op_lock(f)
1.1       thorpej  1245:  *
1.12.4.1  thorpej  1246:  *     Acquire the op lock of f.  Pair with futex_op_unlock.  Do
1.1       thorpej  1247:  *     not use if caller needs to acquire two locks; use
1.12.4.1  thorpej  1248:  *     futex_op_lock2 instead.
1.1       thorpej  1249:  */
                   1250: static void
1.12.4.1  thorpej  1251: futex_op_lock(struct futex *f)
1.1       thorpej  1252: {
1.12.4.1  thorpej  1253:        mutex_enter(&f->fx_op_lock);
1.1       thorpej  1254: }
                   1255:
                   1256: /*
1.12.4.1  thorpej  1257:  * futex_op_unlock(f)
1.1       thorpej  1258:  *
1.12.4.3  thorpej  1259:  *     Release the op lock of f.
1.1       thorpej  1260:  */
                   1261: static void
1.12.4.1  thorpej  1262: futex_op_unlock(struct futex *f)
1.1       thorpej  1263: {
1.12.4.1  thorpej  1264:        mutex_exit(&f->fx_op_lock);
1.1       thorpej  1265: }
                   1266:
                   1267: /*
1.12.4.1  thorpej  1268:  * futex_op_lock2(f, f2)
1.1       thorpej  1269:  *
1.12.4.1  thorpej  1270:  *     Acquire the op locks of both f and f2, which may be null, or
                   1271:  *     which may be the same.  If they are distinct, an arbitrary total
                   1272:  *     order is chosen on the locks.
1.1       thorpej  1273:  *
1.12.4.1  thorpej  1274:  *     Callers should only ever acquire multiple op locks
                   1275:  *     simultaneously using futex_op_lock2.
1.1       thorpej  1276:  */
                   1277: static void
1.12.4.1  thorpej  1278: futex_op_lock2(struct futex *f, struct futex *f2)
1.1       thorpej  1279: {
                   1280:
                   1281:        /*
                   1282:         * If both are null, do nothing; if one is null and the other
                   1283:         * is not, lock the other and be done with it.
                   1284:         */
                   1285:        if (f == NULL && f2 == NULL) {
                   1286:                return;
                   1287:        } else if (f == NULL) {
1.12.4.1  thorpej  1288:                mutex_enter(&f2->fx_op_lock);
1.1       thorpej  1289:                return;
                   1290:        } else if (f2 == NULL) {
1.12.4.1  thorpej  1291:                mutex_enter(&f->fx_op_lock);
1.1       thorpej  1292:                return;
                   1293:        }
                   1294:
                   1295:        /* If both futexes are the same, acquire only one. */
                   1296:        if (f == f2) {
1.12.4.1  thorpej  1297:                mutex_enter(&f->fx_op_lock);
1.1       thorpej  1298:                return;
                   1299:        }
                   1300:
                   1301:        /* Otherwise, use the ordering on the kva of the futex pointer.  */
                   1302:        if ((uintptr_t)f < (uintptr_t)f2) {
1.12.4.1  thorpej  1303:                mutex_enter(&f->fx_op_lock);
                   1304:                mutex_enter(&f2->fx_op_lock);
1.1       thorpej  1305:        } else {
1.12.4.1  thorpej  1306:                mutex_enter(&f2->fx_op_lock);
                   1307:                mutex_enter(&f->fx_op_lock);
1.1       thorpej  1308:        }
                   1309: }
                   1310:
                   1311: /*
1.12.4.1  thorpej  1312:  * futex_op_unlock2(f, f2)
1.1       thorpej  1313:  *
                   1314:  *     Release the queue locks of both f and f2, which may be null, or
                   1315:  *     which may have the same underlying queue.
                   1316:  */
                   1317: static void
1.12.4.1  thorpej  1318: futex_op_unlock2(struct futex *f, struct futex *f2)
1.1       thorpej  1319: {
                   1320:
                   1321:        /*
                   1322:         * If both are null, do nothing; if one is null and the other
                   1323:         * is not, unlock the other and be done with it.
                   1324:         */
                   1325:        if (f == NULL && f2 == NULL) {
                   1326:                return;
                   1327:        } else if (f == NULL) {
1.12.4.1  thorpej  1328:                mutex_exit(&f2->fx_op_lock);
1.1       thorpej  1329:                return;
                   1330:        } else if (f2 == NULL) {
1.12.4.1  thorpej  1331:                mutex_exit(&f->fx_op_lock);
1.1       thorpej  1332:                return;
                   1333:        }
                   1334:
                   1335:        /* If both futexes are the same, release only one. */
                   1336:        if (f == f2) {
1.12.4.1  thorpej  1337:                mutex_exit(&f->fx_op_lock);
1.1       thorpej  1338:                return;
                   1339:        }
                   1340:
                   1341:        /* Otherwise, use the ordering on the kva of the futex pointer.  */
                   1342:        if ((uintptr_t)f < (uintptr_t)f2) {
1.12.4.1  thorpej  1343:                mutex_exit(&f2->fx_op_lock);
                   1344:                mutex_exit(&f->fx_op_lock);
1.1       thorpej  1345:        } else {
1.12.4.1  thorpej  1346:                mutex_exit(&f->fx_op_lock);
                   1347:                mutex_exit(&f2->fx_op_lock);
1.1       thorpej  1348:        }
                   1349: }
                   1350:
                   1351: /*
                   1352:  * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
                   1353:  *
1.12.4.1  thorpej  1354:  *     Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
1.1       thorpej  1355:  */
                   1356: static int
                   1357: futex_func_wait(bool shared, int *uaddr, int val, int val3,
1.11      riastrad 1358:     const struct timespec *timeout, clockid_t clkid, int clkflags,
1.1       thorpej  1359:     register_t *retval)
                   1360: {
                   1361:        struct futex *f;
1.11      riastrad 1362:        struct timespec ts;
                   1363:        const struct timespec *deadline;
1.1       thorpej  1364:        int error;
                   1365:
1.6       riastrad 1366:        /*
                   1367:         * If there's nothing to wait for, and nobody will ever wake
                   1368:         * us, then don't set anything up to wait -- just stop here.
                   1369:         */
                   1370:        if (val3 == 0)
1.7       riastrad 1371:                return EINVAL;
1.6       riastrad 1372:
1.1       thorpej  1373:        /* Optimistically test before anything else.  */
                   1374:        if (!futex_test(uaddr, val))
                   1375:                return EAGAIN;
                   1376:
1.11      riastrad 1377:        /* Determine a deadline on the specified clock.  */
1.12.4.1  thorpej  1378:        if (timeout == NULL) {
                   1379:                deadline = timeout;
                   1380:        } else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
                   1381:                /* Sanity-check the deadline. */
                   1382:                if (timeout->tv_sec < 0 ||
                   1383:                    timeout->tv_nsec < 0 ||
                   1384:                    timeout->tv_nsec >= 1000000000L) {
                   1385:                        return EINVAL;
                   1386:                }
1.11      riastrad 1387:                deadline = timeout;
                   1388:        } else {
1.12.4.1  thorpej  1389:                struct timespec interval = *timeout;
                   1390:
                   1391:                error = itimespecfix(&interval);
                   1392:                if (error)
                   1393:                        return error;
1.11      riastrad 1394:                error = clock_gettime1(clkid, &ts);
                   1395:                if (error)
                   1396:                        return error;
1.12.4.1  thorpej  1397:                timespecadd(&ts, &interval, &ts);
1.11      riastrad 1398:                deadline = &ts;
                   1399:        }
                   1400:
1.1       thorpej  1401:        /* Get the futex, creating it if necessary.  */
1.12.4.1  thorpej  1402:        error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1.1       thorpej  1403:        if (error)
                   1404:                return error;
                   1405:        KASSERT(f);
                   1406:
                   1407:        /*
1.12.4.1  thorpej  1408:         * Under the op lock, check the value again: if it has
1.1       thorpej  1409:         * already changed, EAGAIN; otherwise enqueue the waiter.
                   1410:         * Since FUTEX_WAKE will use the same lock and be done after
                   1411:         * modifying the value, the order in which we check and enqueue
                   1412:         * is immaterial.
                   1413:         */
1.12.4.1  thorpej  1414:        futex_op_lock(f);
1.1       thorpej  1415:        if (!futex_test(uaddr, val)) {
1.12.4.1  thorpej  1416:                futex_op_unlock(f);
1.1       thorpej  1417:                error = EAGAIN;
                   1418:                goto out;
                   1419:        }
1.12.4.1  thorpej  1420:
                   1421:        /*
                   1422:         * Now wait.  futex_wait() will drop our op lock once we
                   1423:         * are entered into the sleep queue, thus ensuring atomicity
                   1424:         * of wakes with respect to waits.
                   1425:         */
                   1426:        error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
1.1       thorpej  1427:
                   1428:        /*
                   1429:         * We cannot drop our reference to the futex here, because
                   1430:         * we might be enqueued on a different one when we are awakened.
1.12.4.1  thorpej  1431:         * The references will be managed on our behalf in the requeue,
                   1432:         * wake, and error cases.
1.1       thorpej  1433:         */
                   1434:        f = NULL;
                   1435:
1.12.4.1  thorpej  1436:        if (__predict_true(error == 0)) {
                   1437:                /* Return 0 on success, error on failure. */
                   1438:                *retval = 0;
                   1439:        }
1.1       thorpej  1440:
                   1441: out:   if (f != NULL)
1.5       riastrad 1442:                futex_rele(f);
1.1       thorpej  1443:        return error;
                   1444: }
                   1445:
                   1446: /*
                   1447:  * futex_func_wake(uaddr, val, val3, retval)
                   1448:  *
                   1449:  *     Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
                   1450:  */
                   1451: static int
                   1452: futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
                   1453: {
                   1454:        struct futex *f;
                   1455:        unsigned int nwoken = 0;
                   1456:        int error = 0;
                   1457:
                   1458:        /* Reject negative number of wakeups.  */
                   1459:        if (val < 0) {
                   1460:                error = EINVAL;
                   1461:                goto out;
                   1462:        }
                   1463:
                   1464:        /* Look up the futex, if any.  */
1.12.4.1  thorpej  1465:        error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1.1       thorpej  1466:        if (error)
                   1467:                goto out;
                   1468:
                   1469:        /* If there's no futex, there are no waiters to wake.  */
                   1470:        if (f == NULL)
                   1471:                goto out;
                   1472:
                   1473:        /*
1.12.4.1  thorpej  1474:         * Under f's op lock, wake the waiters and remember the
1.1       thorpej  1475:         * number woken.
                   1476:         */
1.12.4.1  thorpej  1477:        futex_op_lock(f);
                   1478:        nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1.12.4.5! thorpej  1479:                            NULL, FUTEX_WRITERQ, 0, val3, NULL);
1.12.4.1  thorpej  1480:        futex_op_unlock(f);
1.1       thorpej  1481:
                   1482:        /* Release the futex.  */
1.5       riastrad 1483:        futex_rele(f);
1.1       thorpej  1484:
                   1485: out:
                   1486:        /* Return the number of waiters woken.  */
                   1487:        *retval = nwoken;
                   1488:
                   1489:        /* Success!  */
                   1490:        return error;
                   1491: }
                   1492:
                   1493: /*
                   1494:  * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
                   1495:  *
                   1496:  *     Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
                   1497:  */
                   1498: static int
                   1499: futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
                   1500:     int val2, int val3, register_t *retval)
                   1501: {
                   1502:        struct futex *f = NULL, *f2 = NULL;
                   1503:        unsigned nwoken = 0;    /* default to zero woken on early return */
1.12.4.5! thorpej  1504:        unsigned nrequeued = 0;
1.1       thorpej  1505:        int error;
                   1506:
                   1507:        /* Reject negative number of wakeups or requeues. */
                   1508:        if (val < 0 || val2 < 0) {
                   1509:                error = EINVAL;
                   1510:                goto out;
                   1511:        }
                   1512:
                   1513:        /* Look up the source futex, if any. */
1.12.4.1  thorpej  1514:        error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1.1       thorpej  1515:        if (error)
                   1516:                goto out;
                   1517:
                   1518:        /* If there is none, nothing to do. */
                   1519:        if (f == NULL)
                   1520:                goto out;
                   1521:
                   1522:        /*
                   1523:         * We may need to create the destination futex because it's
                   1524:         * entirely possible it does not currently have any waiters.
                   1525:         */
1.12.4.1  thorpej  1526:        error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1.1       thorpej  1527:        if (error)
                   1528:                goto out;
                   1529:
                   1530:        /*
1.12.4.1  thorpej  1531:         * Under the futexes' op locks, check the value; if
1.1       thorpej  1532:         * unchanged from val3, wake the waiters.
                   1533:         */
1.12.4.1  thorpej  1534:        futex_op_lock2(f, f2);
1.1       thorpej  1535:        if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
                   1536:                error = EAGAIN;
                   1537:        } else {
                   1538:                error = 0;
1.12.4.1  thorpej  1539:                nwoken = futex_wake(f, FUTEX_WRITERQ, val,
                   1540:                                    f2, FUTEX_WRITERQ, val2,
1.12.4.5! thorpej  1541:                                    FUTEX_BITSET_MATCH_ANY,
        !          1542:                                    &nrequeued);
1.1       thorpej  1543:        }
1.12.4.1  thorpej  1544:        futex_op_unlock2(f, f2);
1.1       thorpej  1545:
                   1546: out:
1.12.4.5! thorpej  1547:        /*
        !          1548:         * For FUTUEX_REQUEUE, return the numner of waiters woken.
        !          1549:         *
        !          1550:         * For FUTEX_CMP_REQUEUE, return the number of waiters woken
        !          1551:         * **and** requeued.
        !          1552:         */
        !          1553:        *retval = nwoken + (op == FUTEX_CMP_REQUEUE) ? nrequeued : 0;
1.1       thorpej  1554:
                   1555:        /* Release the futexes if we got them.  */
                   1556:        if (f2)
1.5       riastrad 1557:                futex_rele(f2);
1.1       thorpej  1558:        if (f)
1.5       riastrad 1559:                futex_rele(f);
1.1       thorpej  1560:        return error;
                   1561: }
                   1562:
                   1563: /*
                   1564:  * futex_validate_op_cmp(val3)
                   1565:  *
                   1566:  *     Validate an op/cmp argument for FUTEX_WAKE_OP.
                   1567:  */
                   1568: static int
                   1569: futex_validate_op_cmp(int val3)
                   1570: {
                   1571:        int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
                   1572:        int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
                   1573:
                   1574:        if (op & FUTEX_OP_OPARG_SHIFT) {
                   1575:                int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
                   1576:                if (oparg < 0)
                   1577:                        return EINVAL;
                   1578:                if (oparg >= 32)
                   1579:                        return EINVAL;
                   1580:                op &= ~FUTEX_OP_OPARG_SHIFT;
                   1581:        }
                   1582:
                   1583:        switch (op) {
                   1584:        case FUTEX_OP_SET:
                   1585:        case FUTEX_OP_ADD:
                   1586:        case FUTEX_OP_OR:
                   1587:        case FUTEX_OP_ANDN:
                   1588:        case FUTEX_OP_XOR:
                   1589:                break;
                   1590:        default:
                   1591:                return EINVAL;
                   1592:        }
                   1593:
                   1594:        switch (cmp) {
                   1595:        case FUTEX_OP_CMP_EQ:
                   1596:        case FUTEX_OP_CMP_NE:
                   1597:        case FUTEX_OP_CMP_LT:
                   1598:        case FUTEX_OP_CMP_LE:
                   1599:        case FUTEX_OP_CMP_GT:
                   1600:        case FUTEX_OP_CMP_GE:
                   1601:                break;
                   1602:        default:
                   1603:                return EINVAL;
                   1604:        }
                   1605:
                   1606:        return 0;
                   1607: }
                   1608:
                   1609: /*
                   1610:  * futex_compute_op(oldval, val3)
                   1611:  *
                   1612:  *     Apply a FUTEX_WAIT_OP operation to oldval.
                   1613:  */
                   1614: static int
                   1615: futex_compute_op(int oldval, int val3)
                   1616: {
                   1617:        int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
                   1618:        int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
                   1619:
                   1620:        if (op & FUTEX_OP_OPARG_SHIFT) {
                   1621:                KASSERT(oparg >= 0);
                   1622:                KASSERT(oparg < 32);
                   1623:                oparg = 1u << oparg;
                   1624:                op &= ~FUTEX_OP_OPARG_SHIFT;
                   1625:        }
                   1626:
                   1627:        switch (op) {
                   1628:        case FUTEX_OP_SET:
                   1629:                return oparg;
                   1630:
                   1631:        case FUTEX_OP_ADD:
                   1632:                /*
                   1633:                 * Avoid signed arithmetic overflow by doing
                   1634:                 * arithmetic unsigned and converting back to signed
                   1635:                 * at the end.
                   1636:                 */
                   1637:                return (int)((unsigned)oldval + (unsigned)oparg);
                   1638:
                   1639:        case FUTEX_OP_OR:
                   1640:                return oldval | oparg;
                   1641:
                   1642:        case FUTEX_OP_ANDN:
                   1643:                return oldval & ~oparg;
                   1644:
                   1645:        case FUTEX_OP_XOR:
                   1646:                return oldval ^ oparg;
                   1647:
                   1648:        default:
                   1649:                panic("invalid futex op");
                   1650:        }
                   1651: }
                   1652:
                   1653: /*
                   1654:  * futex_compute_cmp(oldval, val3)
                   1655:  *
                   1656:  *     Apply a FUTEX_WAIT_OP comparison to oldval.
                   1657:  */
                   1658: static bool
                   1659: futex_compute_cmp(int oldval, int val3)
                   1660: {
                   1661:        int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
                   1662:        int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
                   1663:
                   1664:        switch (cmp) {
                   1665:        case FUTEX_OP_CMP_EQ:
                   1666:                return (oldval == cmparg);
                   1667:
                   1668:        case FUTEX_OP_CMP_NE:
                   1669:                return (oldval != cmparg);
                   1670:
                   1671:        case FUTEX_OP_CMP_LT:
                   1672:                return (oldval < cmparg);
                   1673:
                   1674:        case FUTEX_OP_CMP_LE:
                   1675:                return (oldval <= cmparg);
                   1676:
                   1677:        case FUTEX_OP_CMP_GT:
                   1678:                return (oldval > cmparg);
                   1679:
                   1680:        case FUTEX_OP_CMP_GE:
                   1681:                return (oldval >= cmparg);
                   1682:
                   1683:        default:
                   1684:                panic("invalid futex cmp operation");
                   1685:        }
                   1686: }
                   1687:
                   1688: /*
                   1689:  * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
                   1690:  *
                   1691:  *     Implement futex(FUTEX_WAKE_OP).
                   1692:  */
                   1693: static int
                   1694: futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
                   1695:     int val3, register_t *retval)
                   1696: {
                   1697:        struct futex *f = NULL, *f2 = NULL;
                   1698:        int oldval, newval, actual;
                   1699:        unsigned nwoken = 0;
                   1700:        int error;
                   1701:
                   1702:        /* Reject negative number of wakeups.  */
                   1703:        if (val < 0 || val2 < 0) {
                   1704:                error = EINVAL;
                   1705:                goto out;
                   1706:        }
                   1707:
                   1708:        /* Reject invalid operations before we start doing things.  */
                   1709:        if ((error = futex_validate_op_cmp(val3)) != 0)
                   1710:                goto out;
                   1711:
                   1712:        /* Look up the first futex, if any.  */
1.12.4.1  thorpej  1713:        error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1.1       thorpej  1714:        if (error)
                   1715:                goto out;
                   1716:
                   1717:        /* Look up the second futex, if any.  */
1.12.4.1  thorpej  1718:        error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1.1       thorpej  1719:        if (error)
                   1720:                goto out;
                   1721:
                   1722:        /*
1.12.4.1  thorpej  1723:         * Under the op locks:
1.1       thorpej  1724:         *
                   1725:         * 1. Read/modify/write: *uaddr2 op= oparg.
                   1726:         * 2. Unconditionally wake uaddr.
1.12.4.1  thorpej  1727:         * 3. Conditionally wake uaddr2, if it previously matched val3.
1.1       thorpej  1728:         */
1.12.4.1  thorpej  1729:        futex_op_lock2(f, f2);
1.1       thorpej  1730:        do {
                   1731:                error = futex_load(uaddr2, &oldval);
                   1732:                if (error)
                   1733:                        goto out_unlock;
                   1734:                newval = futex_compute_op(oldval, val3);
                   1735:                error = ucas_int(uaddr2, oldval, newval, &actual);
                   1736:                if (error)
                   1737:                        goto out_unlock;
                   1738:        } while (actual != oldval);
1.12.4.1  thorpej  1739:        nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
                   1740:                                 NULL, FUTEX_WRITERQ, 0,
1.12.4.5! thorpej  1741:                                 FUTEX_BITSET_MATCH_ANY, NULL) : 0);
1.1       thorpej  1742:        if (f2 && futex_compute_cmp(oldval, val3))
1.12.4.1  thorpej  1743:                nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
                   1744:                                     NULL, FUTEX_WRITERQ, 0,
1.12.4.5! thorpej  1745:                                     FUTEX_BITSET_MATCH_ANY, NULL);
1.1       thorpej  1746:
                   1747:        /* Success! */
                   1748:        error = 0;
                   1749: out_unlock:
1.12.4.1  thorpej  1750:        futex_op_unlock2(f, f2);
1.1       thorpej  1751:
                   1752: out:
                   1753:        /* Return the number of waiters woken. */
                   1754:        *retval = nwoken;
                   1755:
                   1756:        /* Release the futexes, if we got them. */
                   1757:        if (f2)
1.5       riastrad 1758:                futex_rele(f2);
1.1       thorpej  1759:        if (f)
1.5       riastrad 1760:                futex_rele(f);
1.1       thorpej  1761:        return error;
                   1762: }
                   1763:
                   1764: /*
                   1765:  * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
                   1766:  *
                   1767:  *     Implement the futex system call with all the parameters
                   1768:  *     parsed out.
                   1769:  */
                   1770: int
1.11      riastrad 1771: do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
                   1772:     int *uaddr2, int val2, int val3, register_t *retval)
1.1       thorpej  1773: {
                   1774:        const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
                   1775:        const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
                   1776:                                                            : CLOCK_MONOTONIC;
1.12.4.1  thorpej  1777:        int error;
1.1       thorpej  1778:
                   1779:        op &= FUTEX_CMD_MASK;
                   1780:
                   1781:        switch (op) {
                   1782:        case FUTEX_WAIT:
1.12.4.1  thorpej  1783:                SDT_PROBE6(futex, func, wait, entry,
                   1784:                    uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
                   1785:                    TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
                   1786:                error = futex_func_wait(shared, uaddr, val,
1.1       thorpej  1787:                    FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
                   1788:                    retval);
1.12.4.1  thorpej  1789:                SDT_PROBE1(futex, func, wait, exit, error);
                   1790:                break;
                   1791:
                   1792:        case FUTEX_WAIT_BITSET:
                   1793:                SDT_PROBE6(futex, func, wait, entry,
                   1794:                    uaddr, val, val3, timeout,
                   1795:                    TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
                   1796:                error = futex_func_wait(shared, uaddr, val, val3, timeout,
                   1797:                    clkid, TIMER_ABSTIME, retval);
                   1798:                SDT_PROBE1(futex, func, wait, exit, error);
                   1799:                break;
1.1       thorpej  1800:
                   1801:        case FUTEX_WAKE:
1.12.4.1  thorpej  1802:                SDT_PROBE4(futex, func, wake, entry,
                   1803:                    uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
                   1804:                error = futex_func_wake(shared, uaddr, val,
                   1805:                    FUTEX_BITSET_MATCH_ANY, retval);
                   1806:                SDT_PROBE2(futex, func, wake, exit, error, *retval);
                   1807:                break;
                   1808:
1.1       thorpej  1809:        case FUTEX_WAKE_BITSET:
1.12.4.1  thorpej  1810:                SDT_PROBE4(futex, func, wake, entry,
                   1811:                    uaddr, val, val3, op & ~FUTEX_CMD_MASK);
                   1812:                error = futex_func_wake(shared, uaddr, val, val3, retval);
                   1813:                SDT_PROBE2(futex, func, wake, exit, error, *retval);
                   1814:                break;
1.1       thorpej  1815:
                   1816:        case FUTEX_REQUEUE:
1.12.4.1  thorpej  1817:                SDT_PROBE5(futex, func, requeue, entry,
                   1818:                    uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
                   1819:                error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
                   1820:                    val2, 0, retval);
                   1821:                SDT_PROBE2(futex, func, requeue, exit, error, *retval);
                   1822:                break;
                   1823:
1.1       thorpej  1824:        case FUTEX_CMP_REQUEUE:
1.12.4.1  thorpej  1825:                SDT_PROBE6(futex, func, cmp_requeue, entry,
                   1826:                    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
                   1827:                error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
1.1       thorpej  1828:                    val2, val3, retval);
1.12.4.1  thorpej  1829:                SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
                   1830:                break;
1.1       thorpej  1831:
                   1832:        case FUTEX_WAKE_OP:
1.12.4.1  thorpej  1833:                SDT_PROBE6(futex, func, wake_op, entry,
                   1834:                    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
                   1835:                error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
1.1       thorpej  1836:                    val3, retval);
1.12.4.1  thorpej  1837:                SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
                   1838:                break;
1.1       thorpej  1839:
                   1840:        case FUTEX_FD:
                   1841:        default:
1.12.4.1  thorpej  1842:                error = ENOSYS;
                   1843:                break;
1.1       thorpej  1844:        }
1.12.4.1  thorpej  1845:
                   1846:        return error;
1.1       thorpej  1847: }
                   1848:
                   1849: /*
                   1850:  * sys___futex(l, uap, retval)
                   1851:  *
                   1852:  *     __futex(2) system call: generic futex operations.
                   1853:  */
                   1854: int
                   1855: sys___futex(struct lwp *l, const struct sys___futex_args *uap,
                   1856:     register_t *retval)
                   1857: {
                   1858:        /* {
                   1859:                syscallarg(int *) uaddr;
                   1860:                syscallarg(int) op;
                   1861:                syscallarg(int) val;
                   1862:                syscallarg(const struct timespec *) timeout;
                   1863:                syscallarg(int *) uaddr2;
                   1864:                syscallarg(int) val2;
                   1865:                syscallarg(int) val3;
                   1866:        } */
                   1867:        struct timespec ts, *tsp;
                   1868:        int error;
                   1869:
                   1870:        /*
                   1871:         * Copy in the timeout argument, if specified.
                   1872:         */
                   1873:        if (SCARG(uap, timeout)) {
                   1874:                error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
                   1875:                if (error)
                   1876:                        return error;
                   1877:                tsp = &ts;
                   1878:        } else {
                   1879:                tsp = NULL;
                   1880:        }
                   1881:
                   1882:        return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
                   1883:            tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
                   1884:            retval);
                   1885: }
                   1886:
                   1887: /*
                   1888:  * sys___futex_set_robust_list(l, uap, retval)
                   1889:  *
                   1890:  *     __futex_set_robust_list(2) system call for robust futexes.
                   1891:  */
                   1892: int
                   1893: sys___futex_set_robust_list(struct lwp *l,
                   1894:     const struct sys___futex_set_robust_list_args *uap, register_t *retval)
                   1895: {
                   1896:        /* {
                   1897:                syscallarg(void *) head;
                   1898:                syscallarg(size_t) len;
                   1899:        } */
                   1900:        void *head = SCARG(uap, head);
                   1901:
                   1902:        if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
                   1903:                return EINVAL;
                   1904:        if ((uintptr_t)head % sizeof(u_long))
                   1905:                return EINVAL;
                   1906:
                   1907:        l->l_robust_head = (uintptr_t)head;
                   1908:
                   1909:        return 0;
                   1910: }
                   1911:
                   1912: /*
                   1913:  * sys___futex_get_robust_list(l, uap, retval)
                   1914:  *
                   1915:  *     __futex_get_robust_list(2) system call for robust futexes.
                   1916:  */
                   1917: int
                   1918: sys___futex_get_robust_list(struct lwp *l,
                   1919:     const struct sys___futex_get_robust_list_args *uap, register_t *retval)
                   1920: {
                   1921:        /* {
                   1922:                syscallarg(lwpid_t) lwpid;
                   1923:                syscallarg(void **) headp;
                   1924:                syscallarg(size_t *) lenp;
                   1925:        } */
                   1926:        void *head;
                   1927:        const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
                   1928:        int error;
                   1929:
                   1930:        error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
                   1931:        if (error)
                   1932:                return error;
                   1933:
                   1934:        /* Copy out the head pointer and the head structure length. */
                   1935:        error = copyout(&head, SCARG(uap, headp), sizeof(head));
                   1936:        if (__predict_true(error == 0)) {
                   1937:                error = copyout(&len, SCARG(uap, lenp), sizeof(len));
                   1938:        }
                   1939:
                   1940:        return error;
                   1941: }
                   1942:
                   1943: /*
                   1944:  * release_futex(uva, tid)
                   1945:  *
                   1946:  *     Try to release the robust futex at uva in the current process
                   1947:  *     on lwp exit.  If anything goes wrong, silently fail.  It is the
                   1948:  *     userland program's obligation to arrange correct behaviour.
                   1949:  */
                   1950: static void
                   1951: release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
                   1952:     bool const is_pending)
                   1953: {
                   1954:        int *uaddr;
                   1955:        struct futex *f;
                   1956:        int oldval, newval, actual;
                   1957:        int error;
1.12.4.1  thorpej  1958:        bool shared;
1.1       thorpej  1959:
                   1960:        /* If it's misaligned, tough.  */
                   1961:        if (__predict_false(uptr & 3))
                   1962:                return;
                   1963:        uaddr = (int *)uptr;
                   1964:
                   1965:        error = futex_load(uaddr, &oldval);
                   1966:        if (__predict_false(error))
                   1967:                return;
                   1968:
                   1969:        /*
                   1970:         * There are two race conditions we need to handle here:
                   1971:         *
                   1972:         * 1. User space cleared the futex word but died before
                   1973:         *    being able to issue the wakeup.  No wakeups will
                   1974:         *    ever be issued, oops!
                   1975:         *
                   1976:         * 2. Awakened waiter died before being able to acquire
                   1977:         *    the futex in user space.  Any other waiters are
                   1978:         *    now stuck, oops!
                   1979:         *
                   1980:         * In both of these cases, the futex word will be 0 (because
                   1981:         * it's updated before the wake is issued).  The best we can
                   1982:         * do is detect this situation if it's the pending futex and
                   1983:         * issue a wake without modifying the futex word.
                   1984:         *
                   1985:         * XXX eventual PI handling?
                   1986:         */
                   1987:        if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
                   1988:                register_t retval;
1.12.4.1  thorpej  1989:                error = futex_func_wake(/*shared*/true, uaddr, 1,
1.1       thorpej  1990:                    FUTEX_BITSET_MATCH_ANY, &retval);
1.12.4.1  thorpej  1991:                if (error != 0 || retval == 0)
                   1992:                        (void) futex_func_wake(/*shared*/false, uaddr, 1,
                   1993:                            FUTEX_BITSET_MATCH_ANY, &retval);
1.1       thorpej  1994:                return;
                   1995:        }
                   1996:
                   1997:        /* Optimistically test whether we need to do anything at all.  */
                   1998:        if ((oldval & FUTEX_TID_MASK) != tid)
                   1999:                return;
                   2000:
                   2001:        /*
                   2002:         * We need to handle the case where this thread owned the futex,
                   2003:         * but it was uncontended.  In this case, there won't be any
                   2004:         * kernel state to look up.  All we can do is mark the futex
                   2005:         * as a zombie to be mopped up the next time another thread
                   2006:         * attempts to acquire it.
                   2007:         *
                   2008:         * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
                   2009:         * this loop, even if waiters appear while we're are doing
                   2010:         * so.  This is beause FUTEX_WAITERS is set by user space
                   2011:         * before calling __futex() to wait, and the futex needs
                   2012:         * to be marked as a zombie when the new waiter gets into
                   2013:         * the kernel.
                   2014:         */
                   2015:        if ((oldval & FUTEX_WAITERS) == 0) {
                   2016:                do {
                   2017:                        error = futex_load(uaddr, &oldval);
                   2018:                        if (error)
                   2019:                                return;
                   2020:                        if ((oldval & FUTEX_TID_MASK) != tid)
                   2021:                                return;
                   2022:                        newval = oldval | FUTEX_OWNER_DIED;
                   2023:                        error = ucas_int(uaddr, oldval, newval, &actual);
                   2024:                        if (error)
                   2025:                                return;
                   2026:                } while (actual != oldval);
                   2027:
                   2028:                /*
                   2029:                 * If where is still no indication of waiters, then there is
                   2030:                 * no more work for us to do.
                   2031:                 */
                   2032:                if ((oldval & FUTEX_WAITERS) == 0)
                   2033:                        return;
                   2034:        }
                   2035:
                   2036:        /*
1.12.4.1  thorpej  2037:         * Look for a futex.  Try shared first, then private.  If we
                   2038:         * can't fine one, tough.
                   2039:         *
                   2040:         * Note: the assumption here is that anyone placing a futex on
                   2041:         * the robust list is adhering to the rules, regardless of the
                   2042:         * futex class.
1.1       thorpej  2043:         */
1.12.4.1  thorpej  2044:        for (f = NULL, shared = true; f == NULL; shared = false) {
                   2045:                error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
                   2046:                if (error)
                   2047:                        return;
                   2048:                if (f == NULL && shared == false)
                   2049:                        return;
                   2050:        }
1.1       thorpej  2051:
1.12.4.1  thorpej  2052:        /* Work under the futex op lock.  */
                   2053:        futex_op_lock(f);
1.1       thorpej  2054:
                   2055:        /*
                   2056:         * Fetch the word: if the tid doesn't match ours, skip;
                   2057:         * otherwise, set the owner-died bit, atomically.
                   2058:         */
                   2059:        do {
                   2060:                error = futex_load(uaddr, &oldval);
                   2061:                if (error)
                   2062:                        goto out;
                   2063:                if ((oldval & FUTEX_TID_MASK) != tid)
                   2064:                        goto out;
                   2065:                newval = oldval | FUTEX_OWNER_DIED;
                   2066:                error = ucas_int(uaddr, oldval, newval, &actual);
                   2067:                if (error)
                   2068:                        goto out;
                   2069:        } while (actual != oldval);
                   2070:
                   2071:        /*
                   2072:         * If there may be waiters, try to wake one.  If anything goes
                   2073:         * wrong, tough.
                   2074:         *
                   2075:         * XXX eventual PI handling?
                   2076:         */
1.12.4.1  thorpej  2077:        if (oldval & FUTEX_WAITERS) {
                   2078:                (void)futex_wake(f, FUTEX_WRITERQ, 1,
                   2079:                                 NULL, FUTEX_WRITERQ, 0,
1.12.4.5! thorpej  2080:                                 FUTEX_BITSET_MATCH_ANY, NULL);
1.12.4.1  thorpej  2081:        }
1.1       thorpej  2082:
                   2083:        /* Unlock the queue and release the futex.  */
1.12.4.1  thorpej  2084: out:   futex_op_unlock(f);
1.5       riastrad 2085:        futex_rele(f);
1.1       thorpej  2086: }
                   2087:
                   2088: /*
                   2089:  * futex_robust_head_lookup(l, lwpid)
                   2090:  *
                   2091:  *     Helper function to look up a robust head by LWP ID.
                   2092:  */
                   2093: int
                   2094: futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
                   2095: {
                   2096:        struct proc *p = l->l_proc;
                   2097:
                   2098:        /* Find the other lwp, if requested; otherwise use our robust head.  */
                   2099:        if (lwpid) {
                   2100:                mutex_enter(p->p_lock);
                   2101:                l = lwp_find(p, lwpid);
                   2102:                if (l == NULL) {
                   2103:                        mutex_exit(p->p_lock);
                   2104:                        return ESRCH;
                   2105:                }
                   2106:                *headp = (void *)l->l_robust_head;
                   2107:                mutex_exit(p->p_lock);
                   2108:        } else {
                   2109:                *headp = (void *)l->l_robust_head;
                   2110:        }
                   2111:        return 0;
                   2112: }
                   2113:
                   2114: /*
                   2115:  * futex_fetch_robust_head(uaddr)
                   2116:  *
                   2117:  *     Helper routine to fetch the futex robust list head that
                   2118:  *     handles 32-bit binaries running on 64-bit kernels.
                   2119:  */
                   2120: static int
                   2121: futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
                   2122: {
                   2123: #ifdef _LP64
                   2124:        if (curproc->p_flag & PK_32) {
                   2125:                uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
                   2126:                int error;
                   2127:
                   2128:                error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
                   2129:                if (__predict_true(error == 0)) {
                   2130:                        for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
                   2131:                                if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
                   2132:                                        /*
                   2133:                                         * Make sure the offset is sign-
                   2134:                                         * extended.
                   2135:                                         */
                   2136:                                        rhead[i] = (int32_t)rhead32[i];
                   2137:                                } else {
                   2138:                                        rhead[i] = rhead32[i];
                   2139:                                }
                   2140:                        }
                   2141:                }
                   2142:                return error;
                   2143:        }
                   2144: #endif /* _L64 */
                   2145:
                   2146:        return copyin((void *)uaddr, rhead,
                   2147:            sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
                   2148: }
                   2149:
                   2150: /*
                   2151:  * futex_decode_robust_word(word)
                   2152:  *
                   2153:  *     Decode a robust futex list word into the entry and entry
                   2154:  *     properties.
                   2155:  */
                   2156: static inline void
                   2157: futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
                   2158:     bool * const is_pi)
                   2159: {
                   2160:        *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
                   2161:        *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
                   2162: }
                   2163:
                   2164: /*
                   2165:  * futex_fetch_robust_entry(uaddr)
                   2166:  *
                   2167:  *     Helper routine to fetch and decode a robust futex entry
                   2168:  *     that handles 32-bit binaries running on 64-bit kernels.
                   2169:  */
                   2170: static int
                   2171: futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
                   2172:     bool * const is_pi)
                   2173: {
                   2174:        uintptr_t val = 0;
                   2175:        int error = 0;
                   2176:
                   2177: #ifdef _LP64
                   2178:        if (curproc->p_flag & PK_32) {
                   2179:                uint32_t val32;
                   2180:
                   2181:                error = ufetch_32((uint32_t *)uaddr, &val32);
                   2182:                if (__predict_true(error == 0))
                   2183:                        val = val32;
                   2184:        } else
                   2185: #endif /* _LP64 */
                   2186:                error = ufetch_long((u_long *)uaddr, (u_long *)&val);
                   2187:        if (__predict_false(error))
                   2188:                return error;
                   2189:
                   2190:        futex_decode_robust_word(val, valp, is_pi);
                   2191:        return 0;
                   2192: }
                   2193:
                   2194: /*
                   2195:  * futex_release_all_lwp(l, tid)
                   2196:  *
                   2197:  *     Release all l's robust futexes.  If anything looks funny in
                   2198:  *     the process, give up -- it's userland's responsibility to dot
                   2199:  *     the i's and cross the t's.
                   2200:  */
                   2201: void
                   2202: futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
                   2203: {
                   2204:        u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
                   2205:        int limit = 1000000;
                   2206:        int error;
                   2207:
                   2208:        /* If there's no robust list there's nothing to do. */
                   2209:        if (l->l_robust_head == 0)
                   2210:                return;
                   2211:
                   2212:        /* Read the final snapshot of the robust list head. */
                   2213:        error = futex_fetch_robust_head(l->l_robust_head, rhead);
                   2214:        if (error) {
                   2215:                printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
                   2216:                    " unmapped robust futex list head\n",
                   2217:                    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
                   2218:                    (uintmax_t)l->l_lid, (uintmax_t)tid);
                   2219:                return;
                   2220:        }
                   2221:
                   2222:        const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
                   2223:
                   2224:        uintptr_t next, pending;
                   2225:        bool is_pi, pending_is_pi;
                   2226:
                   2227:        futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
                   2228:            &next, &is_pi);
                   2229:        futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
                   2230:            &pending, &pending_is_pi);
                   2231:
                   2232:        /*
                   2233:         * Walk down the list of locked futexes and release them, up
                   2234:         * to one million of them before we give up.
                   2235:         */
                   2236:
                   2237:        while (next != l->l_robust_head && limit-- > 0) {
                   2238:                /* pending handled below. */
                   2239:                if (next != pending)
                   2240:                        release_futex(next + offset, tid, is_pi, false);
                   2241:                error = futex_fetch_robust_entry(next, &next, &is_pi);
                   2242:                if (error)
                   2243:                        break;
                   2244:                preempt_point();
                   2245:        }
                   2246:        if (limit <= 0) {
                   2247:                printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
                   2248:                    " exhausted robust futex limit\n",
                   2249:                    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
                   2250:                    (uintmax_t)l->l_lid, (uintmax_t)tid);
                   2251:        }
                   2252:
                   2253:        /* If there's a pending futex, it may need to be released too. */
                   2254:        if (pending != 0) {
                   2255:                release_futex(pending + offset, tid, pending_is_pi, true);
                   2256:        }
                   2257: }

CVSweb <webmaster@jp.NetBSD.org>