[BACK]Return to sys_futex.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / kern

Annotation of src/sys/kern/sys_futex.c, Revision 1.14

1.14    ! andvar      1: /*     $NetBSD: sys_futex.c,v 1.13 2021/09/28 15:05:42 thorpej Exp $   */
1.1       thorpej     2:
                      3: /*-
                      4:  * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
                      5:  * All rights reserved.
                      6:  *
                      7:  * This code is derived from software contributed to The NetBSD Foundation
                      8:  * by Taylor R. Campbell and Jason R. Thorpe.
                      9:  *
                     10:  * Redistribution and use in source and binary forms, with or without
                     11:  * modification, are permitted provided that the following conditions
                     12:  * are met:
                     13:  * 1. Redistributions of source code must retain the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer.
                     15:  * 2. Redistributions in binary form must reproduce the above copyright
                     16:  *    notice, this list of conditions and the following disclaimer in the
                     17:  *    documentation and/or other materials provided with the distribution.
                     18:  *
                     19:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     20:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     21:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     22:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     23:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     24:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     25:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     26:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     27:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     28:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     29:  * POSSIBILITY OF SUCH DAMAGE.
                     30:  */
                     31:
                     32: #include <sys/cdefs.h>
1.14    ! andvar     33: __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.13 2021/09/28 15:05:42 thorpej Exp $");
1.1       thorpej    34:
                     35: /*
                     36:  * Futexes
                     37:  *
                     38:  *     The futex system call coordinates notifying threads waiting for
                     39:  *     changes on a 32-bit word of memory.  The word can be managed by
                     40:  *     CPU atomic operations in userland, without system calls, as long
                     41:  *     as there is no contention.
                     42:  *
                     43:  *     The simplest use case demonstrating the utility is:
                     44:  *
                     45:  *             // 32-bit word of memory shared among threads or
                     46:  *             // processes in userland.  lock & 1 means owned;
                     47:  *             // lock & 2 means there are waiters waiting.
                     48:  *             volatile int lock = 0;
                     49:  *
                     50:  *             int v;
                     51:  *
                     52:  *             // Acquire a lock.
                     53:  *             do {
                     54:  *                     v = lock;
                     55:  *                     if (v & 1) {
                     56:  *                             // Lock is held.  Set a bit to say that
                     57:  *                             // there are waiters, and wait for lock
                     58:  *                             // to change to anything other than v;
                     59:  *                             // then retry.
                     60:  *                             if (atomic_cas_uint(&lock, v, v | 2) != v)
                     61:  *                                     continue;
                     62:  *                             futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
                     63:  *                             continue;
                     64:  *                     }
                     65:  *             } while (atomic_cas_uint(&lock, v, v & ~1) != v);
                     66:  *             membar_enter();
                     67:  *
                     68:  *             ...
                     69:  *
                     70:  *             // Release the lock.  Optimistically assume there are
                     71:  *             // no waiters first until demonstrated otherwise.
                     72:  *             membar_exit();
                     73:  *             if (atomic_cas_uint(&lock, 1, 0) != 1) {
                     74:  *                     // There may be waiters.
                     75:  *                     v = atomic_swap_uint(&lock, 0);
                     76:  *                     // If there are still waiters, wake one.
                     77:  *                     if (v & 2)
                     78:  *                             futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
                     79:  *             }
                     80:  *
                     81:  *     The goal is to avoid the futex system call unless there is
                     82:  *     contention; then if there is contention, to guarantee no missed
                     83:  *     wakeups.
                     84:  *
                     85:  *     For a simple implementation, futex(FUTEX_WAIT) could queue
                     86:  *     itself to be woken, double-check the lock word, and then sleep;
                     87:  *     spurious wakeups are generally a fact of life, so any
                     88:  *     FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
                     89:  *
                     90:  *     If this were all there is to it, we could then increase
                     91:  *     parallelism by refining the approximation: partition the
                     92:  *     waiters into buckets by hashing the lock addresses to reduce
                     93:  *     the incidence of spurious wakeups.  But this is not all.
                     94:  *
                     95:  *     The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
                     96:  *     operation not only wakes n waiters on lock if lock == val, but
                     97:  *     also _transfers_ m additional waiters to lock2.  Unless wakeups
                     98:  *     on lock2 also trigger wakeups on lock, we cannot move waiters
                     99:  *     to lock2 if they merely share the same hash as waiters on lock.
                    100:  *     Thus, we can't approximately distribute waiters into queues by
                    101:  *     a hash function; we must distinguish futex queues exactly by
                    102:  *     lock address.
                    103:  *
                    104:  *     For now, we use a global red/black tree to index futexes.  This
                    105:  *     should be replaced by a lockless radix tree with a thread to
                    106:  *     free entries no longer in use once all lookups on all CPUs have
                    107:  *     completed.
                    108:  *
                    109:  *     Specifically, we maintain two maps:
                    110:  *
                    111:  *     futex_tab.va[vmspace, va] for private futexes
                    112:  *     futex_tab.oa[uvm_voaddr] for shared futexes
                    113:  *
                    114:  *     This implementation does not support priority inheritance.
                    115:  */
                    116:
1.12      skrll     117: #include <sys/param.h>
1.1       thorpej   118: #include <sys/types.h>
                    119: #include <sys/atomic.h>
                    120: #include <sys/condvar.h>
                    121: #include <sys/futex.h>
                    122: #include <sys/mutex.h>
                    123: #include <sys/rbtree.h>
                    124: #include <sys/queue.h>
                    125:
                    126: #include <sys/syscall.h>
                    127: #include <sys/syscallargs.h>
                    128: #include <sys/syscallvar.h>
                    129:
                    130: #include <uvm/uvm_extern.h>
                    131:
                    132: /*
                    133:  * Lock order:
                    134:  *
                    135:  *     futex_tab.lock
                    136:  *     futex::fx_qlock                 ordered by kva of struct futex
                    137:  *      -> futex_wait::fw_lock         only one at a time
                    138:  *     futex_wait::fw_lock             only one at a time
                    139:  *      -> futex::fx_abortlock         only one at a time
                    140:  */
                    141:
                    142: /*
                    143:  * union futex_key
                    144:  *
                    145:  *     A futex is addressed either by a vmspace+va (private) or by
                    146:  *     a uvm_voaddr (shared).
                    147:  */
                    148: union futex_key {
                    149:        struct {
                    150:                struct vmspace                  *vmspace;
                    151:                vaddr_t                         va;
                    152:        }                       fk_private;
                    153:        struct uvm_voaddr       fk_shared;
                    154: };
                    155:
                    156: /*
                    157:  * struct futex
                    158:  *
                    159:  *     Kernel state for a futex located at a particular address in a
                    160:  *     particular virtual address space.
                    161:  *
                    162:  *     N.B. fx_refcnt is an unsigned long because we need to be able
                    163:  *     to operate on it atomically on all systems while at the same
                    164:  *     time rendering practically impossible the chance of it reaching
                    165:  *     its max value.  In practice, we're limited by the number of LWPs
                    166:  *     that can be present on the system at any given time, and the
                    167:  *     assumption is that limit will be good enough on a 32-bit platform.
                    168:  *     See futex_wake() for why overflow needs to be avoided.
                    169:  */
                    170: struct futex {
                    171:        union futex_key         fx_key;
                    172:        unsigned long           fx_refcnt;
                    173:        bool                    fx_shared;
                    174:        bool                    fx_on_tree;
                    175:        struct rb_node          fx_node;
                    176:
                    177:        kmutex_t                        fx_qlock;
                    178:        TAILQ_HEAD(, futex_wait)        fx_queue;
                    179:
                    180:        kmutex_t                        fx_abortlock;
                    181:        LIST_HEAD(, futex_wait)         fx_abortlist;
                    182:        kcondvar_t                      fx_abortcv;
                    183: };
                    184:
                    185: /*
                    186:  * struct futex_wait
                    187:  *
                    188:  *     State for a thread to wait on a futex.  Threads wait on fw_cv
                    189:  *     for fw_bitset to be set to zero.  The thread may transition to
                    190:  *     a different futex queue at any time under the futex's lock.
                    191:  */
                    192: struct futex_wait {
                    193:        kmutex_t                fw_lock;
                    194:        kcondvar_t              fw_cv;
                    195:        struct futex            *fw_futex;
                    196:        TAILQ_ENTRY(futex_wait) fw_entry;       /* queue lock */
                    197:        LIST_ENTRY(futex_wait)  fw_abort;       /* queue abortlock */
                    198:        int                     fw_bitset;
1.4       riastrad  199:        bool                    fw_aborting;    /* fw_lock */
1.1       thorpej   200: };
                    201:
                    202: /*
                    203:  * futex_tab
                    204:  *
                    205:  *     Global trees of futexes by vmspace/va and VM object address.
                    206:  *
                    207:  *     XXX This obviously doesn't scale in parallel.  We could use a
                    208:  *     pserialize-safe data structure, but there may be a high cost to
                    209:  *     frequent deletion since we don't cache futexes after we're done
                    210:  *     with them.  We could use hashed locks.  But for now, just make
                    211:  *     sure userland can't DoS the serial performance, by using a
                    212:  *     balanced binary tree for lookup.
                    213:  *
                    214:  *     XXX We could use a per-process tree for the table indexed by
                    215:  *     virtual address to reduce contention between processes.
                    216:  */
                    217: static struct {
                    218:        kmutex_t        lock;
                    219:        struct rb_tree  va;
                    220:        struct rb_tree  oa;
                    221: } futex_tab __cacheline_aligned;
                    222:
                    223: static int
                    224: compare_futex_key(void *cookie, const void *n, const void *k)
                    225: {
                    226:        const struct futex *fa = n;
                    227:        const union futex_key *fka = &fa->fx_key;
                    228:        const union futex_key *fkb = k;
                    229:
                    230:        if ((uintptr_t)fka->fk_private.vmspace <
                    231:            (uintptr_t)fkb->fk_private.vmspace)
                    232:                return -1;
                    233:        if ((uintptr_t)fka->fk_private.vmspace >
                    234:            (uintptr_t)fkb->fk_private.vmspace)
                    235:                return +1;
                    236:        if (fka->fk_private.va < fkb->fk_private.va)
                    237:                return -1;
                    238:        if (fka->fk_private.va > fkb->fk_private.va)
                    239:                return -1;
                    240:        return 0;
                    241: }
                    242:
                    243: static int
                    244: compare_futex(void *cookie, const void *na, const void *nb)
                    245: {
                    246:        const struct futex *fa = na;
                    247:        const struct futex *fb = nb;
                    248:
                    249:        return compare_futex_key(cookie, fa, &fb->fx_key);
                    250: }
                    251:
                    252: static const rb_tree_ops_t futex_rb_ops = {
                    253:        .rbto_compare_nodes = compare_futex,
                    254:        .rbto_compare_key = compare_futex_key,
                    255:        .rbto_node_offset = offsetof(struct futex, fx_node),
                    256: };
                    257:
                    258: static int
                    259: compare_futex_shared_key(void *cookie, const void *n, const void *k)
                    260: {
                    261:        const struct futex *fa = n;
                    262:        const union futex_key *fka = &fa->fx_key;
                    263:        const union futex_key *fkb = k;
                    264:
                    265:        return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
                    266: }
                    267:
                    268: static int
                    269: compare_futex_shared(void *cookie, const void *na, const void *nb)
                    270: {
                    271:        const struct futex *fa = na;
                    272:        const struct futex *fb = nb;
                    273:
                    274:        return compare_futex_shared_key(cookie, fa, &fb->fx_key);
                    275: }
                    276:
                    277: static const rb_tree_ops_t futex_shared_rb_ops = {
                    278:        .rbto_compare_nodes = compare_futex_shared,
                    279:        .rbto_compare_key = compare_futex_shared_key,
                    280:        .rbto_node_offset = offsetof(struct futex, fx_node),
                    281: };
                    282:
                    283: static void    futex_wait_dequeue(struct futex_wait *, struct futex *);
                    284:
                    285: /*
                    286:  * futex_load(uaddr, kaddr)
                    287:  *
                    288:  *     Perform a single atomic load to read *uaddr, and return the
                    289:  *     result in *kaddr.  Return 0 on success, EFAULT if uaddr is not
                    290:  *     mapped.
                    291:  */
                    292: static inline int
                    293: futex_load(int *uaddr, int *kaddr)
                    294: {
                    295:        return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
                    296: }
                    297:
                    298: /*
                    299:  * futex_test(uaddr, expected)
                    300:  *
                    301:  *     True if *uaddr == expected.  False if *uaddr != expected, or if
                    302:  *     uaddr is not mapped.
                    303:  */
                    304: static bool
                    305: futex_test(int *uaddr, int expected)
                    306: {
                    307:        int val;
                    308:        int error;
                    309:
                    310:        error = futex_load(uaddr, &val);
                    311:        if (error)
                    312:                return false;
                    313:        return val == expected;
                    314: }
                    315:
                    316: /*
                    317:  * futex_sys_init()
                    318:  *
                    319:  *     Initialize the futex subsystem.
                    320:  */
                    321: void
                    322: futex_sys_init(void)
                    323: {
                    324:
                    325:        mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
                    326:        rb_tree_init(&futex_tab.va, &futex_rb_ops);
                    327:        rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
                    328: }
                    329:
                    330: /*
                    331:  * futex_sys_fini()
                    332:  *
                    333:  *     Finalize the futex subsystem.
                    334:  */
                    335: void
                    336: futex_sys_fini(void)
                    337: {
                    338:
                    339:        KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
                    340:        KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
                    341:        mutex_destroy(&futex_tab.lock);
                    342: }
                    343:
                    344: /*
                    345:  * futex_queue_init(f)
                    346:  *
                    347:  *     Initialize the futex queue.  Caller must call futex_queue_fini
                    348:  *     when done.
                    349:  *
                    350:  *     Never sleeps.
                    351:  */
                    352: static void
                    353: futex_queue_init(struct futex *f)
                    354: {
                    355:
                    356:        mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE);
                    357:        mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE);
                    358:        cv_init(&f->fx_abortcv, "fqabort");
                    359:        LIST_INIT(&f->fx_abortlist);
                    360:        TAILQ_INIT(&f->fx_queue);
                    361: }
                    362:
                    363: /*
                    364:  * futex_queue_drain(f)
                    365:  *
                    366:  *     Wait for any aborting waiters in f; then empty the queue of
                    367:  *     any stragglers and wake them.  Caller must guarantee no new
                    368:  *     references to f.
                    369:  *
                    370:  *     May sleep.
                    371:  */
                    372: static void
                    373: futex_queue_drain(struct futex *f)
                    374: {
                    375:        struct futex_wait *fw, *fw_next;
                    376:
                    377:        mutex_enter(&f->fx_abortlock);
                    378:        while (!LIST_EMPTY(&f->fx_abortlist))
                    379:                cv_wait(&f->fx_abortcv, &f->fx_abortlock);
                    380:        mutex_exit(&f->fx_abortlock);
                    381:
                    382:        mutex_enter(&f->fx_qlock);
                    383:        TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
                    384:                mutex_enter(&fw->fw_lock);
                    385:                futex_wait_dequeue(fw, f);
                    386:                cv_broadcast(&fw->fw_cv);
                    387:                mutex_exit(&fw->fw_lock);
                    388:        }
                    389:        mutex_exit(&f->fx_qlock);
                    390: }
                    391:
                    392: /*
                    393:  * futex_queue_fini(fq)
                    394:  *
                    395:  *     Finalize the futex queue initialized by futex_queue_init.  Queue
                    396:  *     must be empty.  Caller must not use f again until a subsequent
                    397:  *     futex_queue_init.
                    398:  */
                    399: static void
                    400: futex_queue_fini(struct futex *f)
                    401: {
                    402:
                    403:        KASSERT(TAILQ_EMPTY(&f->fx_queue));
                    404:        KASSERT(LIST_EMPTY(&f->fx_abortlist));
                    405:        mutex_destroy(&f->fx_qlock);
                    406:        mutex_destroy(&f->fx_abortlock);
                    407:        cv_destroy(&f->fx_abortcv);
                    408: }
                    409:
                    410: /*
                    411:  * futex_key_init(key, vm, va, shared)
                    412:  *
                    413:  *     Initialize a futex key for lookup, etc.
                    414:  */
                    415: static int
                    416: futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
                    417: {
                    418:        int error = 0;
                    419:
                    420:        if (__predict_false(shared)) {
                    421:                if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
                    422:                        error = EFAULT;
                    423:        } else {
                    424:                fk->fk_private.vmspace = vm;
                    425:                fk->fk_private.va = va;
                    426:        }
                    427:
                    428:        return error;
                    429: }
                    430:
                    431: /*
                    432:  * futex_key_fini(key, shared)
                    433:  *
                    434:  *     Release a futex key.
                    435:  */
                    436: static void
                    437: futex_key_fini(union futex_key *fk, bool shared)
                    438: {
                    439:        if (__predict_false(shared))
                    440:                uvm_voaddr_release(&fk->fk_shared);
                    441:        memset(fk, 0, sizeof(*fk));
                    442: }
                    443:
                    444: /*
                    445:  * futex_create(fk, shared)
                    446:  *
                    447:  *     Create a futex.  Initial reference count is 1, representing the
                    448:  *     caller.  Returns NULL on failure.  Always takes ownership of the
                    449:  *     key, either transferring it to the newly-created futex, or releasing
                    450:  *     the key if creation fails.
                    451:  *
                    452:  *     Never sleeps for memory, but may sleep to acquire a lock.
                    453:  */
                    454: static struct futex *
                    455: futex_create(union futex_key *fk, bool shared)
                    456: {
                    457:        struct futex *f;
                    458:
                    459:        f = kmem_alloc(sizeof(*f), KM_NOSLEEP);
                    460:        if (f == NULL) {
                    461:                futex_key_fini(fk, shared);
                    462:                return NULL;
                    463:        }
                    464:        f->fx_key = *fk;
                    465:        f->fx_refcnt = 1;
                    466:        f->fx_shared = shared;
                    467:        f->fx_on_tree = false;
                    468:        futex_queue_init(f);
                    469:
                    470:        return f;
                    471: }
                    472:
                    473: /*
                    474:  * futex_destroy(f)
                    475:  *
                    476:  *     Destroy a futex created with futex_create.  Reference count
                    477:  *     must be zero.
                    478:  *
                    479:  *     May sleep.
                    480:  */
                    481: static void
                    482: futex_destroy(struct futex *f)
                    483: {
                    484:
                    485:        ASSERT_SLEEPABLE();
                    486:
                    487:        KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
                    488:        KASSERT(!f->fx_on_tree);
                    489:
                    490:        /* Drain and destroy the private queue.  */
                    491:        futex_queue_drain(f);
                    492:        futex_queue_fini(f);
                    493:
                    494:        futex_key_fini(&f->fx_key, f->fx_shared);
                    495:
                    496:        kmem_free(f, sizeof(*f));
                    497: }
                    498:
                    499: /*
                    500:  * futex_hold(f)
                    501:  *
                    502:  *     Attempt to acquire a reference to f.  Return 0 on success,
                    503:  *     ENFILE on too many references.
                    504:  *
                    505:  *     Never sleeps.
                    506:  */
                    507: static int
                    508: futex_hold(struct futex *f)
                    509: {
                    510:        unsigned long refcnt;
                    511:
                    512:        do {
                    513:                refcnt = atomic_load_relaxed(&f->fx_refcnt);
                    514:                if (refcnt == ULONG_MAX)
                    515:                        return ENFILE;
                    516:        } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt);
                    517:
                    518:        return 0;
                    519: }
                    520:
                    521: /*
                    522:  * futex_rele(f)
                    523:  *
                    524:  *     Release a reference to f acquired with futex_create or
                    525:  *     futex_hold.
                    526:  *
                    527:  *     May sleep to free f.
                    528:  */
                    529: static void
                    530: futex_rele(struct futex *f)
                    531: {
                    532:        unsigned long refcnt;
                    533:
                    534:        ASSERT_SLEEPABLE();
                    535:
                    536:        do {
                    537:                refcnt = atomic_load_relaxed(&f->fx_refcnt);
                    538:                if (refcnt == 1)
                    539:                        goto trylast;
                    540:        } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
                    541:        return;
                    542:
                    543: trylast:
                    544:        mutex_enter(&futex_tab.lock);
                    545:        if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) {
                    546:                if (f->fx_on_tree) {
                    547:                        if (__predict_false(f->fx_shared))
                    548:                                rb_tree_remove_node(&futex_tab.oa, f);
                    549:                        else
                    550:                                rb_tree_remove_node(&futex_tab.va, f);
                    551:                        f->fx_on_tree = false;
                    552:                }
                    553:        } else {
                    554:                /* References remain -- don't destroy it.  */
                    555:                f = NULL;
                    556:        }
                    557:        mutex_exit(&futex_tab.lock);
                    558:        if (f != NULL)
                    559:                futex_destroy(f);
                    560: }
                    561:
                    562: /*
                    563:  * futex_rele_not_last(f)
                    564:  *
                    565:  *     Release a reference to f acquired with futex_create or
                    566:  *     futex_hold.
                    567:  *
                    568:  *     This version asserts that we are not dropping the last
                    569:  *     reference to f.
                    570:  */
                    571: static void
                    572: futex_rele_not_last(struct futex *f)
                    573: {
                    574:        unsigned long refcnt;
                    575:
                    576:        do {
                    577:                refcnt = atomic_load_relaxed(&f->fx_refcnt);
                    578:                KASSERT(refcnt > 1);
                    579:        } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
                    580: }
                    581:
                    582: /*
                    583:  * futex_lookup_by_key(key, shared, &f)
                    584:  *
                    585:  *     Try to find an existing futex va reference in the specified key
                    586:  *     On success, return 0, set f to found futex or to NULL if not found,
                    587:  *     and increment f's reference count if found.
                    588:  *
                    589:  *     Return ENFILE if reference count too high.
                    590:  *
                    591:  *     Internal lookup routine shared by futex_lookup() and
1.5       riastrad  592:  *     futex_lookup_create().
1.1       thorpej   593:  */
                    594: static int
                    595: futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp)
                    596: {
                    597:        struct futex *f;
                    598:        int error = 0;
                    599:
                    600:        mutex_enter(&futex_tab.lock);
                    601:        if (__predict_false(shared)) {
                    602:                f = rb_tree_find_node(&futex_tab.oa, fk);
                    603:        } else {
                    604:                f = rb_tree_find_node(&futex_tab.va, fk);
                    605:        }
                    606:        if (f) {
                    607:                error = futex_hold(f);
                    608:                if (error)
                    609:                        f = NULL;
                    610:        }
                    611:        *fp = f;
                    612:        mutex_exit(&futex_tab.lock);
                    613:
                    614:        return error;
                    615: }
                    616:
                    617: /*
                    618:  * futex_insert(f, fp)
                    619:  *
                    620:  *     Try to insert the futex f into the tree by va.  If there
                    621:  *     already is a futex for its va, acquire a reference to it, and
                    622:  *     store it in *fp; otherwise store f in *fp.
                    623:  *
                    624:  *     Return 0 on success, ENFILE if there already is a futex but its
                    625:  *     reference count is too high.
                    626:  */
                    627: static int
                    628: futex_insert(struct futex *f, struct futex **fp)
                    629: {
                    630:        struct futex *f0;
                    631:        int error;
                    632:
                    633:        KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
                    634:        KASSERT(!f->fx_on_tree);
                    635:
                    636:        mutex_enter(&futex_tab.lock);
                    637:        if (__predict_false(f->fx_shared))
                    638:                f0 = rb_tree_insert_node(&futex_tab.oa, f);
                    639:        else
                    640:                f0 = rb_tree_insert_node(&futex_tab.va, f);
                    641:        if (f0 == f) {
                    642:                f->fx_on_tree = true;
                    643:                error = 0;
                    644:        } else {
                    645:                KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
                    646:                KASSERT(f0->fx_on_tree);
                    647:                error = futex_hold(f0);
                    648:                if (error)
                    649:                        goto out;
                    650:        }
                    651:        *fp = f0;
                    652: out:   mutex_exit(&futex_tab.lock);
                    653:
                    654:        return error;
                    655: }
                    656:
                    657: /*
                    658:  * futex_lookup(uaddr, shared, &f)
                    659:  *
                    660:  *     Find a futex at the userland pointer uaddr in the current
                    661:  *     process's VM space.  On success, return the futex in f and
                    662:  *     increment its reference count.
                    663:  *
1.5       riastrad  664:  *     Caller must call futex_rele when done.
1.1       thorpej   665:  */
                    666: static int
                    667: futex_lookup(int *uaddr, bool shared, struct futex **fp)
                    668: {
                    669:        union futex_key fk;
                    670:        struct vmspace *vm = curproc->p_vmspace;
                    671:        vaddr_t va = (vaddr_t)uaddr;
                    672:        int error;
                    673:
                    674:        /*
                    675:         * Reject unaligned user pointers so we don't cross page
                    676:         * boundaries and so atomics will work.
                    677:         */
                    678:        if ((va & 3) != 0)
                    679:                return EINVAL;
                    680:
                    681:        /* Look it up. */
                    682:        error = futex_key_init(&fk, vm, va, shared);
                    683:        if (error)
                    684:                return error;
                    685:
                    686:        error = futex_lookup_by_key(&fk, shared, fp);
                    687:        futex_key_fini(&fk, shared);
                    688:        if (error)
                    689:                return error;
                    690:
                    691:        KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
                    692:        KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
                    693:
                    694:        /*
                    695:         * Success!  (Caller must still check whether we found
                    696:         * anything, but nothing went _wrong_ like trying to use
                    697:         * unmapped memory.)
                    698:         */
                    699:        KASSERT(error == 0);
                    700:
                    701:        return error;
                    702: }
                    703:
                    704: /*
1.5       riastrad  705:  * futex_lookup_create(uaddr, shared, &f)
1.1       thorpej   706:  *
                    707:  *     Find or create a futex at the userland pointer uaddr in the
                    708:  *     current process's VM space.  On success, return the futex in f
                    709:  *     and increment its reference count.
                    710:  *
1.5       riastrad  711:  *     Caller must call futex_rele when done.
1.1       thorpej   712:  */
                    713: static int
1.5       riastrad  714: futex_lookup_create(int *uaddr, bool shared, struct futex **fp)
1.1       thorpej   715: {
                    716:        union futex_key fk;
                    717:        struct vmspace *vm = curproc->p_vmspace;
                    718:        struct futex *f = NULL;
                    719:        vaddr_t va = (vaddr_t)uaddr;
                    720:        int error;
                    721:
                    722:        /*
                    723:         * Reject unaligned user pointers so we don't cross page
                    724:         * boundaries and so atomics will work.
                    725:         */
                    726:        if ((va & 3) != 0)
                    727:                return EINVAL;
                    728:
                    729:        error = futex_key_init(&fk, vm, va, shared);
                    730:        if (error)
                    731:                return error;
                    732:
                    733:        /*
                    734:         * Optimistically assume there already is one, and try to find
                    735:         * it.
                    736:         */
                    737:        error = futex_lookup_by_key(&fk, shared, fp);
                    738:        if (error || *fp != NULL) {
                    739:                /*
                    740:                 * We either found one, or there was an error.
                    741:                 * In either case, we are done with the key.
                    742:                 */
                    743:                futex_key_fini(&fk, shared);
                    744:                goto out;
                    745:        }
                    746:
                    747:        /*
1.14    ! andvar    748:         * Create a futex record.  This transfers ownership of the key
1.1       thorpej   749:         * in all cases.
                    750:         */
                    751:        f = futex_create(&fk, shared);
                    752:        if (f == NULL) {
                    753:                error = ENOMEM;
                    754:                goto out;
                    755:        }
                    756:
                    757:        /*
                    758:         * Insert our new futex, or use existing if someone else beat
                    759:         * us to it.
                    760:         */
                    761:        error = futex_insert(f, fp);
                    762:        if (error)
                    763:                goto out;
                    764:        if (*fp == f)
                    765:                f = NULL;       /* don't release on exit */
                    766:
                    767:        /* Success!  */
                    768:        KASSERT(error == 0);
                    769:
                    770: out:   if (f != NULL)
                    771:                futex_rele(f);
                    772:        KASSERT(error || *fp != NULL);
                    773:        KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
                    774:        return error;
                    775: }
                    776:
                    777: /*
                    778:  * futex_wait_init(fw, bitset)
                    779:  *
                    780:  *     Initialize a record for a thread to wait on a futex matching
                    781:  *     the specified bit set.  Should be passed to futex_wait_enqueue
                    782:  *     before futex_wait, and should be passed to futex_wait_fini when
                    783:  *     done.
                    784:  */
                    785: static void
                    786: futex_wait_init(struct futex_wait *fw, int bitset)
                    787: {
                    788:
1.6       riastrad  789:        KASSERT(bitset);
                    790:
1.1       thorpej   791:        mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE);
                    792:        cv_init(&fw->fw_cv, "futex");
                    793:        fw->fw_futex = NULL;
                    794:        fw->fw_bitset = bitset;
1.4       riastrad  795:        fw->fw_aborting = false;
1.1       thorpej   796: }
                    797:
                    798: /*
                    799:  * futex_wait_fini(fw)
                    800:  *
                    801:  *     Finalize a record for a futex waiter.  Must not be on any
                    802:  *     futex's queue.
                    803:  */
                    804: static void
                    805: futex_wait_fini(struct futex_wait *fw)
                    806: {
                    807:
1.6       riastrad  808:        KASSERT(fw->fw_futex == NULL);
                    809:
1.1       thorpej   810:        cv_destroy(&fw->fw_cv);
                    811:        mutex_destroy(&fw->fw_lock);
                    812: }
                    813:
                    814: /*
                    815:  * futex_wait_enqueue(fw, f)
                    816:  *
                    817:  *     Put fw on the futex queue.  Must be done before futex_wait.
                    818:  *     Caller must hold fw's lock and f's lock, and fw must not be on
                    819:  *     any existing futex's waiter list.
                    820:  */
                    821: static void
                    822: futex_wait_enqueue(struct futex_wait *fw, struct futex *f)
                    823: {
                    824:
                    825:        KASSERT(mutex_owned(&f->fx_qlock));
                    826:        KASSERT(mutex_owned(&fw->fw_lock));
                    827:        KASSERT(fw->fw_futex == NULL);
1.4       riastrad  828:        KASSERT(!fw->fw_aborting);
1.1       thorpej   829:
                    830:        fw->fw_futex = f;
                    831:        TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry);
                    832: }
                    833:
                    834: /*
                    835:  * futex_wait_dequeue(fw, f)
                    836:  *
                    837:  *     Remove fw from the futex queue.  Precludes subsequent
                    838:  *     futex_wait until a futex_wait_enqueue.  Caller must hold fw's
                    839:  *     lock and f's lock, and fw must be on f.
                    840:  */
                    841: static void
                    842: futex_wait_dequeue(struct futex_wait *fw, struct futex *f)
                    843: {
                    844:
                    845:        KASSERT(mutex_owned(&f->fx_qlock));
                    846:        KASSERT(mutex_owned(&fw->fw_lock));
                    847:        KASSERT(fw->fw_futex == f);
                    848:
                    849:        TAILQ_REMOVE(&f->fx_queue, fw, fw_entry);
                    850:        fw->fw_futex = NULL;
                    851: }
                    852:
                    853: /*
                    854:  * futex_wait_abort(fw)
                    855:  *
                    856:  *     Caller is no longer waiting for fw.  Remove it from any queue
1.4       riastrad  857:  *     if it was on one.  Caller must hold fw->fw_lock.
1.1       thorpej   858:  */
                    859: static void
                    860: futex_wait_abort(struct futex_wait *fw)
                    861: {
                    862:        struct futex *f;
                    863:
1.4       riastrad  864:        KASSERT(mutex_owned(&fw->fw_lock));
1.1       thorpej   865:
                    866:        /*
                    867:         * Grab the futex queue.  It can't go away as long as we hold
                    868:         * fw_lock.  However, we can't take the queue lock because
                    869:         * that's a lock order reversal.
                    870:         */
                    871:        f = fw->fw_futex;
                    872:
                    873:        /* Put us on the abort list so that fq won't go away.  */
                    874:        mutex_enter(&f->fx_abortlock);
                    875:        LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort);
                    876:        mutex_exit(&f->fx_abortlock);
                    877:
1.4       riastrad  878:        /*
                    879:         * Mark fw as aborting so it won't lose wakeups and won't be
                    880:         * transferred to any other queue.
                    881:         */
                    882:        fw->fw_aborting = true;
                    883:
1.1       thorpej   884:        /* f is now stable, so we can release fw_lock.  */
                    885:        mutex_exit(&fw->fw_lock);
                    886:
                    887:        /* Now we can remove fw under the queue lock.  */
                    888:        mutex_enter(&f->fx_qlock);
1.4       riastrad  889:        mutex_enter(&fw->fw_lock);
                    890:        futex_wait_dequeue(fw, f);
                    891:        mutex_exit(&fw->fw_lock);
1.1       thorpej   892:        mutex_exit(&f->fx_qlock);
                    893:
                    894:        /*
                    895:         * Finally, remove us from the abort list and notify anyone
                    896:         * waiting for the abort to complete if we were the last to go.
                    897:         */
                    898:        mutex_enter(&f->fx_abortlock);
                    899:        LIST_REMOVE(fw, fw_abort);
                    900:        if (LIST_EMPTY(&f->fx_abortlist))
                    901:                cv_broadcast(&f->fx_abortcv);
                    902:        mutex_exit(&f->fx_abortlock);
1.4       riastrad  903:
                    904:        /*
                    905:         * Release our reference to the futex now that we are not
                    906:         * waiting for it.
                    907:         */
                    908:        futex_rele(f);
                    909:
                    910:        /*
                    911:         * Reacquire the fw lock as caller expects.  Verify that we're
                    912:         * aborting and no longer associated with a futex.
                    913:         */
                    914:        mutex_enter(&fw->fw_lock);
                    915:        KASSERT(fw->fw_aborting);
                    916:        KASSERT(fw->fw_futex == NULL);
1.1       thorpej   917: }
                    918:
                    919: /*
1.11      riastrad  920:  * futex_wait(fw, deadline, clkid)
1.1       thorpej   921:  *
1.11      riastrad  922:  *     fw must be a waiter on a futex's queue.  Wait until deadline on
                    923:  *     the clock clkid, or forever if deadline is NULL, for a futex
                    924:  *     wakeup.  Return 0 on explicit wakeup or destruction of futex,
                    925:  *     ETIMEDOUT on timeout, EINTR/ERESTART on signal.  Either way, fw
                    926:  *     will no longer be on a futex queue on return.
1.1       thorpej   927:  */
                    928: static int
1.11      riastrad  929: futex_wait(struct futex_wait *fw, const struct timespec *deadline,
                    930:     clockid_t clkid)
1.1       thorpej   931: {
                    932:        int error = 0;
                    933:
                    934:        /* Test and wait under the wait lock.  */
                    935:        mutex_enter(&fw->fw_lock);
1.4       riastrad  936:
                    937:        for (;;) {
1.11      riastrad  938:                /* If we're done yet, stop and report success.  */
1.4       riastrad  939:                if (fw->fw_bitset == 0 || fw->fw_futex == NULL) {
                    940:                        error = 0;
                    941:                        break;
                    942:                }
                    943:
                    944:                /* If anything went wrong in the last iteration, stop.  */
                    945:                if (error)
                    946:                        break;
                    947:
1.1       thorpej   948:                /* Not done yet.  Wait.  */
1.11      riastrad  949:                if (deadline) {
                    950:                        struct timespec ts;
                    951:
                    952:                        /* Check our watch.  */
                    953:                        error = clock_gettime1(clkid, &ts);
                    954:                        if (error)
                    955:                                break;
                    956:
                    957:                        /* If we're past the deadline, ETIMEDOUT.  */
                    958:                        if (timespeccmp(deadline, &ts, <=)) {
                    959:                                error = ETIMEDOUT;
                    960:                                break;
                    961:                        }
                    962:
                    963:                        /* Count how much time is left.  */
                    964:                        timespecsub(deadline, &ts, &ts);
                    965:
                    966:                        /* Wait for that much time, allowing signals.  */
                    967:                        error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock,
                    968:                            tstohz(&ts));
                    969:                } else {
                    970:                        /* Wait indefinitely, allowing signals. */
                    971:                        error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock);
                    972:                }
1.1       thorpej   973:        }
1.4       riastrad  974:
                    975:        /*
                    976:         * If we were woken up, the waker will have removed fw from the
                    977:         * queue.  But if anything went wrong, we must remove fw from
                    978:         * the queue ourselves.  While here, convert EWOULDBLOCK to
1.10      riastrad  979:         * ETIMEDOUT.
1.4       riastrad  980:         */
                    981:        if (error) {
                    982:                futex_wait_abort(fw);
                    983:                if (error == EWOULDBLOCK)
                    984:                        error = ETIMEDOUT;
                    985:        }
                    986:
1.1       thorpej   987:        mutex_exit(&fw->fw_lock);
                    988:
                    989:        return error;
                    990: }
                    991:
                    992: /*
                    993:  * futex_wake(f, nwake, f2, nrequeue, bitset)
                    994:  *
                    995:  *     Wake up to nwake waiters on f matching bitset; then, if f2 is
                    996:  *     provided, move up to nrequeue remaining waiters on f matching
                    997:  *     bitset to f2.  Return the number of waiters actually woken.
                    998:  *     Caller must hold the locks of f and f2, if provided.
                    999:  */
                   1000: static unsigned
                   1001: futex_wake(struct futex *f, unsigned nwake, struct futex *f2,
                   1002:     unsigned nrequeue, int bitset)
                   1003: {
                   1004:        struct futex_wait *fw, *fw_next;
                   1005:        unsigned nwoken = 0;
1.2       mlelstv  1006:        int hold_error __diagused;
1.1       thorpej  1007:
                   1008:        KASSERT(mutex_owned(&f->fx_qlock));
                   1009:        KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock));
                   1010:
                   1011:        /* Wake up to nwake waiters, and count the number woken.  */
                   1012:        TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
                   1013:                if ((fw->fw_bitset & bitset) == 0)
                   1014:                        continue;
1.4       riastrad 1015:                if (nwake > 0) {
1.1       thorpej  1016:                        mutex_enter(&fw->fw_lock);
1.4       riastrad 1017:                        if (__predict_false(fw->fw_aborting)) {
                   1018:                                mutex_exit(&fw->fw_lock);
                   1019:                                continue;
                   1020:                        }
1.1       thorpej  1021:                        futex_wait_dequeue(fw, f);
                   1022:                        fw->fw_bitset = 0;
                   1023:                        cv_broadcast(&fw->fw_cv);
                   1024:                        mutex_exit(&fw->fw_lock);
1.4       riastrad 1025:                        nwake--;
1.1       thorpej  1026:                        nwoken++;
                   1027:                        /*
                   1028:                         * Drop the futex reference on behalf of the
                   1029:                         * waiter.  We assert this is not the last
                   1030:                         * reference on the futex (our caller should
                   1031:                         * also have one).
                   1032:                         */
                   1033:                        futex_rele_not_last(f);
                   1034:                } else {
                   1035:                        break;
                   1036:                }
                   1037:        }
                   1038:
                   1039:        if (f2) {
                   1040:                /* Move up to nrequeue waiters from f's queue to f2's queue. */
                   1041:                TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
                   1042:                        if ((fw->fw_bitset & bitset) == 0)
                   1043:                                continue;
1.4       riastrad 1044:                        if (nrequeue > 0) {
1.1       thorpej  1045:                                mutex_enter(&fw->fw_lock);
1.4       riastrad 1046:                                if (__predict_false(fw->fw_aborting)) {
                   1047:                                        mutex_exit(&fw->fw_lock);
                   1048:                                        continue;
                   1049:                                }
1.1       thorpej  1050:                                futex_wait_dequeue(fw, f);
                   1051:                                futex_wait_enqueue(fw, f2);
                   1052:                                mutex_exit(&fw->fw_lock);
1.4       riastrad 1053:                                nrequeue--;
1.1       thorpej  1054:                                /*
                   1055:                                 * Transfer the reference from f to f2.
                   1056:                                 * As above, we assert that we are not
                   1057:                                 * dropping the last reference to f here.
                   1058:                                 *
                   1059:                                 * XXX futex_hold() could theoretically
                   1060:                                 * XXX fail here.
                   1061:                                 */
                   1062:                                futex_rele_not_last(f);
                   1063:                                hold_error = futex_hold(f2);
                   1064:                                KASSERT(hold_error == 0);
                   1065:                        } else {
                   1066:                                break;
                   1067:                        }
                   1068:                }
                   1069:        } else {
                   1070:                KASSERT(nrequeue == 0);
                   1071:        }
                   1072:
                   1073:        /* Return the number of waiters woken.  */
                   1074:        return nwoken;
                   1075: }
                   1076:
                   1077: /*
                   1078:  * futex_queue_lock(f)
                   1079:  *
                   1080:  *     Acquire the queue lock of f.  Pair with futex_queue_unlock.  Do
                   1081:  *     not use if caller needs to acquire two locks; use
                   1082:  *     futex_queue_lock2 instead.
                   1083:  */
                   1084: static void
                   1085: futex_queue_lock(struct futex *f)
                   1086: {
                   1087:        mutex_enter(&f->fx_qlock);
                   1088: }
                   1089:
                   1090: /*
                   1091:  * futex_queue_unlock(f)
                   1092:  *
                   1093:  *     Release the queue lock of f.
                   1094:  */
                   1095: static void
                   1096: futex_queue_unlock(struct futex *f)
                   1097: {
                   1098:        mutex_exit(&f->fx_qlock);
                   1099: }
                   1100:
                   1101: /*
                   1102:  * futex_queue_lock2(f, f2)
                   1103:  *
                   1104:  *     Acquire the queue locks of both f and f2, which may be null, or
                   1105:  *     which may have the same underlying queue.  If they are
                   1106:  *     distinct, an arbitrary total order is chosen on the locks.
                   1107:  *
                   1108:  *     Callers should only ever acquire multiple queue locks
                   1109:  *     simultaneously using futex_queue_lock2.
                   1110:  */
                   1111: static void
                   1112: futex_queue_lock2(struct futex *f, struct futex *f2)
                   1113: {
                   1114:
                   1115:        /*
                   1116:         * If both are null, do nothing; if one is null and the other
                   1117:         * is not, lock the other and be done with it.
                   1118:         */
                   1119:        if (f == NULL && f2 == NULL) {
                   1120:                return;
                   1121:        } else if (f == NULL) {
                   1122:                mutex_enter(&f2->fx_qlock);
                   1123:                return;
                   1124:        } else if (f2 == NULL) {
                   1125:                mutex_enter(&f->fx_qlock);
                   1126:                return;
                   1127:        }
                   1128:
                   1129:        /* If both futexes are the same, acquire only one. */
                   1130:        if (f == f2) {
                   1131:                mutex_enter(&f->fx_qlock);
                   1132:                return;
                   1133:        }
                   1134:
                   1135:        /* Otherwise, use the ordering on the kva of the futex pointer.  */
                   1136:        if ((uintptr_t)f < (uintptr_t)f2) {
                   1137:                mutex_enter(&f->fx_qlock);
                   1138:                mutex_enter(&f2->fx_qlock);
                   1139:        } else {
                   1140:                mutex_enter(&f2->fx_qlock);
                   1141:                mutex_enter(&f->fx_qlock);
                   1142:        }
                   1143: }
                   1144:
                   1145: /*
                   1146:  * futex_queue_unlock2(f, f2)
                   1147:  *
                   1148:  *     Release the queue locks of both f and f2, which may be null, or
                   1149:  *     which may have the same underlying queue.
                   1150:  */
                   1151: static void
                   1152: futex_queue_unlock2(struct futex *f, struct futex *f2)
                   1153: {
                   1154:
                   1155:        /*
                   1156:         * If both are null, do nothing; if one is null and the other
                   1157:         * is not, unlock the other and be done with it.
                   1158:         */
                   1159:        if (f == NULL && f2 == NULL) {
                   1160:                return;
                   1161:        } else if (f == NULL) {
                   1162:                mutex_exit(&f2->fx_qlock);
                   1163:                return;
                   1164:        } else if (f2 == NULL) {
                   1165:                mutex_exit(&f->fx_qlock);
                   1166:                return;
                   1167:        }
                   1168:
                   1169:        /* If both futexes are the same, release only one. */
                   1170:        if (f == f2) {
                   1171:                mutex_exit(&f->fx_qlock);
                   1172:                return;
                   1173:        }
                   1174:
                   1175:        /* Otherwise, use the ordering on the kva of the futex pointer.  */
                   1176:        if ((uintptr_t)f < (uintptr_t)f2) {
                   1177:                mutex_exit(&f2->fx_qlock);
                   1178:                mutex_exit(&f->fx_qlock);
                   1179:        } else {
                   1180:                mutex_exit(&f->fx_qlock);
                   1181:                mutex_exit(&f2->fx_qlock);
                   1182:        }
                   1183: }
                   1184:
                   1185: /*
                   1186:  * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
                   1187:  *
                   1188:  *     Implement futex(FUTEX_WAIT).
                   1189:  */
                   1190: static int
                   1191: futex_func_wait(bool shared, int *uaddr, int val, int val3,
1.11      riastrad 1192:     const struct timespec *timeout, clockid_t clkid, int clkflags,
1.1       thorpej  1193:     register_t *retval)
                   1194: {
                   1195:        struct futex *f;
                   1196:        struct futex_wait wait, *fw = &wait;
1.11      riastrad 1197:        struct timespec ts;
                   1198:        const struct timespec *deadline;
1.1       thorpej  1199:        int error;
                   1200:
1.6       riastrad 1201:        /*
                   1202:         * If there's nothing to wait for, and nobody will ever wake
                   1203:         * us, then don't set anything up to wait -- just stop here.
                   1204:         */
                   1205:        if (val3 == 0)
1.7       riastrad 1206:                return EINVAL;
1.6       riastrad 1207:
1.1       thorpej  1208:        /* Optimistically test before anything else.  */
                   1209:        if (!futex_test(uaddr, val))
                   1210:                return EAGAIN;
                   1211:
1.11      riastrad 1212:        /* Determine a deadline on the specified clock.  */
                   1213:        if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
                   1214:                deadline = timeout;
                   1215:        } else {
                   1216:                error = clock_gettime1(clkid, &ts);
                   1217:                if (error)
                   1218:                        return error;
                   1219:                timespecadd(&ts, timeout, &ts);
                   1220:                deadline = &ts;
                   1221:        }
                   1222:
1.1       thorpej  1223:        /* Get the futex, creating it if necessary.  */
1.5       riastrad 1224:        error = futex_lookup_create(uaddr, shared, &f);
1.1       thorpej  1225:        if (error)
                   1226:                return error;
                   1227:        KASSERT(f);
                   1228:
                   1229:        /* Get ready to wait.  */
                   1230:        futex_wait_init(fw, val3);
                   1231:
                   1232:        /*
                   1233:         * Under the queue lock, check the value again: if it has
                   1234:         * already changed, EAGAIN; otherwise enqueue the waiter.
                   1235:         * Since FUTEX_WAKE will use the same lock and be done after
                   1236:         * modifying the value, the order in which we check and enqueue
                   1237:         * is immaterial.
                   1238:         */
                   1239:        futex_queue_lock(f);
                   1240:        if (!futex_test(uaddr, val)) {
                   1241:                futex_queue_unlock(f);
                   1242:                error = EAGAIN;
                   1243:                goto out;
                   1244:        }
                   1245:        mutex_enter(&fw->fw_lock);
                   1246:        futex_wait_enqueue(fw, f);
                   1247:        mutex_exit(&fw->fw_lock);
                   1248:        futex_queue_unlock(f);
                   1249:
                   1250:        /*
                   1251:         * We cannot drop our reference to the futex here, because
                   1252:         * we might be enqueued on a different one when we are awakened.
                   1253:         * The references will be managed on our behalf in the requeue
                   1254:         * and wake cases.
                   1255:         */
                   1256:        f = NULL;
                   1257:
                   1258:        /* Wait. */
1.11      riastrad 1259:        error = futex_wait(fw, deadline, clkid);
1.4       riastrad 1260:        if (error)
1.1       thorpej  1261:                goto out;
                   1262:
                   1263:        /* Return 0 on success, error on failure. */
                   1264:        *retval = 0;
                   1265:
                   1266: out:   if (f != NULL)
1.5       riastrad 1267:                futex_rele(f);
1.1       thorpej  1268:        futex_wait_fini(fw);
                   1269:        return error;
                   1270: }
                   1271:
                   1272: /*
                   1273:  * futex_func_wake(uaddr, val, val3, retval)
                   1274:  *
                   1275:  *     Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
                   1276:  */
                   1277: static int
                   1278: futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
                   1279: {
                   1280:        struct futex *f;
                   1281:        unsigned int nwoken = 0;
                   1282:        int error = 0;
                   1283:
                   1284:        /* Reject negative number of wakeups.  */
                   1285:        if (val < 0) {
                   1286:                error = EINVAL;
                   1287:                goto out;
                   1288:        }
                   1289:
                   1290:        /* Look up the futex, if any.  */
                   1291:        error = futex_lookup(uaddr, shared, &f);
                   1292:        if (error)
                   1293:                goto out;
                   1294:
                   1295:        /* If there's no futex, there are no waiters to wake.  */
                   1296:        if (f == NULL)
                   1297:                goto out;
                   1298:
                   1299:        /*
                   1300:         * Under f's queue lock, wake the waiters and remember the
                   1301:         * number woken.
                   1302:         */
                   1303:        futex_queue_lock(f);
                   1304:        nwoken = futex_wake(f, val, NULL, 0, val3);
                   1305:        futex_queue_unlock(f);
                   1306:
                   1307:        /* Release the futex.  */
1.5       riastrad 1308:        futex_rele(f);
1.1       thorpej  1309:
                   1310: out:
                   1311:        /* Return the number of waiters woken.  */
                   1312:        *retval = nwoken;
                   1313:
                   1314:        /* Success!  */
                   1315:        return error;
                   1316: }
                   1317:
                   1318: /*
                   1319:  * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
                   1320:  *
                   1321:  *     Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
                   1322:  */
                   1323: static int
                   1324: futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
                   1325:     int val2, int val3, register_t *retval)
                   1326: {
                   1327:        struct futex *f = NULL, *f2 = NULL;
                   1328:        unsigned nwoken = 0;    /* default to zero woken on early return */
                   1329:        int error;
                   1330:
                   1331:        /* Reject negative number of wakeups or requeues. */
                   1332:        if (val < 0 || val2 < 0) {
                   1333:                error = EINVAL;
                   1334:                goto out;
                   1335:        }
                   1336:
                   1337:        /* Look up the source futex, if any. */
                   1338:        error = futex_lookup(uaddr, shared, &f);
                   1339:        if (error)
                   1340:                goto out;
                   1341:
                   1342:        /* If there is none, nothing to do. */
                   1343:        if (f == NULL)
                   1344:                goto out;
                   1345:
                   1346:        /*
                   1347:         * We may need to create the destination futex because it's
                   1348:         * entirely possible it does not currently have any waiters.
                   1349:         */
1.5       riastrad 1350:        error = futex_lookup_create(uaddr2, shared, &f2);
1.1       thorpej  1351:        if (error)
                   1352:                goto out;
                   1353:
                   1354:        /*
                   1355:         * Under the futexes' queue locks, check the value; if
                   1356:         * unchanged from val3, wake the waiters.
                   1357:         */
                   1358:        futex_queue_lock2(f, f2);
                   1359:        if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
                   1360:                error = EAGAIN;
                   1361:        } else {
                   1362:                error = 0;
                   1363:                nwoken = futex_wake(f, val, f2, val2, FUTEX_BITSET_MATCH_ANY);
                   1364:        }
                   1365:        futex_queue_unlock2(f, f2);
                   1366:
                   1367: out:
                   1368:        /* Return the number of waiters woken.  */
                   1369:        *retval = nwoken;
                   1370:
                   1371:        /* Release the futexes if we got them.  */
                   1372:        if (f2)
1.5       riastrad 1373:                futex_rele(f2);
1.1       thorpej  1374:        if (f)
1.5       riastrad 1375:                futex_rele(f);
1.1       thorpej  1376:        return error;
                   1377: }
                   1378:
                   1379: /*
                   1380:  * futex_validate_op_cmp(val3)
                   1381:  *
                   1382:  *     Validate an op/cmp argument for FUTEX_WAKE_OP.
                   1383:  */
                   1384: static int
                   1385: futex_validate_op_cmp(int val3)
                   1386: {
                   1387:        int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
                   1388:        int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
                   1389:
                   1390:        if (op & FUTEX_OP_OPARG_SHIFT) {
                   1391:                int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
                   1392:                if (oparg < 0)
                   1393:                        return EINVAL;
                   1394:                if (oparg >= 32)
                   1395:                        return EINVAL;
                   1396:                op &= ~FUTEX_OP_OPARG_SHIFT;
                   1397:        }
                   1398:
                   1399:        switch (op) {
                   1400:        case FUTEX_OP_SET:
                   1401:        case FUTEX_OP_ADD:
                   1402:        case FUTEX_OP_OR:
                   1403:        case FUTEX_OP_ANDN:
                   1404:        case FUTEX_OP_XOR:
                   1405:                break;
                   1406:        default:
                   1407:                return EINVAL;
                   1408:        }
                   1409:
                   1410:        switch (cmp) {
                   1411:        case FUTEX_OP_CMP_EQ:
                   1412:        case FUTEX_OP_CMP_NE:
                   1413:        case FUTEX_OP_CMP_LT:
                   1414:        case FUTEX_OP_CMP_LE:
                   1415:        case FUTEX_OP_CMP_GT:
                   1416:        case FUTEX_OP_CMP_GE:
                   1417:                break;
                   1418:        default:
                   1419:                return EINVAL;
                   1420:        }
                   1421:
                   1422:        return 0;
                   1423: }
                   1424:
                   1425: /*
                   1426:  * futex_compute_op(oldval, val3)
                   1427:  *
                   1428:  *     Apply a FUTEX_WAIT_OP operation to oldval.
                   1429:  */
                   1430: static int
                   1431: futex_compute_op(int oldval, int val3)
                   1432: {
                   1433:        int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
                   1434:        int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
                   1435:
                   1436:        if (op & FUTEX_OP_OPARG_SHIFT) {
                   1437:                KASSERT(oparg >= 0);
                   1438:                KASSERT(oparg < 32);
                   1439:                oparg = 1u << oparg;
                   1440:                op &= ~FUTEX_OP_OPARG_SHIFT;
                   1441:        }
                   1442:
                   1443:        switch (op) {
                   1444:        case FUTEX_OP_SET:
                   1445:                return oparg;
                   1446:
                   1447:        case FUTEX_OP_ADD:
                   1448:                /*
                   1449:                 * Avoid signed arithmetic overflow by doing
                   1450:                 * arithmetic unsigned and converting back to signed
                   1451:                 * at the end.
                   1452:                 */
                   1453:                return (int)((unsigned)oldval + (unsigned)oparg);
                   1454:
                   1455:        case FUTEX_OP_OR:
                   1456:                return oldval | oparg;
                   1457:
                   1458:        case FUTEX_OP_ANDN:
                   1459:                return oldval & ~oparg;
                   1460:
                   1461:        case FUTEX_OP_XOR:
                   1462:                return oldval ^ oparg;
                   1463:
                   1464:        default:
                   1465:                panic("invalid futex op");
                   1466:        }
                   1467: }
                   1468:
                   1469: /*
                   1470:  * futex_compute_cmp(oldval, val3)
                   1471:  *
                   1472:  *     Apply a FUTEX_WAIT_OP comparison to oldval.
                   1473:  */
                   1474: static bool
                   1475: futex_compute_cmp(int oldval, int val3)
                   1476: {
                   1477:        int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
                   1478:        int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
                   1479:
                   1480:        switch (cmp) {
                   1481:        case FUTEX_OP_CMP_EQ:
                   1482:                return (oldval == cmparg);
                   1483:
                   1484:        case FUTEX_OP_CMP_NE:
                   1485:                return (oldval != cmparg);
                   1486:
                   1487:        case FUTEX_OP_CMP_LT:
                   1488:                return (oldval < cmparg);
                   1489:
                   1490:        case FUTEX_OP_CMP_LE:
                   1491:                return (oldval <= cmparg);
                   1492:
                   1493:        case FUTEX_OP_CMP_GT:
                   1494:                return (oldval > cmparg);
                   1495:
                   1496:        case FUTEX_OP_CMP_GE:
                   1497:                return (oldval >= cmparg);
                   1498:
                   1499:        default:
                   1500:                panic("invalid futex cmp operation");
                   1501:        }
                   1502: }
                   1503:
                   1504: /*
                   1505:  * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
                   1506:  *
                   1507:  *     Implement futex(FUTEX_WAKE_OP).
                   1508:  */
                   1509: static int
                   1510: futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
                   1511:     int val3, register_t *retval)
                   1512: {
                   1513:        struct futex *f = NULL, *f2 = NULL;
                   1514:        int oldval, newval, actual;
                   1515:        unsigned nwoken = 0;
                   1516:        int error;
                   1517:
                   1518:        /* Reject negative number of wakeups.  */
                   1519:        if (val < 0 || val2 < 0) {
                   1520:                error = EINVAL;
                   1521:                goto out;
                   1522:        }
                   1523:
                   1524:        /* Reject invalid operations before we start doing things.  */
                   1525:        if ((error = futex_validate_op_cmp(val3)) != 0)
                   1526:                goto out;
                   1527:
                   1528:        /* Look up the first futex, if any.  */
                   1529:        error = futex_lookup(uaddr, shared, &f);
                   1530:        if (error)
                   1531:                goto out;
                   1532:
                   1533:        /* Look up the second futex, if any.  */
                   1534:        error = futex_lookup(uaddr2, shared, &f2);
                   1535:        if (error)
                   1536:                goto out;
                   1537:
                   1538:        /*
                   1539:         * Under the queue locks:
                   1540:         *
                   1541:         * 1. Read/modify/write: *uaddr2 op= oparg.
                   1542:         * 2. Unconditionally wake uaddr.
                   1543:         * 3. Conditionally wake uaddr2, if it previously matched val2.
                   1544:         */
                   1545:        futex_queue_lock2(f, f2);
                   1546:        do {
                   1547:                error = futex_load(uaddr2, &oldval);
                   1548:                if (error)
                   1549:                        goto out_unlock;
                   1550:                newval = futex_compute_op(oldval, val3);
                   1551:                error = ucas_int(uaddr2, oldval, newval, &actual);
                   1552:                if (error)
                   1553:                        goto out_unlock;
                   1554:        } while (actual != oldval);
                   1555:        nwoken = (f ? futex_wake(f, val, NULL, 0, FUTEX_BITSET_MATCH_ANY) : 0);
                   1556:        if (f2 && futex_compute_cmp(oldval, val3))
                   1557:                nwoken += futex_wake(f2, val2, NULL, 0,
                   1558:                    FUTEX_BITSET_MATCH_ANY);
                   1559:
                   1560:        /* Success! */
                   1561:        error = 0;
                   1562: out_unlock:
                   1563:        futex_queue_unlock2(f, f2);
                   1564:
                   1565: out:
                   1566:        /* Return the number of waiters woken. */
                   1567:        *retval = nwoken;
                   1568:
                   1569:        /* Release the futexes, if we got them. */
                   1570:        if (f2)
1.5       riastrad 1571:                futex_rele(f2);
1.1       thorpej  1572:        if (f)
1.5       riastrad 1573:                futex_rele(f);
1.1       thorpej  1574:        return error;
                   1575: }
                   1576:
                   1577: /*
                   1578:  * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
                   1579:  *
                   1580:  *     Implement the futex system call with all the parameters
                   1581:  *     parsed out.
                   1582:  */
                   1583: int
1.11      riastrad 1584: do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
                   1585:     int *uaddr2, int val2, int val3, register_t *retval)
1.1       thorpej  1586: {
                   1587:        const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
                   1588:        const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
                   1589:                                                            : CLOCK_MONOTONIC;
                   1590:
                   1591:        op &= FUTEX_CMD_MASK;
                   1592:
                   1593:        switch (op) {
                   1594:        case FUTEX_WAIT:
                   1595:                return futex_func_wait(shared, uaddr, val,
                   1596:                    FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
                   1597:                    retval);
                   1598:
                   1599:        case FUTEX_WAKE:
                   1600:                val3 = FUTEX_BITSET_MATCH_ANY;
                   1601:                /* FALLTHROUGH */
                   1602:        case FUTEX_WAKE_BITSET:
                   1603:                return futex_func_wake(shared, uaddr, val, val3, retval);
                   1604:
                   1605:        case FUTEX_REQUEUE:
                   1606:        case FUTEX_CMP_REQUEUE:
                   1607:                return futex_func_requeue(shared, op, uaddr, val, uaddr2,
                   1608:                    val2, val3, retval);
                   1609:
                   1610:        case FUTEX_WAIT_BITSET:
                   1611:                return futex_func_wait(shared, uaddr, val, val3, timeout,
                   1612:                    clkid, TIMER_ABSTIME, retval);
                   1613:
                   1614:        case FUTEX_WAKE_OP:
                   1615:                return futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
                   1616:                    val3, retval);
                   1617:
                   1618:        case FUTEX_FD:
                   1619:        default:
                   1620:                return ENOSYS;
                   1621:        }
                   1622: }
                   1623:
                   1624: /*
                   1625:  * sys___futex(l, uap, retval)
                   1626:  *
                   1627:  *     __futex(2) system call: generic futex operations.
                   1628:  */
                   1629: int
                   1630: sys___futex(struct lwp *l, const struct sys___futex_args *uap,
                   1631:     register_t *retval)
                   1632: {
                   1633:        /* {
                   1634:                syscallarg(int *) uaddr;
                   1635:                syscallarg(int) op;
                   1636:                syscallarg(int) val;
                   1637:                syscallarg(const struct timespec *) timeout;
                   1638:                syscallarg(int *) uaddr2;
                   1639:                syscallarg(int) val2;
                   1640:                syscallarg(int) val3;
                   1641:        } */
                   1642:        struct timespec ts, *tsp;
                   1643:        int error;
                   1644:
                   1645:        /*
                   1646:         * Copy in the timeout argument, if specified.
                   1647:         */
                   1648:        if (SCARG(uap, timeout)) {
                   1649:                error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
                   1650:                if (error)
                   1651:                        return error;
                   1652:                tsp = &ts;
                   1653:        } else {
                   1654:                tsp = NULL;
                   1655:        }
                   1656:
                   1657:        return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
                   1658:            tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
                   1659:            retval);
                   1660: }
                   1661:
                   1662: /*
                   1663:  * sys___futex_set_robust_list(l, uap, retval)
                   1664:  *
                   1665:  *     __futex_set_robust_list(2) system call for robust futexes.
                   1666:  */
                   1667: int
                   1668: sys___futex_set_robust_list(struct lwp *l,
                   1669:     const struct sys___futex_set_robust_list_args *uap, register_t *retval)
                   1670: {
                   1671:        /* {
                   1672:                syscallarg(void *) head;
                   1673:                syscallarg(size_t) len;
                   1674:        } */
                   1675:        void *head = SCARG(uap, head);
                   1676:
                   1677:        if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
                   1678:                return EINVAL;
                   1679:        if ((uintptr_t)head % sizeof(u_long))
                   1680:                return EINVAL;
                   1681:
                   1682:        l->l_robust_head = (uintptr_t)head;
                   1683:
                   1684:        return 0;
                   1685: }
                   1686:
                   1687: /*
                   1688:  * sys___futex_get_robust_list(l, uap, retval)
                   1689:  *
                   1690:  *     __futex_get_robust_list(2) system call for robust futexes.
                   1691:  */
                   1692: int
                   1693: sys___futex_get_robust_list(struct lwp *l,
                   1694:     const struct sys___futex_get_robust_list_args *uap, register_t *retval)
                   1695: {
                   1696:        /* {
                   1697:                syscallarg(lwpid_t) lwpid;
                   1698:                syscallarg(void **) headp;
                   1699:                syscallarg(size_t *) lenp;
                   1700:        } */
                   1701:        void *head;
                   1702:        const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
                   1703:        int error;
                   1704:
                   1705:        error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
                   1706:        if (error)
                   1707:                return error;
                   1708:
                   1709:        /* Copy out the head pointer and the head structure length. */
                   1710:        error = copyout(&head, SCARG(uap, headp), sizeof(head));
                   1711:        if (__predict_true(error == 0)) {
                   1712:                error = copyout(&len, SCARG(uap, lenp), sizeof(len));
                   1713:        }
                   1714:
                   1715:        return error;
                   1716: }
                   1717:
                   1718: /*
                   1719:  * release_futex(uva, tid)
                   1720:  *
                   1721:  *     Try to release the robust futex at uva in the current process
                   1722:  *     on lwp exit.  If anything goes wrong, silently fail.  It is the
                   1723:  *     userland program's obligation to arrange correct behaviour.
                   1724:  */
                   1725: static void
                   1726: release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
                   1727:     bool const is_pending)
                   1728: {
                   1729:        int *uaddr;
                   1730:        struct futex *f;
                   1731:        int oldval, newval, actual;
                   1732:        int error;
                   1733:
                   1734:        /* If it's misaligned, tough.  */
                   1735:        if (__predict_false(uptr & 3))
                   1736:                return;
                   1737:        uaddr = (int *)uptr;
                   1738:
                   1739:        error = futex_load(uaddr, &oldval);
                   1740:        if (__predict_false(error))
                   1741:                return;
                   1742:
                   1743:        /*
                   1744:         * There are two race conditions we need to handle here:
                   1745:         *
                   1746:         * 1. User space cleared the futex word but died before
                   1747:         *    being able to issue the wakeup.  No wakeups will
                   1748:         *    ever be issued, oops!
                   1749:         *
                   1750:         * 2. Awakened waiter died before being able to acquire
                   1751:         *    the futex in user space.  Any other waiters are
                   1752:         *    now stuck, oops!
                   1753:         *
                   1754:         * In both of these cases, the futex word will be 0 (because
                   1755:         * it's updated before the wake is issued).  The best we can
                   1756:         * do is detect this situation if it's the pending futex and
                   1757:         * issue a wake without modifying the futex word.
                   1758:         *
                   1759:         * XXX eventual PI handling?
                   1760:         */
                   1761:        if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
                   1762:                register_t retval;
                   1763:                (void) futex_func_wake(/*shared*/true, uaddr, 1,
                   1764:                    FUTEX_BITSET_MATCH_ANY, &retval);
                   1765:                return;
                   1766:        }
                   1767:
                   1768:        /* Optimistically test whether we need to do anything at all.  */
                   1769:        if ((oldval & FUTEX_TID_MASK) != tid)
                   1770:                return;
                   1771:
                   1772:        /*
                   1773:         * We need to handle the case where this thread owned the futex,
                   1774:         * but it was uncontended.  In this case, there won't be any
                   1775:         * kernel state to look up.  All we can do is mark the futex
                   1776:         * as a zombie to be mopped up the next time another thread
                   1777:         * attempts to acquire it.
                   1778:         *
                   1779:         * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
                   1780:         * this loop, even if waiters appear while we're are doing
                   1781:         * so.  This is beause FUTEX_WAITERS is set by user space
                   1782:         * before calling __futex() to wait, and the futex needs
                   1783:         * to be marked as a zombie when the new waiter gets into
                   1784:         * the kernel.
                   1785:         */
                   1786:        if ((oldval & FUTEX_WAITERS) == 0) {
                   1787:                do {
                   1788:                        error = futex_load(uaddr, &oldval);
                   1789:                        if (error)
                   1790:                                return;
                   1791:                        if ((oldval & FUTEX_TID_MASK) != tid)
                   1792:                                return;
                   1793:                        newval = oldval | FUTEX_OWNER_DIED;
                   1794:                        error = ucas_int(uaddr, oldval, newval, &actual);
                   1795:                        if (error)
                   1796:                                return;
                   1797:                } while (actual != oldval);
                   1798:
                   1799:                /*
                   1800:                 * If where is still no indication of waiters, then there is
                   1801:                 * no more work for us to do.
                   1802:                 */
                   1803:                if ((oldval & FUTEX_WAITERS) == 0)
                   1804:                        return;
                   1805:        }
                   1806:
                   1807:        /*
                   1808:         * Look for a shared futex since we have no positive indication
                   1809:         * it is private.  If we can't, tough.
                   1810:         */
                   1811:        error = futex_lookup(uaddr, /*shared*/true, &f);
                   1812:        if (error)
                   1813:                return;
                   1814:
                   1815:        /*
                   1816:         * If there's no kernel state for this futex, there's nothing to
                   1817:         * release.
                   1818:         */
                   1819:        if (f == NULL)
                   1820:                return;
                   1821:
                   1822:        /* Work under the futex queue lock.  */
                   1823:        futex_queue_lock(f);
                   1824:
                   1825:        /*
                   1826:         * Fetch the word: if the tid doesn't match ours, skip;
                   1827:         * otherwise, set the owner-died bit, atomically.
                   1828:         */
                   1829:        do {
                   1830:                error = futex_load(uaddr, &oldval);
                   1831:                if (error)
                   1832:                        goto out;
                   1833:                if ((oldval & FUTEX_TID_MASK) != tid)
                   1834:                        goto out;
                   1835:                newval = oldval | FUTEX_OWNER_DIED;
                   1836:                error = ucas_int(uaddr, oldval, newval, &actual);
                   1837:                if (error)
                   1838:                        goto out;
                   1839:        } while (actual != oldval);
                   1840:
                   1841:        /*
                   1842:         * If there may be waiters, try to wake one.  If anything goes
                   1843:         * wrong, tough.
                   1844:         *
                   1845:         * XXX eventual PI handling?
                   1846:         */
                   1847:        if (oldval & FUTEX_WAITERS)
                   1848:                (void)futex_wake(f, 1, NULL, 0, FUTEX_BITSET_MATCH_ANY);
                   1849:
                   1850:        /* Unlock the queue and release the futex.  */
                   1851: out:   futex_queue_unlock(f);
1.5       riastrad 1852:        futex_rele(f);
1.1       thorpej  1853: }
                   1854:
                   1855: /*
                   1856:  * futex_robust_head_lookup(l, lwpid)
                   1857:  *
                   1858:  *     Helper function to look up a robust head by LWP ID.
                   1859:  */
                   1860: int
                   1861: futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
                   1862: {
                   1863:        struct proc *p = l->l_proc;
                   1864:
                   1865:        /* Find the other lwp, if requested; otherwise use our robust head.  */
                   1866:        if (lwpid) {
                   1867:                mutex_enter(p->p_lock);
                   1868:                l = lwp_find(p, lwpid);
                   1869:                if (l == NULL) {
                   1870:                        mutex_exit(p->p_lock);
                   1871:                        return ESRCH;
                   1872:                }
                   1873:                *headp = (void *)l->l_robust_head;
                   1874:                mutex_exit(p->p_lock);
                   1875:        } else {
                   1876:                *headp = (void *)l->l_robust_head;
                   1877:        }
                   1878:        return 0;
                   1879: }
                   1880:
                   1881: /*
                   1882:  * futex_fetch_robust_head(uaddr)
                   1883:  *
                   1884:  *     Helper routine to fetch the futex robust list head that
                   1885:  *     handles 32-bit binaries running on 64-bit kernels.
                   1886:  */
                   1887: static int
                   1888: futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
                   1889: {
                   1890: #ifdef _LP64
                   1891:        if (curproc->p_flag & PK_32) {
                   1892:                uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
                   1893:                int error;
                   1894:
                   1895:                error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
                   1896:                if (__predict_true(error == 0)) {
                   1897:                        for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
                   1898:                                if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
                   1899:                                        /*
                   1900:                                         * Make sure the offset is sign-
                   1901:                                         * extended.
                   1902:                                         */
                   1903:                                        rhead[i] = (int32_t)rhead32[i];
                   1904:                                } else {
                   1905:                                        rhead[i] = rhead32[i];
                   1906:                                }
                   1907:                        }
                   1908:                }
                   1909:                return error;
                   1910:        }
                   1911: #endif /* _L64 */
                   1912:
                   1913:        return copyin((void *)uaddr, rhead,
                   1914:            sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
                   1915: }
                   1916:
                   1917: /*
                   1918:  * futex_decode_robust_word(word)
                   1919:  *
                   1920:  *     Decode a robust futex list word into the entry and entry
                   1921:  *     properties.
                   1922:  */
                   1923: static inline void
                   1924: futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
                   1925:     bool * const is_pi)
                   1926: {
                   1927:        *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
                   1928:        *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
                   1929: }
                   1930:
                   1931: /*
                   1932:  * futex_fetch_robust_entry(uaddr)
                   1933:  *
                   1934:  *     Helper routine to fetch and decode a robust futex entry
                   1935:  *     that handles 32-bit binaries running on 64-bit kernels.
                   1936:  */
                   1937: static int
                   1938: futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
                   1939:     bool * const is_pi)
                   1940: {
                   1941:        uintptr_t val = 0;
                   1942:        int error = 0;
                   1943:
                   1944: #ifdef _LP64
                   1945:        if (curproc->p_flag & PK_32) {
                   1946:                uint32_t val32;
                   1947:
                   1948:                error = ufetch_32((uint32_t *)uaddr, &val32);
                   1949:                if (__predict_true(error == 0))
                   1950:                        val = val32;
                   1951:        } else
                   1952: #endif /* _LP64 */
                   1953:                error = ufetch_long((u_long *)uaddr, (u_long *)&val);
                   1954:        if (__predict_false(error))
                   1955:                return error;
                   1956:
                   1957:        futex_decode_robust_word(val, valp, is_pi);
                   1958:        return 0;
                   1959: }
                   1960:
                   1961: /*
                   1962:  * futex_release_all_lwp(l, tid)
                   1963:  *
                   1964:  *     Release all l's robust futexes.  If anything looks funny in
                   1965:  *     the process, give up -- it's userland's responsibility to dot
                   1966:  *     the i's and cross the t's.
                   1967:  */
                   1968: void
1.13      thorpej  1969: futex_release_all_lwp(struct lwp * const l)
1.1       thorpej  1970: {
                   1971:        u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
                   1972:        int limit = 1000000;
                   1973:        int error;
                   1974:
                   1975:        /* If there's no robust list there's nothing to do. */
                   1976:        if (l->l_robust_head == 0)
                   1977:                return;
                   1978:
1.13      thorpej  1979:        KASSERT((l->l_lid & FUTEX_TID_MASK) == l->l_lid);
                   1980:
1.1       thorpej  1981:        /* Read the final snapshot of the robust list head. */
                   1982:        error = futex_fetch_robust_head(l->l_robust_head, rhead);
                   1983:        if (error) {
1.13      thorpej  1984:                printf("WARNING: pid %jd (%s) lwp %jd:"
1.1       thorpej  1985:                    " unmapped robust futex list head\n",
                   1986:                    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
1.13      thorpej  1987:                    (uintmax_t)l->l_lid);
1.1       thorpej  1988:                return;
                   1989:        }
                   1990:
                   1991:        const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
                   1992:
                   1993:        uintptr_t next, pending;
                   1994:        bool is_pi, pending_is_pi;
                   1995:
                   1996:        futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
                   1997:            &next, &is_pi);
                   1998:        futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
                   1999:            &pending, &pending_is_pi);
                   2000:
                   2001:        /*
                   2002:         * Walk down the list of locked futexes and release them, up
                   2003:         * to one million of them before we give up.
                   2004:         */
                   2005:
                   2006:        while (next != l->l_robust_head && limit-- > 0) {
                   2007:                /* pending handled below. */
                   2008:                if (next != pending)
1.13      thorpej  2009:                        release_futex(next + offset, l->l_lid, is_pi, false);
1.1       thorpej  2010:                error = futex_fetch_robust_entry(next, &next, &is_pi);
                   2011:                if (error)
                   2012:                        break;
                   2013:                preempt_point();
                   2014:        }
                   2015:        if (limit <= 0) {
1.13      thorpej  2016:                printf("WARNING: pid %jd (%s) lwp %jd:"
1.1       thorpej  2017:                    " exhausted robust futex limit\n",
                   2018:                    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
1.13      thorpej  2019:                    (uintmax_t)l->l_lid);
1.1       thorpej  2020:        }
                   2021:
                   2022:        /* If there's a pending futex, it may need to be released too. */
                   2023:        if (pending != 0) {
1.13      thorpej  2024:                release_futex(pending + offset, l->l_lid, pending_is_pi, true);
1.1       thorpej  2025:        }
                   2026: }

CVSweb <webmaster@jp.NetBSD.org>