Annotation of src/sys/kern/sys_futex.c, Revision 1.12.4.5
1.12.4.5! thorpej 1: /* $NetBSD: sys_futex.c,v 1.12.4.4 2021/08/06 23:53:53 thorpej Exp $ */
1.1 thorpej 2:
3: /*-
4: * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
5: * All rights reserved.
6: *
7: * This code is derived from software contributed to The NetBSD Foundation
8: * by Taylor R. Campbell and Jason R. Thorpe.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
18: *
19: * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29: * POSSIBILITY OF SUCH DAMAGE.
30: */
31:
32: #include <sys/cdefs.h>
1.12.4.5! thorpej 33: __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.4 2021/08/06 23:53:53 thorpej Exp $");
1.1 thorpej 34:
35: /*
36: * Futexes
37: *
38: * The futex system call coordinates notifying threads waiting for
39: * changes on a 32-bit word of memory. The word can be managed by
40: * CPU atomic operations in userland, without system calls, as long
41: * as there is no contention.
42: *
43: * The simplest use case demonstrating the utility is:
44: *
45: * // 32-bit word of memory shared among threads or
46: * // processes in userland. lock & 1 means owned;
47: * // lock & 2 means there are waiters waiting.
48: * volatile int lock = 0;
49: *
50: * int v;
51: *
52: * // Acquire a lock.
53: * do {
54: * v = lock;
55: * if (v & 1) {
56: * // Lock is held. Set a bit to say that
57: * // there are waiters, and wait for lock
58: * // to change to anything other than v;
59: * // then retry.
60: * if (atomic_cas_uint(&lock, v, v | 2) != v)
61: * continue;
62: * futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
63: * continue;
64: * }
65: * } while (atomic_cas_uint(&lock, v, v & ~1) != v);
66: * membar_enter();
67: *
68: * ...
69: *
70: * // Release the lock. Optimistically assume there are
71: * // no waiters first until demonstrated otherwise.
72: * membar_exit();
73: * if (atomic_cas_uint(&lock, 1, 0) != 1) {
74: * // There may be waiters.
75: * v = atomic_swap_uint(&lock, 0);
76: * // If there are still waiters, wake one.
77: * if (v & 2)
78: * futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
79: * }
80: *
81: * The goal is to avoid the futex system call unless there is
82: * contention; then if there is contention, to guarantee no missed
83: * wakeups.
84: *
85: * For a simple implementation, futex(FUTEX_WAIT) could queue
86: * itself to be woken, double-check the lock word, and then sleep;
87: * spurious wakeups are generally a fact of life, so any
88: * FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
89: *
90: * If this were all there is to it, we could then increase
91: * parallelism by refining the approximation: partition the
92: * waiters into buckets by hashing the lock addresses to reduce
93: * the incidence of spurious wakeups. But this is not all.
94: *
95: * The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
96: * operation not only wakes n waiters on lock if lock == val, but
97: * also _transfers_ m additional waiters to lock2. Unless wakeups
98: * on lock2 also trigger wakeups on lock, we cannot move waiters
99: * to lock2 if they merely share the same hash as waiters on lock.
100: * Thus, we can't approximately distribute waiters into queues by
101: * a hash function; we must distinguish futex queues exactly by
102: * lock address.
103: *
104: * For now, we use a global red/black tree to index futexes. This
105: * should be replaced by a lockless radix tree with a thread to
106: * free entries no longer in use once all lookups on all CPUs have
107: * completed.
108: *
109: * Specifically, we maintain two maps:
110: *
111: * futex_tab.va[vmspace, va] for private futexes
112: * futex_tab.oa[uvm_voaddr] for shared futexes
113: *
114: * This implementation does not support priority inheritance.
115: */
116:
1.12 skrll 117: #include <sys/param.h>
1.1 thorpej 118: #include <sys/types.h>
119: #include <sys/atomic.h>
120: #include <sys/condvar.h>
121: #include <sys/futex.h>
122: #include <sys/mutex.h>
123: #include <sys/rbtree.h>
1.12.4.1 thorpej 124: #include <sys/sleepq.h>
1.1 thorpej 125: #include <sys/queue.h>
1.12.4.1 thorpej 126: #include <sys/sdt.h>
1.1 thorpej 127:
128: #include <sys/syscall.h>
129: #include <sys/syscallargs.h>
130: #include <sys/syscallvar.h>
131:
132: #include <uvm/uvm_extern.h>
133:
134: /*
1.12.4.1 thorpej 135: * DTrace probes.
136: */
137: SDT_PROVIDER_DEFINE(futex);
138:
139: /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
140: /* exit: error */
141: SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
142: "struct timespec *", "int", "int");
143: SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
144:
145: /* entry: uaddr, nwake, bitset, fflags */
146: /* exit: error, nwoken */
147: SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
148: SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
149:
150: /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
151: /* exit: error, nwoken */
152: SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
153: "int");
154: SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
155:
156: /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
157: /* exit: error, nwoken */
158: SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
159: "int", "int", "int");
160: SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
161:
162: /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
163: /* exit: error, nwoken */
164: SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
165: "int", "int");
166: SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
167:
168: SDT_PROBE_DEFINE0(futex, wait, finish, normally);
169: SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
170: SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
171:
172: /* entry: timo */
173: /* exit: error */
174: SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
175: SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
176:
177: /*
1.1 thorpej 178: * Lock order:
179: *
180: * futex_tab.lock
1.12.4.1 thorpej 181: * futex::fx_op_lock ordered by kva of struct futex
182: * -> futex::fx_sq_lock ordered by kva of sleepq lock
183: *
184: * N.B. multiple futexes can share a single sleepq lock.
1.1 thorpej 185: */
186:
187: /*
188: * union futex_key
189: *
190: * A futex is addressed either by a vmspace+va (private) or by
191: * a uvm_voaddr (shared).
192: */
1.12.4.4 thorpej 193: typedef union futex_key {
1.1 thorpej 194: struct {
195: struct vmspace *vmspace;
196: vaddr_t va;
197: } fk_private;
198: struct uvm_voaddr fk_shared;
1.12.4.4 thorpej 199: } futex_key_t;
200:
201: CTASSERT(offsetof(futex_key_t, fk_private.va) ==
202: offsetof(futex_key_t, fk_shared.offset));
1.1 thorpej 203:
204: /*
205: * struct futex
206: *
207: * Kernel state for a futex located at a particular address in a
208: * particular virtual address space.
209: *
210: * N.B. fx_refcnt is an unsigned long because we need to be able
211: * to operate on it atomically on all systems while at the same
212: * time rendering practically impossible the chance of it reaching
213: * its max value. In practice, we're limited by the number of LWPs
214: * that can be present on the system at any given time, and the
215: * assumption is that limit will be good enough on a 32-bit platform.
216: * See futex_wake() for why overflow needs to be avoided.
1.12.4.1 thorpej 217: *
218: * XXX Since futex addresses must be 4-byte aligned, we could
219: * XXX squirrel away fx_shared and fx_on_tree bits in the "va"
220: * XXX field of the key. Worth it?
1.1 thorpej 221: */
222: struct futex {
1.12.4.4 thorpej 223: futex_key_t fx_key;
1.12.4.1 thorpej 224: struct rb_node fx_node;
1.1 thorpej 225: unsigned long fx_refcnt;
226: bool fx_shared;
227: bool fx_on_tree;
1.12.4.1 thorpej 228: uint8_t fx_class;
1.1 thorpej 229:
1.12.4.1 thorpej 230: kmutex_t fx_op_lock; /* adaptive */
231: kmutex_t * fx_sq_lock; /* &sleepq_locks[...] */
232: sleepq_t fx_sqs[2]; /* 0=reader, 1=writer */
233: unsigned int fx_nwaiters[2];
1.1 thorpej 234: };
235:
236: /*
1.12.4.1 thorpej 237: * futex classes: Some futex operations can only be carried out on
238: * futexes that are known to be abiding by a certain protocol. These
239: * classes are assigned to a futex when created due to a wait event,
240: * and when subsequent wake or requeue operations are issued, the
241: * class is checked at futex lookup time. If the class does not match,
242: * EINVAL is the result.
243: */
244: #define FUTEX_CLASS_ANY 0 /* match any class in lookup */
245: #define FUTEX_CLASS_NORMAL 1 /* normal futex */
246: #define FUTEX_CLASS_RWLOCK 2 /* rwlock futex */
247: #define FUTEX_CLASS_PI 3 /* for FUTEX_*_PI ops */
248:
249: /* sleepq definitions */
250: #define FUTEX_READERQ 0
251: #define FUTEX_WRITERQ 1
252:
253: static const char futex_wmesg[] = "futex";
254:
255: static void futex_unsleep(lwp_t *, bool);
256:
257: static syncobj_t futex_syncobj = {
258: .sobj_flag = SOBJ_SLEEPQ_SORTED,
259: .sobj_unsleep = futex_unsleep,
260: .sobj_changepri = sleepq_changepri,
261: .sobj_lendpri = sleepq_lendpri,
262: .sobj_owner = syncobj_noowner,
1.1 thorpej 263: };
264:
265: /*
266: * futex_tab
267: *
268: * Global trees of futexes by vmspace/va and VM object address.
269: *
270: * XXX This obviously doesn't scale in parallel. We could use a
271: * pserialize-safe data structure, but there may be a high cost to
272: * frequent deletion since we don't cache futexes after we're done
273: * with them. We could use hashed locks. But for now, just make
274: * sure userland can't DoS the serial performance, by using a
275: * balanced binary tree for lookup.
276: *
277: * XXX We could use a per-process tree for the table indexed by
278: * virtual address to reduce contention between processes.
279: */
280: static struct {
281: kmutex_t lock;
282: struct rb_tree va;
283: struct rb_tree oa;
284: } futex_tab __cacheline_aligned;
285:
286: static int
287: compare_futex_key(void *cookie, const void *n, const void *k)
288: {
289: const struct futex *fa = n;
1.12.4.4 thorpej 290: const futex_key_t *fka = &fa->fx_key;
291: const futex_key_t *fkb = k;
1.1 thorpej 292:
293: if ((uintptr_t)fka->fk_private.vmspace <
294: (uintptr_t)fkb->fk_private.vmspace)
295: return -1;
296: if ((uintptr_t)fka->fk_private.vmspace >
297: (uintptr_t)fkb->fk_private.vmspace)
298: return +1;
299: if (fka->fk_private.va < fkb->fk_private.va)
300: return -1;
301: if (fka->fk_private.va > fkb->fk_private.va)
302: return -1;
303: return 0;
304: }
305:
306: static int
307: compare_futex(void *cookie, const void *na, const void *nb)
308: {
309: const struct futex *fa = na;
310: const struct futex *fb = nb;
311:
312: return compare_futex_key(cookie, fa, &fb->fx_key);
313: }
314:
315: static const rb_tree_ops_t futex_rb_ops = {
316: .rbto_compare_nodes = compare_futex,
317: .rbto_compare_key = compare_futex_key,
318: .rbto_node_offset = offsetof(struct futex, fx_node),
319: };
320:
321: static int
322: compare_futex_shared_key(void *cookie, const void *n, const void *k)
323: {
324: const struct futex *fa = n;
1.12.4.4 thorpej 325: const futex_key_t *fka = &fa->fx_key;
326: const futex_key_t *fkb = k;
1.1 thorpej 327:
328: return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
329: }
330:
331: static int
332: compare_futex_shared(void *cookie, const void *na, const void *nb)
333: {
334: const struct futex *fa = na;
335: const struct futex *fb = nb;
336:
337: return compare_futex_shared_key(cookie, fa, &fb->fx_key);
338: }
339:
340: static const rb_tree_ops_t futex_shared_rb_ops = {
341: .rbto_compare_nodes = compare_futex_shared,
342: .rbto_compare_key = compare_futex_shared_key,
343: .rbto_node_offset = offsetof(struct futex, fx_node),
344: };
345:
1.12.4.1 thorpej 346: /*
347: * futex_sq_lock2(f, f2)
348: *
349: * Acquire the sleepq locks for f and f2, which may be null, or
350: * which may be the same. If they are distinct, an arbitrary total
351: * order is chosen on the locks.
352: *
353: * Callers should only ever acquire multiple sleepq locks
354: * simultaneously using futex_sq_lock2.
355: */
356: static void
357: futex_sq_lock2(struct futex * const f, struct futex * const f2)
358: {
359:
360: /*
361: * If both are null, do nothing; if one is null and the other
362: * is not, lock the other and be done with it.
363: */
364: if (f == NULL && f2 == NULL) {
365: return;
366: } else if (f == NULL) {
367: mutex_spin_enter(f2->fx_sq_lock);
368: return;
369: } else if (f2 == NULL) {
370: mutex_spin_enter(f->fx_sq_lock);
371: return;
372: }
373:
374: kmutex_t * const m = f->fx_sq_lock;
375: kmutex_t * const m2 = f2->fx_sq_lock;
376:
377: /* If both locks are the same, acquire only one. */
378: if (m == m2) {
379: mutex_spin_enter(m);
380: return;
381: }
382:
383: /* Otherwise, use the ordering on the kva of the lock pointer. */
384: if ((uintptr_t)m < (uintptr_t)m2) {
385: mutex_spin_enter(m);
386: mutex_spin_enter(m2);
387: } else {
388: mutex_spin_enter(m2);
389: mutex_spin_enter(m);
390: }
391: }
392:
393: /*
394: * futex_sq_unlock2(f, f2)
395: *
396: * Release the sleep queue locks for f and f2, which may be null, or
397: * which may have the same underlying lock.
398: */
399: static void
400: futex_sq_unlock2(struct futex * const f, struct futex * const f2)
401: {
402:
403: /*
404: * If both are null, do nothing; if one is null and the other
405: * is not, unlock the other and be done with it.
406: */
407: if (f == NULL && f2 == NULL) {
408: return;
409: } else if (f == NULL) {
410: mutex_spin_exit(f2->fx_sq_lock);
411: return;
412: } else if (f2 == NULL) {
413: mutex_spin_exit(f->fx_sq_lock);
414: return;
415: }
416:
417: kmutex_t * const m = f->fx_sq_lock;
418: kmutex_t * const m2 = f2->fx_sq_lock;
419:
420: /* If both locks are the same, release only one. */
421: if (m == m2) {
422: mutex_spin_exit(m);
423: return;
424: }
425:
426: /* Otherwise, use the ordering on the kva of the lock pointer. */
427: if ((uintptr_t)m < (uintptr_t)m2) {
428: mutex_spin_exit(m2);
429: mutex_spin_exit(m);
430: } else {
431: mutex_spin_exit(m);
432: mutex_spin_exit(m2);
433: }
434: }
1.1 thorpej 435:
436: /*
437: * futex_load(uaddr, kaddr)
438: *
439: * Perform a single atomic load to read *uaddr, and return the
440: * result in *kaddr. Return 0 on success, EFAULT if uaddr is not
441: * mapped.
442: */
443: static inline int
444: futex_load(int *uaddr, int *kaddr)
445: {
446: return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
447: }
448:
449: /*
450: * futex_test(uaddr, expected)
451: *
452: * True if *uaddr == expected. False if *uaddr != expected, or if
453: * uaddr is not mapped.
454: */
455: static bool
456: futex_test(int *uaddr, int expected)
457: {
458: int val;
459: int error;
460:
461: error = futex_load(uaddr, &val);
462: if (error)
463: return false;
464: return val == expected;
465: }
466:
1.12.4.1 thorpej 467: static pool_cache_t futex_cache __read_mostly;
468:
469: static int futex_ctor(void *, void *, int);
470: static void futex_dtor(void *, void *);
471:
1.1 thorpej 472: /*
473: * futex_sys_init()
474: *
475: * Initialize the futex subsystem.
476: */
477: void
478: futex_sys_init(void)
479: {
480:
481: mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
482: rb_tree_init(&futex_tab.va, &futex_rb_ops);
483: rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
1.12.4.1 thorpej 484:
485: futex_cache = pool_cache_init(sizeof(struct futex),
486: coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
487: futex_dtor, NULL);
1.1 thorpej 488: }
489:
490: /*
491: * futex_sys_fini()
492: *
493: * Finalize the futex subsystem.
494: */
495: void
496: futex_sys_fini(void)
497: {
498:
499: KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
500: KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
501: mutex_destroy(&futex_tab.lock);
502: }
503:
504: /*
1.12.4.1 thorpej 505: * futex_ctor()
1.1 thorpej 506: *
1.12.4.1 thorpej 507: * Futex object constructor.
1.1 thorpej 508: */
1.12.4.1 thorpej 509: static int
510: futex_ctor(void *arg __unused, void *obj, int flags __unused)
1.1 thorpej 511: {
1.12.4.1 thorpej 512: extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
513: struct futex * const f = obj;
1.1 thorpej 514:
1.12.4.1 thorpej 515: mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
516: f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
1.1 thorpej 517:
1.12.4.1 thorpej 518: sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
519: sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
520: f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
1.1 thorpej 521:
1.12.4.1 thorpej 522: return 0;
1.1 thorpej 523: }
524:
525: /*
1.12.4.1 thorpej 526: * futex_dtor()
1.1 thorpej 527: *
1.12.4.1 thorpej 528: * Futex object destructor.
1.1 thorpej 529: */
530: static void
1.12.4.1 thorpej 531: futex_dtor(void *arg __unused, void *obj)
1.1 thorpej 532: {
1.12.4.1 thorpej 533: struct futex * const f = obj;
1.1 thorpej 534:
1.12.4.1 thorpej 535: mutex_destroy(&f->fx_op_lock);
536: f->fx_sq_lock = NULL;
1.1 thorpej 537: }
538:
539: /*
540: * futex_key_init(key, vm, va, shared)
541: *
542: * Initialize a futex key for lookup, etc.
543: */
544: static int
1.12.4.4 thorpej 545: futex_key_init(futex_key_t *fk, struct vmspace *vm, vaddr_t va, bool shared)
1.1 thorpej 546: {
547: int error = 0;
548:
549: if (__predict_false(shared)) {
550: if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
551: error = EFAULT;
552: } else {
553: fk->fk_private.vmspace = vm;
554: fk->fk_private.va = va;
555: }
556:
557: return error;
558: }
559:
560: /*
561: * futex_key_fini(key, shared)
562: *
563: * Release a futex key.
564: */
565: static void
1.12.4.4 thorpej 566: futex_key_fini(futex_key_t *fk, bool shared)
1.1 thorpej 567: {
568: if (__predict_false(shared))
569: uvm_voaddr_release(&fk->fk_shared);
570: memset(fk, 0, sizeof(*fk));
571: }
572:
573: /*
574: * futex_create(fk, shared)
575: *
576: * Create a futex. Initial reference count is 1, representing the
577: * caller. Returns NULL on failure. Always takes ownership of the
578: * key, either transferring it to the newly-created futex, or releasing
579: * the key if creation fails.
580: *
581: * Never sleeps for memory, but may sleep to acquire a lock.
582: */
583: static struct futex *
1.12.4.4 thorpej 584: futex_create(futex_key_t *fk, bool shared, uint8_t class)
1.1 thorpej 585: {
586: struct futex *f;
587:
1.12.4.1 thorpej 588: f = pool_cache_get(futex_cache, PR_NOWAIT);
1.1 thorpej 589: if (f == NULL) {
590: futex_key_fini(fk, shared);
591: return NULL;
592: }
593: f->fx_key = *fk;
594: f->fx_refcnt = 1;
595: f->fx_shared = shared;
596: f->fx_on_tree = false;
1.12.4.1 thorpej 597: f->fx_class = class;
1.1 thorpej 598:
599: return f;
600: }
601:
602: /*
603: * futex_destroy(f)
604: *
605: * Destroy a futex created with futex_create. Reference count
606: * must be zero.
607: *
608: * May sleep.
609: */
610: static void
611: futex_destroy(struct futex *f)
612: {
613:
614: ASSERT_SLEEPABLE();
615:
616: KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
617: KASSERT(!f->fx_on_tree);
1.12.4.1 thorpej 618: KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
619: KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
620: KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
621: KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
1.1 thorpej 622:
623: futex_key_fini(&f->fx_key, f->fx_shared);
624:
1.12.4.1 thorpej 625: pool_cache_put(futex_cache, f);
1.1 thorpej 626: }
627:
628: /*
1.12.4.1 thorpej 629: * futex_hold_count(f, n)
1.1 thorpej 630: *
631: * Attempt to acquire a reference to f. Return 0 on success,
632: * ENFILE on too many references.
633: *
634: * Never sleeps.
635: */
636: static int
1.12.4.1 thorpej 637: futex_hold_count(struct futex *f, unsigned long const count)
1.1 thorpej 638: {
639: unsigned long refcnt;
640:
641: do {
642: refcnt = atomic_load_relaxed(&f->fx_refcnt);
1.12.4.1 thorpej 643: if (ULONG_MAX - refcnt < count)
1.1 thorpej 644: return ENFILE;
1.12.4.1 thorpej 645: } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
646: refcnt + count) != refcnt);
1.1 thorpej 647:
648: return 0;
649: }
1.12.4.1 thorpej 650: #define futex_hold(f) futex_hold_count(f, 1)
1.1 thorpej 651:
652: /*
1.12.4.1 thorpej 653: * futex_rele_count(f, n)
1.1 thorpej 654: *
655: * Release a reference to f acquired with futex_create or
656: * futex_hold.
657: *
658: * May sleep to free f.
659: */
660: static void
1.12.4.1 thorpej 661: futex_rele_count(struct futex *f, unsigned long const count)
1.1 thorpej 662: {
663: unsigned long refcnt;
664:
665: ASSERT_SLEEPABLE();
666:
667: do {
668: refcnt = atomic_load_relaxed(&f->fx_refcnt);
1.12.4.1 thorpej 669: KASSERT(refcnt >= count);
670: if (refcnt - count == 0)
1.1 thorpej 671: goto trylast;
1.12.4.1 thorpej 672: } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
673: refcnt - count) != refcnt);
1.1 thorpej 674: return;
675:
676: trylast:
1.12.4.1 thorpej 677: KASSERT(count <= LONG_MAX);
1.1 thorpej 678: mutex_enter(&futex_tab.lock);
1.12.4.1 thorpej 679: if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
1.1 thorpej 680: if (f->fx_on_tree) {
681: if (__predict_false(f->fx_shared))
682: rb_tree_remove_node(&futex_tab.oa, f);
683: else
684: rb_tree_remove_node(&futex_tab.va, f);
685: f->fx_on_tree = false;
686: }
687: } else {
688: /* References remain -- don't destroy it. */
689: f = NULL;
690: }
691: mutex_exit(&futex_tab.lock);
692: if (f != NULL)
693: futex_destroy(f);
694: }
1.12.4.1 thorpej 695: #define futex_rele(f) futex_rele_count(f, 1)
1.1 thorpej 696:
697: /*
1.12.4.1 thorpej 698: * futex_rele_count_not_last(f, n)
1.1 thorpej 699: *
700: * Release a reference to f acquired with futex_create or
701: * futex_hold.
702: *
703: * This version asserts that we are not dropping the last
704: * reference to f.
705: */
706: static void
1.12.4.1 thorpej 707: futex_rele_count_not_last(struct futex *f, unsigned long const count)
1.1 thorpej 708: {
709: unsigned long refcnt;
710:
711: do {
712: refcnt = atomic_load_relaxed(&f->fx_refcnt);
1.12.4.1 thorpej 713: KASSERT(refcnt >= count);
714: } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
715: refcnt - count) != refcnt);
1.1 thorpej 716: }
717:
718: /*
1.12.4.1 thorpej 719: * futex_lookup_by_key(key, shared, class, &f)
1.1 thorpej 720: *
721: * Try to find an existing futex va reference in the specified key
722: * On success, return 0, set f to found futex or to NULL if not found,
723: * and increment f's reference count if found.
724: *
725: * Return ENFILE if reference count too high.
726: *
727: * Internal lookup routine shared by futex_lookup() and
1.5 riastrad 728: * futex_lookup_create().
1.1 thorpej 729: */
730: static int
1.12.4.4 thorpej 731: futex_lookup_by_key(futex_key_t *fk, bool shared, uint8_t class,
1.12.4.1 thorpej 732: struct futex **fp)
1.1 thorpej 733: {
734: struct futex *f;
735: int error = 0;
736:
737: mutex_enter(&futex_tab.lock);
738: if (__predict_false(shared)) {
739: f = rb_tree_find_node(&futex_tab.oa, fk);
740: } else {
741: f = rb_tree_find_node(&futex_tab.va, fk);
742: }
743: if (f) {
1.12.4.1 thorpej 744: if (__predict_true(f->fx_class == class ||
745: class == FUTEX_CLASS_ANY))
746: error = futex_hold(f);
747: else
748: error = EINVAL;
1.1 thorpej 749: if (error)
750: f = NULL;
751: }
752: *fp = f;
753: mutex_exit(&futex_tab.lock);
754:
755: return error;
756: }
757:
758: /*
759: * futex_insert(f, fp)
760: *
761: * Try to insert the futex f into the tree by va. If there
762: * already is a futex for its va, acquire a reference to it, and
763: * store it in *fp; otherwise store f in *fp.
764: *
765: * Return 0 on success, ENFILE if there already is a futex but its
766: * reference count is too high.
767: */
768: static int
769: futex_insert(struct futex *f, struct futex **fp)
770: {
771: struct futex *f0;
772: int error;
773:
774: KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
775: KASSERT(!f->fx_on_tree);
776:
777: mutex_enter(&futex_tab.lock);
778: if (__predict_false(f->fx_shared))
779: f0 = rb_tree_insert_node(&futex_tab.oa, f);
780: else
781: f0 = rb_tree_insert_node(&futex_tab.va, f);
782: if (f0 == f) {
783: f->fx_on_tree = true;
784: error = 0;
785: } else {
786: KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
787: KASSERT(f0->fx_on_tree);
788: error = futex_hold(f0);
789: if (error)
790: goto out;
791: }
792: *fp = f0;
793: out: mutex_exit(&futex_tab.lock);
794:
795: return error;
796: }
797:
798: /*
1.12.4.1 thorpej 799: * futex_lookup(uaddr, shared, class, &f)
1.1 thorpej 800: *
801: * Find a futex at the userland pointer uaddr in the current
802: * process's VM space. On success, return the futex in f and
803: * increment its reference count.
804: *
1.5 riastrad 805: * Caller must call futex_rele when done.
1.1 thorpej 806: */
807: static int
1.12.4.1 thorpej 808: futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
1.1 thorpej 809: {
1.12.4.4 thorpej 810: futex_key_t fk;
1.1 thorpej 811: struct vmspace *vm = curproc->p_vmspace;
812: vaddr_t va = (vaddr_t)uaddr;
813: int error;
814:
815: /*
816: * Reject unaligned user pointers so we don't cross page
817: * boundaries and so atomics will work.
818: */
1.12.4.1 thorpej 819: if (__predict_false((va & 3) != 0))
1.1 thorpej 820: return EINVAL;
821:
822: /* Look it up. */
823: error = futex_key_init(&fk, vm, va, shared);
824: if (error)
825: return error;
826:
1.12.4.1 thorpej 827: error = futex_lookup_by_key(&fk, shared, class, fp);
1.1 thorpej 828: futex_key_fini(&fk, shared);
829: if (error)
830: return error;
831:
832: KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
1.12.4.1 thorpej 833: KASSERT(*fp == NULL || (*fp)->fx_class == class);
1.1 thorpej 834: KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
835:
836: /*
837: * Success! (Caller must still check whether we found
838: * anything, but nothing went _wrong_ like trying to use
839: * unmapped memory.)
840: */
841: KASSERT(error == 0);
842:
843: return error;
844: }
845:
846: /*
1.12.4.1 thorpej 847: * futex_lookup_create(uaddr, shared, class, &f)
1.1 thorpej 848: *
849: * Find or create a futex at the userland pointer uaddr in the
850: * current process's VM space. On success, return the futex in f
851: * and increment its reference count.
852: *
1.5 riastrad 853: * Caller must call futex_rele when done.
1.1 thorpej 854: */
855: static int
1.12.4.1 thorpej 856: futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
1.1 thorpej 857: {
1.12.4.4 thorpej 858: futex_key_t fk;
1.1 thorpej 859: struct vmspace *vm = curproc->p_vmspace;
860: struct futex *f = NULL;
861: vaddr_t va = (vaddr_t)uaddr;
862: int error;
863:
864: /*
865: * Reject unaligned user pointers so we don't cross page
866: * boundaries and so atomics will work.
867: */
1.12.4.1 thorpej 868: if (__predict_false((va & 3) != 0))
869: return EINVAL;
870:
871: if (__predict_false(class == FUTEX_CLASS_ANY))
1.1 thorpej 872: return EINVAL;
873:
874: error = futex_key_init(&fk, vm, va, shared);
875: if (error)
876: return error;
877:
878: /*
879: * Optimistically assume there already is one, and try to find
880: * it.
881: */
1.12.4.1 thorpej 882: error = futex_lookup_by_key(&fk, shared, class, fp);
1.1 thorpej 883: if (error || *fp != NULL) {
884: /*
885: * We either found one, or there was an error.
886: * In either case, we are done with the key.
887: */
888: futex_key_fini(&fk, shared);
889: goto out;
890: }
891:
892: /*
893: * Create a futex recoard. This tranfers ownership of the key
894: * in all cases.
895: */
1.12.4.1 thorpej 896: f = futex_create(&fk, shared, class);
1.1 thorpej 897: if (f == NULL) {
898: error = ENOMEM;
899: goto out;
900: }
901:
902: /*
903: * Insert our new futex, or use existing if someone else beat
904: * us to it.
905: */
906: error = futex_insert(f, fp);
907: if (error)
908: goto out;
909: if (*fp == f)
910: f = NULL; /* don't release on exit */
911:
912: /* Success! */
913: KASSERT(error == 0);
914:
915: out: if (f != NULL)
916: futex_rele(f);
917: KASSERT(error || *fp != NULL);
918: KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
919: return error;
920: }
921:
922: /*
1.12.4.1 thorpej 923: * futex_unsleep:
1.1 thorpej 924: *
1.12.4.1 thorpej 925: * Remove an LWP from the futex and sleep queue. This is called when
926: * the LWP has not been awoken normally but instead interrupted: for
927: * example, when a signal is received. Must be called with the LWP
928: * locked. Will unlock if "unlock" is true.
1.1 thorpej 929: */
930: static void
1.12.4.1 thorpej 931: futex_unsleep(lwp_t *l, bool unlock)
1.1 thorpej 932: {
1.12.4.1 thorpej 933: struct futex *f __diagused;
1.1 thorpej 934:
1.12.4.1 thorpej 935: f = (struct futex *)(uintptr_t)l->l_wchan;
1.1 thorpej 936:
1.12.4.1 thorpej 937: KASSERT(mutex_owned(f->fx_sq_lock));
1.1 thorpej 938:
1.12.4.1 thorpej 939: /* WRITERQ is more likely */
940: if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
941: KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
942: f->fx_nwaiters[FUTEX_WRITERQ]--;
943: } else {
944: KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
945: KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
946: f->fx_nwaiters[FUTEX_READERQ]--;
947: }
1.1 thorpej 948:
1.12.4.1 thorpej 949: sleepq_unsleep(l, unlock);
1.1 thorpej 950: }
951:
952: /*
1.12.4.1 thorpej 953: * futex_wait(f, timeout, clkid, bitset)
1.1 thorpej 954: *
1.12.4.1 thorpej 955: * Wait until deadline on the clock clkid, or forever if deadline
956: * is NULL, for a futex wakeup. Return 0 on explicit wakeup,
957: * ETIMEDOUT on timeout, EINTR on signal.
1.1 thorpej 958: */
1.12.4.1 thorpej 959: static int
960: futex_wait(struct futex *f, int q, const struct timespec *deadline,
961: clockid_t clkid, int bitset)
1.1 thorpej 962: {
963:
964: /*
1.12.4.1 thorpej 965: * Some things to note about how this function works:
966: *
967: * ==> When we enter futex_wait(), the futex's op lock is
968: * held, preventing us from missing wakes. Once we are on
969: * the futex's sleepq, it is safe to release the op lock.
970: *
971: * ==> We have a single early escape to avoid calling
972: * sleepq_block(): our deadline already having passed.
973: * If this is a no-timeout wait, or if the deadline has
974: * not already passed, we are guaranteed to call sleepq_block().
975: *
976: * ==> sleepq_block() contains all of the necessary logic
977: * for handling signals; we do not need to check for them
978: * ourselves.
979: *
980: * ==> Once we have blocked, other futex operations will
981: * be able to wake us or requeue us. Until we have blocked,
982: * those other futex operations will not be able to acquire
983: * the sleepq locks in order to perform those operations,
984: * even if we have dropped the op lock. Thus, we will not
985: * miss wakes. This fundamental invariant is relied upon by
986: * every other synchronization object in the kernel.
987: *
988: * ==> sleepq_block() returns for one of three reasons:
989: *
990: * -- We were awakened.
991: * -- We were signaled.
992: * -- We timed out.
993: *
994: * ==> Once sleepq_block() returns, we will not call
995: * sleepq_block() again.
996: *
997: * ==> If sleepq_block() returns due to being signaled,
998: * we must check to see if we were also awakened. This
999: * is to handle the following sequence:
1000: *
1001: * -- Futex wake takes us off sleepq, places us on runq.
1002: * -- We are signaled while sitting on the runq.
1003: * -- We run, and sleepq_block() notices the signal and
1004: * returns EINTR/ERESTART.
1005: *
1006: * In this situation, we want to indicate a successful wake
1007: * to the caller, because that wake has been reported to the
1008: * thread that issued it.
1009: *
1010: * Note that it is NOT possible for a wake to be issued after
1011: * a signal; the LWP will already have been taken off the sleepq
1012: * by the signal, so the wake will never happen. Note that for
1013: * this reason, we must *never* return ERESTART, because it could
1014: * result in us waiting again with no one to awaken us.
1.1 thorpej 1015: */
1016:
1.12.4.1 thorpej 1017: struct lwp * const l = curlwp;
1018: int timo;
1019: int error;
1.1 thorpej 1020:
1.4 riastrad 1021: /*
1.12.4.1 thorpej 1022: * Compute our timeout before taking any locks.
1.4 riastrad 1023: */
1.12.4.1 thorpej 1024: if (deadline) {
1025: struct timespec ts;
1.4 riastrad 1026:
1.12.4.1 thorpej 1027: /* Check our watch. */
1028: error = clock_gettime1(clkid, &ts);
1029: if (error) {
1030: mutex_exit(&f->fx_op_lock);
1031: return error;
1032: }
1.1 thorpej 1033:
1.12.4.1 thorpej 1034: /*
1035: * If we're past the deadline, ETIMEDOUT.
1036: */
1037: if (timespeccmp(deadline, &ts, <=)) {
1038: mutex_exit(&f->fx_op_lock);
1039: return ETIMEDOUT;
1040: } else {
1041: /* Count how much time is left. */
1042: timespecsub(deadline, &ts, &ts);
1043: timo = tstohz(&ts);
1044: if (timo == 0) {
1045: /*
1046: * tstohz() already rounds up, and
1047: * a return value of 0 ticks means
1048: * "expired now or in the past".
1049: */
1050: mutex_exit(&f->fx_op_lock);
1051: return ETIMEDOUT;
1052: }
1053: }
1054: } else {
1055: timo = 0;
1056: }
1.1 thorpej 1057:
1058: /*
1.12.4.1 thorpej 1059: * Acquire in sleepq -> lwp order. While we're at it, ensure
1060: * that there's room for us to block.
1.1 thorpej 1061: */
1.12.4.1 thorpej 1062: mutex_spin_enter(f->fx_sq_lock);
1063: if (__predict_false(q == FUTEX_READERQ &&
1064: f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
1065: mutex_spin_exit(f->fx_sq_lock);
1066: mutex_exit(&f->fx_op_lock);
1067: return ENFILE;
1068: }
1069: lwp_lock(l);
1.4 riastrad 1070:
1071: /*
1.12.4.1 thorpej 1072: * No need for locks here; until we're on a futex's sleepq, these
1073: * values are private to the LWP, and the locks needed to get onto
1074: * the sleepq provide the memory barriers needed to ensure visibility.
1.4 riastrad 1075: */
1.12.4.1 thorpej 1076: l->l_futex = f;
1077: l->l_futex_wakesel = bitset;
1.4 riastrad 1078:
1079: /*
1.12.4.1 thorpej 1080: * This is an inlined bit of sleepq_enter();
1081: * we already hold the lwp lock.
1.4 riastrad 1082: */
1.12.4.1 thorpej 1083: lwp_unlock_to(l, f->fx_sq_lock);
1084: KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
1085: KASSERT(lwp_locked(l, f->fx_sq_lock));
1.4 riastrad 1086:
1.12.4.1 thorpej 1087: sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
1088: f->fx_nwaiters[q]++;
1.11 riastrad 1089:
1.12.4.1 thorpej 1090: /* We can now safely release the op lock. */
1091: mutex_exit(&f->fx_op_lock);
1.11 riastrad 1092:
1.12.4.1 thorpej 1093: SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
1094: error = sleepq_block(timo, true);
1095: SDT_PROBE1(futex, wait, sleepq_block, exit, error);
1.11 riastrad 1096:
1.12.4.1 thorpej 1097: /*
1098: * f is no longer valid; we may have been moved to another
1099: * futex sleepq while we slept.
1100: */
1.4 riastrad 1101:
1102: /*
1.12.4.1 thorpej 1103: * If something went wrong, we may still have a futex reference.
1104: * Check for that here and drop it if so. We can do this without
1105: * taking any locks because l->l_futex is private to the LWP when
1106: * not on any futex's sleepq, and we are not on any sleepq because
1107: * we are running.
1108: *
1109: * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
1110: * EWOULDBLOCK, and ensure the only other error we return is EINTR.
1.4 riastrad 1111: */
1112: if (error) {
1.12.4.1 thorpej 1113: f = l->l_futex;
1114: if (f != NULL) {
1115: SDT_PROBE0(futex, wait, finish, aborted);
1116: l->l_futex = NULL;
1117: futex_rele(f);
1118: } else {
1119: /* Raced with wakeup; report success. */
1120: SDT_PROBE0(futex, wait, finish, wakerace);
1121: error = 0;
1122: }
1.4 riastrad 1123: if (error == EWOULDBLOCK)
1124: error = ETIMEDOUT;
1.12.4.1 thorpej 1125: else if (error != ETIMEDOUT)
1126: error = EINTR;
1127: } else {
1.12.4.2 thorpej 1128: f = l->l_futex;
1129: if (__predict_false(f != NULL)) {
1130: /*
1131: * There are certain situations that can cause
1132: * sleepq_block() to return 0 even if we were
1133: * signalled (by e.g. SIGKILL). In this case,
1134: * we will need to release our futex hold and
1135: * return EINTR (we're probably about to die
1136: * anyway).
1137: */
1138: SDT_PROBE0(futex, wait, finish, aborted);
1139: l->l_futex = NULL;
1140: futex_rele(f);
1141: error = EINTR;
1142: } else {
1143: SDT_PROBE0(futex, wait, finish, normally);
1144: }
1.4 riastrad 1145: }
1146:
1.1 thorpej 1147: return error;
1148: }
1149:
1150: /*
1.12.4.1 thorpej 1151: * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
1.1 thorpej 1152: *
1153: * Wake up to nwake waiters on f matching bitset; then, if f2 is
1154: * provided, move up to nrequeue remaining waiters on f matching
1155: * bitset to f2. Return the number of waiters actually woken.
1156: * Caller must hold the locks of f and f2, if provided.
1157: */
1158: static unsigned
1.12.4.1 thorpej 1159: futex_wake(struct futex *f, int q, unsigned int const nwake,
1160: struct futex *f2, int q2, unsigned int const nrequeue,
1.12.4.5! thorpej 1161: int bitset, unsigned int *nrequeuedp)
1.1 thorpej 1162: {
1.12.4.1 thorpej 1163: struct lwp *l, *l_next;
1.1 thorpej 1164: unsigned nwoken = 0;
1.12.4.1 thorpej 1165: unsigned nrequeued = 0;
1.1 thorpej 1166:
1.12.4.1 thorpej 1167: KASSERT(mutex_owned(&f->fx_op_lock));
1168: KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
1169:
1170: futex_sq_lock2(f, f2);
1.1 thorpej 1171:
1172: /* Wake up to nwake waiters, and count the number woken. */
1.12.4.1 thorpej 1173: LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1174: if (nwoken == nwake)
1.1 thorpej 1175: break;
1.12.4.1 thorpej 1176:
1177: KASSERT(l->l_syncobj == &futex_syncobj);
1178: KASSERT(l->l_wchan == (wchan_t)f);
1179: KASSERT(l->l_futex == f);
1180: KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1181: KASSERT(l->l_mutex == f->fx_sq_lock);
1182:
1183: if ((l->l_futex_wakesel & bitset) == 0)
1184: continue;
1185:
1186: l->l_futex_wakesel = 0;
1187: l->l_futex = NULL;
1188: sleepq_remove(&f->fx_sqs[q], l);
1189: f->fx_nwaiters[q]--;
1190: nwoken++;
1191: /* hold counts adjusted in bulk below */
1.1 thorpej 1192: }
1193:
1.12.4.1 thorpej 1194: if (nrequeue) {
1195: KASSERT(f2 != NULL);
1.1 thorpej 1196: /* Move up to nrequeue waiters from f's queue to f2's queue. */
1.12.4.1 thorpej 1197: LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1198: if (nrequeued == nrequeue)
1.1 thorpej 1199: break;
1.12.4.1 thorpej 1200:
1201: KASSERT(l->l_syncobj == &futex_syncobj);
1202: KASSERT(l->l_wchan == (wchan_t)f);
1203: KASSERT(l->l_futex == f);
1204: KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1205: KASSERT(l->l_mutex == f->fx_sq_lock);
1206:
1207: if ((l->l_futex_wakesel & bitset) == 0)
1208: continue;
1209:
1210: l->l_futex = f2;
1211: sleepq_transfer(l, &f->fx_sqs[q],
1212: &f2->fx_sqs[q2], f2, futex_wmesg,
1213: &futex_syncobj, f2->fx_sq_lock, true);
1214: f->fx_nwaiters[q]--;
1215: f2->fx_nwaiters[q2]++;
1216: nrequeued++;
1217: /* hold counts adjusted in bulk below */
1218: }
1219: if (nrequeued) {
1220: /* XXX futex_hold() could theoretically fail here. */
1221: int hold_error __diagused =
1222: futex_hold_count(f2, nrequeued);
1223: KASSERT(hold_error == 0);
1.1 thorpej 1224: }
1225: }
1226:
1.12.4.1 thorpej 1227: /*
1228: * Adjust the futex reference counts for the wakes and
1229: * requeues.
1230: */
1231: KASSERT(nwoken + nrequeued <= LONG_MAX);
1232: futex_rele_count_not_last(f, nwoken + nrequeued);
1233:
1234: futex_sq_unlock2(f, f2);
1235:
1236: /* Return the number of waiters woken and requeued. */
1.12.4.5! thorpej 1237: if (nrequeuedp != NULL) {
! 1238: *nrequeuedp = nrequeued;
! 1239: }
! 1240: return nwoken;
1.1 thorpej 1241: }
1242:
1243: /*
1.12.4.1 thorpej 1244: * futex_op_lock(f)
1.1 thorpej 1245: *
1.12.4.1 thorpej 1246: * Acquire the op lock of f. Pair with futex_op_unlock. Do
1.1 thorpej 1247: * not use if caller needs to acquire two locks; use
1.12.4.1 thorpej 1248: * futex_op_lock2 instead.
1.1 thorpej 1249: */
1250: static void
1.12.4.1 thorpej 1251: futex_op_lock(struct futex *f)
1.1 thorpej 1252: {
1.12.4.1 thorpej 1253: mutex_enter(&f->fx_op_lock);
1.1 thorpej 1254: }
1255:
1256: /*
1.12.4.1 thorpej 1257: * futex_op_unlock(f)
1.1 thorpej 1258: *
1.12.4.3 thorpej 1259: * Release the op lock of f.
1.1 thorpej 1260: */
1261: static void
1.12.4.1 thorpej 1262: futex_op_unlock(struct futex *f)
1.1 thorpej 1263: {
1.12.4.1 thorpej 1264: mutex_exit(&f->fx_op_lock);
1.1 thorpej 1265: }
1266:
1267: /*
1.12.4.1 thorpej 1268: * futex_op_lock2(f, f2)
1.1 thorpej 1269: *
1.12.4.1 thorpej 1270: * Acquire the op locks of both f and f2, which may be null, or
1271: * which may be the same. If they are distinct, an arbitrary total
1272: * order is chosen on the locks.
1.1 thorpej 1273: *
1.12.4.1 thorpej 1274: * Callers should only ever acquire multiple op locks
1275: * simultaneously using futex_op_lock2.
1.1 thorpej 1276: */
1277: static void
1.12.4.1 thorpej 1278: futex_op_lock2(struct futex *f, struct futex *f2)
1.1 thorpej 1279: {
1280:
1281: /*
1282: * If both are null, do nothing; if one is null and the other
1283: * is not, lock the other and be done with it.
1284: */
1285: if (f == NULL && f2 == NULL) {
1286: return;
1287: } else if (f == NULL) {
1.12.4.1 thorpej 1288: mutex_enter(&f2->fx_op_lock);
1.1 thorpej 1289: return;
1290: } else if (f2 == NULL) {
1.12.4.1 thorpej 1291: mutex_enter(&f->fx_op_lock);
1.1 thorpej 1292: return;
1293: }
1294:
1295: /* If both futexes are the same, acquire only one. */
1296: if (f == f2) {
1.12.4.1 thorpej 1297: mutex_enter(&f->fx_op_lock);
1.1 thorpej 1298: return;
1299: }
1300:
1301: /* Otherwise, use the ordering on the kva of the futex pointer. */
1302: if ((uintptr_t)f < (uintptr_t)f2) {
1.12.4.1 thorpej 1303: mutex_enter(&f->fx_op_lock);
1304: mutex_enter(&f2->fx_op_lock);
1.1 thorpej 1305: } else {
1.12.4.1 thorpej 1306: mutex_enter(&f2->fx_op_lock);
1307: mutex_enter(&f->fx_op_lock);
1.1 thorpej 1308: }
1309: }
1310:
1311: /*
1.12.4.1 thorpej 1312: * futex_op_unlock2(f, f2)
1.1 thorpej 1313: *
1314: * Release the queue locks of both f and f2, which may be null, or
1315: * which may have the same underlying queue.
1316: */
1317: static void
1.12.4.1 thorpej 1318: futex_op_unlock2(struct futex *f, struct futex *f2)
1.1 thorpej 1319: {
1320:
1321: /*
1322: * If both are null, do nothing; if one is null and the other
1323: * is not, unlock the other and be done with it.
1324: */
1325: if (f == NULL && f2 == NULL) {
1326: return;
1327: } else if (f == NULL) {
1.12.4.1 thorpej 1328: mutex_exit(&f2->fx_op_lock);
1.1 thorpej 1329: return;
1330: } else if (f2 == NULL) {
1.12.4.1 thorpej 1331: mutex_exit(&f->fx_op_lock);
1.1 thorpej 1332: return;
1333: }
1334:
1335: /* If both futexes are the same, release only one. */
1336: if (f == f2) {
1.12.4.1 thorpej 1337: mutex_exit(&f->fx_op_lock);
1.1 thorpej 1338: return;
1339: }
1340:
1341: /* Otherwise, use the ordering on the kva of the futex pointer. */
1342: if ((uintptr_t)f < (uintptr_t)f2) {
1.12.4.1 thorpej 1343: mutex_exit(&f2->fx_op_lock);
1344: mutex_exit(&f->fx_op_lock);
1.1 thorpej 1345: } else {
1.12.4.1 thorpej 1346: mutex_exit(&f->fx_op_lock);
1347: mutex_exit(&f2->fx_op_lock);
1.1 thorpej 1348: }
1349: }
1350:
1351: /*
1352: * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
1353: *
1.12.4.1 thorpej 1354: * Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
1.1 thorpej 1355: */
1356: static int
1357: futex_func_wait(bool shared, int *uaddr, int val, int val3,
1.11 riastrad 1358: const struct timespec *timeout, clockid_t clkid, int clkflags,
1.1 thorpej 1359: register_t *retval)
1360: {
1361: struct futex *f;
1.11 riastrad 1362: struct timespec ts;
1363: const struct timespec *deadline;
1.1 thorpej 1364: int error;
1365:
1.6 riastrad 1366: /*
1367: * If there's nothing to wait for, and nobody will ever wake
1368: * us, then don't set anything up to wait -- just stop here.
1369: */
1370: if (val3 == 0)
1.7 riastrad 1371: return EINVAL;
1.6 riastrad 1372:
1.1 thorpej 1373: /* Optimistically test before anything else. */
1374: if (!futex_test(uaddr, val))
1375: return EAGAIN;
1376:
1.11 riastrad 1377: /* Determine a deadline on the specified clock. */
1.12.4.1 thorpej 1378: if (timeout == NULL) {
1379: deadline = timeout;
1380: } else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
1381: /* Sanity-check the deadline. */
1382: if (timeout->tv_sec < 0 ||
1383: timeout->tv_nsec < 0 ||
1384: timeout->tv_nsec >= 1000000000L) {
1385: return EINVAL;
1386: }
1.11 riastrad 1387: deadline = timeout;
1388: } else {
1.12.4.1 thorpej 1389: struct timespec interval = *timeout;
1390:
1391: error = itimespecfix(&interval);
1392: if (error)
1393: return error;
1.11 riastrad 1394: error = clock_gettime1(clkid, &ts);
1395: if (error)
1396: return error;
1.12.4.1 thorpej 1397: timespecadd(&ts, &interval, &ts);
1.11 riastrad 1398: deadline = &ts;
1399: }
1400:
1.1 thorpej 1401: /* Get the futex, creating it if necessary. */
1.12.4.1 thorpej 1402: error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1.1 thorpej 1403: if (error)
1404: return error;
1405: KASSERT(f);
1406:
1407: /*
1.12.4.1 thorpej 1408: * Under the op lock, check the value again: if it has
1.1 thorpej 1409: * already changed, EAGAIN; otherwise enqueue the waiter.
1410: * Since FUTEX_WAKE will use the same lock and be done after
1411: * modifying the value, the order in which we check and enqueue
1412: * is immaterial.
1413: */
1.12.4.1 thorpej 1414: futex_op_lock(f);
1.1 thorpej 1415: if (!futex_test(uaddr, val)) {
1.12.4.1 thorpej 1416: futex_op_unlock(f);
1.1 thorpej 1417: error = EAGAIN;
1418: goto out;
1419: }
1.12.4.1 thorpej 1420:
1421: /*
1422: * Now wait. futex_wait() will drop our op lock once we
1423: * are entered into the sleep queue, thus ensuring atomicity
1424: * of wakes with respect to waits.
1425: */
1426: error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
1.1 thorpej 1427:
1428: /*
1429: * We cannot drop our reference to the futex here, because
1430: * we might be enqueued on a different one when we are awakened.
1.12.4.1 thorpej 1431: * The references will be managed on our behalf in the requeue,
1432: * wake, and error cases.
1.1 thorpej 1433: */
1434: f = NULL;
1435:
1.12.4.1 thorpej 1436: if (__predict_true(error == 0)) {
1437: /* Return 0 on success, error on failure. */
1438: *retval = 0;
1439: }
1.1 thorpej 1440:
1441: out: if (f != NULL)
1.5 riastrad 1442: futex_rele(f);
1.1 thorpej 1443: return error;
1444: }
1445:
1446: /*
1447: * futex_func_wake(uaddr, val, val3, retval)
1448: *
1449: * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
1450: */
1451: static int
1452: futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
1453: {
1454: struct futex *f;
1455: unsigned int nwoken = 0;
1456: int error = 0;
1457:
1458: /* Reject negative number of wakeups. */
1459: if (val < 0) {
1460: error = EINVAL;
1461: goto out;
1462: }
1463:
1464: /* Look up the futex, if any. */
1.12.4.1 thorpej 1465: error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1.1 thorpej 1466: if (error)
1467: goto out;
1468:
1469: /* If there's no futex, there are no waiters to wake. */
1470: if (f == NULL)
1471: goto out;
1472:
1473: /*
1.12.4.1 thorpej 1474: * Under f's op lock, wake the waiters and remember the
1.1 thorpej 1475: * number woken.
1476: */
1.12.4.1 thorpej 1477: futex_op_lock(f);
1478: nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1.12.4.5! thorpej 1479: NULL, FUTEX_WRITERQ, 0, val3, NULL);
1.12.4.1 thorpej 1480: futex_op_unlock(f);
1.1 thorpej 1481:
1482: /* Release the futex. */
1.5 riastrad 1483: futex_rele(f);
1.1 thorpej 1484:
1485: out:
1486: /* Return the number of waiters woken. */
1487: *retval = nwoken;
1488:
1489: /* Success! */
1490: return error;
1491: }
1492:
1493: /*
1494: * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
1495: *
1496: * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
1497: */
1498: static int
1499: futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
1500: int val2, int val3, register_t *retval)
1501: {
1502: struct futex *f = NULL, *f2 = NULL;
1503: unsigned nwoken = 0; /* default to zero woken on early return */
1.12.4.5! thorpej 1504: unsigned nrequeued = 0;
1.1 thorpej 1505: int error;
1506:
1507: /* Reject negative number of wakeups or requeues. */
1508: if (val < 0 || val2 < 0) {
1509: error = EINVAL;
1510: goto out;
1511: }
1512:
1513: /* Look up the source futex, if any. */
1.12.4.1 thorpej 1514: error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1.1 thorpej 1515: if (error)
1516: goto out;
1517:
1518: /* If there is none, nothing to do. */
1519: if (f == NULL)
1520: goto out;
1521:
1522: /*
1523: * We may need to create the destination futex because it's
1524: * entirely possible it does not currently have any waiters.
1525: */
1.12.4.1 thorpej 1526: error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1.1 thorpej 1527: if (error)
1528: goto out;
1529:
1530: /*
1.12.4.1 thorpej 1531: * Under the futexes' op locks, check the value; if
1.1 thorpej 1532: * unchanged from val3, wake the waiters.
1533: */
1.12.4.1 thorpej 1534: futex_op_lock2(f, f2);
1.1 thorpej 1535: if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
1536: error = EAGAIN;
1537: } else {
1538: error = 0;
1.12.4.1 thorpej 1539: nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1540: f2, FUTEX_WRITERQ, val2,
1.12.4.5! thorpej 1541: FUTEX_BITSET_MATCH_ANY,
! 1542: &nrequeued);
1.1 thorpej 1543: }
1.12.4.1 thorpej 1544: futex_op_unlock2(f, f2);
1.1 thorpej 1545:
1546: out:
1.12.4.5! thorpej 1547: /*
! 1548: * For FUTUEX_REQUEUE, return the numner of waiters woken.
! 1549: *
! 1550: * For FUTEX_CMP_REQUEUE, return the number of waiters woken
! 1551: * **and** requeued.
! 1552: */
! 1553: *retval = nwoken + (op == FUTEX_CMP_REQUEUE) ? nrequeued : 0;
1.1 thorpej 1554:
1555: /* Release the futexes if we got them. */
1556: if (f2)
1.5 riastrad 1557: futex_rele(f2);
1.1 thorpej 1558: if (f)
1.5 riastrad 1559: futex_rele(f);
1.1 thorpej 1560: return error;
1561: }
1562:
1563: /*
1564: * futex_validate_op_cmp(val3)
1565: *
1566: * Validate an op/cmp argument for FUTEX_WAKE_OP.
1567: */
1568: static int
1569: futex_validate_op_cmp(int val3)
1570: {
1571: int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1572: int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1573:
1574: if (op & FUTEX_OP_OPARG_SHIFT) {
1575: int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1576: if (oparg < 0)
1577: return EINVAL;
1578: if (oparg >= 32)
1579: return EINVAL;
1580: op &= ~FUTEX_OP_OPARG_SHIFT;
1581: }
1582:
1583: switch (op) {
1584: case FUTEX_OP_SET:
1585: case FUTEX_OP_ADD:
1586: case FUTEX_OP_OR:
1587: case FUTEX_OP_ANDN:
1588: case FUTEX_OP_XOR:
1589: break;
1590: default:
1591: return EINVAL;
1592: }
1593:
1594: switch (cmp) {
1595: case FUTEX_OP_CMP_EQ:
1596: case FUTEX_OP_CMP_NE:
1597: case FUTEX_OP_CMP_LT:
1598: case FUTEX_OP_CMP_LE:
1599: case FUTEX_OP_CMP_GT:
1600: case FUTEX_OP_CMP_GE:
1601: break;
1602: default:
1603: return EINVAL;
1604: }
1605:
1606: return 0;
1607: }
1608:
1609: /*
1610: * futex_compute_op(oldval, val3)
1611: *
1612: * Apply a FUTEX_WAIT_OP operation to oldval.
1613: */
1614: static int
1615: futex_compute_op(int oldval, int val3)
1616: {
1617: int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1618: int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1619:
1620: if (op & FUTEX_OP_OPARG_SHIFT) {
1621: KASSERT(oparg >= 0);
1622: KASSERT(oparg < 32);
1623: oparg = 1u << oparg;
1624: op &= ~FUTEX_OP_OPARG_SHIFT;
1625: }
1626:
1627: switch (op) {
1628: case FUTEX_OP_SET:
1629: return oparg;
1630:
1631: case FUTEX_OP_ADD:
1632: /*
1633: * Avoid signed arithmetic overflow by doing
1634: * arithmetic unsigned and converting back to signed
1635: * at the end.
1636: */
1637: return (int)((unsigned)oldval + (unsigned)oparg);
1638:
1639: case FUTEX_OP_OR:
1640: return oldval | oparg;
1641:
1642: case FUTEX_OP_ANDN:
1643: return oldval & ~oparg;
1644:
1645: case FUTEX_OP_XOR:
1646: return oldval ^ oparg;
1647:
1648: default:
1649: panic("invalid futex op");
1650: }
1651: }
1652:
1653: /*
1654: * futex_compute_cmp(oldval, val3)
1655: *
1656: * Apply a FUTEX_WAIT_OP comparison to oldval.
1657: */
1658: static bool
1659: futex_compute_cmp(int oldval, int val3)
1660: {
1661: int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1662: int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
1663:
1664: switch (cmp) {
1665: case FUTEX_OP_CMP_EQ:
1666: return (oldval == cmparg);
1667:
1668: case FUTEX_OP_CMP_NE:
1669: return (oldval != cmparg);
1670:
1671: case FUTEX_OP_CMP_LT:
1672: return (oldval < cmparg);
1673:
1674: case FUTEX_OP_CMP_LE:
1675: return (oldval <= cmparg);
1676:
1677: case FUTEX_OP_CMP_GT:
1678: return (oldval > cmparg);
1679:
1680: case FUTEX_OP_CMP_GE:
1681: return (oldval >= cmparg);
1682:
1683: default:
1684: panic("invalid futex cmp operation");
1685: }
1686: }
1687:
1688: /*
1689: * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
1690: *
1691: * Implement futex(FUTEX_WAKE_OP).
1692: */
1693: static int
1694: futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
1695: int val3, register_t *retval)
1696: {
1697: struct futex *f = NULL, *f2 = NULL;
1698: int oldval, newval, actual;
1699: unsigned nwoken = 0;
1700: int error;
1701:
1702: /* Reject negative number of wakeups. */
1703: if (val < 0 || val2 < 0) {
1704: error = EINVAL;
1705: goto out;
1706: }
1707:
1708: /* Reject invalid operations before we start doing things. */
1709: if ((error = futex_validate_op_cmp(val3)) != 0)
1710: goto out;
1711:
1712: /* Look up the first futex, if any. */
1.12.4.1 thorpej 1713: error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1.1 thorpej 1714: if (error)
1715: goto out;
1716:
1717: /* Look up the second futex, if any. */
1.12.4.1 thorpej 1718: error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1.1 thorpej 1719: if (error)
1720: goto out;
1721:
1722: /*
1.12.4.1 thorpej 1723: * Under the op locks:
1.1 thorpej 1724: *
1725: * 1. Read/modify/write: *uaddr2 op= oparg.
1726: * 2. Unconditionally wake uaddr.
1.12.4.1 thorpej 1727: * 3. Conditionally wake uaddr2, if it previously matched val3.
1.1 thorpej 1728: */
1.12.4.1 thorpej 1729: futex_op_lock2(f, f2);
1.1 thorpej 1730: do {
1731: error = futex_load(uaddr2, &oldval);
1732: if (error)
1733: goto out_unlock;
1734: newval = futex_compute_op(oldval, val3);
1735: error = ucas_int(uaddr2, oldval, newval, &actual);
1736: if (error)
1737: goto out_unlock;
1738: } while (actual != oldval);
1.12.4.1 thorpej 1739: nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
1740: NULL, FUTEX_WRITERQ, 0,
1.12.4.5! thorpej 1741: FUTEX_BITSET_MATCH_ANY, NULL) : 0);
1.1 thorpej 1742: if (f2 && futex_compute_cmp(oldval, val3))
1.12.4.1 thorpej 1743: nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
1744: NULL, FUTEX_WRITERQ, 0,
1.12.4.5! thorpej 1745: FUTEX_BITSET_MATCH_ANY, NULL);
1.1 thorpej 1746:
1747: /* Success! */
1748: error = 0;
1749: out_unlock:
1.12.4.1 thorpej 1750: futex_op_unlock2(f, f2);
1.1 thorpej 1751:
1752: out:
1753: /* Return the number of waiters woken. */
1754: *retval = nwoken;
1755:
1756: /* Release the futexes, if we got them. */
1757: if (f2)
1.5 riastrad 1758: futex_rele(f2);
1.1 thorpej 1759: if (f)
1.5 riastrad 1760: futex_rele(f);
1.1 thorpej 1761: return error;
1762: }
1763:
1764: /*
1765: * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
1766: *
1767: * Implement the futex system call with all the parameters
1768: * parsed out.
1769: */
1770: int
1.11 riastrad 1771: do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
1772: int *uaddr2, int val2, int val3, register_t *retval)
1.1 thorpej 1773: {
1774: const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
1775: const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
1776: : CLOCK_MONOTONIC;
1.12.4.1 thorpej 1777: int error;
1.1 thorpej 1778:
1779: op &= FUTEX_CMD_MASK;
1780:
1781: switch (op) {
1782: case FUTEX_WAIT:
1.12.4.1 thorpej 1783: SDT_PROBE6(futex, func, wait, entry,
1784: uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
1785: TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
1786: error = futex_func_wait(shared, uaddr, val,
1.1 thorpej 1787: FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
1788: retval);
1.12.4.1 thorpej 1789: SDT_PROBE1(futex, func, wait, exit, error);
1790: break;
1791:
1792: case FUTEX_WAIT_BITSET:
1793: SDT_PROBE6(futex, func, wait, entry,
1794: uaddr, val, val3, timeout,
1795: TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
1796: error = futex_func_wait(shared, uaddr, val, val3, timeout,
1797: clkid, TIMER_ABSTIME, retval);
1798: SDT_PROBE1(futex, func, wait, exit, error);
1799: break;
1.1 thorpej 1800:
1801: case FUTEX_WAKE:
1.12.4.1 thorpej 1802: SDT_PROBE4(futex, func, wake, entry,
1803: uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
1804: error = futex_func_wake(shared, uaddr, val,
1805: FUTEX_BITSET_MATCH_ANY, retval);
1806: SDT_PROBE2(futex, func, wake, exit, error, *retval);
1807: break;
1808:
1.1 thorpej 1809: case FUTEX_WAKE_BITSET:
1.12.4.1 thorpej 1810: SDT_PROBE4(futex, func, wake, entry,
1811: uaddr, val, val3, op & ~FUTEX_CMD_MASK);
1812: error = futex_func_wake(shared, uaddr, val, val3, retval);
1813: SDT_PROBE2(futex, func, wake, exit, error, *retval);
1814: break;
1.1 thorpej 1815:
1816: case FUTEX_REQUEUE:
1.12.4.1 thorpej 1817: SDT_PROBE5(futex, func, requeue, entry,
1818: uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
1819: error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
1820: val2, 0, retval);
1821: SDT_PROBE2(futex, func, requeue, exit, error, *retval);
1822: break;
1823:
1.1 thorpej 1824: case FUTEX_CMP_REQUEUE:
1.12.4.1 thorpej 1825: SDT_PROBE6(futex, func, cmp_requeue, entry,
1826: uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
1827: error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
1.1 thorpej 1828: val2, val3, retval);
1.12.4.1 thorpej 1829: SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
1830: break;
1.1 thorpej 1831:
1832: case FUTEX_WAKE_OP:
1.12.4.1 thorpej 1833: SDT_PROBE6(futex, func, wake_op, entry,
1834: uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
1835: error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
1.1 thorpej 1836: val3, retval);
1.12.4.1 thorpej 1837: SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
1838: break;
1.1 thorpej 1839:
1840: case FUTEX_FD:
1841: default:
1.12.4.1 thorpej 1842: error = ENOSYS;
1843: break;
1.1 thorpej 1844: }
1.12.4.1 thorpej 1845:
1846: return error;
1.1 thorpej 1847: }
1848:
1849: /*
1850: * sys___futex(l, uap, retval)
1851: *
1852: * __futex(2) system call: generic futex operations.
1853: */
1854: int
1855: sys___futex(struct lwp *l, const struct sys___futex_args *uap,
1856: register_t *retval)
1857: {
1858: /* {
1859: syscallarg(int *) uaddr;
1860: syscallarg(int) op;
1861: syscallarg(int) val;
1862: syscallarg(const struct timespec *) timeout;
1863: syscallarg(int *) uaddr2;
1864: syscallarg(int) val2;
1865: syscallarg(int) val3;
1866: } */
1867: struct timespec ts, *tsp;
1868: int error;
1869:
1870: /*
1871: * Copy in the timeout argument, if specified.
1872: */
1873: if (SCARG(uap, timeout)) {
1874: error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
1875: if (error)
1876: return error;
1877: tsp = &ts;
1878: } else {
1879: tsp = NULL;
1880: }
1881:
1882: return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
1883: tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
1884: retval);
1885: }
1886:
1887: /*
1888: * sys___futex_set_robust_list(l, uap, retval)
1889: *
1890: * __futex_set_robust_list(2) system call for robust futexes.
1891: */
1892: int
1893: sys___futex_set_robust_list(struct lwp *l,
1894: const struct sys___futex_set_robust_list_args *uap, register_t *retval)
1895: {
1896: /* {
1897: syscallarg(void *) head;
1898: syscallarg(size_t) len;
1899: } */
1900: void *head = SCARG(uap, head);
1901:
1902: if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
1903: return EINVAL;
1904: if ((uintptr_t)head % sizeof(u_long))
1905: return EINVAL;
1906:
1907: l->l_robust_head = (uintptr_t)head;
1908:
1909: return 0;
1910: }
1911:
1912: /*
1913: * sys___futex_get_robust_list(l, uap, retval)
1914: *
1915: * __futex_get_robust_list(2) system call for robust futexes.
1916: */
1917: int
1918: sys___futex_get_robust_list(struct lwp *l,
1919: const struct sys___futex_get_robust_list_args *uap, register_t *retval)
1920: {
1921: /* {
1922: syscallarg(lwpid_t) lwpid;
1923: syscallarg(void **) headp;
1924: syscallarg(size_t *) lenp;
1925: } */
1926: void *head;
1927: const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
1928: int error;
1929:
1930: error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
1931: if (error)
1932: return error;
1933:
1934: /* Copy out the head pointer and the head structure length. */
1935: error = copyout(&head, SCARG(uap, headp), sizeof(head));
1936: if (__predict_true(error == 0)) {
1937: error = copyout(&len, SCARG(uap, lenp), sizeof(len));
1938: }
1939:
1940: return error;
1941: }
1942:
1943: /*
1944: * release_futex(uva, tid)
1945: *
1946: * Try to release the robust futex at uva in the current process
1947: * on lwp exit. If anything goes wrong, silently fail. It is the
1948: * userland program's obligation to arrange correct behaviour.
1949: */
1950: static void
1951: release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
1952: bool const is_pending)
1953: {
1954: int *uaddr;
1955: struct futex *f;
1956: int oldval, newval, actual;
1957: int error;
1.12.4.1 thorpej 1958: bool shared;
1.1 thorpej 1959:
1960: /* If it's misaligned, tough. */
1961: if (__predict_false(uptr & 3))
1962: return;
1963: uaddr = (int *)uptr;
1964:
1965: error = futex_load(uaddr, &oldval);
1966: if (__predict_false(error))
1967: return;
1968:
1969: /*
1970: * There are two race conditions we need to handle here:
1971: *
1972: * 1. User space cleared the futex word but died before
1973: * being able to issue the wakeup. No wakeups will
1974: * ever be issued, oops!
1975: *
1976: * 2. Awakened waiter died before being able to acquire
1977: * the futex in user space. Any other waiters are
1978: * now stuck, oops!
1979: *
1980: * In both of these cases, the futex word will be 0 (because
1981: * it's updated before the wake is issued). The best we can
1982: * do is detect this situation if it's the pending futex and
1983: * issue a wake without modifying the futex word.
1984: *
1985: * XXX eventual PI handling?
1986: */
1987: if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
1988: register_t retval;
1.12.4.1 thorpej 1989: error = futex_func_wake(/*shared*/true, uaddr, 1,
1.1 thorpej 1990: FUTEX_BITSET_MATCH_ANY, &retval);
1.12.4.1 thorpej 1991: if (error != 0 || retval == 0)
1992: (void) futex_func_wake(/*shared*/false, uaddr, 1,
1993: FUTEX_BITSET_MATCH_ANY, &retval);
1.1 thorpej 1994: return;
1995: }
1996:
1997: /* Optimistically test whether we need to do anything at all. */
1998: if ((oldval & FUTEX_TID_MASK) != tid)
1999: return;
2000:
2001: /*
2002: * We need to handle the case where this thread owned the futex,
2003: * but it was uncontended. In this case, there won't be any
2004: * kernel state to look up. All we can do is mark the futex
2005: * as a zombie to be mopped up the next time another thread
2006: * attempts to acquire it.
2007: *
2008: * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
2009: * this loop, even if waiters appear while we're are doing
2010: * so. This is beause FUTEX_WAITERS is set by user space
2011: * before calling __futex() to wait, and the futex needs
2012: * to be marked as a zombie when the new waiter gets into
2013: * the kernel.
2014: */
2015: if ((oldval & FUTEX_WAITERS) == 0) {
2016: do {
2017: error = futex_load(uaddr, &oldval);
2018: if (error)
2019: return;
2020: if ((oldval & FUTEX_TID_MASK) != tid)
2021: return;
2022: newval = oldval | FUTEX_OWNER_DIED;
2023: error = ucas_int(uaddr, oldval, newval, &actual);
2024: if (error)
2025: return;
2026: } while (actual != oldval);
2027:
2028: /*
2029: * If where is still no indication of waiters, then there is
2030: * no more work for us to do.
2031: */
2032: if ((oldval & FUTEX_WAITERS) == 0)
2033: return;
2034: }
2035:
2036: /*
1.12.4.1 thorpej 2037: * Look for a futex. Try shared first, then private. If we
2038: * can't fine one, tough.
2039: *
2040: * Note: the assumption here is that anyone placing a futex on
2041: * the robust list is adhering to the rules, regardless of the
2042: * futex class.
1.1 thorpej 2043: */
1.12.4.1 thorpej 2044: for (f = NULL, shared = true; f == NULL; shared = false) {
2045: error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
2046: if (error)
2047: return;
2048: if (f == NULL && shared == false)
2049: return;
2050: }
1.1 thorpej 2051:
1.12.4.1 thorpej 2052: /* Work under the futex op lock. */
2053: futex_op_lock(f);
1.1 thorpej 2054:
2055: /*
2056: * Fetch the word: if the tid doesn't match ours, skip;
2057: * otherwise, set the owner-died bit, atomically.
2058: */
2059: do {
2060: error = futex_load(uaddr, &oldval);
2061: if (error)
2062: goto out;
2063: if ((oldval & FUTEX_TID_MASK) != tid)
2064: goto out;
2065: newval = oldval | FUTEX_OWNER_DIED;
2066: error = ucas_int(uaddr, oldval, newval, &actual);
2067: if (error)
2068: goto out;
2069: } while (actual != oldval);
2070:
2071: /*
2072: * If there may be waiters, try to wake one. If anything goes
2073: * wrong, tough.
2074: *
2075: * XXX eventual PI handling?
2076: */
1.12.4.1 thorpej 2077: if (oldval & FUTEX_WAITERS) {
2078: (void)futex_wake(f, FUTEX_WRITERQ, 1,
2079: NULL, FUTEX_WRITERQ, 0,
1.12.4.5! thorpej 2080: FUTEX_BITSET_MATCH_ANY, NULL);
1.12.4.1 thorpej 2081: }
1.1 thorpej 2082:
2083: /* Unlock the queue and release the futex. */
1.12.4.1 thorpej 2084: out: futex_op_unlock(f);
1.5 riastrad 2085: futex_rele(f);
1.1 thorpej 2086: }
2087:
2088: /*
2089: * futex_robust_head_lookup(l, lwpid)
2090: *
2091: * Helper function to look up a robust head by LWP ID.
2092: */
2093: int
2094: futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
2095: {
2096: struct proc *p = l->l_proc;
2097:
2098: /* Find the other lwp, if requested; otherwise use our robust head. */
2099: if (lwpid) {
2100: mutex_enter(p->p_lock);
2101: l = lwp_find(p, lwpid);
2102: if (l == NULL) {
2103: mutex_exit(p->p_lock);
2104: return ESRCH;
2105: }
2106: *headp = (void *)l->l_robust_head;
2107: mutex_exit(p->p_lock);
2108: } else {
2109: *headp = (void *)l->l_robust_head;
2110: }
2111: return 0;
2112: }
2113:
2114: /*
2115: * futex_fetch_robust_head(uaddr)
2116: *
2117: * Helper routine to fetch the futex robust list head that
2118: * handles 32-bit binaries running on 64-bit kernels.
2119: */
2120: static int
2121: futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
2122: {
2123: #ifdef _LP64
2124: if (curproc->p_flag & PK_32) {
2125: uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
2126: int error;
2127:
2128: error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
2129: if (__predict_true(error == 0)) {
2130: for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
2131: if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
2132: /*
2133: * Make sure the offset is sign-
2134: * extended.
2135: */
2136: rhead[i] = (int32_t)rhead32[i];
2137: } else {
2138: rhead[i] = rhead32[i];
2139: }
2140: }
2141: }
2142: return error;
2143: }
2144: #endif /* _L64 */
2145:
2146: return copyin((void *)uaddr, rhead,
2147: sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
2148: }
2149:
2150: /*
2151: * futex_decode_robust_word(word)
2152: *
2153: * Decode a robust futex list word into the entry and entry
2154: * properties.
2155: */
2156: static inline void
2157: futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
2158: bool * const is_pi)
2159: {
2160: *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
2161: *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
2162: }
2163:
2164: /*
2165: * futex_fetch_robust_entry(uaddr)
2166: *
2167: * Helper routine to fetch and decode a robust futex entry
2168: * that handles 32-bit binaries running on 64-bit kernels.
2169: */
2170: static int
2171: futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
2172: bool * const is_pi)
2173: {
2174: uintptr_t val = 0;
2175: int error = 0;
2176:
2177: #ifdef _LP64
2178: if (curproc->p_flag & PK_32) {
2179: uint32_t val32;
2180:
2181: error = ufetch_32((uint32_t *)uaddr, &val32);
2182: if (__predict_true(error == 0))
2183: val = val32;
2184: } else
2185: #endif /* _LP64 */
2186: error = ufetch_long((u_long *)uaddr, (u_long *)&val);
2187: if (__predict_false(error))
2188: return error;
2189:
2190: futex_decode_robust_word(val, valp, is_pi);
2191: return 0;
2192: }
2193:
2194: /*
2195: * futex_release_all_lwp(l, tid)
2196: *
2197: * Release all l's robust futexes. If anything looks funny in
2198: * the process, give up -- it's userland's responsibility to dot
2199: * the i's and cross the t's.
2200: */
2201: void
2202: futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
2203: {
2204: u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
2205: int limit = 1000000;
2206: int error;
2207:
2208: /* If there's no robust list there's nothing to do. */
2209: if (l->l_robust_head == 0)
2210: return;
2211:
2212: /* Read the final snapshot of the robust list head. */
2213: error = futex_fetch_robust_head(l->l_robust_head, rhead);
2214: if (error) {
2215: printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2216: " unmapped robust futex list head\n",
2217: (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2218: (uintmax_t)l->l_lid, (uintmax_t)tid);
2219: return;
2220: }
2221:
2222: const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
2223:
2224: uintptr_t next, pending;
2225: bool is_pi, pending_is_pi;
2226:
2227: futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
2228: &next, &is_pi);
2229: futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
2230: &pending, &pending_is_pi);
2231:
2232: /*
2233: * Walk down the list of locked futexes and release them, up
2234: * to one million of them before we give up.
2235: */
2236:
2237: while (next != l->l_robust_head && limit-- > 0) {
2238: /* pending handled below. */
2239: if (next != pending)
2240: release_futex(next + offset, tid, is_pi, false);
2241: error = futex_fetch_robust_entry(next, &next, &is_pi);
2242: if (error)
2243: break;
2244: preempt_point();
2245: }
2246: if (limit <= 0) {
2247: printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2248: " exhausted robust futex limit\n",
2249: (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2250: (uintmax_t)l->l_lid, (uintmax_t)tid);
2251: }
2252:
2253: /* If there's a pending futex, it may need to be released too. */
2254: if (pending != 0) {
2255: release_futex(pending + offset, tid, pending_is_pi, true);
2256: }
2257: }
CVSweb <webmaster@jp.NetBSD.org>