Annotation of src/sys/kern/sys_futex.c, Revision 1.14
1.14 ! andvar 1: /* $NetBSD: sys_futex.c,v 1.13 2021/09/28 15:05:42 thorpej Exp $ */
1.1 thorpej 2:
3: /*-
4: * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
5: * All rights reserved.
6: *
7: * This code is derived from software contributed to The NetBSD Foundation
8: * by Taylor R. Campbell and Jason R. Thorpe.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
18: *
19: * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29: * POSSIBILITY OF SUCH DAMAGE.
30: */
31:
32: #include <sys/cdefs.h>
1.14 ! andvar 33: __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.13 2021/09/28 15:05:42 thorpej Exp $");
1.1 thorpej 34:
35: /*
36: * Futexes
37: *
38: * The futex system call coordinates notifying threads waiting for
39: * changes on a 32-bit word of memory. The word can be managed by
40: * CPU atomic operations in userland, without system calls, as long
41: * as there is no contention.
42: *
43: * The simplest use case demonstrating the utility is:
44: *
45: * // 32-bit word of memory shared among threads or
46: * // processes in userland. lock & 1 means owned;
47: * // lock & 2 means there are waiters waiting.
48: * volatile int lock = 0;
49: *
50: * int v;
51: *
52: * // Acquire a lock.
53: * do {
54: * v = lock;
55: * if (v & 1) {
56: * // Lock is held. Set a bit to say that
57: * // there are waiters, and wait for lock
58: * // to change to anything other than v;
59: * // then retry.
60: * if (atomic_cas_uint(&lock, v, v | 2) != v)
61: * continue;
62: * futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
63: * continue;
64: * }
65: * } while (atomic_cas_uint(&lock, v, v & ~1) != v);
66: * membar_enter();
67: *
68: * ...
69: *
70: * // Release the lock. Optimistically assume there are
71: * // no waiters first until demonstrated otherwise.
72: * membar_exit();
73: * if (atomic_cas_uint(&lock, 1, 0) != 1) {
74: * // There may be waiters.
75: * v = atomic_swap_uint(&lock, 0);
76: * // If there are still waiters, wake one.
77: * if (v & 2)
78: * futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
79: * }
80: *
81: * The goal is to avoid the futex system call unless there is
82: * contention; then if there is contention, to guarantee no missed
83: * wakeups.
84: *
85: * For a simple implementation, futex(FUTEX_WAIT) could queue
86: * itself to be woken, double-check the lock word, and then sleep;
87: * spurious wakeups are generally a fact of life, so any
88: * FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
89: *
90: * If this were all there is to it, we could then increase
91: * parallelism by refining the approximation: partition the
92: * waiters into buckets by hashing the lock addresses to reduce
93: * the incidence of spurious wakeups. But this is not all.
94: *
95: * The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
96: * operation not only wakes n waiters on lock if lock == val, but
97: * also _transfers_ m additional waiters to lock2. Unless wakeups
98: * on lock2 also trigger wakeups on lock, we cannot move waiters
99: * to lock2 if they merely share the same hash as waiters on lock.
100: * Thus, we can't approximately distribute waiters into queues by
101: * a hash function; we must distinguish futex queues exactly by
102: * lock address.
103: *
104: * For now, we use a global red/black tree to index futexes. This
105: * should be replaced by a lockless radix tree with a thread to
106: * free entries no longer in use once all lookups on all CPUs have
107: * completed.
108: *
109: * Specifically, we maintain two maps:
110: *
111: * futex_tab.va[vmspace, va] for private futexes
112: * futex_tab.oa[uvm_voaddr] for shared futexes
113: *
114: * This implementation does not support priority inheritance.
115: */
116:
1.12 skrll 117: #include <sys/param.h>
1.1 thorpej 118: #include <sys/types.h>
119: #include <sys/atomic.h>
120: #include <sys/condvar.h>
121: #include <sys/futex.h>
122: #include <sys/mutex.h>
123: #include <sys/rbtree.h>
124: #include <sys/queue.h>
125:
126: #include <sys/syscall.h>
127: #include <sys/syscallargs.h>
128: #include <sys/syscallvar.h>
129:
130: #include <uvm/uvm_extern.h>
131:
132: /*
133: * Lock order:
134: *
135: * futex_tab.lock
136: * futex::fx_qlock ordered by kva of struct futex
137: * -> futex_wait::fw_lock only one at a time
138: * futex_wait::fw_lock only one at a time
139: * -> futex::fx_abortlock only one at a time
140: */
141:
142: /*
143: * union futex_key
144: *
145: * A futex is addressed either by a vmspace+va (private) or by
146: * a uvm_voaddr (shared).
147: */
148: union futex_key {
149: struct {
150: struct vmspace *vmspace;
151: vaddr_t va;
152: } fk_private;
153: struct uvm_voaddr fk_shared;
154: };
155:
156: /*
157: * struct futex
158: *
159: * Kernel state for a futex located at a particular address in a
160: * particular virtual address space.
161: *
162: * N.B. fx_refcnt is an unsigned long because we need to be able
163: * to operate on it atomically on all systems while at the same
164: * time rendering practically impossible the chance of it reaching
165: * its max value. In practice, we're limited by the number of LWPs
166: * that can be present on the system at any given time, and the
167: * assumption is that limit will be good enough on a 32-bit platform.
168: * See futex_wake() for why overflow needs to be avoided.
169: */
170: struct futex {
171: union futex_key fx_key;
172: unsigned long fx_refcnt;
173: bool fx_shared;
174: bool fx_on_tree;
175: struct rb_node fx_node;
176:
177: kmutex_t fx_qlock;
178: TAILQ_HEAD(, futex_wait) fx_queue;
179:
180: kmutex_t fx_abortlock;
181: LIST_HEAD(, futex_wait) fx_abortlist;
182: kcondvar_t fx_abortcv;
183: };
184:
185: /*
186: * struct futex_wait
187: *
188: * State for a thread to wait on a futex. Threads wait on fw_cv
189: * for fw_bitset to be set to zero. The thread may transition to
190: * a different futex queue at any time under the futex's lock.
191: */
192: struct futex_wait {
193: kmutex_t fw_lock;
194: kcondvar_t fw_cv;
195: struct futex *fw_futex;
196: TAILQ_ENTRY(futex_wait) fw_entry; /* queue lock */
197: LIST_ENTRY(futex_wait) fw_abort; /* queue abortlock */
198: int fw_bitset;
1.4 riastrad 199: bool fw_aborting; /* fw_lock */
1.1 thorpej 200: };
201:
202: /*
203: * futex_tab
204: *
205: * Global trees of futexes by vmspace/va and VM object address.
206: *
207: * XXX This obviously doesn't scale in parallel. We could use a
208: * pserialize-safe data structure, but there may be a high cost to
209: * frequent deletion since we don't cache futexes after we're done
210: * with them. We could use hashed locks. But for now, just make
211: * sure userland can't DoS the serial performance, by using a
212: * balanced binary tree for lookup.
213: *
214: * XXX We could use a per-process tree for the table indexed by
215: * virtual address to reduce contention between processes.
216: */
217: static struct {
218: kmutex_t lock;
219: struct rb_tree va;
220: struct rb_tree oa;
221: } futex_tab __cacheline_aligned;
222:
223: static int
224: compare_futex_key(void *cookie, const void *n, const void *k)
225: {
226: const struct futex *fa = n;
227: const union futex_key *fka = &fa->fx_key;
228: const union futex_key *fkb = k;
229:
230: if ((uintptr_t)fka->fk_private.vmspace <
231: (uintptr_t)fkb->fk_private.vmspace)
232: return -1;
233: if ((uintptr_t)fka->fk_private.vmspace >
234: (uintptr_t)fkb->fk_private.vmspace)
235: return +1;
236: if (fka->fk_private.va < fkb->fk_private.va)
237: return -1;
238: if (fka->fk_private.va > fkb->fk_private.va)
239: return -1;
240: return 0;
241: }
242:
243: static int
244: compare_futex(void *cookie, const void *na, const void *nb)
245: {
246: const struct futex *fa = na;
247: const struct futex *fb = nb;
248:
249: return compare_futex_key(cookie, fa, &fb->fx_key);
250: }
251:
252: static const rb_tree_ops_t futex_rb_ops = {
253: .rbto_compare_nodes = compare_futex,
254: .rbto_compare_key = compare_futex_key,
255: .rbto_node_offset = offsetof(struct futex, fx_node),
256: };
257:
258: static int
259: compare_futex_shared_key(void *cookie, const void *n, const void *k)
260: {
261: const struct futex *fa = n;
262: const union futex_key *fka = &fa->fx_key;
263: const union futex_key *fkb = k;
264:
265: return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
266: }
267:
268: static int
269: compare_futex_shared(void *cookie, const void *na, const void *nb)
270: {
271: const struct futex *fa = na;
272: const struct futex *fb = nb;
273:
274: return compare_futex_shared_key(cookie, fa, &fb->fx_key);
275: }
276:
277: static const rb_tree_ops_t futex_shared_rb_ops = {
278: .rbto_compare_nodes = compare_futex_shared,
279: .rbto_compare_key = compare_futex_shared_key,
280: .rbto_node_offset = offsetof(struct futex, fx_node),
281: };
282:
283: static void futex_wait_dequeue(struct futex_wait *, struct futex *);
284:
285: /*
286: * futex_load(uaddr, kaddr)
287: *
288: * Perform a single atomic load to read *uaddr, and return the
289: * result in *kaddr. Return 0 on success, EFAULT if uaddr is not
290: * mapped.
291: */
292: static inline int
293: futex_load(int *uaddr, int *kaddr)
294: {
295: return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
296: }
297:
298: /*
299: * futex_test(uaddr, expected)
300: *
301: * True if *uaddr == expected. False if *uaddr != expected, or if
302: * uaddr is not mapped.
303: */
304: static bool
305: futex_test(int *uaddr, int expected)
306: {
307: int val;
308: int error;
309:
310: error = futex_load(uaddr, &val);
311: if (error)
312: return false;
313: return val == expected;
314: }
315:
316: /*
317: * futex_sys_init()
318: *
319: * Initialize the futex subsystem.
320: */
321: void
322: futex_sys_init(void)
323: {
324:
325: mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
326: rb_tree_init(&futex_tab.va, &futex_rb_ops);
327: rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
328: }
329:
330: /*
331: * futex_sys_fini()
332: *
333: * Finalize the futex subsystem.
334: */
335: void
336: futex_sys_fini(void)
337: {
338:
339: KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
340: KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
341: mutex_destroy(&futex_tab.lock);
342: }
343:
344: /*
345: * futex_queue_init(f)
346: *
347: * Initialize the futex queue. Caller must call futex_queue_fini
348: * when done.
349: *
350: * Never sleeps.
351: */
352: static void
353: futex_queue_init(struct futex *f)
354: {
355:
356: mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE);
357: mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE);
358: cv_init(&f->fx_abortcv, "fqabort");
359: LIST_INIT(&f->fx_abortlist);
360: TAILQ_INIT(&f->fx_queue);
361: }
362:
363: /*
364: * futex_queue_drain(f)
365: *
366: * Wait for any aborting waiters in f; then empty the queue of
367: * any stragglers and wake them. Caller must guarantee no new
368: * references to f.
369: *
370: * May sleep.
371: */
372: static void
373: futex_queue_drain(struct futex *f)
374: {
375: struct futex_wait *fw, *fw_next;
376:
377: mutex_enter(&f->fx_abortlock);
378: while (!LIST_EMPTY(&f->fx_abortlist))
379: cv_wait(&f->fx_abortcv, &f->fx_abortlock);
380: mutex_exit(&f->fx_abortlock);
381:
382: mutex_enter(&f->fx_qlock);
383: TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
384: mutex_enter(&fw->fw_lock);
385: futex_wait_dequeue(fw, f);
386: cv_broadcast(&fw->fw_cv);
387: mutex_exit(&fw->fw_lock);
388: }
389: mutex_exit(&f->fx_qlock);
390: }
391:
392: /*
393: * futex_queue_fini(fq)
394: *
395: * Finalize the futex queue initialized by futex_queue_init. Queue
396: * must be empty. Caller must not use f again until a subsequent
397: * futex_queue_init.
398: */
399: static void
400: futex_queue_fini(struct futex *f)
401: {
402:
403: KASSERT(TAILQ_EMPTY(&f->fx_queue));
404: KASSERT(LIST_EMPTY(&f->fx_abortlist));
405: mutex_destroy(&f->fx_qlock);
406: mutex_destroy(&f->fx_abortlock);
407: cv_destroy(&f->fx_abortcv);
408: }
409:
410: /*
411: * futex_key_init(key, vm, va, shared)
412: *
413: * Initialize a futex key for lookup, etc.
414: */
415: static int
416: futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
417: {
418: int error = 0;
419:
420: if (__predict_false(shared)) {
421: if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
422: error = EFAULT;
423: } else {
424: fk->fk_private.vmspace = vm;
425: fk->fk_private.va = va;
426: }
427:
428: return error;
429: }
430:
431: /*
432: * futex_key_fini(key, shared)
433: *
434: * Release a futex key.
435: */
436: static void
437: futex_key_fini(union futex_key *fk, bool shared)
438: {
439: if (__predict_false(shared))
440: uvm_voaddr_release(&fk->fk_shared);
441: memset(fk, 0, sizeof(*fk));
442: }
443:
444: /*
445: * futex_create(fk, shared)
446: *
447: * Create a futex. Initial reference count is 1, representing the
448: * caller. Returns NULL on failure. Always takes ownership of the
449: * key, either transferring it to the newly-created futex, or releasing
450: * the key if creation fails.
451: *
452: * Never sleeps for memory, but may sleep to acquire a lock.
453: */
454: static struct futex *
455: futex_create(union futex_key *fk, bool shared)
456: {
457: struct futex *f;
458:
459: f = kmem_alloc(sizeof(*f), KM_NOSLEEP);
460: if (f == NULL) {
461: futex_key_fini(fk, shared);
462: return NULL;
463: }
464: f->fx_key = *fk;
465: f->fx_refcnt = 1;
466: f->fx_shared = shared;
467: f->fx_on_tree = false;
468: futex_queue_init(f);
469:
470: return f;
471: }
472:
473: /*
474: * futex_destroy(f)
475: *
476: * Destroy a futex created with futex_create. Reference count
477: * must be zero.
478: *
479: * May sleep.
480: */
481: static void
482: futex_destroy(struct futex *f)
483: {
484:
485: ASSERT_SLEEPABLE();
486:
487: KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
488: KASSERT(!f->fx_on_tree);
489:
490: /* Drain and destroy the private queue. */
491: futex_queue_drain(f);
492: futex_queue_fini(f);
493:
494: futex_key_fini(&f->fx_key, f->fx_shared);
495:
496: kmem_free(f, sizeof(*f));
497: }
498:
499: /*
500: * futex_hold(f)
501: *
502: * Attempt to acquire a reference to f. Return 0 on success,
503: * ENFILE on too many references.
504: *
505: * Never sleeps.
506: */
507: static int
508: futex_hold(struct futex *f)
509: {
510: unsigned long refcnt;
511:
512: do {
513: refcnt = atomic_load_relaxed(&f->fx_refcnt);
514: if (refcnt == ULONG_MAX)
515: return ENFILE;
516: } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt);
517:
518: return 0;
519: }
520:
521: /*
522: * futex_rele(f)
523: *
524: * Release a reference to f acquired with futex_create or
525: * futex_hold.
526: *
527: * May sleep to free f.
528: */
529: static void
530: futex_rele(struct futex *f)
531: {
532: unsigned long refcnt;
533:
534: ASSERT_SLEEPABLE();
535:
536: do {
537: refcnt = atomic_load_relaxed(&f->fx_refcnt);
538: if (refcnt == 1)
539: goto trylast;
540: } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
541: return;
542:
543: trylast:
544: mutex_enter(&futex_tab.lock);
545: if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) {
546: if (f->fx_on_tree) {
547: if (__predict_false(f->fx_shared))
548: rb_tree_remove_node(&futex_tab.oa, f);
549: else
550: rb_tree_remove_node(&futex_tab.va, f);
551: f->fx_on_tree = false;
552: }
553: } else {
554: /* References remain -- don't destroy it. */
555: f = NULL;
556: }
557: mutex_exit(&futex_tab.lock);
558: if (f != NULL)
559: futex_destroy(f);
560: }
561:
562: /*
563: * futex_rele_not_last(f)
564: *
565: * Release a reference to f acquired with futex_create or
566: * futex_hold.
567: *
568: * This version asserts that we are not dropping the last
569: * reference to f.
570: */
571: static void
572: futex_rele_not_last(struct futex *f)
573: {
574: unsigned long refcnt;
575:
576: do {
577: refcnt = atomic_load_relaxed(&f->fx_refcnt);
578: KASSERT(refcnt > 1);
579: } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
580: }
581:
582: /*
583: * futex_lookup_by_key(key, shared, &f)
584: *
585: * Try to find an existing futex va reference in the specified key
586: * On success, return 0, set f to found futex or to NULL if not found,
587: * and increment f's reference count if found.
588: *
589: * Return ENFILE if reference count too high.
590: *
591: * Internal lookup routine shared by futex_lookup() and
1.5 riastrad 592: * futex_lookup_create().
1.1 thorpej 593: */
594: static int
595: futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp)
596: {
597: struct futex *f;
598: int error = 0;
599:
600: mutex_enter(&futex_tab.lock);
601: if (__predict_false(shared)) {
602: f = rb_tree_find_node(&futex_tab.oa, fk);
603: } else {
604: f = rb_tree_find_node(&futex_tab.va, fk);
605: }
606: if (f) {
607: error = futex_hold(f);
608: if (error)
609: f = NULL;
610: }
611: *fp = f;
612: mutex_exit(&futex_tab.lock);
613:
614: return error;
615: }
616:
617: /*
618: * futex_insert(f, fp)
619: *
620: * Try to insert the futex f into the tree by va. If there
621: * already is a futex for its va, acquire a reference to it, and
622: * store it in *fp; otherwise store f in *fp.
623: *
624: * Return 0 on success, ENFILE if there already is a futex but its
625: * reference count is too high.
626: */
627: static int
628: futex_insert(struct futex *f, struct futex **fp)
629: {
630: struct futex *f0;
631: int error;
632:
633: KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
634: KASSERT(!f->fx_on_tree);
635:
636: mutex_enter(&futex_tab.lock);
637: if (__predict_false(f->fx_shared))
638: f0 = rb_tree_insert_node(&futex_tab.oa, f);
639: else
640: f0 = rb_tree_insert_node(&futex_tab.va, f);
641: if (f0 == f) {
642: f->fx_on_tree = true;
643: error = 0;
644: } else {
645: KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
646: KASSERT(f0->fx_on_tree);
647: error = futex_hold(f0);
648: if (error)
649: goto out;
650: }
651: *fp = f0;
652: out: mutex_exit(&futex_tab.lock);
653:
654: return error;
655: }
656:
657: /*
658: * futex_lookup(uaddr, shared, &f)
659: *
660: * Find a futex at the userland pointer uaddr in the current
661: * process's VM space. On success, return the futex in f and
662: * increment its reference count.
663: *
1.5 riastrad 664: * Caller must call futex_rele when done.
1.1 thorpej 665: */
666: static int
667: futex_lookup(int *uaddr, bool shared, struct futex **fp)
668: {
669: union futex_key fk;
670: struct vmspace *vm = curproc->p_vmspace;
671: vaddr_t va = (vaddr_t)uaddr;
672: int error;
673:
674: /*
675: * Reject unaligned user pointers so we don't cross page
676: * boundaries and so atomics will work.
677: */
678: if ((va & 3) != 0)
679: return EINVAL;
680:
681: /* Look it up. */
682: error = futex_key_init(&fk, vm, va, shared);
683: if (error)
684: return error;
685:
686: error = futex_lookup_by_key(&fk, shared, fp);
687: futex_key_fini(&fk, shared);
688: if (error)
689: return error;
690:
691: KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
692: KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
693:
694: /*
695: * Success! (Caller must still check whether we found
696: * anything, but nothing went _wrong_ like trying to use
697: * unmapped memory.)
698: */
699: KASSERT(error == 0);
700:
701: return error;
702: }
703:
704: /*
1.5 riastrad 705: * futex_lookup_create(uaddr, shared, &f)
1.1 thorpej 706: *
707: * Find or create a futex at the userland pointer uaddr in the
708: * current process's VM space. On success, return the futex in f
709: * and increment its reference count.
710: *
1.5 riastrad 711: * Caller must call futex_rele when done.
1.1 thorpej 712: */
713: static int
1.5 riastrad 714: futex_lookup_create(int *uaddr, bool shared, struct futex **fp)
1.1 thorpej 715: {
716: union futex_key fk;
717: struct vmspace *vm = curproc->p_vmspace;
718: struct futex *f = NULL;
719: vaddr_t va = (vaddr_t)uaddr;
720: int error;
721:
722: /*
723: * Reject unaligned user pointers so we don't cross page
724: * boundaries and so atomics will work.
725: */
726: if ((va & 3) != 0)
727: return EINVAL;
728:
729: error = futex_key_init(&fk, vm, va, shared);
730: if (error)
731: return error;
732:
733: /*
734: * Optimistically assume there already is one, and try to find
735: * it.
736: */
737: error = futex_lookup_by_key(&fk, shared, fp);
738: if (error || *fp != NULL) {
739: /*
740: * We either found one, or there was an error.
741: * In either case, we are done with the key.
742: */
743: futex_key_fini(&fk, shared);
744: goto out;
745: }
746:
747: /*
1.14 ! andvar 748: * Create a futex record. This transfers ownership of the key
1.1 thorpej 749: * in all cases.
750: */
751: f = futex_create(&fk, shared);
752: if (f == NULL) {
753: error = ENOMEM;
754: goto out;
755: }
756:
757: /*
758: * Insert our new futex, or use existing if someone else beat
759: * us to it.
760: */
761: error = futex_insert(f, fp);
762: if (error)
763: goto out;
764: if (*fp == f)
765: f = NULL; /* don't release on exit */
766:
767: /* Success! */
768: KASSERT(error == 0);
769:
770: out: if (f != NULL)
771: futex_rele(f);
772: KASSERT(error || *fp != NULL);
773: KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
774: return error;
775: }
776:
777: /*
778: * futex_wait_init(fw, bitset)
779: *
780: * Initialize a record for a thread to wait on a futex matching
781: * the specified bit set. Should be passed to futex_wait_enqueue
782: * before futex_wait, and should be passed to futex_wait_fini when
783: * done.
784: */
785: static void
786: futex_wait_init(struct futex_wait *fw, int bitset)
787: {
788:
1.6 riastrad 789: KASSERT(bitset);
790:
1.1 thorpej 791: mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE);
792: cv_init(&fw->fw_cv, "futex");
793: fw->fw_futex = NULL;
794: fw->fw_bitset = bitset;
1.4 riastrad 795: fw->fw_aborting = false;
1.1 thorpej 796: }
797:
798: /*
799: * futex_wait_fini(fw)
800: *
801: * Finalize a record for a futex waiter. Must not be on any
802: * futex's queue.
803: */
804: static void
805: futex_wait_fini(struct futex_wait *fw)
806: {
807:
1.6 riastrad 808: KASSERT(fw->fw_futex == NULL);
809:
1.1 thorpej 810: cv_destroy(&fw->fw_cv);
811: mutex_destroy(&fw->fw_lock);
812: }
813:
814: /*
815: * futex_wait_enqueue(fw, f)
816: *
817: * Put fw on the futex queue. Must be done before futex_wait.
818: * Caller must hold fw's lock and f's lock, and fw must not be on
819: * any existing futex's waiter list.
820: */
821: static void
822: futex_wait_enqueue(struct futex_wait *fw, struct futex *f)
823: {
824:
825: KASSERT(mutex_owned(&f->fx_qlock));
826: KASSERT(mutex_owned(&fw->fw_lock));
827: KASSERT(fw->fw_futex == NULL);
1.4 riastrad 828: KASSERT(!fw->fw_aborting);
1.1 thorpej 829:
830: fw->fw_futex = f;
831: TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry);
832: }
833:
834: /*
835: * futex_wait_dequeue(fw, f)
836: *
837: * Remove fw from the futex queue. Precludes subsequent
838: * futex_wait until a futex_wait_enqueue. Caller must hold fw's
839: * lock and f's lock, and fw must be on f.
840: */
841: static void
842: futex_wait_dequeue(struct futex_wait *fw, struct futex *f)
843: {
844:
845: KASSERT(mutex_owned(&f->fx_qlock));
846: KASSERT(mutex_owned(&fw->fw_lock));
847: KASSERT(fw->fw_futex == f);
848:
849: TAILQ_REMOVE(&f->fx_queue, fw, fw_entry);
850: fw->fw_futex = NULL;
851: }
852:
853: /*
854: * futex_wait_abort(fw)
855: *
856: * Caller is no longer waiting for fw. Remove it from any queue
1.4 riastrad 857: * if it was on one. Caller must hold fw->fw_lock.
1.1 thorpej 858: */
859: static void
860: futex_wait_abort(struct futex_wait *fw)
861: {
862: struct futex *f;
863:
1.4 riastrad 864: KASSERT(mutex_owned(&fw->fw_lock));
1.1 thorpej 865:
866: /*
867: * Grab the futex queue. It can't go away as long as we hold
868: * fw_lock. However, we can't take the queue lock because
869: * that's a lock order reversal.
870: */
871: f = fw->fw_futex;
872:
873: /* Put us on the abort list so that fq won't go away. */
874: mutex_enter(&f->fx_abortlock);
875: LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort);
876: mutex_exit(&f->fx_abortlock);
877:
1.4 riastrad 878: /*
879: * Mark fw as aborting so it won't lose wakeups and won't be
880: * transferred to any other queue.
881: */
882: fw->fw_aborting = true;
883:
1.1 thorpej 884: /* f is now stable, so we can release fw_lock. */
885: mutex_exit(&fw->fw_lock);
886:
887: /* Now we can remove fw under the queue lock. */
888: mutex_enter(&f->fx_qlock);
1.4 riastrad 889: mutex_enter(&fw->fw_lock);
890: futex_wait_dequeue(fw, f);
891: mutex_exit(&fw->fw_lock);
1.1 thorpej 892: mutex_exit(&f->fx_qlock);
893:
894: /*
895: * Finally, remove us from the abort list and notify anyone
896: * waiting for the abort to complete if we were the last to go.
897: */
898: mutex_enter(&f->fx_abortlock);
899: LIST_REMOVE(fw, fw_abort);
900: if (LIST_EMPTY(&f->fx_abortlist))
901: cv_broadcast(&f->fx_abortcv);
902: mutex_exit(&f->fx_abortlock);
1.4 riastrad 903:
904: /*
905: * Release our reference to the futex now that we are not
906: * waiting for it.
907: */
908: futex_rele(f);
909:
910: /*
911: * Reacquire the fw lock as caller expects. Verify that we're
912: * aborting and no longer associated with a futex.
913: */
914: mutex_enter(&fw->fw_lock);
915: KASSERT(fw->fw_aborting);
916: KASSERT(fw->fw_futex == NULL);
1.1 thorpej 917: }
918:
919: /*
1.11 riastrad 920: * futex_wait(fw, deadline, clkid)
1.1 thorpej 921: *
1.11 riastrad 922: * fw must be a waiter on a futex's queue. Wait until deadline on
923: * the clock clkid, or forever if deadline is NULL, for a futex
924: * wakeup. Return 0 on explicit wakeup or destruction of futex,
925: * ETIMEDOUT on timeout, EINTR/ERESTART on signal. Either way, fw
926: * will no longer be on a futex queue on return.
1.1 thorpej 927: */
928: static int
1.11 riastrad 929: futex_wait(struct futex_wait *fw, const struct timespec *deadline,
930: clockid_t clkid)
1.1 thorpej 931: {
932: int error = 0;
933:
934: /* Test and wait under the wait lock. */
935: mutex_enter(&fw->fw_lock);
1.4 riastrad 936:
937: for (;;) {
1.11 riastrad 938: /* If we're done yet, stop and report success. */
1.4 riastrad 939: if (fw->fw_bitset == 0 || fw->fw_futex == NULL) {
940: error = 0;
941: break;
942: }
943:
944: /* If anything went wrong in the last iteration, stop. */
945: if (error)
946: break;
947:
1.1 thorpej 948: /* Not done yet. Wait. */
1.11 riastrad 949: if (deadline) {
950: struct timespec ts;
951:
952: /* Check our watch. */
953: error = clock_gettime1(clkid, &ts);
954: if (error)
955: break;
956:
957: /* If we're past the deadline, ETIMEDOUT. */
958: if (timespeccmp(deadline, &ts, <=)) {
959: error = ETIMEDOUT;
960: break;
961: }
962:
963: /* Count how much time is left. */
964: timespecsub(deadline, &ts, &ts);
965:
966: /* Wait for that much time, allowing signals. */
967: error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock,
968: tstohz(&ts));
969: } else {
970: /* Wait indefinitely, allowing signals. */
971: error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock);
972: }
1.1 thorpej 973: }
1.4 riastrad 974:
975: /*
976: * If we were woken up, the waker will have removed fw from the
977: * queue. But if anything went wrong, we must remove fw from
978: * the queue ourselves. While here, convert EWOULDBLOCK to
1.10 riastrad 979: * ETIMEDOUT.
1.4 riastrad 980: */
981: if (error) {
982: futex_wait_abort(fw);
983: if (error == EWOULDBLOCK)
984: error = ETIMEDOUT;
985: }
986:
1.1 thorpej 987: mutex_exit(&fw->fw_lock);
988:
989: return error;
990: }
991:
992: /*
993: * futex_wake(f, nwake, f2, nrequeue, bitset)
994: *
995: * Wake up to nwake waiters on f matching bitset; then, if f2 is
996: * provided, move up to nrequeue remaining waiters on f matching
997: * bitset to f2. Return the number of waiters actually woken.
998: * Caller must hold the locks of f and f2, if provided.
999: */
1000: static unsigned
1001: futex_wake(struct futex *f, unsigned nwake, struct futex *f2,
1002: unsigned nrequeue, int bitset)
1003: {
1004: struct futex_wait *fw, *fw_next;
1005: unsigned nwoken = 0;
1.2 mlelstv 1006: int hold_error __diagused;
1.1 thorpej 1007:
1008: KASSERT(mutex_owned(&f->fx_qlock));
1009: KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock));
1010:
1011: /* Wake up to nwake waiters, and count the number woken. */
1012: TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
1013: if ((fw->fw_bitset & bitset) == 0)
1014: continue;
1.4 riastrad 1015: if (nwake > 0) {
1.1 thorpej 1016: mutex_enter(&fw->fw_lock);
1.4 riastrad 1017: if (__predict_false(fw->fw_aborting)) {
1018: mutex_exit(&fw->fw_lock);
1019: continue;
1020: }
1.1 thorpej 1021: futex_wait_dequeue(fw, f);
1022: fw->fw_bitset = 0;
1023: cv_broadcast(&fw->fw_cv);
1024: mutex_exit(&fw->fw_lock);
1.4 riastrad 1025: nwake--;
1.1 thorpej 1026: nwoken++;
1027: /*
1028: * Drop the futex reference on behalf of the
1029: * waiter. We assert this is not the last
1030: * reference on the futex (our caller should
1031: * also have one).
1032: */
1033: futex_rele_not_last(f);
1034: } else {
1035: break;
1036: }
1037: }
1038:
1039: if (f2) {
1040: /* Move up to nrequeue waiters from f's queue to f2's queue. */
1041: TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
1042: if ((fw->fw_bitset & bitset) == 0)
1043: continue;
1.4 riastrad 1044: if (nrequeue > 0) {
1.1 thorpej 1045: mutex_enter(&fw->fw_lock);
1.4 riastrad 1046: if (__predict_false(fw->fw_aborting)) {
1047: mutex_exit(&fw->fw_lock);
1048: continue;
1049: }
1.1 thorpej 1050: futex_wait_dequeue(fw, f);
1051: futex_wait_enqueue(fw, f2);
1052: mutex_exit(&fw->fw_lock);
1.4 riastrad 1053: nrequeue--;
1.1 thorpej 1054: /*
1055: * Transfer the reference from f to f2.
1056: * As above, we assert that we are not
1057: * dropping the last reference to f here.
1058: *
1059: * XXX futex_hold() could theoretically
1060: * XXX fail here.
1061: */
1062: futex_rele_not_last(f);
1063: hold_error = futex_hold(f2);
1064: KASSERT(hold_error == 0);
1065: } else {
1066: break;
1067: }
1068: }
1069: } else {
1070: KASSERT(nrequeue == 0);
1071: }
1072:
1073: /* Return the number of waiters woken. */
1074: return nwoken;
1075: }
1076:
1077: /*
1078: * futex_queue_lock(f)
1079: *
1080: * Acquire the queue lock of f. Pair with futex_queue_unlock. Do
1081: * not use if caller needs to acquire two locks; use
1082: * futex_queue_lock2 instead.
1083: */
1084: static void
1085: futex_queue_lock(struct futex *f)
1086: {
1087: mutex_enter(&f->fx_qlock);
1088: }
1089:
1090: /*
1091: * futex_queue_unlock(f)
1092: *
1093: * Release the queue lock of f.
1094: */
1095: static void
1096: futex_queue_unlock(struct futex *f)
1097: {
1098: mutex_exit(&f->fx_qlock);
1099: }
1100:
1101: /*
1102: * futex_queue_lock2(f, f2)
1103: *
1104: * Acquire the queue locks of both f and f2, which may be null, or
1105: * which may have the same underlying queue. If they are
1106: * distinct, an arbitrary total order is chosen on the locks.
1107: *
1108: * Callers should only ever acquire multiple queue locks
1109: * simultaneously using futex_queue_lock2.
1110: */
1111: static void
1112: futex_queue_lock2(struct futex *f, struct futex *f2)
1113: {
1114:
1115: /*
1116: * If both are null, do nothing; if one is null and the other
1117: * is not, lock the other and be done with it.
1118: */
1119: if (f == NULL && f2 == NULL) {
1120: return;
1121: } else if (f == NULL) {
1122: mutex_enter(&f2->fx_qlock);
1123: return;
1124: } else if (f2 == NULL) {
1125: mutex_enter(&f->fx_qlock);
1126: return;
1127: }
1128:
1129: /* If both futexes are the same, acquire only one. */
1130: if (f == f2) {
1131: mutex_enter(&f->fx_qlock);
1132: return;
1133: }
1134:
1135: /* Otherwise, use the ordering on the kva of the futex pointer. */
1136: if ((uintptr_t)f < (uintptr_t)f2) {
1137: mutex_enter(&f->fx_qlock);
1138: mutex_enter(&f2->fx_qlock);
1139: } else {
1140: mutex_enter(&f2->fx_qlock);
1141: mutex_enter(&f->fx_qlock);
1142: }
1143: }
1144:
1145: /*
1146: * futex_queue_unlock2(f, f2)
1147: *
1148: * Release the queue locks of both f and f2, which may be null, or
1149: * which may have the same underlying queue.
1150: */
1151: static void
1152: futex_queue_unlock2(struct futex *f, struct futex *f2)
1153: {
1154:
1155: /*
1156: * If both are null, do nothing; if one is null and the other
1157: * is not, unlock the other and be done with it.
1158: */
1159: if (f == NULL && f2 == NULL) {
1160: return;
1161: } else if (f == NULL) {
1162: mutex_exit(&f2->fx_qlock);
1163: return;
1164: } else if (f2 == NULL) {
1165: mutex_exit(&f->fx_qlock);
1166: return;
1167: }
1168:
1169: /* If both futexes are the same, release only one. */
1170: if (f == f2) {
1171: mutex_exit(&f->fx_qlock);
1172: return;
1173: }
1174:
1175: /* Otherwise, use the ordering on the kva of the futex pointer. */
1176: if ((uintptr_t)f < (uintptr_t)f2) {
1177: mutex_exit(&f2->fx_qlock);
1178: mutex_exit(&f->fx_qlock);
1179: } else {
1180: mutex_exit(&f->fx_qlock);
1181: mutex_exit(&f2->fx_qlock);
1182: }
1183: }
1184:
1185: /*
1186: * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
1187: *
1188: * Implement futex(FUTEX_WAIT).
1189: */
1190: static int
1191: futex_func_wait(bool shared, int *uaddr, int val, int val3,
1.11 riastrad 1192: const struct timespec *timeout, clockid_t clkid, int clkflags,
1.1 thorpej 1193: register_t *retval)
1194: {
1195: struct futex *f;
1196: struct futex_wait wait, *fw = &wait;
1.11 riastrad 1197: struct timespec ts;
1198: const struct timespec *deadline;
1.1 thorpej 1199: int error;
1200:
1.6 riastrad 1201: /*
1202: * If there's nothing to wait for, and nobody will ever wake
1203: * us, then don't set anything up to wait -- just stop here.
1204: */
1205: if (val3 == 0)
1.7 riastrad 1206: return EINVAL;
1.6 riastrad 1207:
1.1 thorpej 1208: /* Optimistically test before anything else. */
1209: if (!futex_test(uaddr, val))
1210: return EAGAIN;
1211:
1.11 riastrad 1212: /* Determine a deadline on the specified clock. */
1213: if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
1214: deadline = timeout;
1215: } else {
1216: error = clock_gettime1(clkid, &ts);
1217: if (error)
1218: return error;
1219: timespecadd(&ts, timeout, &ts);
1220: deadline = &ts;
1221: }
1222:
1.1 thorpej 1223: /* Get the futex, creating it if necessary. */
1.5 riastrad 1224: error = futex_lookup_create(uaddr, shared, &f);
1.1 thorpej 1225: if (error)
1226: return error;
1227: KASSERT(f);
1228:
1229: /* Get ready to wait. */
1230: futex_wait_init(fw, val3);
1231:
1232: /*
1233: * Under the queue lock, check the value again: if it has
1234: * already changed, EAGAIN; otherwise enqueue the waiter.
1235: * Since FUTEX_WAKE will use the same lock and be done after
1236: * modifying the value, the order in which we check and enqueue
1237: * is immaterial.
1238: */
1239: futex_queue_lock(f);
1240: if (!futex_test(uaddr, val)) {
1241: futex_queue_unlock(f);
1242: error = EAGAIN;
1243: goto out;
1244: }
1245: mutex_enter(&fw->fw_lock);
1246: futex_wait_enqueue(fw, f);
1247: mutex_exit(&fw->fw_lock);
1248: futex_queue_unlock(f);
1249:
1250: /*
1251: * We cannot drop our reference to the futex here, because
1252: * we might be enqueued on a different one when we are awakened.
1253: * The references will be managed on our behalf in the requeue
1254: * and wake cases.
1255: */
1256: f = NULL;
1257:
1258: /* Wait. */
1.11 riastrad 1259: error = futex_wait(fw, deadline, clkid);
1.4 riastrad 1260: if (error)
1.1 thorpej 1261: goto out;
1262:
1263: /* Return 0 on success, error on failure. */
1264: *retval = 0;
1265:
1266: out: if (f != NULL)
1.5 riastrad 1267: futex_rele(f);
1.1 thorpej 1268: futex_wait_fini(fw);
1269: return error;
1270: }
1271:
1272: /*
1273: * futex_func_wake(uaddr, val, val3, retval)
1274: *
1275: * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
1276: */
1277: static int
1278: futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
1279: {
1280: struct futex *f;
1281: unsigned int nwoken = 0;
1282: int error = 0;
1283:
1284: /* Reject negative number of wakeups. */
1285: if (val < 0) {
1286: error = EINVAL;
1287: goto out;
1288: }
1289:
1290: /* Look up the futex, if any. */
1291: error = futex_lookup(uaddr, shared, &f);
1292: if (error)
1293: goto out;
1294:
1295: /* If there's no futex, there are no waiters to wake. */
1296: if (f == NULL)
1297: goto out;
1298:
1299: /*
1300: * Under f's queue lock, wake the waiters and remember the
1301: * number woken.
1302: */
1303: futex_queue_lock(f);
1304: nwoken = futex_wake(f, val, NULL, 0, val3);
1305: futex_queue_unlock(f);
1306:
1307: /* Release the futex. */
1.5 riastrad 1308: futex_rele(f);
1.1 thorpej 1309:
1310: out:
1311: /* Return the number of waiters woken. */
1312: *retval = nwoken;
1313:
1314: /* Success! */
1315: return error;
1316: }
1317:
1318: /*
1319: * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
1320: *
1321: * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
1322: */
1323: static int
1324: futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
1325: int val2, int val3, register_t *retval)
1326: {
1327: struct futex *f = NULL, *f2 = NULL;
1328: unsigned nwoken = 0; /* default to zero woken on early return */
1329: int error;
1330:
1331: /* Reject negative number of wakeups or requeues. */
1332: if (val < 0 || val2 < 0) {
1333: error = EINVAL;
1334: goto out;
1335: }
1336:
1337: /* Look up the source futex, if any. */
1338: error = futex_lookup(uaddr, shared, &f);
1339: if (error)
1340: goto out;
1341:
1342: /* If there is none, nothing to do. */
1343: if (f == NULL)
1344: goto out;
1345:
1346: /*
1347: * We may need to create the destination futex because it's
1348: * entirely possible it does not currently have any waiters.
1349: */
1.5 riastrad 1350: error = futex_lookup_create(uaddr2, shared, &f2);
1.1 thorpej 1351: if (error)
1352: goto out;
1353:
1354: /*
1355: * Under the futexes' queue locks, check the value; if
1356: * unchanged from val3, wake the waiters.
1357: */
1358: futex_queue_lock2(f, f2);
1359: if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
1360: error = EAGAIN;
1361: } else {
1362: error = 0;
1363: nwoken = futex_wake(f, val, f2, val2, FUTEX_BITSET_MATCH_ANY);
1364: }
1365: futex_queue_unlock2(f, f2);
1366:
1367: out:
1368: /* Return the number of waiters woken. */
1369: *retval = nwoken;
1370:
1371: /* Release the futexes if we got them. */
1372: if (f2)
1.5 riastrad 1373: futex_rele(f2);
1.1 thorpej 1374: if (f)
1.5 riastrad 1375: futex_rele(f);
1.1 thorpej 1376: return error;
1377: }
1378:
1379: /*
1380: * futex_validate_op_cmp(val3)
1381: *
1382: * Validate an op/cmp argument for FUTEX_WAKE_OP.
1383: */
1384: static int
1385: futex_validate_op_cmp(int val3)
1386: {
1387: int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1388: int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1389:
1390: if (op & FUTEX_OP_OPARG_SHIFT) {
1391: int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1392: if (oparg < 0)
1393: return EINVAL;
1394: if (oparg >= 32)
1395: return EINVAL;
1396: op &= ~FUTEX_OP_OPARG_SHIFT;
1397: }
1398:
1399: switch (op) {
1400: case FUTEX_OP_SET:
1401: case FUTEX_OP_ADD:
1402: case FUTEX_OP_OR:
1403: case FUTEX_OP_ANDN:
1404: case FUTEX_OP_XOR:
1405: break;
1406: default:
1407: return EINVAL;
1408: }
1409:
1410: switch (cmp) {
1411: case FUTEX_OP_CMP_EQ:
1412: case FUTEX_OP_CMP_NE:
1413: case FUTEX_OP_CMP_LT:
1414: case FUTEX_OP_CMP_LE:
1415: case FUTEX_OP_CMP_GT:
1416: case FUTEX_OP_CMP_GE:
1417: break;
1418: default:
1419: return EINVAL;
1420: }
1421:
1422: return 0;
1423: }
1424:
1425: /*
1426: * futex_compute_op(oldval, val3)
1427: *
1428: * Apply a FUTEX_WAIT_OP operation to oldval.
1429: */
1430: static int
1431: futex_compute_op(int oldval, int val3)
1432: {
1433: int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1434: int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1435:
1436: if (op & FUTEX_OP_OPARG_SHIFT) {
1437: KASSERT(oparg >= 0);
1438: KASSERT(oparg < 32);
1439: oparg = 1u << oparg;
1440: op &= ~FUTEX_OP_OPARG_SHIFT;
1441: }
1442:
1443: switch (op) {
1444: case FUTEX_OP_SET:
1445: return oparg;
1446:
1447: case FUTEX_OP_ADD:
1448: /*
1449: * Avoid signed arithmetic overflow by doing
1450: * arithmetic unsigned and converting back to signed
1451: * at the end.
1452: */
1453: return (int)((unsigned)oldval + (unsigned)oparg);
1454:
1455: case FUTEX_OP_OR:
1456: return oldval | oparg;
1457:
1458: case FUTEX_OP_ANDN:
1459: return oldval & ~oparg;
1460:
1461: case FUTEX_OP_XOR:
1462: return oldval ^ oparg;
1463:
1464: default:
1465: panic("invalid futex op");
1466: }
1467: }
1468:
1469: /*
1470: * futex_compute_cmp(oldval, val3)
1471: *
1472: * Apply a FUTEX_WAIT_OP comparison to oldval.
1473: */
1474: static bool
1475: futex_compute_cmp(int oldval, int val3)
1476: {
1477: int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1478: int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
1479:
1480: switch (cmp) {
1481: case FUTEX_OP_CMP_EQ:
1482: return (oldval == cmparg);
1483:
1484: case FUTEX_OP_CMP_NE:
1485: return (oldval != cmparg);
1486:
1487: case FUTEX_OP_CMP_LT:
1488: return (oldval < cmparg);
1489:
1490: case FUTEX_OP_CMP_LE:
1491: return (oldval <= cmparg);
1492:
1493: case FUTEX_OP_CMP_GT:
1494: return (oldval > cmparg);
1495:
1496: case FUTEX_OP_CMP_GE:
1497: return (oldval >= cmparg);
1498:
1499: default:
1500: panic("invalid futex cmp operation");
1501: }
1502: }
1503:
1504: /*
1505: * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
1506: *
1507: * Implement futex(FUTEX_WAKE_OP).
1508: */
1509: static int
1510: futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
1511: int val3, register_t *retval)
1512: {
1513: struct futex *f = NULL, *f2 = NULL;
1514: int oldval, newval, actual;
1515: unsigned nwoken = 0;
1516: int error;
1517:
1518: /* Reject negative number of wakeups. */
1519: if (val < 0 || val2 < 0) {
1520: error = EINVAL;
1521: goto out;
1522: }
1523:
1524: /* Reject invalid operations before we start doing things. */
1525: if ((error = futex_validate_op_cmp(val3)) != 0)
1526: goto out;
1527:
1528: /* Look up the first futex, if any. */
1529: error = futex_lookup(uaddr, shared, &f);
1530: if (error)
1531: goto out;
1532:
1533: /* Look up the second futex, if any. */
1534: error = futex_lookup(uaddr2, shared, &f2);
1535: if (error)
1536: goto out;
1537:
1538: /*
1539: * Under the queue locks:
1540: *
1541: * 1. Read/modify/write: *uaddr2 op= oparg.
1542: * 2. Unconditionally wake uaddr.
1543: * 3. Conditionally wake uaddr2, if it previously matched val2.
1544: */
1545: futex_queue_lock2(f, f2);
1546: do {
1547: error = futex_load(uaddr2, &oldval);
1548: if (error)
1549: goto out_unlock;
1550: newval = futex_compute_op(oldval, val3);
1551: error = ucas_int(uaddr2, oldval, newval, &actual);
1552: if (error)
1553: goto out_unlock;
1554: } while (actual != oldval);
1555: nwoken = (f ? futex_wake(f, val, NULL, 0, FUTEX_BITSET_MATCH_ANY) : 0);
1556: if (f2 && futex_compute_cmp(oldval, val3))
1557: nwoken += futex_wake(f2, val2, NULL, 0,
1558: FUTEX_BITSET_MATCH_ANY);
1559:
1560: /* Success! */
1561: error = 0;
1562: out_unlock:
1563: futex_queue_unlock2(f, f2);
1564:
1565: out:
1566: /* Return the number of waiters woken. */
1567: *retval = nwoken;
1568:
1569: /* Release the futexes, if we got them. */
1570: if (f2)
1.5 riastrad 1571: futex_rele(f2);
1.1 thorpej 1572: if (f)
1.5 riastrad 1573: futex_rele(f);
1.1 thorpej 1574: return error;
1575: }
1576:
1577: /*
1578: * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
1579: *
1580: * Implement the futex system call with all the parameters
1581: * parsed out.
1582: */
1583: int
1.11 riastrad 1584: do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
1585: int *uaddr2, int val2, int val3, register_t *retval)
1.1 thorpej 1586: {
1587: const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
1588: const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
1589: : CLOCK_MONOTONIC;
1590:
1591: op &= FUTEX_CMD_MASK;
1592:
1593: switch (op) {
1594: case FUTEX_WAIT:
1595: return futex_func_wait(shared, uaddr, val,
1596: FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
1597: retval);
1598:
1599: case FUTEX_WAKE:
1600: val3 = FUTEX_BITSET_MATCH_ANY;
1601: /* FALLTHROUGH */
1602: case FUTEX_WAKE_BITSET:
1603: return futex_func_wake(shared, uaddr, val, val3, retval);
1604:
1605: case FUTEX_REQUEUE:
1606: case FUTEX_CMP_REQUEUE:
1607: return futex_func_requeue(shared, op, uaddr, val, uaddr2,
1608: val2, val3, retval);
1609:
1610: case FUTEX_WAIT_BITSET:
1611: return futex_func_wait(shared, uaddr, val, val3, timeout,
1612: clkid, TIMER_ABSTIME, retval);
1613:
1614: case FUTEX_WAKE_OP:
1615: return futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
1616: val3, retval);
1617:
1618: case FUTEX_FD:
1619: default:
1620: return ENOSYS;
1621: }
1622: }
1623:
1624: /*
1625: * sys___futex(l, uap, retval)
1626: *
1627: * __futex(2) system call: generic futex operations.
1628: */
1629: int
1630: sys___futex(struct lwp *l, const struct sys___futex_args *uap,
1631: register_t *retval)
1632: {
1633: /* {
1634: syscallarg(int *) uaddr;
1635: syscallarg(int) op;
1636: syscallarg(int) val;
1637: syscallarg(const struct timespec *) timeout;
1638: syscallarg(int *) uaddr2;
1639: syscallarg(int) val2;
1640: syscallarg(int) val3;
1641: } */
1642: struct timespec ts, *tsp;
1643: int error;
1644:
1645: /*
1646: * Copy in the timeout argument, if specified.
1647: */
1648: if (SCARG(uap, timeout)) {
1649: error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
1650: if (error)
1651: return error;
1652: tsp = &ts;
1653: } else {
1654: tsp = NULL;
1655: }
1656:
1657: return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
1658: tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
1659: retval);
1660: }
1661:
1662: /*
1663: * sys___futex_set_robust_list(l, uap, retval)
1664: *
1665: * __futex_set_robust_list(2) system call for robust futexes.
1666: */
1667: int
1668: sys___futex_set_robust_list(struct lwp *l,
1669: const struct sys___futex_set_robust_list_args *uap, register_t *retval)
1670: {
1671: /* {
1672: syscallarg(void *) head;
1673: syscallarg(size_t) len;
1674: } */
1675: void *head = SCARG(uap, head);
1676:
1677: if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
1678: return EINVAL;
1679: if ((uintptr_t)head % sizeof(u_long))
1680: return EINVAL;
1681:
1682: l->l_robust_head = (uintptr_t)head;
1683:
1684: return 0;
1685: }
1686:
1687: /*
1688: * sys___futex_get_robust_list(l, uap, retval)
1689: *
1690: * __futex_get_robust_list(2) system call for robust futexes.
1691: */
1692: int
1693: sys___futex_get_robust_list(struct lwp *l,
1694: const struct sys___futex_get_robust_list_args *uap, register_t *retval)
1695: {
1696: /* {
1697: syscallarg(lwpid_t) lwpid;
1698: syscallarg(void **) headp;
1699: syscallarg(size_t *) lenp;
1700: } */
1701: void *head;
1702: const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
1703: int error;
1704:
1705: error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
1706: if (error)
1707: return error;
1708:
1709: /* Copy out the head pointer and the head structure length. */
1710: error = copyout(&head, SCARG(uap, headp), sizeof(head));
1711: if (__predict_true(error == 0)) {
1712: error = copyout(&len, SCARG(uap, lenp), sizeof(len));
1713: }
1714:
1715: return error;
1716: }
1717:
1718: /*
1719: * release_futex(uva, tid)
1720: *
1721: * Try to release the robust futex at uva in the current process
1722: * on lwp exit. If anything goes wrong, silently fail. It is the
1723: * userland program's obligation to arrange correct behaviour.
1724: */
1725: static void
1726: release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
1727: bool const is_pending)
1728: {
1729: int *uaddr;
1730: struct futex *f;
1731: int oldval, newval, actual;
1732: int error;
1733:
1734: /* If it's misaligned, tough. */
1735: if (__predict_false(uptr & 3))
1736: return;
1737: uaddr = (int *)uptr;
1738:
1739: error = futex_load(uaddr, &oldval);
1740: if (__predict_false(error))
1741: return;
1742:
1743: /*
1744: * There are two race conditions we need to handle here:
1745: *
1746: * 1. User space cleared the futex word but died before
1747: * being able to issue the wakeup. No wakeups will
1748: * ever be issued, oops!
1749: *
1750: * 2. Awakened waiter died before being able to acquire
1751: * the futex in user space. Any other waiters are
1752: * now stuck, oops!
1753: *
1754: * In both of these cases, the futex word will be 0 (because
1755: * it's updated before the wake is issued). The best we can
1756: * do is detect this situation if it's the pending futex and
1757: * issue a wake without modifying the futex word.
1758: *
1759: * XXX eventual PI handling?
1760: */
1761: if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
1762: register_t retval;
1763: (void) futex_func_wake(/*shared*/true, uaddr, 1,
1764: FUTEX_BITSET_MATCH_ANY, &retval);
1765: return;
1766: }
1767:
1768: /* Optimistically test whether we need to do anything at all. */
1769: if ((oldval & FUTEX_TID_MASK) != tid)
1770: return;
1771:
1772: /*
1773: * We need to handle the case where this thread owned the futex,
1774: * but it was uncontended. In this case, there won't be any
1775: * kernel state to look up. All we can do is mark the futex
1776: * as a zombie to be mopped up the next time another thread
1777: * attempts to acquire it.
1778: *
1779: * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
1780: * this loop, even if waiters appear while we're are doing
1781: * so. This is beause FUTEX_WAITERS is set by user space
1782: * before calling __futex() to wait, and the futex needs
1783: * to be marked as a zombie when the new waiter gets into
1784: * the kernel.
1785: */
1786: if ((oldval & FUTEX_WAITERS) == 0) {
1787: do {
1788: error = futex_load(uaddr, &oldval);
1789: if (error)
1790: return;
1791: if ((oldval & FUTEX_TID_MASK) != tid)
1792: return;
1793: newval = oldval | FUTEX_OWNER_DIED;
1794: error = ucas_int(uaddr, oldval, newval, &actual);
1795: if (error)
1796: return;
1797: } while (actual != oldval);
1798:
1799: /*
1800: * If where is still no indication of waiters, then there is
1801: * no more work for us to do.
1802: */
1803: if ((oldval & FUTEX_WAITERS) == 0)
1804: return;
1805: }
1806:
1807: /*
1808: * Look for a shared futex since we have no positive indication
1809: * it is private. If we can't, tough.
1810: */
1811: error = futex_lookup(uaddr, /*shared*/true, &f);
1812: if (error)
1813: return;
1814:
1815: /*
1816: * If there's no kernel state for this futex, there's nothing to
1817: * release.
1818: */
1819: if (f == NULL)
1820: return;
1821:
1822: /* Work under the futex queue lock. */
1823: futex_queue_lock(f);
1824:
1825: /*
1826: * Fetch the word: if the tid doesn't match ours, skip;
1827: * otherwise, set the owner-died bit, atomically.
1828: */
1829: do {
1830: error = futex_load(uaddr, &oldval);
1831: if (error)
1832: goto out;
1833: if ((oldval & FUTEX_TID_MASK) != tid)
1834: goto out;
1835: newval = oldval | FUTEX_OWNER_DIED;
1836: error = ucas_int(uaddr, oldval, newval, &actual);
1837: if (error)
1838: goto out;
1839: } while (actual != oldval);
1840:
1841: /*
1842: * If there may be waiters, try to wake one. If anything goes
1843: * wrong, tough.
1844: *
1845: * XXX eventual PI handling?
1846: */
1847: if (oldval & FUTEX_WAITERS)
1848: (void)futex_wake(f, 1, NULL, 0, FUTEX_BITSET_MATCH_ANY);
1849:
1850: /* Unlock the queue and release the futex. */
1851: out: futex_queue_unlock(f);
1.5 riastrad 1852: futex_rele(f);
1.1 thorpej 1853: }
1854:
1855: /*
1856: * futex_robust_head_lookup(l, lwpid)
1857: *
1858: * Helper function to look up a robust head by LWP ID.
1859: */
1860: int
1861: futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
1862: {
1863: struct proc *p = l->l_proc;
1864:
1865: /* Find the other lwp, if requested; otherwise use our robust head. */
1866: if (lwpid) {
1867: mutex_enter(p->p_lock);
1868: l = lwp_find(p, lwpid);
1869: if (l == NULL) {
1870: mutex_exit(p->p_lock);
1871: return ESRCH;
1872: }
1873: *headp = (void *)l->l_robust_head;
1874: mutex_exit(p->p_lock);
1875: } else {
1876: *headp = (void *)l->l_robust_head;
1877: }
1878: return 0;
1879: }
1880:
1881: /*
1882: * futex_fetch_robust_head(uaddr)
1883: *
1884: * Helper routine to fetch the futex robust list head that
1885: * handles 32-bit binaries running on 64-bit kernels.
1886: */
1887: static int
1888: futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
1889: {
1890: #ifdef _LP64
1891: if (curproc->p_flag & PK_32) {
1892: uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
1893: int error;
1894:
1895: error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
1896: if (__predict_true(error == 0)) {
1897: for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
1898: if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
1899: /*
1900: * Make sure the offset is sign-
1901: * extended.
1902: */
1903: rhead[i] = (int32_t)rhead32[i];
1904: } else {
1905: rhead[i] = rhead32[i];
1906: }
1907: }
1908: }
1909: return error;
1910: }
1911: #endif /* _L64 */
1912:
1913: return copyin((void *)uaddr, rhead,
1914: sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
1915: }
1916:
1917: /*
1918: * futex_decode_robust_word(word)
1919: *
1920: * Decode a robust futex list word into the entry and entry
1921: * properties.
1922: */
1923: static inline void
1924: futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
1925: bool * const is_pi)
1926: {
1927: *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
1928: *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
1929: }
1930:
1931: /*
1932: * futex_fetch_robust_entry(uaddr)
1933: *
1934: * Helper routine to fetch and decode a robust futex entry
1935: * that handles 32-bit binaries running on 64-bit kernels.
1936: */
1937: static int
1938: futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
1939: bool * const is_pi)
1940: {
1941: uintptr_t val = 0;
1942: int error = 0;
1943:
1944: #ifdef _LP64
1945: if (curproc->p_flag & PK_32) {
1946: uint32_t val32;
1947:
1948: error = ufetch_32((uint32_t *)uaddr, &val32);
1949: if (__predict_true(error == 0))
1950: val = val32;
1951: } else
1952: #endif /* _LP64 */
1953: error = ufetch_long((u_long *)uaddr, (u_long *)&val);
1954: if (__predict_false(error))
1955: return error;
1956:
1957: futex_decode_robust_word(val, valp, is_pi);
1958: return 0;
1959: }
1960:
1961: /*
1962: * futex_release_all_lwp(l, tid)
1963: *
1964: * Release all l's robust futexes. If anything looks funny in
1965: * the process, give up -- it's userland's responsibility to dot
1966: * the i's and cross the t's.
1967: */
1968: void
1.13 thorpej 1969: futex_release_all_lwp(struct lwp * const l)
1.1 thorpej 1970: {
1971: u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
1972: int limit = 1000000;
1973: int error;
1974:
1975: /* If there's no robust list there's nothing to do. */
1976: if (l->l_robust_head == 0)
1977: return;
1978:
1.13 thorpej 1979: KASSERT((l->l_lid & FUTEX_TID_MASK) == l->l_lid);
1980:
1.1 thorpej 1981: /* Read the final snapshot of the robust list head. */
1982: error = futex_fetch_robust_head(l->l_robust_head, rhead);
1983: if (error) {
1.13 thorpej 1984: printf("WARNING: pid %jd (%s) lwp %jd:"
1.1 thorpej 1985: " unmapped robust futex list head\n",
1986: (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
1.13 thorpej 1987: (uintmax_t)l->l_lid);
1.1 thorpej 1988: return;
1989: }
1990:
1991: const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
1992:
1993: uintptr_t next, pending;
1994: bool is_pi, pending_is_pi;
1995:
1996: futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
1997: &next, &is_pi);
1998: futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
1999: &pending, &pending_is_pi);
2000:
2001: /*
2002: * Walk down the list of locked futexes and release them, up
2003: * to one million of them before we give up.
2004: */
2005:
2006: while (next != l->l_robust_head && limit-- > 0) {
2007: /* pending handled below. */
2008: if (next != pending)
1.13 thorpej 2009: release_futex(next + offset, l->l_lid, is_pi, false);
1.1 thorpej 2010: error = futex_fetch_robust_entry(next, &next, &is_pi);
2011: if (error)
2012: break;
2013: preempt_point();
2014: }
2015: if (limit <= 0) {
1.13 thorpej 2016: printf("WARNING: pid %jd (%s) lwp %jd:"
1.1 thorpej 2017: " exhausted robust futex limit\n",
2018: (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
1.13 thorpej 2019: (uintmax_t)l->l_lid);
1.1 thorpej 2020: }
2021:
2022: /* If there's a pending futex, it may need to be released too. */
2023: if (pending != 0) {
1.13 thorpej 2024: release_futex(pending + offset, l->l_lid, pending_is_pi, true);
1.1 thorpej 2025: }
2026: }
CVSweb <webmaster@jp.NetBSD.org>