[BACK]Return to kern_exec.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / kern

Annotation of src/sys/kern/kern_exec.c, Revision 1.482

1.482   ! kamil       1: /*     $NetBSD: kern_exec.c,v 1.481 2019/09/17 15:19:27 christos Exp $ */
1.277     ad          2:
                      3: /*-
                      4:  * Copyright (c) 2008 The NetBSD Foundation, Inc.
                      5:  * All rights reserved.
                      6:  *
                      7:  * Redistribution and use in source and binary forms, with or without
                      8:  * modification, are permitted provided that the following conditions
                      9:  * are met:
                     10:  * 1. Redistributions of source code must retain the above copyright
                     11:  *    notice, this list of conditions and the following disclaimer.
                     12:  * 2. Redistributions in binary form must reproduce the above copyright
                     13:  *    notice, this list of conditions and the following disclaimer in the
                     14:  *    documentation and/or other materials provided with the distribution.
                     15:  *
                     16:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     17:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     18:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     19:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     20:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     21:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     22:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     23:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     24:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     25:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     26:  * POSSIBILITY OF SUCH DAMAGE.
                     27:  */
1.55      cgd        28:
                     29: /*-
1.77      cgd        30:  * Copyright (C) 1993, 1994, 1996 Christopher G. Demetriou
1.55      cgd        31:  * Copyright (C) 1992 Wolfgang Solfrank.
                     32:  * Copyright (C) 1992 TooLs GmbH.
                     33:  * All rights reserved.
                     34:  *
                     35:  * Redistribution and use in source and binary forms, with or without
                     36:  * modification, are permitted provided that the following conditions
                     37:  * are met:
                     38:  * 1. Redistributions of source code must retain the above copyright
                     39:  *    notice, this list of conditions and the following disclaimer.
                     40:  * 2. Redistributions in binary form must reproduce the above copyright
                     41:  *    notice, this list of conditions and the following disclaimer in the
                     42:  *    documentation and/or other materials provided with the distribution.
                     43:  * 3. All advertising materials mentioning features or use of this software
                     44:  *    must display the following acknowledgement:
                     45:  *     This product includes software developed by TooLs GmbH.
                     46:  * 4. The name of TooLs GmbH may not be used to endorse or promote products
                     47:  *    derived from this software without specific prior written permission.
                     48:  *
                     49:  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
                     50:  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
                     51:  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
                     52:  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
                     53:  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
                     54:  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
                     55:  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
                     56:  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
                     57:  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
                     58:  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                     59:  */
1.146     lukem      60:
                     61: #include <sys/cdefs.h>
1.482   ! kamil      62: __KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.481 2019/09/17 15:19:27 christos Exp $");
1.89      mrg        63:
1.325     jmcneill   64: #include "opt_exec.h"
1.360     christos   65: #include "opt_execfmt.h"
1.92      thorpej    66: #include "opt_ktrace.h"
1.285     apb        67: #include "opt_modular.h"
1.124     jdolecek   68: #include "opt_syscall_debug.h"
1.226     dogcow     69: #include "veriexec.h"
1.232     elad       70: #include "opt_pax.h"
1.55      cgd        71:
                     72: #include <sys/param.h>
                     73: #include <sys/systm.h>
                     74: #include <sys/filedesc.h>
                     75: #include <sys/kernel.h>
                     76: #include <sys/proc.h>
1.466     kamil      77: #include <sys/ptrace.h>
1.55      cgd        78: #include <sys/mount.h>
1.265     yamt       79: #include <sys/kmem.h>
1.55      cgd        80: #include <sys/namei.h>
                     81: #include <sys/vnode.h>
                     82: #include <sys/file.h>
1.414     christos   83: #include <sys/filedesc.h>
1.55      cgd        84: #include <sys/acct.h>
1.337     martin     85: #include <sys/atomic.h>
1.55      cgd        86: #include <sys/exec.h>
                     87: #include <sys/ktrace.h>
1.278     pooka      88: #include <sys/uidinfo.h>
1.55      cgd        89: #include <sys/wait.h>
                     90: #include <sys/mman.h>
1.155     gmcgarry   91: #include <sys/ras.h>
1.55      cgd        92: #include <sys/signalvar.h>
                     93: #include <sys/stat.h>
1.124     jdolecek   94: #include <sys/syscall.h>
1.218     elad       95: #include <sys/kauth.h>
1.253     ad         96: #include <sys/lwpctl.h>
1.260     christos   97: #include <sys/pax.h>
1.263     ad         98: #include <sys/cpu.h>
1.282     ad         99: #include <sys/module.h>
1.289     pooka     100: #include <sys/syscallvar.h>
1.56      cgd       101: #include <sys/syscallargs.h>
1.222     elad      102: #if NVERIEXEC > 0
1.197     blymn     103: #include <sys/verified_exec.h>
1.222     elad      104: #endif /* NVERIEXEC > 0 */
1.294     darran    105: #include <sys/sdt.h>
1.337     martin    106: #include <sys/spawn.h>
                    107: #include <sys/prot.h>
1.330     tls       108: #include <sys/cprng.h>
1.55      cgd       109:
1.88      mrg       110: #include <uvm/uvm_extern.h>
                    111:
1.55      cgd       112: #include <machine/reg.h>
                    113:
1.244     dsl       114: #include <compat/common/compat_util.h>
                    115:
1.364     martin    116: #ifndef MD_TOPDOWN_INIT
1.370     christos  117: #ifdef __USE_TOPDOWN_VM
1.364     martin    118: #define        MD_TOPDOWN_INIT(epp)    (epp)->ep_flags |= EXEC_TOPDOWN_VM
                    119: #else
                    120: #define        MD_TOPDOWN_INIT(epp)
                    121: #endif
                    122: #endif
                    123:
1.391     uebayasi  124: struct execve_data;
                    125:
1.436     maxv      126: extern int user_va0_disable;
                    127:
1.396     uebayasi  128: static size_t calcargs(struct execve_data * restrict, const size_t);
                    129: static size_t calcstack(struct execve_data * restrict, const size_t);
1.399     uebayasi  130: static int copyoutargs(struct execve_data * restrict, struct lwp *,
                    131:     char * const);
1.398     uebayasi  132: static int copyoutpsstrs(struct execve_data * restrict, struct proc *);
1.391     uebayasi  133: static int copyinargs(struct execve_data * restrict, char * const *,
                    134:     char * const *, execve_fetch_element_t, char **);
1.392     uebayasi  135: static int copyinargstrs(struct execve_data * restrict, char * const *,
                    136:     execve_fetch_element_t, char **, size_t *, void (*)(const void *, size_t));
1.171     chs       137: static int exec_sigcode_map(struct proc *, const struct emul *);
                    138:
1.429     ozaki-r   139: #if defined(DEBUG) && !defined(DEBUG_EXEC)
1.428     christos  140: #define DEBUG_EXEC
                    141: #endif
1.143     christos  142: #ifdef DEBUG_EXEC
1.305     matt      143: #define DPRINTF(a) printf a
1.312     christos  144: #define COPYPRINTF(s, a, b) printf("%s, %d: copyout%s @%p %zu\n", __func__, \
                    145:     __LINE__, (s), (a), (b))
1.388     uebayasi  146: static void dump_vmcmds(const struct exec_package * const, size_t, int);
                    147: #define DUMPVMCMDS(p, x, e) do { dump_vmcmds((p), (x), (e)); } while (0)
1.143     christos  148: #else
                    149: #define DPRINTF(a)
1.312     christos  150: #define COPYPRINTF(s, a, b)
1.388     uebayasi  151: #define DUMPVMCMDS(p, x, e) do {} while (0)
1.143     christos  152: #endif /* DEBUG_EXEC */
1.165     thorpej   153:
1.130     jdolecek  154: /*
1.294     darran    155:  * DTrace SDT provider definitions
                    156:  */
1.418     christos  157: SDT_PROVIDER_DECLARE(proc);
                    158: SDT_PROBE_DEFINE1(proc, kernel, , exec, "char *");
                    159: SDT_PROBE_DEFINE1(proc, kernel, , exec__success, "char *");
                    160: SDT_PROBE_DEFINE1(proc, kernel, , exec__failure, "int");
1.294     darran    161:
                    162: /*
1.130     jdolecek  163:  * Exec function switch:
                    164:  *
                    165:  * Note that each makecmds function is responsible for loading the
                    166:  * exec package with the necessary functions for any exec-type-specific
                    167:  * handling.
                    168:  *
                    169:  * Functions for specific exec types should be defined in their own
                    170:  * header file.
                    171:  */
1.138     lukem     172: static const struct execsw     **execsw = NULL;
                    173: static int                     nexecs;
                    174:
1.282     ad        175: u_int  exec_maxhdrsz;   /* must not be static - used by netbsd32 */
1.130     jdolecek  176:
                    177: /* list of dynamically loaded execsw entries */
1.282     ad        178: static LIST_HEAD(execlist_head, exec_entry) ex_head =
                    179:     LIST_HEAD_INITIALIZER(ex_head);
1.130     jdolecek  180: struct exec_entry {
1.138     lukem     181:        LIST_ENTRY(exec_entry)  ex_list;
1.282     ad        182:        SLIST_ENTRY(exec_entry) ex_slist;
                    183:        const struct execsw     *ex_sw;
1.130     jdolecek  184: };
                    185:
1.203     christos  186: #ifndef __HAVE_SYSCALL_INTERN
                    187: void   syscall(void);
                    188: #endif
                    189:
1.423     pgoyette  190: /* NetBSD autoloadable syscalls */
                    191: #ifdef MODULAR
                    192: #include <kern/syscalls_autoload.c>
                    193: #endif
                    194:
1.173     christos  195: /* NetBSD emul struct */
1.282     ad        196: struct emul emul_netbsd = {
1.291     rmind     197:        .e_name =               "netbsd",
1.371     manu      198: #ifdef EMUL_NATIVEROOT
                    199:        .e_path =               EMUL_NATIVEROOT,
                    200: #else
                    201:        .e_path =               NULL,
                    202: #endif
1.133     mycroft   203: #ifndef __HAVE_MINIMAL_EMUL
1.291     rmind     204:        .e_flags =              EMUL_HAS_SYS___syscall,
                    205:        .e_errno =              NULL,
                    206:        .e_nosys =              SYS_syscall,
                    207:        .e_nsysent =            SYS_NSYSENT,
1.133     mycroft   208: #endif
1.423     pgoyette  209: #ifdef MODULAR
                    210:        .e_sc_autoload =        netbsd_syscalls_autoload,
                    211: #endif
1.291     rmind     212:        .e_sysent =             sysent,
1.460     pgoyette  213:        .e_nomodbits =          sysent_nomodbits,
1.124     jdolecek  214: #ifdef SYSCALL_DEBUG
1.291     rmind     215:        .e_syscallnames =       syscallnames,
1.124     jdolecek  216: #else
1.291     rmind     217:        .e_syscallnames =       NULL,
1.124     jdolecek  218: #endif
1.291     rmind     219:        .e_sendsig =            sendsig,
                    220:        .e_trapsignal =         trapsignal,
                    221:        .e_sigcode =            NULL,
                    222:        .e_esigcode =           NULL,
                    223:        .e_sigobject =          NULL,
                    224:        .e_setregs =            setregs,
                    225:        .e_proc_exec =          NULL,
                    226:        .e_proc_fork =          NULL,
                    227:        .e_proc_exit =          NULL,
                    228:        .e_lwp_fork =           NULL,
                    229:        .e_lwp_exit =           NULL,
1.133     mycroft   230: #ifdef __HAVE_SYSCALL_INTERN
1.291     rmind     231:        .e_syscall_intern =     syscall_intern,
1.133     mycroft   232: #else
1.291     rmind     233:        .e_syscall =            syscall,
1.133     mycroft   234: #endif
1.291     rmind     235:        .e_sysctlovly =         NULL,
                    236:        .e_vm_default_addr =    uvm_default_mapaddr,
                    237:        .e_usertrap =           NULL,
                    238:        .e_ucsize =             sizeof(ucontext_t),
                    239:        .e_startlwp =           startlwp
1.124     jdolecek  240: };
                    241:
1.55      cgd       242: /*
1.130     jdolecek  243:  * Exec lock. Used to control access to execsw[] structures.
                    244:  * This must not be static so that netbsd32 can access it, too.
                    245:  */
1.352     rmind     246: krwlock_t exec_lock;
                    247:
                    248: static kmutex_t sigobject_lock;
1.259     ad        249:
1.337     martin    250: /*
                    251:  * Data used between a loadvm and execve part of an "exec" operation
                    252:  */
                    253: struct execve_data {
                    254:        struct exec_package     ed_pack;
                    255:        struct pathbuf          *ed_pathbuf;
                    256:        struct vattr            ed_attr;
                    257:        struct ps_strings       ed_arginfo;
                    258:        char                    *ed_argp;
                    259:        const char              *ed_pathstring;
1.480     christos  260:        char                    *ed_resolvedname;
1.337     martin    261:        size_t                  ed_ps_strings_sz;
                    262:        int                     ed_szsigcode;
1.396     uebayasi  263:        size_t                  ed_argslen;
1.337     martin    264:        long                    ed_argc;
                    265:        long                    ed_envc;
                    266: };
                    267:
                    268: /*
                    269:  * data passed from parent lwp to child during a posix_spawn()
                    270:  */
                    271: struct spawn_exec_data {
                    272:        struct execve_data      sed_exec;
1.348     martin    273:        struct posix_spawn_file_actions
1.337     martin    274:                                *sed_actions;
                    275:        struct posix_spawnattr  *sed_attrs;
                    276:        struct proc             *sed_parent;
                    277:        kcondvar_t              sed_cv_child_ready;
                    278:        kmutex_t                sed_mtx_child;
                    279:        int                     sed_error;
1.348     martin    280:        volatile uint32_t       sed_refcnt;
1.337     martin    281: };
                    282:
1.448     riastrad  283: static struct vm_map *exec_map;
                    284: static struct pool exec_pool;
                    285:
1.277     ad        286: static void *
                    287: exec_pool_alloc(struct pool *pp, int flags)
                    288: {
                    289:
1.448     riastrad  290:        return (void *)uvm_km_alloc(exec_map, NCARGS, 0,
1.277     ad        291:            UVM_KMF_PAGEABLE | UVM_KMF_WAITVA);
                    292: }
                    293:
                    294: static void
                    295: exec_pool_free(struct pool *pp, void *addr)
                    296: {
                    297:
1.448     riastrad  298:        uvm_km_free(exec_map, (vaddr_t)addr, NCARGS, UVM_KMF_PAGEABLE);
1.277     ad        299: }
                    300:
                    301: static struct pool_allocator exec_palloc = {
                    302:        .pa_alloc = exec_pool_alloc,
                    303:        .pa_free = exec_pool_free,
                    304:        .pa_pagesz = NCARGS
                    305: };
                    306:
1.479     christos  307: static void
                    308: exec_path_free(struct execve_data *data)
                    309: {
                    310:        pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
                    311:        pathbuf_destroy(data->ed_pathbuf);
1.480     christos  312:        if (data->ed_resolvedname)
                    313:                PNBUF_PUT(data->ed_resolvedname);
1.479     christos  314: }
                    315:
1.480     christos  316: static void
                    317: exec_resolvename(struct lwp *l, struct exec_package *epp, struct vnode *vp,
                    318:     char **rpath)
                    319: {
                    320:        int error;
                    321:        char *p;
                    322:
                    323:        KASSERT(rpath != NULL);
                    324:
                    325:        *rpath = PNBUF_GET();
                    326:        error = vnode_to_path(*rpath, MAXPATHLEN, vp, l, l->l_proc);
                    327:        if (error) {
                    328:                PNBUF_PUT(*rpath);
                    329:                *rpath = NULL;
                    330:                return;
                    331:        }
                    332:        epp->ep_resolvedname = *rpath;
                    333:        if ((p = strrchr(*rpath, '/')) != NULL)
                    334:                epp->ep_kname = p + 1;
                    335: }
                    336:
                    337:
1.130     jdolecek  338: /*
1.55      cgd       339:  * check exec:
                    340:  * given an "executable" described in the exec package's namei info,
                    341:  * see what we can do with it.
                    342:  *
                    343:  * ON ENTRY:
                    344:  *     exec package with appropriate namei info
1.212     christos  345:  *     lwp pointer of exec'ing lwp
1.55      cgd       346:  *     NO SELF-LOCKED VNODES
                    347:  *
                    348:  * ON EXIT:
                    349:  *     error:  nothing held, etc.  exec header still allocated.
1.77      cgd       350:  *     ok:     filled exec package, executable's vnode (unlocked).
1.55      cgd       351:  *
                    352:  * EXEC SWITCH ENTRY:
                    353:  *     Locked vnode to check, exec package, proc.
                    354:  *
                    355:  * EXEC SWITCH EXIT:
1.77      cgd       356:  *     ok:     return 0, filled exec package, executable's vnode (unlocked).
1.55      cgd       357:  *     error:  destructive:
                    358:  *                     everything deallocated execept exec header.
1.76      cgd       359:  *             non-destructive:
1.77      cgd       360:  *                     error code, executable's vnode (unlocked),
1.76      cgd       361:  *                     exec header unmodified.
1.55      cgd       362:  */
                    363: int
1.352     rmind     364: /*ARGSUSED*/
1.480     christos  365: check_exec(struct lwp *l, struct exec_package *epp, struct pathbuf *pb,
                    366:     char **rpath)
1.55      cgd       367: {
1.138     lukem     368:        int             error, i;
                    369:        struct vnode    *vp;
                    370:        size_t          resid;
1.55      cgd       371:
1.480     christos  372:        if (epp->ep_resolvedname) {
                    373:                struct nameidata nd;
                    374:
                    375:                // grab the absolute pathbuf here before namei() trashes it.
                    376:                pathbuf_copystring(pb, epp->ep_resolvedname, PATH_MAX);
                    377:                NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1.295     dholland  378:
1.480     christos  379:                /* first get the vnode */
                    380:                if ((error = namei(&nd)) != 0)
                    381:                        return error;
1.295     dholland  382:
1.480     christos  383:                epp->ep_vp = vp = nd.ni_vp;
1.296     dholland  384: #ifdef DIAGNOSTIC
1.480     christos  385:                /* paranoia (take this out once namei stuff stabilizes) */
                    386:                memset(nd.ni_pnbuf, '~', PATH_MAX);
1.295     dholland  387: #endif
1.480     christos  388:        } else {
                    389:                struct file *fp;
                    390:
                    391:                if ((error = fd_getvnode(epp->ep_xfd, &fp)) != 0)
                    392:                        return error;
                    393:                epp->ep_vp = vp = fp->f_vnode;
                    394:                vref(vp);
                    395:                fd_putfile(epp->ep_xfd);
                    396:                exec_resolvename(l, epp, vp, rpath);
                    397:                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                    398:        }
1.55      cgd       399:
1.84      mycroft   400:        /* check access and type */
1.55      cgd       401:        if (vp->v_type != VREG) {
1.81      kleink    402:                error = EACCES;
1.55      cgd       403:                goto bad1;
                    404:        }
1.254     pooka     405:        if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != 0)
1.84      mycroft   406:                goto bad1;
1.55      cgd       407:
                    408:        /* get attributes */
1.254     pooka     409:        if ((error = VOP_GETATTR(vp, epp->ep_vap, l->l_cred)) != 0)
1.55      cgd       410:                goto bad1;
                    411:
                    412:        /* Check mount point */
                    413:        if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
                    414:                error = EACCES;
                    415:                goto bad1;
                    416:        }
1.141     thorpej   417:        if (vp->v_mount->mnt_flag & MNT_NOSUID)
1.83      mycroft   418:                epp->ep_vap->va_mode &= ~(S_ISUID | S_ISGID);
1.55      cgd       419:
                    420:        /* try to open it */
1.254     pooka     421:        if ((error = VOP_OPEN(vp, FREAD, l->l_cred)) != 0)
1.55      cgd       422:                goto bad1;
                    423:
1.99      wrstuden  424:        /* unlock vp, since we need it unlocked from here on out. */
1.298     hannken   425:        VOP_UNLOCK(vp);
1.77      cgd       426:
1.222     elad      427: #if NVERIEXEC > 0
1.480     christos  428:        error = veriexec_verify(l, vp,
                    429:            epp->ep_resolvedname ? epp->ep_resolvedname : epp->ep_kname,
1.233     elad      430:            epp->ep_flags & EXEC_INDIR ? VERIEXEC_INDIRECT : VERIEXEC_DIRECT,
1.236     elad      431:            NULL);
                    432:        if (error)
1.234     elad      433:                goto bad2;
1.222     elad      434: #endif /* NVERIEXEC > 0 */
1.160     blymn     435:
1.232     elad      436: #ifdef PAX_SEGVGUARD
1.295     dholland  437:        error = pax_segvguard(l, vp, epp->ep_resolvedname, false);
1.234     elad      438:        if (error)
                    439:                goto bad2;
1.232     elad      440: #endif /* PAX_SEGVGUARD */
                    441:
1.55      cgd       442:        /* now we have the file, get the exec header */
1.74      christos  443:        error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, 0,
1.223     ad        444:                        UIO_SYSSPACE, 0, l->l_cred, &resid, NULL);
1.74      christos  445:        if (error)
1.55      cgd       446:                goto bad2;
                    447:        epp->ep_hdrvalid = epp->ep_hdrlen - resid;
                    448:
                    449:        /*
1.136     eeh       450:         * Set up default address space limits.  Can be overridden
                    451:         * by individual exec packages.
                    452:         */
1.436     maxv      453:        epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
1.136     eeh       454:        epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS;
1.436     maxv      455:
1.136     eeh       456:        /*
1.55      cgd       457:         * set up the vmcmds for creation of the process
                    458:         * address space
                    459:         */
                    460:        error = ENOEXEC;
1.244     dsl       461:        for (i = 0; i < nexecs; i++) {
1.68      cgd       462:                int newerror;
                    463:
1.130     jdolecek  464:                epp->ep_esch = execsw[i];
1.212     christos  465:                newerror = (*execsw[i]->es_makecmds)(l, epp);
1.244     dsl       466:
                    467:                if (!newerror) {
1.318     reinoud   468:                        /* Seems ok: check that entry point is not too high */
1.456     maxv      469:                        if (epp->ep_entry >= epp->ep_vm_maxaddr) {
1.322     reinoud   470: #ifdef DIAGNOSTIC
1.329     reinoud   471:                                printf("%s: rejecting %p due to "
1.456     maxv      472:                                    "too high entry address (>= %p)\n",
1.331     christos  473:                                         __func__, (void *)epp->ep_entry,
                    474:                                         (void *)epp->ep_vm_maxaddr);
1.322     reinoud   475: #endif
1.318     reinoud   476:                                error = ENOEXEC;
                    477:                                break;
                    478:                        }
                    479:                        /* Seems ok: check that entry point is not too low */
1.323     reinoud   480:                        if (epp->ep_entry < epp->ep_vm_minaddr) {
1.322     reinoud   481: #ifdef DIAGNOSTIC
1.329     reinoud   482:                                printf("%s: rejecting %p due to "
1.331     christos  483:                                    "too low entry address (< %p)\n",
                    484:                                     __func__, (void *)epp->ep_entry,
                    485:                                     (void *)epp->ep_vm_minaddr);
1.322     reinoud   486: #endif
1.244     dsl       487:                                error = ENOEXEC;
                    488:                                break;
                    489:                        }
                    490:
                    491:                        /* check limits */
                    492:                        if ((epp->ep_tsize > MAXTSIZ) ||
                    493:                            (epp->ep_dsize > (u_quad_t)l->l_proc->p_rlimit
                    494:                                                    [RLIMIT_DATA].rlim_cur)) {
1.322     reinoud   495: #ifdef DIAGNOSTIC
1.323     reinoud   496:                                printf("%s: rejecting due to "
1.331     christos  497:                                    "limits (t=%llu > %llu || d=%llu > %llu)\n",
                    498:                                    __func__,
                    499:                                    (unsigned long long)epp->ep_tsize,
                    500:                                    (unsigned long long)MAXTSIZ,
                    501:                                    (unsigned long long)epp->ep_dsize,
1.332     christos  502:                                    (unsigned long long)
                    503:                                    l->l_proc->p_rlimit[RLIMIT_DATA].rlim_cur);
1.322     reinoud   504: #endif
1.244     dsl       505:                                error = ENOMEM;
                    506:                                break;
                    507:                        }
                    508:                        return 0;
                    509:                }
                    510:
1.421     maxv      511:                /*
                    512:                 * Reset all the fields that may have been modified by the
                    513:                 * loader.
                    514:                 */
                    515:                KASSERT(epp->ep_emul_arg == NULL);
1.244     dsl       516:                if (epp->ep_emul_root != NULL) {
                    517:                        vrele(epp->ep_emul_root);
                    518:                        epp->ep_emul_root = NULL;
                    519:                }
                    520:                if (epp->ep_interp != NULL) {
                    521:                        vrele(epp->ep_interp);
                    522:                        epp->ep_interp = NULL;
                    523:                }
1.421     maxv      524:                epp->ep_pax_flags = 0;
1.244     dsl       525:
1.68      cgd       526:                /* make sure the first "interesting" error code is saved. */
1.244     dsl       527:                if (error == ENOEXEC)
1.68      cgd       528:                        error = newerror;
1.124     jdolecek  529:
1.244     dsl       530:                if (epp->ep_flags & EXEC_DESTR)
                    531:                        /* Error from "#!" code, tidied up by recursive call */
1.55      cgd       532:                        return error;
                    533:        }
                    534:
1.249     pooka     535:        /* not found, error */
                    536:
1.55      cgd       537:        /*
                    538:         * free any vmspace-creation commands,
                    539:         * and release their references
                    540:         */
                    541:        kill_vmcmds(&epp->ep_vmcmds);
                    542:
                    543: bad2:
                    544:        /*
1.99      wrstuden  545:         * close and release the vnode, restore the old one, free the
1.55      cgd       546:         * pathname buf, and punt.
                    547:         */
1.99      wrstuden  548:        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1.254     pooka     549:        VOP_CLOSE(vp, FREAD, l->l_cred);
1.99      wrstuden  550:        vput(vp);
1.55      cgd       551:        return error;
                    552:
                    553: bad1:
                    554:        /*
                    555:         * free the namei pathname buffer, and put the vnode
                    556:         * (which we don't yet have open).
                    557:         */
1.77      cgd       558:        vput(vp);                               /* was still locked */
1.55      cgd       559:        return error;
                    560: }
                    561:
1.188     chs       562: #ifdef __MACHINE_STACK_GROWS_UP
                    563: #define STACK_PTHREADSPACE NBPG
                    564: #else
                    565: #define STACK_PTHREADSPACE 0
                    566: #endif
                    567:
1.204     cube      568: static int
                    569: execve_fetch_element(char * const *array, size_t index, char **value)
                    570: {
                    571:        return copyin(array + index, value, sizeof(*value));
                    572: }
                    573:
1.55      cgd       574: /*
                    575:  * exec system call
                    576:  */
1.75      christos  577: int
1.258     dsl       578: sys_execve(struct lwp *l, const struct sys_execve_args *uap, register_t *retval)
1.71      thorpej   579: {
1.258     dsl       580:        /* {
1.138     lukem     581:                syscallarg(const char *)        path;
                    582:                syscallarg(char * const *)      argp;
                    583:                syscallarg(char * const *)      envp;
1.258     dsl       584:        } */
1.204     cube      585:
1.481     christos  586:        return execve1(l, true, SCARG(uap, path), -1, SCARG(uap, argp),
1.204     cube      587:            SCARG(uap, envp), execve_fetch_element);
                    588: }
                    589:
1.376     maxv      590: int
1.317     manu      591: sys_fexecve(struct lwp *l, const struct sys_fexecve_args *uap,
                    592:     register_t *retval)
                    593: {
                    594:        /* {
                    595:                syscallarg(int)                 fd;
                    596:                syscallarg(char * const *)      argp;
                    597:                syscallarg(char * const *)      envp;
                    598:        } */
                    599:
1.481     christos  600:        return execve1(l, false, NULL, SCARG(uap, fd), SCARG(uap, argp),
1.480     christos  601:            SCARG(uap, envp), execve_fetch_element);
1.317     manu      602: }
                    603:
1.282     ad        604: /*
                    605:  * Load modules to try and execute an image that we do not understand.
                    606:  * If no execsw entries are present, we load those likely to be needed
                    607:  * in order to run native images only.  Otherwise, we autoload all
                    608:  * possible modules that could let us run the binary.  XXX lame
                    609:  */
                    610: static void
                    611: exec_autoload(void)
                    612: {
                    613: #ifdef MODULAR
                    614:        static const char * const native[] = {
                    615:                "exec_elf32",
                    616:                "exec_elf64",
                    617:                "exec_script",
                    618:                NULL
                    619:        };
                    620:        static const char * const compat[] = {
                    621:                "exec_elf32",
                    622:                "exec_elf64",
                    623:                "exec_script",
                    624:                "exec_aout",
                    625:                "exec_coff",
                    626:                "exec_ecoff",
                    627:                "compat_aoutm68k",
                    628:                "compat_netbsd32",
                    629:                "compat_sunos",
                    630:                "compat_sunos32",
                    631:                "compat_ultrix",
                    632:                NULL
                    633:        };
                    634:        char const * const *list;
                    635:        int i;
                    636:
                    637:        list = (nexecs == 0 ? native : compat);
                    638:        for (i = 0; list[i] != NULL; i++) {
1.363     christos  639:                if (module_autoload(list[i], MODULE_CLASS_EXEC) != 0) {
1.376     maxv      640:                        continue;
1.282     ad        641:                }
1.376     maxv      642:                yield();
1.282     ad        643:        }
                    644: #endif
                    645: }
                    646:
1.470     christos  647: /*
                    648:  * Copy the user or kernel supplied upath to the allocated pathbuffer pbp
                    649:  * making it absolute in the process, by prepending the current working
1.471     wiz       650:  * directory if it is not. If offs is supplied it will contain the offset
1.470     christos  651:  * where the original supplied copy of upath starts.
                    652:  */
1.457     christos  653: int
                    654: exec_makepathbuf(struct lwp *l, const char *upath, enum uio_seg seg,
                    655:     struct pathbuf **pbp, size_t *offs)
1.414     christos  656: {
                    657:        char *path, *bp;
1.415     christos  658:        size_t len, tlen;
1.414     christos  659:        int error;
                    660:        struct cwdinfo *cwdi;
                    661:
                    662:        path = PNBUF_GET();
1.457     christos  663:        if (seg == UIO_SYSSPACE) {
                    664:                error = copystr(upath, path, MAXPATHLEN, &len);
                    665:        } else {
                    666:                error = copyinstr(upath, path, MAXPATHLEN, &len);
                    667:        }
1.474     maxv      668:        if (error)
1.472     christos  669:                goto err;
1.414     christos  670:
1.415     christos  671:        if (path[0] == '/') {
1.457     christos  672:                if (offs)
                    673:                        *offs = 0;
1.414     christos  674:                goto out;
1.415     christos  675:        }
1.414     christos  676:
                    677:        len++;
1.477     maxv      678:        if (len + 1 >= MAXPATHLEN) {
                    679:                error = ENAMETOOLONG;
1.475     christos  680:                goto err;
1.477     maxv      681:        }
1.414     christos  682:        bp = path + MAXPATHLEN - len;
                    683:        memmove(bp, path, len);
                    684:        *(--bp) = '/';
                    685:
1.435     msaitoh   686:        cwdi = l->l_proc->p_cwdi;
1.414     christos  687:        rw_enter(&cwdi->cwdi_lock, RW_READER);
                    688:        error = getcwd_common(cwdi->cwdi_cdir, NULL, &bp, path, MAXPATHLEN / 2,
                    689:            GETCWD_CHECK_ACCESS, l);
                    690:        rw_exit(&cwdi->cwdi_lock);
                    691:
1.474     maxv      692:        if (error)
1.472     christos  693:                goto err;
1.415     christos  694:        tlen = path + MAXPATHLEN - bp;
1.414     christos  695:
1.415     christos  696:        memmove(path, bp, tlen);
1.473     christos  697:        path[tlen - 1] = '\0';
1.457     christos  698:        if (offs)
                    699:                *offs = tlen - len;
1.414     christos  700: out:
1.415     christos  701:        *pbp = pathbuf_assimilate(path);
                    702:        return 0;
1.472     christos  703: err:
                    704:        PNBUF_PUT(path);
                    705:        return error;
1.414     christos  706: }
                    707:
1.436     maxv      708: vaddr_t
                    709: exec_vm_minaddr(vaddr_t va_min)
                    710: {
                    711:        /*
                    712:         * Increase va_min if we don't want NULL to be mappable by the
                    713:         * process.
                    714:         */
1.437     christos  715: #define VM_MIN_GUARD   PAGE_SIZE
1.436     maxv      716:        if (user_va0_disable && (va_min < VM_MIN_GUARD))
                    717:                return VM_MIN_GUARD;
                    718:        return va_min;
                    719: }
                    720:
1.337     martin    721: static int
1.481     christos  722: execve_loadvm(struct lwp *l, bool has_path, const char *path, int fd,
                    723:        char * const *args, char * const *envs,
                    724:        execve_fetch_element_t fetch_element,
1.337     martin    725:        struct execve_data * restrict data)
1.204     cube      726: {
1.378     uebayasi  727:        struct exec_package     * const epp = &data->ed_pack;
1.153     thorpej   728:        int                     error;
1.164     thorpej   729:        struct proc             *p;
1.391     uebayasi  730:        char                    *dp;
1.282     ad        731:        u_int                   modgen;
1.337     martin    732:
                    733:        KASSERT(data != NULL);
1.55      cgd       734:
1.237     ad        735:        p = l->l_proc;
1.376     maxv      736:        modgen = 0;
1.164     thorpej   737:
1.418     christos  738:        SDT_PROBE(proc, kernel, , exec, path, 0, 0, 0, 0);
1.294     darran    739:
1.149     christos  740:        /*
1.269     christos  741:         * Check if we have exceeded our number of processes limit.
                    742:         * This is so that we handle the case where a root daemon
                    743:         * forked, ran setuid to become the desired user and is trying
                    744:         * to exec. The obvious place to do the reference counting check
                    745:         * is setuid(), but we don't do the reference counting check there
                    746:         * like other OS's do because then all the programs that use setuid()
                    747:         * must be modified to check the return code of setuid() and exit().
                    748:         * It is dangerous to make setuid() fail, because it fails open and
                    749:         * the program will continue to run as root. If we make it succeed
                    750:         * and return an error code, again we are not enforcing the limit.
                    751:         * The best place to enforce the limit is here, when the process tries
                    752:         * to execute a new image, because eventually the process will need
                    753:         * to call exec in order to do something useful.
                    754:         */
1.282     ad        755:  retry:
1.347     elad      756:        if (p->p_flag & PK_SUGID) {
                    757:                if (kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT,
                    758:                     p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
                    759:                     &p->p_rlimit[RLIMIT_NPROC],
                    760:                     KAUTH_ARG(RLIMIT_NPROC)) != 0 &&
                    761:                    chgproccnt(kauth_cred_getuid(l->l_cred), 0) >
                    762:                     p->p_rlimit[RLIMIT_NPROC].rlim_cur)
1.269     christos  763:                return EAGAIN;
1.347     elad      764:        }
1.269     christos  765:
                    766:        /*
1.352     rmind     767:         * Drain existing references and forbid new ones.  The process
                    768:         * should be left alone until we're done here.  This is necessary
                    769:         * to avoid race conditions - e.g. in ptrace() - that might allow
                    770:         * a local user to illicitly obtain elevated privileges.
                    771:         */
                    772:        rw_enter(&p->p_reflock, RW_WRITER);
                    773:
1.481     christos  774:        if (has_path) {
1.480     christos  775:                size_t  offs;
                    776:                /*
                    777:                 * Init the namei data to point the file user's program name.
                    778:                 * This is done here rather than in check_exec(), so that it's
                    779:                 * possible to override this settings if any of makecmd/probe
                    780:                 * functions call check_exec() recursively - for example,
                    781:                 * see exec_script_makecmds().
                    782:                 */
                    783:                if ((error = exec_makepathbuf(l, path, UIO_USERSPACE,
                    784:                    &data->ed_pathbuf, &offs)) != 0)
                    785:                        goto clrflg;
                    786:                data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf);
                    787:                epp->ep_kname = data->ed_pathstring + offs;
                    788:                data->ed_resolvedname = PNBUF_GET();
                    789:                epp->ep_resolvedname = data->ed_resolvedname;
                    790:                epp->ep_xfd = -1;
1.481     christos  791:        } else {
                    792:                data->ed_pathbuf = pathbuf_assimilate(strcpy(PNBUF_GET(), "/"));
                    793:                data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf);
                    794:                epp->ep_kname = "*fexecve*";
                    795:                data->ed_resolvedname = NULL;
                    796:                epp->ep_resolvedname = NULL;
                    797:                epp->ep_xfd = fd;
1.480     christos  798:        }
                    799:
1.55      cgd       800:
                    801:        /*
                    802:         * initialize the fields of the exec package.
                    803:         */
1.378     uebayasi  804:        epp->ep_hdr = kmem_alloc(exec_maxhdrsz, KM_SLEEP);
                    805:        epp->ep_hdrlen = exec_maxhdrsz;
                    806:        epp->ep_hdrvalid = 0;
                    807:        epp->ep_emul_arg = NULL;
                    808:        epp->ep_emul_arg_free = NULL;
                    809:        memset(&epp->ep_vmcmds, 0, sizeof(epp->ep_vmcmds));
                    810:        epp->ep_vap = &data->ed_attr;
1.411     christos  811:        epp->ep_flags = (p->p_flag & PK_32) ? EXEC_FROM32 : 0;
1.378     uebayasi  812:        MD_TOPDOWN_INIT(epp);
                    813:        epp->ep_emul_root = NULL;
                    814:        epp->ep_interp = NULL;
                    815:        epp->ep_esch = NULL;
                    816:        epp->ep_pax_flags = 0;
                    817:        memset(epp->ep_machine_arch, 0, sizeof(epp->ep_machine_arch));
1.55      cgd       818:
1.237     ad        819:        rw_enter(&exec_lock, RW_READER);
1.130     jdolecek  820:
1.55      cgd       821:        /* see if we can run it. */
1.480     christos  822:        if ((error = check_exec(l, epp, data->ed_pathbuf,
                    823:            &data->ed_resolvedname)) != 0) {
1.454     christos  824:                if (error != ENOENT && error != EACCES && error != ENOEXEC) {
1.447     martin    825:                        DPRINTF(("%s: check exec failed for %s, error %d\n",
                    826:                            __func__, epp->ep_kname, error));
1.261     xtraeme   827:                }
1.352     rmind     828:                goto freehdr;
1.248     christos  829:        }
1.55      cgd       830:
                    831:        /* allocate an argument buffer */
1.337     martin    832:        data->ed_argp = pool_get(&exec_pool, PR_WAITOK);
                    833:        KASSERT(data->ed_argp != NULL);
                    834:        dp = data->ed_argp;
1.55      cgd       835:
1.391     uebayasi  836:        if ((error = copyinargs(data, args, envs, fetch_element, &dp)) != 0) {
1.55      cgd       837:                goto bad;
                    838:        }
1.61      mycroft   839:
1.379     uebayasi  840:        /*
                    841:         * Calculate the new stack size.
                    842:         */
                    843:
1.267     dsl       844: #ifdef __MACHINE_STACK_GROWS_UP
1.386     uebayasi  845: /*
                    846:  * copyargs() fills argc/argv/envp from the lower address even on
                    847:  * __MACHINE_STACK_GROWS_UP machines.  Reserve a few words just below the SP
                    848:  * so that _rtld() use it.
                    849:  */
1.267     dsl       850: #define        RTLD_GAP        32
                    851: #else
                    852: #define        RTLD_GAP        0
                    853: #endif
                    854:
1.396     uebayasi  855:        const size_t argenvstrlen = (char *)ALIGN(dp) - data->ed_argp;
1.386     uebayasi  856:
1.396     uebayasi  857:        data->ed_argslen = calcargs(data, argenvstrlen);
1.386     uebayasi  858:
1.430     christos  859:        const size_t len = calcstack(data, pax_aslr_stack_gap(epp) + RTLD_GAP);
1.55      cgd       860:
1.396     uebayasi  861:        if (len > epp->ep_ssize) {
1.337     martin    862:                /* in effect, compare to initial limit */
1.396     uebayasi  863:                DPRINTF(("%s: stack limit exceeded %zu\n", __func__, len));
1.403     maxv      864:                error = ENOMEM;
1.55      cgd       865:                goto bad;
                    866:        }
1.337     martin    867:        /* adjust "active stack depth" for process VSZ */
1.396     uebayasi  868:        epp->ep_ssize = len;
1.337     martin    869:
                    870:        return 0;
                    871:
1.352     rmind     872:  bad:
                    873:        /* free the vmspace-creation commands, and release their references */
1.378     uebayasi  874:        kill_vmcmds(&epp->ep_vmcmds);
1.352     rmind     875:        /* kill any opened file descriptor, if necessary */
1.378     uebayasi  876:        if (epp->ep_flags & EXEC_HASFD) {
                    877:                epp->ep_flags &= ~EXEC_HASFD;
                    878:                fd_close(epp->ep_fd);
1.352     rmind     879:        }
                    880:        /* close and put the exec'd file */
1.378     uebayasi  881:        vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
                    882:        VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred);
                    883:        vput(epp->ep_vp);
1.352     rmind     884:        pool_put(&exec_pool, data->ed_argp);
                    885:
                    886:  freehdr:
1.378     uebayasi  887:        kmem_free(epp->ep_hdr, epp->ep_hdrlen);
                    888:        if (epp->ep_emul_root != NULL)
                    889:                vrele(epp->ep_emul_root);
                    890:        if (epp->ep_interp != NULL)
                    891:                vrele(epp->ep_interp);
1.352     rmind     892:
1.337     martin    893:        rw_exit(&exec_lock);
1.352     rmind     894:
1.479     christos  895:        exec_path_free(data);
1.352     rmind     896:
                    897:  clrflg:
1.351     rmind     898:        rw_exit(&p->p_reflock);
1.337     martin    899:
                    900:        if (modgen != module_gen && error == ENOEXEC) {
                    901:                modgen = module_gen;
                    902:                exec_autoload();
                    903:                goto retry;
                    904:        }
                    905:
1.418     christos  906:        SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
1.337     martin    907:        return error;
                    908: }
                    909:
1.401     uebayasi  910: static int
                    911: execve_dovmcmds(struct lwp *l, struct execve_data * restrict data)
                    912: {
                    913:        struct exec_package     * const epp = &data->ed_pack;
                    914:        struct proc             *p = l->l_proc;
                    915:        struct exec_vmcmd       *base_vcp;
                    916:        int                     error = 0;
1.407     riastrad  917:        size_t                  i;
1.401     uebayasi  918:
                    919:        /* record proc's vnode, for use by procfs and others */
                    920:        if (p->p_textvp)
                    921:                vrele(p->p_textvp);
                    922:        vref(epp->ep_vp);
                    923:        p->p_textvp = epp->ep_vp;
                    924:
                    925:        /* create the new process's VM space by running the vmcmds */
                    926:        KASSERTMSG(epp->ep_vmcmds.evs_used != 0, "%s: no vmcmds", __func__);
                    927:
1.428     christos  928: #ifdef TRACE_EXEC
1.401     uebayasi  929:        DUMPVMCMDS(epp, 0, 0);
1.428     christos  930: #endif
1.401     uebayasi  931:
                    932:        base_vcp = NULL;
                    933:
                    934:        for (i = 0; i < epp->ep_vmcmds.evs_used && !error; i++) {
                    935:                struct exec_vmcmd *vcp;
                    936:
                    937:                vcp = &epp->ep_vmcmds.evs_cmds[i];
                    938:                if (vcp->ev_flags & VMCMD_RELATIVE) {
                    939:                        KASSERTMSG(base_vcp != NULL,
                    940:                            "%s: relative vmcmd with no base", __func__);
                    941:                        KASSERTMSG((vcp->ev_flags & VMCMD_BASE) == 0,
                    942:                            "%s: illegal base & relative vmcmd", __func__);
                    943:                        vcp->ev_addr += base_vcp->ev_addr;
                    944:                }
                    945:                error = (*vcp->ev_proc)(l, vcp);
                    946:                if (error)
                    947:                        DUMPVMCMDS(epp, i, error);
                    948:                if (vcp->ev_flags & VMCMD_BASE)
                    949:                        base_vcp = vcp;
                    950:        }
                    951:
                    952:        /* free the vmspace-creation commands, and release their references */
                    953:        kill_vmcmds(&epp->ep_vmcmds);
                    954:
                    955:        vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
                    956:        VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred);
                    957:        vput(epp->ep_vp);
                    958:
                    959:        /* if an error happened, deallocate and punt */
                    960:        if (error != 0) {
                    961:                DPRINTF(("%s: vmcmd %zu failed: %d\n", __func__, i - 1, error));
                    962:        }
                    963:        return error;
                    964: }
                    965:
1.352     rmind     966: static void
                    967: execve_free_data(struct execve_data *data)
                    968: {
1.378     uebayasi  969:        struct exec_package     * const epp = &data->ed_pack;
1.352     rmind     970:
                    971:        /* free the vmspace-creation commands, and release their references */
1.378     uebayasi  972:        kill_vmcmds(&epp->ep_vmcmds);
1.352     rmind     973:        /* kill any opened file descriptor, if necessary */
1.378     uebayasi  974:        if (epp->ep_flags & EXEC_HASFD) {
                    975:                epp->ep_flags &= ~EXEC_HASFD;
                    976:                fd_close(epp->ep_fd);
1.352     rmind     977:        }
                    978:
                    979:        /* close and put the exec'd file */
1.378     uebayasi  980:        vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
                    981:        VOP_CLOSE(epp->ep_vp, FREAD, curlwp->l_cred);
                    982:        vput(epp->ep_vp);
1.352     rmind     983:        pool_put(&exec_pool, data->ed_argp);
                    984:
1.378     uebayasi  985:        kmem_free(epp->ep_hdr, epp->ep_hdrlen);
                    986:        if (epp->ep_emul_root != NULL)
                    987:                vrele(epp->ep_emul_root);
                    988:        if (epp->ep_interp != NULL)
                    989:                vrele(epp->ep_interp);
1.352     rmind     990:
1.479     christos  991:        exec_path_free(data);
1.352     rmind     992: }
                    993:
1.400     uebayasi  994: static void
1.450     christos  995: pathexec(struct proc *p, const char *resolvedname)
1.400     uebayasi  996: {
1.480     christos  997:        /* set command name & other accounting info */
                    998:        const char *cmdname;
1.400     uebayasi  999:
1.480     christos 1000:        if (resolvedname == NULL) {
                   1001:                cmdname = "*fexecve*";
                   1002:                resolvedname = "/";
                   1003:        } else {
                   1004:                cmdname = strrchr(resolvedname, '/') + 1;
                   1005:        }
                   1006:        KASSERTMSG(resolvedname[0] == '/', "bad resolvedname `%s'",
                   1007:            resolvedname);
                   1008:
                   1009:        strlcpy(p->p_comm, cmdname, sizeof(p->p_comm));
1.400     uebayasi 1010:
1.450     christos 1011:        kmem_strfree(p->p_path);
                   1012:        p->p_path = kmem_strdupsize(resolvedname, NULL, KM_SLEEP);
1.400     uebayasi 1013: }
                   1014:
1.387     uebayasi 1015: /* XXX elsewhere */
                   1016: static int
                   1017: credexec(struct lwp *l, struct vattr *attr)
                   1018: {
                   1019:        struct proc *p = l->l_proc;
                   1020:        int error;
                   1021:
                   1022:        /*
                   1023:         * Deal with set[ug]id.  MNT_NOSUID has already been used to disable
                   1024:         * s[ug]id.  It's OK to check for PSL_TRACED here as we have blocked
                   1025:         * out additional references on the process for the moment.
                   1026:         */
                   1027:        if ((p->p_slflag & PSL_TRACED) == 0 &&
                   1028:
                   1029:            (((attr->va_mode & S_ISUID) != 0 &&
                   1030:              kauth_cred_geteuid(l->l_cred) != attr->va_uid) ||
                   1031:
                   1032:             ((attr->va_mode & S_ISGID) != 0 &&
                   1033:              kauth_cred_getegid(l->l_cred) != attr->va_gid))) {
                   1034:                /*
                   1035:                 * Mark the process as SUGID before we do
                   1036:                 * anything that might block.
                   1037:                 */
                   1038:                proc_crmod_enter();
                   1039:                proc_crmod_leave(NULL, NULL, true);
                   1040:
                   1041:                /* Make sure file descriptors 0..2 are in use. */
                   1042:                if ((error = fd_checkstd()) != 0) {
                   1043:                        DPRINTF(("%s: fdcheckstd failed %d\n",
                   1044:                            __func__, error));
                   1045:                        return error;
                   1046:                }
                   1047:
                   1048:                /*
                   1049:                 * Copy the credential so other references don't see our
                   1050:                 * changes.
                   1051:                 */
                   1052:                l->l_cred = kauth_cred_copy(l->l_cred);
                   1053: #ifdef KTRACE
                   1054:                /*
                   1055:                 * If the persistent trace flag isn't set, turn off.
                   1056:                 */
                   1057:                if (p->p_tracep) {
                   1058:                        mutex_enter(&ktrace_lock);
                   1059:                        if (!(p->p_traceflag & KTRFAC_PERSISTENT))
                   1060:                                ktrderef(p);
                   1061:                        mutex_exit(&ktrace_lock);
                   1062:                }
                   1063: #endif
                   1064:                if (attr->va_mode & S_ISUID)
                   1065:                        kauth_cred_seteuid(l->l_cred, attr->va_uid);
                   1066:                if (attr->va_mode & S_ISGID)
                   1067:                        kauth_cred_setegid(l->l_cred, attr->va_gid);
                   1068:        } else {
                   1069:                if (kauth_cred_geteuid(l->l_cred) ==
                   1070:                    kauth_cred_getuid(l->l_cred) &&
                   1071:                    kauth_cred_getegid(l->l_cred) ==
                   1072:                    kauth_cred_getgid(l->l_cred))
                   1073:                        p->p_flag &= ~PK_SUGID;
                   1074:        }
                   1075:
                   1076:        /*
                   1077:         * Copy the credential so other references don't see our changes.
                   1078:         * Test to see if this is necessary first, since in the common case
                   1079:         * we won't need a private reference.
                   1080:         */
                   1081:        if (kauth_cred_geteuid(l->l_cred) != kauth_cred_getsvuid(l->l_cred) ||
                   1082:            kauth_cred_getegid(l->l_cred) != kauth_cred_getsvgid(l->l_cred)) {
                   1083:                l->l_cred = kauth_cred_copy(l->l_cred);
                   1084:                kauth_cred_setsvuid(l->l_cred, kauth_cred_geteuid(l->l_cred));
                   1085:                kauth_cred_setsvgid(l->l_cred, kauth_cred_getegid(l->l_cred));
                   1086:        }
                   1087:
                   1088:        /* Update the master credentials. */
                   1089:        if (l->l_cred != p->p_cred) {
                   1090:                kauth_cred_t ocred;
                   1091:
                   1092:                kauth_cred_hold(l->l_cred);
                   1093:                mutex_enter(p->p_lock);
                   1094:                ocred = p->p_cred;
                   1095:                p->p_cred = l->l_cred;
                   1096:                mutex_exit(p->p_lock);
                   1097:                kauth_cred_free(ocred);
                   1098:        }
                   1099:
                   1100:        return 0;
                   1101: }
                   1102:
1.406     uebayasi 1103: static void
                   1104: emulexec(struct lwp *l, struct exec_package *epp)
                   1105: {
                   1106:        struct proc             *p = l->l_proc;
                   1107:
                   1108:        /* The emulation root will usually have been found when we looked
                   1109:         * for the elf interpreter (or similar), if not look now. */
                   1110:        if (epp->ep_esch->es_emul->e_path != NULL &&
                   1111:            epp->ep_emul_root == NULL)
                   1112:                emul_find_root(l, epp);
                   1113:
                   1114:        /* Any old emulation root got removed by fdcloseexec */
                   1115:        rw_enter(&p->p_cwdi->cwdi_lock, RW_WRITER);
                   1116:        p->p_cwdi->cwdi_edir = epp->ep_emul_root;
                   1117:        rw_exit(&p->p_cwdi->cwdi_lock);
                   1118:        epp->ep_emul_root = NULL;
                   1119:        if (epp->ep_interp != NULL)
                   1120:                vrele(epp->ep_interp);
                   1121:
                   1122:        /*
                   1123:         * Call emulation specific exec hook. This can setup per-process
                   1124:         * p->p_emuldata or do any other per-process stuff an emulation needs.
                   1125:         *
                   1126:         * If we are executing process of different emulation than the
                   1127:         * original forked process, call e_proc_exit() of the old emulation
                   1128:         * first, then e_proc_exec() of new emulation. If the emulation is
                   1129:         * same, the exec hook code should deallocate any old emulation
                   1130:         * resources held previously by this process.
                   1131:         */
                   1132:        if (p->p_emul && p->p_emul->e_proc_exit
                   1133:            && p->p_emul != epp->ep_esch->es_emul)
                   1134:                (*p->p_emul->e_proc_exit)(p);
                   1135:
                   1136:        /*
                   1137:         * This is now LWP 1.
                   1138:         */
                   1139:        /* XXX elsewhere */
                   1140:        mutex_enter(p->p_lock);
                   1141:        p->p_nlwpid = 1;
                   1142:        l->l_lid = 1;
                   1143:        mutex_exit(p->p_lock);
                   1144:
                   1145:        /*
                   1146:         * Call exec hook. Emulation code may NOT store reference to anything
                   1147:         * from &pack.
                   1148:         */
                   1149:        if (epp->ep_esch->es_emul->e_proc_exec)
                   1150:                (*epp->ep_esch->es_emul->e_proc_exec)(p, epp);
                   1151:
                   1152:        /* update p_emul, the old value is no longer needed */
                   1153:        p->p_emul = epp->ep_esch->es_emul;
                   1154:
                   1155:        /* ...and the same for p_execsw */
                   1156:        p->p_execsw = epp->ep_esch;
                   1157:
                   1158: #ifdef __HAVE_SYSCALL_INTERN
                   1159:        (*p->p_emul->e_syscall_intern)(p);
                   1160: #endif
                   1161:        ktremul();
                   1162: }
                   1163:
1.337     martin   1164: static int
1.348     martin   1165: execve_runproc(struct lwp *l, struct execve_data * restrict data,
                   1166:        bool no_local_exec_lock, bool is_spawn)
1.337     martin   1167: {
1.378     uebayasi 1168:        struct exec_package     * const epp = &data->ed_pack;
1.352     rmind    1169:        int error = 0;
                   1170:        struct proc             *p;
1.337     martin   1171:
1.348     martin   1172:        /*
                   1173:         * In case of a posix_spawn operation, the child doing the exec
                   1174:         * might not hold the reader lock on exec_lock, but the parent
                   1175:         * will do this instead.
                   1176:         */
                   1177:        KASSERT(no_local_exec_lock || rw_lock_held(&exec_lock));
1.381     uebayasi 1178:        KASSERT(!no_local_exec_lock || is_spawn);
1.337     martin   1179:        KASSERT(data != NULL);
1.352     rmind    1180:
                   1181:        p = l->l_proc;
1.337     martin   1182:
1.237     ad       1183:        /* Get rid of other LWPs. */
1.340     rmind    1184:        if (p->p_nlwps > 1) {
1.272     ad       1185:                mutex_enter(p->p_lock);
1.237     ad       1186:                exit_lwps(l);
1.272     ad       1187:                mutex_exit(p->p_lock);
1.237     ad       1188:        }
1.164     thorpej  1189:        KDASSERT(p->p_nlwps == 1);
                   1190:
1.253     ad       1191:        /* Destroy any lwpctl info. */
                   1192:        if (p->p_lwpctl != NULL)
                   1193:                lwp_ctl_exit();
                   1194:
1.164     thorpej  1195:        /* Remove POSIX timers */
                   1196:        timers_free(p, TIMERS_POSIX);
                   1197:
1.417     maxv     1198:        /* Set the PaX flags. */
1.431     christos 1199:        pax_set_flags(epp, p);
1.417     maxv     1200:
1.86      thorpej  1201:        /*
                   1202:         * Do whatever is necessary to prepare the address space
                   1203:         * for remapping.  Note that this might replace the current
                   1204:         * vmspace with another!
                   1205:         */
1.348     martin   1206:        if (is_spawn)
1.378     uebayasi 1207:                uvmspace_spawn(l, epp->ep_vm_minaddr,
                   1208:                    epp->ep_vm_maxaddr,
                   1209:                    epp->ep_flags & EXEC_TOPDOWN_VM);
1.348     martin   1210:        else
1.378     uebayasi 1211:                uvmspace_exec(l, epp->ep_vm_minaddr,
                   1212:                    epp->ep_vm_maxaddr,
                   1213:                    epp->ep_flags & EXEC_TOPDOWN_VM);
1.55      cgd      1214:
1.385     uebayasi 1215:        struct vmspace          *vm;
1.86      thorpej  1216:        vm = p->p_vmspace;
1.378     uebayasi 1217:        vm->vm_taddr = (void *)epp->ep_taddr;
                   1218:        vm->vm_tsize = btoc(epp->ep_tsize);
                   1219:        vm->vm_daddr = (void*)epp->ep_daddr;
                   1220:        vm->vm_dsize = btoc(epp->ep_dsize);
                   1221:        vm->vm_ssize = btoc(epp->ep_ssize);
1.288     mrg      1222:        vm->vm_issize = 0;
1.378     uebayasi 1223:        vm->vm_maxsaddr = (void *)epp->ep_maxsaddr;
                   1224:        vm->vm_minsaddr = (void *)epp->ep_minsaddr;
1.55      cgd      1225:
1.424     khorben  1226:        pax_aslr_init_vm(l, vm, epp);
1.260     christos 1227:
1.401     uebayasi 1228:        /* Now map address space. */
                   1229:        error = execve_dovmcmds(l, data);
                   1230:        if (error != 0)
1.55      cgd      1231:                goto exec_abort;
                   1232:
1.452     christos 1233:        pathexec(p, epp->ep_resolvedname);
1.255     christos 1234:
1.397     uebayasi 1235:        char * const newstack = STACK_GROW(vm->vm_minsaddr, epp->ep_ssize);
1.386     uebayasi 1236:
1.399     uebayasi 1237:        error = copyoutargs(data, l, newstack);
1.398     uebayasi 1238:        if (error != 0)
1.55      cgd      1239:                goto exec_abort;
1.109     simonb   1240:
1.307     pooka    1241:        cwdexec(p);
1.270     ad       1242:        fd_closeexec();         /* handle close on exec */
1.315     alnsn    1243:
                   1244:        if (__predict_false(ktrace_on))
                   1245:                fd_ktrexecfd();
                   1246:
1.438     kamil    1247:        execsigs(p);            /* reset caught signals */
1.183     junyoung 1248:
1.380     uebayasi 1249:        mutex_enter(p->p_lock);
1.164     thorpej  1250:        l->l_ctxlink = NULL;    /* reset ucontext link */
1.55      cgd      1251:        p->p_acflag &= ~AFORK;
1.238     pavel    1252:        p->p_flag |= PK_EXEC;
1.272     ad       1253:        mutex_exit(p->p_lock);
1.237     ad       1254:
                   1255:        /*
                   1256:         * Stop profiling.
                   1257:         */
                   1258:        if ((p->p_stflag & PST_PROFIL) != 0) {
                   1259:                mutex_spin_enter(&p->p_stmutex);
                   1260:                stopprofclock(p);
                   1261:                mutex_spin_exit(&p->p_stmutex);
                   1262:        }
                   1263:
                   1264:        /*
1.275     ad       1265:         * It's OK to test PL_PPWAIT unlocked here, as other LWPs have
1.237     ad       1266:         * exited and exec()/exit() are the only places it will be cleared.
                   1267:         */
1.275     ad       1268:        if ((p->p_lflag & PL_PPWAIT) != 0) {
1.467     kamil    1269:                lwp_t *lp;
                   1270:
1.354     christos 1271:                mutex_enter(proc_lock);
1.467     kamil    1272:                lp = p->p_vforklwp;
                   1273:                p->p_vforklwp = NULL;
                   1274:
1.354     christos 1275:                l->l_lwpctl = NULL; /* was on loan from blocked parent */
                   1276:                p->p_lflag &= ~PL_PPWAIT;
1.467     kamil    1277:                lp->l_vforkwaiting = false;
                   1278:
                   1279:                cv_broadcast(&lp->l_waitcv);
1.354     christos 1280:                mutex_exit(proc_lock);
1.55      cgd      1281:        }
                   1282:
1.387     uebayasi 1283:        error = credexec(l, &data->ed_attr);
                   1284:        if (error)
                   1285:                goto exec_abort;
1.221     ad       1286:
1.155     gmcgarry 1287: #if defined(__HAVE_RAS)
                   1288:        /*
                   1289:         * Remove all RASs from the address space.
                   1290:         */
1.251     ad       1291:        ras_purgeall();
1.155     gmcgarry 1292: #endif
1.107     fvdl     1293:
                   1294:        doexechooks(p);
1.55      cgd      1295:
1.390     uebayasi 1296:        /*
                   1297:         * Set initial SP at the top of the stack.
                   1298:         *
                   1299:         * Note that on machines where stack grows up (e.g. hppa), SP points to
                   1300:         * the end of arg/env strings.  Userland guesses the address of argc
                   1301:         * via ps_strings::ps_argvstr.
                   1302:         */
                   1303:
                   1304:        /* Setup new registers and do misc. setup. */
1.397     uebayasi 1305:        (*epp->ep_esch->es_emul->e_setregs)(l, epp, (vaddr_t)newstack);
1.378     uebayasi 1306:        if (epp->ep_esch->es_setregs)
1.397     uebayasi 1307:                (*epp->ep_esch->es_setregs)(l, epp, (vaddr_t)newstack);
1.55      cgd      1308:
1.309     joerg    1309:        /* Provide a consistent LWP private setting */
                   1310:        (void)lwp_setprivate(l, NULL);
                   1311:
1.316     matt     1312:        /* Discard all PCU state; need to start fresh */
                   1313:        pcu_discard_all(l);
                   1314:
1.171     chs      1315:        /* map the process's signal trampoline code */
1.378     uebayasi 1316:        if ((error = exec_sigcode_map(p, epp->ep_esch->es_emul)) != 0) {
1.312     christos 1317:                DPRINTF(("%s: map sigcode failed %d\n", __func__, error));
1.171     chs      1318:                goto exec_abort;
1.209     christos 1319:        }
1.171     chs      1320:
1.337     martin   1321:        pool_put(&exec_pool, data->ed_argp);
1.276     ad       1322:
                   1323:        /* notify others that we exec'd */
                   1324:        KNOTE(&p->p_klist, NOTE_EXEC);
                   1325:
1.378     uebayasi 1326:        kmem_free(epp->ep_hdr, epp->ep_hdrlen);
1.122     jdolecek 1327:
1.418     christos 1328:        SDT_PROBE(proc, kernel, , exec__success, epp->ep_kname, 0, 0, 0, 0);
1.294     darran   1329:
1.406     uebayasi 1330:        emulexec(l, epp);
1.85      mycroft  1331:
1.252     ad       1332:        /* Allow new references from the debugger/procfs. */
1.341     martin   1333:        rw_exit(&p->p_reflock);
1.348     martin   1334:        if (!no_local_exec_lock)
                   1335:                rw_exit(&exec_lock);
1.162     manu     1336:
1.271     ad       1337:        mutex_enter(proc_lock);
1.237     ad       1338:
1.466     kamil    1339:        /* posix_spawn(3) reports a single event with implied exec(3) */
                   1340:        if ((p->p_slflag & PSL_TRACED) && !is_spawn) {
1.459     kamil    1341:                mutex_enter(p->p_lock);
1.482   ! kamil    1342:                eventswitch(TRAP_EXEC, 0, 0);
1.459     kamil    1343:                mutex_enter(proc_lock);
1.237     ad       1344:        }
1.162     manu     1345:
1.237     ad       1346:        if (p->p_sflag & PS_STOPEXEC) {
1.383     uebayasi 1347:                ksiginfoq_t kq;
                   1348:
1.237     ad       1349:                KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
1.175     dsl      1350:                p->p_pptr->p_nstopchild++;
1.419     pgoyette 1351:                p->p_waited = 0;
1.272     ad       1352:                mutex_enter(p->p_lock);
1.237     ad       1353:                ksiginfo_queue_init(&kq);
                   1354:                sigclearall(p, &contsigmask, &kq);
                   1355:                lwp_lock(l);
                   1356:                l->l_stat = LSSTOP;
1.162     manu     1357:                p->p_stat = SSTOP;
1.164     thorpej  1358:                p->p_nrlwps--;
1.304     rmind    1359:                lwp_unlock(l);
1.272     ad       1360:                mutex_exit(p->p_lock);
1.271     ad       1361:                mutex_exit(proc_lock);
1.304     rmind    1362:                lwp_lock(l);
1.245     yamt     1363:                mi_switch(l);
1.237     ad       1364:                ksiginfo_queue_drain(&kq);
                   1365:                KERNEL_LOCK(l->l_biglocks, l);
                   1366:        } else {
1.271     ad       1367:                mutex_exit(proc_lock);
1.162     manu     1368:        }
                   1369:
1.479     christos 1370:        exec_path_free(data);
1.428     christos 1371: #ifdef TRACE_EXEC
1.327     reinoud  1372:        DPRINTF(("%s finished\n", __func__));
1.428     christos 1373: #endif
1.374     martin   1374:        return EJUSTRETURN;
1.55      cgd      1375:
1.138     lukem    1376:  exec_abort:
1.418     christos 1377:        SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
1.297     rmind    1378:        rw_exit(&p->p_reflock);
1.348     martin   1379:        if (!no_local_exec_lock)
                   1380:                rw_exit(&exec_lock);
1.297     rmind    1381:
1.479     christos 1382:        exec_path_free(data);
1.352     rmind    1383:
1.55      cgd      1384:        /*
                   1385:         * the old process doesn't exist anymore.  exit gracefully.
                   1386:         * get rid of the (new) address space we have created, if any, get rid
                   1387:         * of our namei data and vnode, and exit noting failure
                   1388:         */
1.88      mrg      1389:        uvm_deallocate(&vm->vm_map, VM_MIN_ADDRESS,
1.352     rmind    1390:                VM_MAXUSER_ADDRESS - VM_MIN_ADDRESS);
1.348     martin   1391:
1.378     uebayasi 1392:        exec_free_emul_arg(epp);
1.337     martin   1393:        pool_put(&exec_pool, data->ed_argp);
1.378     uebayasi 1394:        kmem_free(epp->ep_hdr, epp->ep_hdrlen);
                   1395:        if (epp->ep_emul_root != NULL)
                   1396:                vrele(epp->ep_emul_root);
                   1397:        if (epp->ep_interp != NULL)
                   1398:                vrele(epp->ep_interp);
1.237     ad       1399:
1.252     ad       1400:        /* Acquire the sched-state mutex (exit1() will release it). */
1.348     martin   1401:        if (!is_spawn) {
1.337     martin   1402:                mutex_enter(p->p_lock);
1.426     christos 1403:                exit1(l, error, SIGABRT);
1.337     martin   1404:        }
1.55      cgd      1405:
1.348     martin   1406:        return error;
1.67      christos 1407: }
                   1408:
1.144     christos 1409: int
1.481     christos 1410: execve1(struct lwp *l, bool has_path, const char *path, int fd,
                   1411:     char * const *args, char * const *envs,
                   1412:     execve_fetch_element_t fetch_element)
1.337     martin   1413: {
                   1414:        struct execve_data data;
                   1415:        int error;
                   1416:
1.481     christos 1417:        error = execve_loadvm(l, has_path, path, fd, args, envs, fetch_element,
                   1418:            &data);
1.337     martin   1419:        if (error)
                   1420:                return error;
1.348     martin   1421:        error = execve_runproc(l, &data, false, false);
1.337     martin   1422:        return error;
                   1423: }
                   1424:
1.396     uebayasi 1425: static size_t
1.411     christos 1426: fromptrsz(const struct exec_package *epp)
                   1427: {
                   1428:        return (epp->ep_flags & EXEC_FROM32) ? sizeof(int) : sizeof(char *);
                   1429: }
                   1430:
                   1431: static size_t
1.409     christos 1432: ptrsz(const struct exec_package *epp)
                   1433: {
1.411     christos 1434:        return (epp->ep_flags & EXEC_32) ? sizeof(int) : sizeof(char *);
1.409     christos 1435: }
                   1436:
                   1437: static size_t
1.396     uebayasi 1438: calcargs(struct execve_data * restrict data, const size_t argenvstrlen)
                   1439: {
                   1440:        struct exec_package     * const epp = &data->ed_pack;
                   1441:
                   1442:        const size_t nargenvptrs =
1.402     uebayasi 1443:            1 +                         /* long argc */
1.396     uebayasi 1444:            data->ed_argc +             /* char *argv[] */
                   1445:            1 +                         /* \0 */
                   1446:            data->ed_envc +             /* char *env[] */
1.441     christos 1447:            1;                          /* \0 */
1.396     uebayasi 1448:
1.441     christos 1449:        return (nargenvptrs * ptrsz(epp))       /* pointers */
                   1450:            + argenvstrlen                      /* strings */
                   1451:            + epp->ep_esch->es_arglen;          /* auxinfo */
1.396     uebayasi 1452: }
                   1453:
                   1454: static size_t
                   1455: calcstack(struct execve_data * restrict data, const size_t gaplen)
                   1456: {
                   1457:        struct exec_package     * const epp = &data->ed_pack;
                   1458:
                   1459:        data->ed_szsigcode = epp->ep_esch->es_emul->e_esigcode -
                   1460:            epp->ep_esch->es_emul->e_sigcode;
                   1461:
                   1462:        data->ed_ps_strings_sz = (epp->ep_flags & EXEC_32) ?
                   1463:            sizeof(struct ps_strings32) : sizeof(struct ps_strings);
                   1464:
                   1465:        const size_t sigcode_psstr_sz =
                   1466:            data->ed_szsigcode +        /* sigcode */
                   1467:            data->ed_ps_strings_sz +    /* ps_strings */
                   1468:            STACK_PTHREADSPACE;         /* pthread space */
                   1469:
                   1470:        const size_t stacklen =
                   1471:            data->ed_argslen +
                   1472:            gaplen +
                   1473:            sigcode_psstr_sz;
                   1474:
                   1475:        /* make the stack "safely" aligned */
                   1476:        return STACK_LEN_ALIGN(stacklen, STACK_ALIGNBYTES);
                   1477: }
                   1478:
1.391     uebayasi 1479: static int
1.399     uebayasi 1480: copyoutargs(struct execve_data * restrict data, struct lwp *l,
                   1481:     char * const newstack)
                   1482: {
                   1483:        struct exec_package     * const epp = &data->ed_pack;
                   1484:        struct proc             *p = l->l_proc;
                   1485:        int                     error;
                   1486:
1.462     maxv     1487:        memset(&data->ed_arginfo, 0, sizeof(data->ed_arginfo));
                   1488:
1.399     uebayasi 1489:        /* remember information about the process */
                   1490:        data->ed_arginfo.ps_nargvstr = data->ed_argc;
                   1491:        data->ed_arginfo.ps_nenvstr = data->ed_envc;
                   1492:
                   1493:        /*
                   1494:         * Allocate the stack address passed to the newly execve()'ed process.
                   1495:         *
                   1496:         * The new stack address will be set to the SP (stack pointer) register
                   1497:         * in setregs().
                   1498:         */
                   1499:
                   1500:        char *newargs = STACK_ALLOC(
                   1501:            STACK_SHRINK(newstack, data->ed_argslen), data->ed_argslen);
                   1502:
                   1503:        error = (*epp->ep_esch->es_copyargs)(l, epp,
                   1504:            &data->ed_arginfo, &newargs, data->ed_argp);
                   1505:
                   1506:        if (error) {
                   1507:                DPRINTF(("%s: copyargs failed %d\n", __func__, error));
                   1508:                return error;
                   1509:        }
                   1510:
                   1511:        error = copyoutpsstrs(data, p);
                   1512:        if (error != 0)
                   1513:                return error;
                   1514:
                   1515:        return 0;
                   1516: }
                   1517:
                   1518: static int
1.398     uebayasi 1519: copyoutpsstrs(struct execve_data * restrict data, struct proc *p)
                   1520: {
                   1521:        struct exec_package     * const epp = &data->ed_pack;
                   1522:        struct ps_strings32     arginfo32;
                   1523:        void                    *aip;
                   1524:        int                     error;
                   1525:
                   1526:        /* fill process ps_strings info */
                   1527:        p->p_psstrp = (vaddr_t)STACK_ALLOC(STACK_GROW(epp->ep_minsaddr,
                   1528:            STACK_PTHREADSPACE), data->ed_ps_strings_sz);
                   1529:
                   1530:        if (epp->ep_flags & EXEC_32) {
                   1531:                aip = &arginfo32;
                   1532:                arginfo32.ps_argvstr = (vaddr_t)data->ed_arginfo.ps_argvstr;
                   1533:                arginfo32.ps_nargvstr = data->ed_arginfo.ps_nargvstr;
                   1534:                arginfo32.ps_envstr = (vaddr_t)data->ed_arginfo.ps_envstr;
                   1535:                arginfo32.ps_nenvstr = data->ed_arginfo.ps_nenvstr;
                   1536:        } else
                   1537:                aip = &data->ed_arginfo;
                   1538:
                   1539:        /* copy out the process's ps_strings structure */
                   1540:        if ((error = copyout(aip, (void *)p->p_psstrp, data->ed_ps_strings_sz))
                   1541:            != 0) {
                   1542:                DPRINTF(("%s: ps_strings copyout %p->%p size %zu failed\n",
                   1543:                    __func__, aip, (void *)p->p_psstrp, data->ed_ps_strings_sz));
                   1544:                return error;
                   1545:        }
                   1546:
                   1547:        return 0;
                   1548: }
                   1549:
                   1550: static int
1.391     uebayasi 1551: copyinargs(struct execve_data * restrict data, char * const *args,
                   1552:     char * const *envs, execve_fetch_element_t fetch_element, char **dpp)
                   1553: {
                   1554:        struct exec_package     * const epp = &data->ed_pack;
1.392     uebayasi 1555:        char                    *dp;
1.391     uebayasi 1556:        size_t                  i;
                   1557:        int                     error;
                   1558:
                   1559:        dp = *dpp;
                   1560:
                   1561:        data->ed_argc = 0;
                   1562:
                   1563:        /* copy the fake args list, if there's one, freeing it as we go */
                   1564:        if (epp->ep_flags & EXEC_HASARGL) {
1.405     uebayasi 1565:                struct exec_fakearg     *fa = epp->ep_fa;
1.391     uebayasi 1566:
1.405     uebayasi 1567:                while (fa->fa_arg != NULL) {
1.394     uebayasi 1568:                        const size_t maxlen = ARG_MAX - (dp - data->ed_argp);
                   1569:                        size_t len;
1.391     uebayasi 1570:
1.405     uebayasi 1571:                        len = strlcpy(dp, fa->fa_arg, maxlen);
1.394     uebayasi 1572:                        /* Count NUL into len. */
                   1573:                        if (len < maxlen)
                   1574:                                len++;
1.404     uebayasi 1575:                        else {
1.405     uebayasi 1576:                                while (fa->fa_arg != NULL) {
                   1577:                                        kmem_free(fa->fa_arg, fa->fa_len);
                   1578:                                        fa++;
1.404     uebayasi 1579:                                }
                   1580:                                kmem_free(epp->ep_fa, epp->ep_fa_len);
                   1581:                                epp->ep_flags &= ~EXEC_HASARGL;
1.395     uebayasi 1582:                                return E2BIG;
1.404     uebayasi 1583:                        }
1.405     uebayasi 1584:                        ktrexecarg(fa->fa_arg, len - 1);
1.394     uebayasi 1585:                        dp += len;
1.391     uebayasi 1586:
1.405     uebayasi 1587:                        kmem_free(fa->fa_arg, fa->fa_len);
                   1588:                        fa++;
1.391     uebayasi 1589:                        data->ed_argc++;
                   1590:                }
                   1591:                kmem_free(epp->ep_fa, epp->ep_fa_len);
                   1592:                epp->ep_flags &= ~EXEC_HASARGL;
                   1593:        }
                   1594:
1.392     uebayasi 1595:        /*
                   1596:         * Read and count argument strings from user.
                   1597:         */
                   1598:
1.391     uebayasi 1599:        if (args == NULL) {
                   1600:                DPRINTF(("%s: null args\n", __func__));
                   1601:                return EINVAL;
                   1602:        }
1.392     uebayasi 1603:        if (epp->ep_flags & EXEC_SKIPARG)
1.411     christos 1604:                args = (const void *)((const char *)args + fromptrsz(epp));
1.391     uebayasi 1605:        i = 0;
1.392     uebayasi 1606:        error = copyinargstrs(data, args, fetch_element, &dp, &i, ktr_execarg);
                   1607:        if (error != 0) {
                   1608:                DPRINTF(("%s: copyin arg %d\n", __func__, error));
                   1609:                return error;
                   1610:        }
                   1611:        data->ed_argc += i;
                   1612:
                   1613:        /*
                   1614:         * Read and count environment strings from user.
                   1615:         */
                   1616:
                   1617:        data->ed_envc = 0;
                   1618:        /* environment need not be there */
                   1619:        if (envs == NULL)
                   1620:                goto done;
                   1621:        i = 0;
                   1622:        error = copyinargstrs(data, envs, fetch_element, &dp, &i, ktr_execenv);
                   1623:        if (error != 0) {
                   1624:                DPRINTF(("%s: copyin env %d\n", __func__, error));
                   1625:                return error;
                   1626:        }
                   1627:        data->ed_envc += i;
                   1628:
                   1629: done:
                   1630:        *dpp = dp;
                   1631:
                   1632:        return 0;
                   1633: }
                   1634:
                   1635: static int
                   1636: copyinargstrs(struct execve_data * restrict data, char * const *strs,
                   1637:     execve_fetch_element_t fetch_element, char **dpp, size_t *ip,
                   1638:     void (*ktr)(const void *, size_t))
                   1639: {
                   1640:        char                    *dp, *sp;
                   1641:        size_t                  i;
                   1642:        int                     error;
                   1643:
                   1644:        dp = *dpp;
1.391     uebayasi 1645:
1.392     uebayasi 1646:        i = 0;
1.391     uebayasi 1647:        while (1) {
1.394     uebayasi 1648:                const size_t maxlen = ARG_MAX - (dp - data->ed_argp);
1.391     uebayasi 1649:                size_t len;
                   1650:
1.392     uebayasi 1651:                if ((error = (*fetch_element)(strs, i, &sp)) != 0) {
1.391     uebayasi 1652:                        return error;
                   1653:                }
                   1654:                if (!sp)
                   1655:                        break;
                   1656:                if ((error = copyinstr(sp, dp, maxlen, &len)) != 0) {
                   1657:                        if (error == ENAMETOOLONG)
                   1658:                                error = E2BIG;
                   1659:                        return error;
                   1660:                }
1.392     uebayasi 1661:                if (__predict_false(ktrace_on))
                   1662:                        (*ktr)(dp, len - 1);
1.391     uebayasi 1663:                dp += len;
                   1664:                i++;
                   1665:        }
                   1666:
                   1667:        *dpp = dp;
1.392     uebayasi 1668:        *ip = i;
1.391     uebayasi 1669:
                   1670:        return 0;
                   1671: }
                   1672:
1.382     uebayasi 1673: /*
                   1674:  * Copy argv and env strings from kernel buffer (argp) to the new stack.
                   1675:  * Those strings are located just after auxinfo.
                   1676:  */
1.337     martin   1677: int
1.231     yamt     1678: copyargs(struct lwp *l, struct exec_package *pack, struct ps_strings *arginfo,
                   1679:     char **stackp, void *argp)
1.67      christos 1680: {
1.138     lukem    1681:        char    **cpp, *dp, *sp;
                   1682:        size_t  len;
                   1683:        void    *nullp;
                   1684:        long    argc, envc;
1.144     christos 1685:        int     error;
1.138     lukem    1686:
1.144     christos 1687:        cpp = (char **)*stackp;
1.138     lukem    1688:        nullp = NULL;
                   1689:        argc = arginfo->ps_nargvstr;
                   1690:        envc = arginfo->ps_nenvstr;
1.382     uebayasi 1691:
                   1692:        /* argc on stack is long */
                   1693:        CTASSERT(sizeof(*cpp) == sizeof(argc));
                   1694:
                   1695:        dp = (char *)(cpp +
1.402     uebayasi 1696:            1 +                         /* long argc */
                   1697:            argc +                      /* char *argv[] */
1.382     uebayasi 1698:            1 +                         /* \0 */
1.402     uebayasi 1699:            envc +                      /* char *env[] */
1.441     christos 1700:            1) +                        /* \0 */
                   1701:            pack->ep_esch->es_arglen;   /* auxinfo */
1.382     uebayasi 1702:        sp = argp;
                   1703:
1.305     matt     1704:        if ((error = copyout(&argc, cpp++, sizeof(argc))) != 0) {
1.312     christos 1705:                COPYPRINTF("", cpp - 1, sizeof(argc));
1.144     christos 1706:                return error;
1.305     matt     1707:        }
1.67      christos 1708:
                   1709:        /* XXX don't copy them out, remap them! */
1.69      mycroft  1710:        arginfo->ps_argvstr = cpp; /* remember location of argv for later */
1.67      christos 1711:
1.305     matt     1712:        for (; --argc >= 0; sp += len, dp += len) {
                   1713:                if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) {
1.312     christos 1714:                        COPYPRINTF("", cpp - 1, sizeof(dp));
1.305     matt     1715:                        return error;
                   1716:                }
                   1717:                if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) {
1.313     jakllsch 1718:                        COPYPRINTF("str", dp, (size_t)ARG_MAX);
1.144     christos 1719:                        return error;
1.305     matt     1720:                }
                   1721:        }
1.67      christos 1722:
1.305     matt     1723:        if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) {
1.312     christos 1724:                COPYPRINTF("", cpp - 1, sizeof(nullp));
1.144     christos 1725:                return error;
1.305     matt     1726:        }
1.67      christos 1727:
1.69      mycroft  1728:        arginfo->ps_envstr = cpp; /* remember location of envp for later */
1.67      christos 1729:
1.305     matt     1730:        for (; --envc >= 0; sp += len, dp += len) {
                   1731:                if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) {
1.312     christos 1732:                        COPYPRINTF("", cpp - 1, sizeof(dp));
1.144     christos 1733:                        return error;
1.305     matt     1734:                }
                   1735:                if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) {
1.313     jakllsch 1736:                        COPYPRINTF("str", dp, (size_t)ARG_MAX);
1.305     matt     1737:                        return error;
                   1738:                }
1.337     martin   1739:
1.305     matt     1740:        }
1.67      christos 1741:
1.305     matt     1742:        if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) {
1.312     christos 1743:                COPYPRINTF("", cpp - 1, sizeof(nullp));
1.144     christos 1744:                return error;
1.305     matt     1745:        }
1.67      christos 1746:
1.144     christos 1747:        *stackp = (char *)cpp;
                   1748:        return 0;
1.55      cgd      1749: }
1.130     jdolecek 1750:
                   1751:
                   1752: /*
1.282     ad       1753:  * Add execsw[] entries.
1.130     jdolecek 1754:  */
                   1755: int
1.282     ad       1756: exec_add(struct execsw *esp, int count)
1.130     jdolecek 1757: {
1.282     ad       1758:        struct exec_entry       *it;
                   1759:        int                     i;
1.130     jdolecek 1760:
1.283     ad       1761:        if (count == 0) {
                   1762:                return 0;
                   1763:        }
1.130     jdolecek 1764:
1.282     ad       1765:        /* Check for duplicates. */
1.237     ad       1766:        rw_enter(&exec_lock, RW_WRITER);
1.282     ad       1767:        for (i = 0; i < count; i++) {
                   1768:                LIST_FOREACH(it, &ex_head, ex_list) {
                   1769:                        /* assume unique (makecmds, probe_func, emulation) */
                   1770:                        if (it->ex_sw->es_makecmds == esp[i].es_makecmds &&
                   1771:                            it->ex_sw->u.elf_probe_func ==
                   1772:                            esp[i].u.elf_probe_func &&
                   1773:                            it->ex_sw->es_emul == esp[i].es_emul) {
                   1774:                                rw_exit(&exec_lock);
                   1775:                                return EEXIST;
1.130     jdolecek 1776:                        }
                   1777:                }
                   1778:        }
                   1779:
1.282     ad       1780:        /* Allocate new entries. */
                   1781:        for (i = 0; i < count; i++) {
                   1782:                it = kmem_alloc(sizeof(*it), KM_SLEEP);
                   1783:                it->ex_sw = &esp[i];
                   1784:                LIST_INSERT_HEAD(&ex_head, it, ex_list);
1.130     jdolecek 1785:        }
                   1786:
                   1787:        /* update execsw[] */
                   1788:        exec_init(0);
1.237     ad       1789:        rw_exit(&exec_lock);
1.282     ad       1790:        return 0;
1.130     jdolecek 1791: }
                   1792:
                   1793: /*
                   1794:  * Remove execsw[] entry.
                   1795:  */
                   1796: int
1.282     ad       1797: exec_remove(struct execsw *esp, int count)
1.130     jdolecek 1798: {
1.282     ad       1799:        struct exec_entry       *it, *next;
                   1800:        int                     i;
                   1801:        const struct proclist_desc *pd;
                   1802:        proc_t                  *p;
                   1803:
1.283     ad       1804:        if (count == 0) {
                   1805:                return 0;
                   1806:        }
1.130     jdolecek 1807:
1.282     ad       1808:        /* Abort if any are busy. */
1.237     ad       1809:        rw_enter(&exec_lock, RW_WRITER);
1.282     ad       1810:        for (i = 0; i < count; i++) {
                   1811:                mutex_enter(proc_lock);
                   1812:                for (pd = proclists; pd->pd_list != NULL; pd++) {
                   1813:                        PROCLIST_FOREACH(p, pd->pd_list) {
                   1814:                                if (p->p_execsw == &esp[i]) {
                   1815:                                        mutex_exit(proc_lock);
                   1816:                                        rw_exit(&exec_lock);
                   1817:                                        return EBUSY;
                   1818:                                }
                   1819:                        }
                   1820:                }
                   1821:                mutex_exit(proc_lock);
                   1822:        }
1.130     jdolecek 1823:
1.282     ad       1824:        /* None are busy, so remove them all. */
                   1825:        for (i = 0; i < count; i++) {
                   1826:                for (it = LIST_FIRST(&ex_head); it != NULL; it = next) {
                   1827:                        next = LIST_NEXT(it, ex_list);
                   1828:                        if (it->ex_sw == &esp[i]) {
                   1829:                                LIST_REMOVE(it, ex_list);
                   1830:                                kmem_free(it, sizeof(*it));
                   1831:                                break;
                   1832:                        }
                   1833:                }
1.130     jdolecek 1834:        }
                   1835:
                   1836:        /* update execsw[] */
                   1837:        exec_init(0);
1.237     ad       1838:        rw_exit(&exec_lock);
1.282     ad       1839:        return 0;
1.130     jdolecek 1840: }
                   1841:
                   1842: /*
                   1843:  * Initialize exec structures. If init_boot is true, also does necessary
                   1844:  * one-time initialization (it's called from main() that way).
1.147     jdolecek 1845:  * Once system is multiuser, this should be called with exec_lock held,
1.130     jdolecek 1846:  * i.e. via exec_{add|remove}().
                   1847:  */
                   1848: int
1.138     lukem    1849: exec_init(int init_boot)
1.130     jdolecek 1850: {
1.282     ad       1851:        const struct execsw     **sw;
                   1852:        struct exec_entry       *ex;
                   1853:        SLIST_HEAD(,exec_entry) first;
                   1854:        SLIST_HEAD(,exec_entry) any;
                   1855:        SLIST_HEAD(,exec_entry) last;
                   1856:        int                     i, sz;
1.130     jdolecek 1857:
                   1858:        if (init_boot) {
                   1859:                /* do one-time initializations */
1.449     riastrad 1860:                vaddr_t vmin = 0, vmax;
1.448     riastrad 1861:
1.237     ad       1862:                rw_init(&exec_lock);
1.259     ad       1863:                mutex_init(&sigobject_lock, MUTEX_DEFAULT, IPL_NONE);
1.448     riastrad 1864:                exec_map = uvm_km_suballoc(kernel_map, &vmin, &vmax,
                   1865:                    maxexec*NCARGS, VM_MAP_PAGEABLE, false, NULL);
1.277     ad       1866:                pool_init(&exec_pool, NCARGS, 0, 0, PR_NOALIGN|PR_NOTOUCH,
                   1867:                    "execargs", &exec_palloc, IPL_NONE);
                   1868:                pool_sethardlimit(&exec_pool, maxexec, "should not happen", 0);
1.282     ad       1869:        } else {
                   1870:                KASSERT(rw_write_held(&exec_lock));
                   1871:        }
1.130     jdolecek 1872:
1.282     ad       1873:        /* Sort each entry onto the appropriate queue. */
                   1874:        SLIST_INIT(&first);
                   1875:        SLIST_INIT(&any);
                   1876:        SLIST_INIT(&last);
                   1877:        sz = 0;
                   1878:        LIST_FOREACH(ex, &ex_head, ex_list) {
                   1879:                switch(ex->ex_sw->es_prio) {
                   1880:                case EXECSW_PRIO_FIRST:
                   1881:                        SLIST_INSERT_HEAD(&first, ex, ex_slist);
                   1882:                        break;
                   1883:                case EXECSW_PRIO_ANY:
                   1884:                        SLIST_INSERT_HEAD(&any, ex, ex_slist);
                   1885:                        break;
                   1886:                case EXECSW_PRIO_LAST:
                   1887:                        SLIST_INSERT_HEAD(&last, ex, ex_slist);
                   1888:                        break;
                   1889:                default:
1.312     christos 1890:                        panic("%s", __func__);
1.282     ad       1891:                        break;
1.130     jdolecek 1892:                }
1.282     ad       1893:                sz++;
1.130     jdolecek 1894:        }
                   1895:
                   1896:        /*
1.282     ad       1897:         * Create new execsw[].  Ensure we do not try a zero-sized
                   1898:         * allocation.
1.130     jdolecek 1899:         */
1.282     ad       1900:        sw = kmem_alloc(sz * sizeof(struct execsw *) + 1, KM_SLEEP);
                   1901:        i = 0;
                   1902:        SLIST_FOREACH(ex, &first, ex_slist) {
                   1903:                sw[i++] = ex->ex_sw;
                   1904:        }
                   1905:        SLIST_FOREACH(ex, &any, ex_slist) {
                   1906:                sw[i++] = ex->ex_sw;
                   1907:        }
                   1908:        SLIST_FOREACH(ex, &last, ex_slist) {
                   1909:                sw[i++] = ex->ex_sw;
1.130     jdolecek 1910:        }
1.183     junyoung 1911:
1.282     ad       1912:        /* Replace old execsw[] and free used memory. */
                   1913:        if (execsw != NULL) {
                   1914:                kmem_free(__UNCONST(execsw),
                   1915:                    nexecs * sizeof(struct execsw *) + 1);
1.130     jdolecek 1916:        }
1.282     ad       1917:        execsw = sw;
                   1918:        nexecs = sz;
1.130     jdolecek 1919:
1.282     ad       1920:        /* Figure out the maximum size of an exec header. */
                   1921:        exec_maxhdrsz = sizeof(int);
1.130     jdolecek 1922:        for (i = 0; i < nexecs; i++) {
                   1923:                if (execsw[i]->es_hdrsz > exec_maxhdrsz)
                   1924:                        exec_maxhdrsz = execsw[i]->es_hdrsz;
                   1925:        }
                   1926:
                   1927:        return 0;
                   1928: }
1.171     chs      1929:
                   1930: static int
                   1931: exec_sigcode_map(struct proc *p, const struct emul *e)
                   1932: {
                   1933:        vaddr_t va;
                   1934:        vsize_t sz;
                   1935:        int error;
                   1936:        struct uvm_object *uobj;
                   1937:
1.184     drochner 1938:        sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode;
                   1939:
                   1940:        if (e->e_sigobject == NULL || sz == 0) {
1.171     chs      1941:                return 0;
                   1942:        }
                   1943:
                   1944:        /*
                   1945:         * If we don't have a sigobject for this emulation, create one.
                   1946:         *
                   1947:         * sigobject is an anonymous memory object (just like SYSV shared
                   1948:         * memory) that we keep a permanent reference to and that we map
                   1949:         * in all processes that need this sigcode. The creation is simple,
                   1950:         * we create an object, add a permanent reference to it, map it in
                   1951:         * kernel space, copy out the sigcode to it and unmap it.
1.189     jdolecek 1952:         * We map it with PROT_READ|PROT_EXEC into the process just
                   1953:         * the way sys_mmap() would map it.
1.171     chs      1954:         */
                   1955:
                   1956:        uobj = *e->e_sigobject;
                   1957:        if (uobj == NULL) {
1.259     ad       1958:                mutex_enter(&sigobject_lock);
                   1959:                if ((uobj = *e->e_sigobject) == NULL) {
                   1960:                        uobj = uao_create(sz, 0);
                   1961:                        (*uobj->pgops->pgo_reference)(uobj);
                   1962:                        va = vm_map_min(kernel_map);
                   1963:                        if ((error = uvm_map(kernel_map, &va, round_page(sz),
                   1964:                            uobj, 0, 0,
                   1965:                            UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
                   1966:                            UVM_INH_SHARE, UVM_ADV_RANDOM, 0)))) {
                   1967:                                printf("kernel mapping failed %d\n", error);
                   1968:                                (*uobj->pgops->pgo_detach)(uobj);
                   1969:                                mutex_exit(&sigobject_lock);
1.374     martin   1970:                                return error;
1.259     ad       1971:                        }
                   1972:                        memcpy((void *)va, e->e_sigcode, sz);
1.171     chs      1973: #ifdef PMAP_NEED_PROCWR
1.259     ad       1974:                        pmap_procwr(&proc0, va, sz);
1.171     chs      1975: #endif
1.259     ad       1976:                        uvm_unmap(kernel_map, va, va + round_page(sz));
                   1977:                        *e->e_sigobject = uobj;
                   1978:                }
                   1979:                mutex_exit(&sigobject_lock);
1.171     chs      1980:        }
                   1981:
1.172     enami    1982:        /* Just a hint to uvm_map where to put it. */
1.195     fvdl     1983:        va = e->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr,
1.422     martin   1984:            round_page(sz), p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
1.187     chs      1985:
                   1986: #ifdef __alpha__
                   1987:        /*
                   1988:         * Tru64 puts /sbin/loader at the end of user virtual memory,
                   1989:         * which causes the above calculation to put the sigcode at
                   1990:         * an invalid address.  Put it just below the text instead.
                   1991:         */
1.193     jmc      1992:        if (va == (vaddr_t)vm_map_max(&p->p_vmspace->vm_map)) {
1.187     chs      1993:                va = (vaddr_t)p->p_vmspace->vm_taddr - round_page(sz);
                   1994:        }
                   1995: #endif
                   1996:
1.171     chs      1997:        (*uobj->pgops->pgo_reference)(uobj);
                   1998:        error = uvm_map(&p->p_vmspace->vm_map, &va, round_page(sz),
                   1999:                        uobj, 0, 0,
                   2000:                        UVM_MAPFLAG(UVM_PROT_RX, UVM_PROT_RX, UVM_INH_SHARE,
                   2001:                                    UVM_ADV_RANDOM, 0));
                   2002:        if (error) {
1.312     christos 2003:                DPRINTF(("%s, %d: map %p "
1.305     matt     2004:                    "uvm_map %#"PRIxVSIZE"@%#"PRIxVADDR" failed %d\n",
1.312     christos 2005:                    __func__, __LINE__, &p->p_vmspace->vm_map, round_page(sz),
                   2006:                    va, error));
1.171     chs      2007:                (*uobj->pgops->pgo_detach)(uobj);
1.374     martin   2008:                return error;
1.171     chs      2009:        }
                   2010:        p->p_sigctx.ps_sigcode = (void *)va;
1.374     martin   2011:        return 0;
1.171     chs      2012: }
1.336     matt     2013:
1.337     martin   2014: /*
1.348     martin   2015:  * Release a refcount on spawn_exec_data and destroy memory, if this
                   2016:  * was the last one.
                   2017:  */
                   2018: static void
                   2019: spawn_exec_data_release(struct spawn_exec_data *data)
                   2020: {
                   2021:        if (atomic_dec_32_nv(&data->sed_refcnt) != 0)
                   2022:                return;
                   2023:
                   2024:        cv_destroy(&data->sed_cv_child_ready);
                   2025:        mutex_destroy(&data->sed_mtx_child);
                   2026:
                   2027:        if (data->sed_actions)
                   2028:                posix_spawn_fa_free(data->sed_actions,
                   2029:                    data->sed_actions->len);
                   2030:        if (data->sed_attrs)
                   2031:                kmem_free(data->sed_attrs,
                   2032:                    sizeof(*data->sed_attrs));
                   2033:        kmem_free(data, sizeof(*data));
                   2034: }
                   2035:
                   2036: /*
1.337     martin   2037:  * A child lwp of a posix_spawn operation starts here and ends up in
                   2038:  * cpu_spawn_return, dealing with all filedescriptor and scheduler
                   2039:  * manipulations in between.
1.369     christos 2040:  * The parent waits for the child, as it is not clear whether the child
                   2041:  * will be able to acquire its own exec_lock. If it can, the parent can
1.348     martin   2042:  * be released early and continue running in parallel. If not (or if the
                   2043:  * magic debug flag is passed in the scheduler attribute struct), the
1.369     christos 2044:  * child rides on the parent's exec lock until it is ready to return to
1.348     martin   2045:  * to userland - and only then releases the parent. This method loses
                   2046:  * concurrency, but improves error reporting.
1.337     martin   2047:  */
                   2048: static void
                   2049: spawn_return(void *arg)
                   2050: {
                   2051:        struct spawn_exec_data *spawn_data = arg;
                   2052:        struct lwp *l = curlwp;
1.466     kamil    2053:        struct proc *p = l->l_proc;
1.337     martin   2054:        int error, newfd;
1.420     pgoyette 2055:        int ostat;
1.337     martin   2056:        size_t i;
                   2057:        const struct posix_spawn_file_actions_entry *fae;
1.348     martin   2058:        pid_t ppid;
1.337     martin   2059:        register_t retval;
1.341     martin   2060:        bool have_reflock;
1.348     martin   2061:        bool parent_is_waiting = true;
1.345     martin   2062:
1.341     martin   2063:        /*
1.348     martin   2064:         * Check if we can release parent early.
                   2065:         * We either need to have no sed_attrs, or sed_attrs does not
                   2066:         * have POSIX_SPAWN_RETURNERROR or one of the flags, that require
                   2067:         * safe access to the parent proc (passed in sed_parent).
                   2068:         * We then try to get the exec_lock, and only if that works, we can
                   2069:         * release the parent here already.
                   2070:         */
                   2071:        ppid = spawn_data->sed_parent->p_pid;
                   2072:        if ((!spawn_data->sed_attrs
                   2073:            || (spawn_data->sed_attrs->sa_flags
                   2074:                & (POSIX_SPAWN_RETURNERROR|POSIX_SPAWN_SETPGROUP)) == 0)
                   2075:            && rw_tryenter(&exec_lock, RW_READER)) {
                   2076:                parent_is_waiting = false;
                   2077:                mutex_enter(&spawn_data->sed_mtx_child);
                   2078:                cv_signal(&spawn_data->sed_cv_child_ready);
                   2079:                mutex_exit(&spawn_data->sed_mtx_child);
                   2080:        }
1.341     martin   2081:
1.352     rmind    2082:        /* don't allow debugger access yet */
1.466     kamil    2083:        rw_enter(&p->p_reflock, RW_WRITER);
1.352     rmind    2084:        have_reflock = true;
                   2085:
                   2086:        error = 0;
1.337     martin   2087:        /* handle posix_spawn_file_actions */
                   2088:        if (spawn_data->sed_actions != NULL) {
1.348     martin   2089:                for (i = 0; i < spawn_data->sed_actions->len; i++) {
                   2090:                        fae = &spawn_data->sed_actions->fae[i];
1.337     martin   2091:                        switch (fae->fae_action) {
                   2092:                        case FAE_OPEN:
1.338     martin   2093:                                if (fd_getfile(fae->fae_fildes) != NULL) {
                   2094:                                        error = fd_close(fae->fae_fildes);
                   2095:                                        if (error)
                   2096:                                                break;
                   2097:                                }
1.337     martin   2098:                                error = fd_open(fae->fae_path, fae->fae_oflag,
                   2099:                                    fae->fae_mode, &newfd);
1.376     maxv     2100:                                if (error)
                   2101:                                        break;
1.337     martin   2102:                                if (newfd != fae->fae_fildes) {
                   2103:                                        error = dodup(l, newfd,
                   2104:                                            fae->fae_fildes, 0, &retval);
                   2105:                                        if (fd_getfile(newfd) != NULL)
                   2106:                                                fd_close(newfd);
                   2107:                                }
                   2108:                                break;
                   2109:                        case FAE_DUP2:
                   2110:                                error = dodup(l, fae->fae_fildes,
                   2111:                                    fae->fae_newfildes, 0, &retval);
                   2112:                                break;
                   2113:                        case FAE_CLOSE:
                   2114:                                if (fd_getfile(fae->fae_fildes) == NULL) {
                   2115:                                        error = EBADF;
                   2116:                                        break;
                   2117:                                }
                   2118:                                error = fd_close(fae->fae_fildes);
                   2119:                                break;
                   2120:                        }
                   2121:                        if (error)
                   2122:                                goto report_error;
                   2123:                }
                   2124:        }
                   2125:
                   2126:        /* handle posix_spawnattr */
                   2127:        if (spawn_data->sed_attrs != NULL) {
                   2128:                struct sigaction sigact;
1.478     maxv     2129:                memset(&sigact, 0, sizeof(sigact));
1.337     martin   2130:                sigact._sa_u._sa_handler = SIG_DFL;
                   2131:                sigact.sa_flags = 0;
                   2132:
                   2133:                /*
                   2134:                 * set state to SSTOP so that this proc can be found by pid.
                   2135:                 * see proc_enterprp, do_sched_setparam below
                   2136:                 */
1.420     pgoyette 2137:                mutex_enter(proc_lock);
                   2138:                /*
                   2139:                 * p_stat should be SACTIVE, so we need to adjust the
                   2140:                 * parent's p_nstopchild here.  For safety, just make
                   2141:                 * we're on the good side of SDEAD before we adjust.
                   2142:                 */
1.466     kamil    2143:                ostat = p->p_stat;
1.420     pgoyette 2144:                KASSERT(ostat < SSTOP);
1.466     kamil    2145:                p->p_stat = SSTOP;
                   2146:                p->p_waited = 0;
                   2147:                p->p_pptr->p_nstopchild++;
1.420     pgoyette 2148:                mutex_exit(proc_lock);
1.337     martin   2149:
                   2150:                /* Set process group */
                   2151:                if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_SETPGROUP) {
1.466     kamil    2152:                        pid_t mypid = p->p_pid,
1.337     martin   2153:                             pgrp = spawn_data->sed_attrs->sa_pgroup;
                   2154:
                   2155:                        if (pgrp == 0)
                   2156:                                pgrp = mypid;
                   2157:
                   2158:                        error = proc_enterpgrp(spawn_data->sed_parent,
                   2159:                            mypid, pgrp, false);
                   2160:                        if (error)
1.420     pgoyette 2161:                                goto report_error_stopped;
1.337     martin   2162:                }
                   2163:
                   2164:                /* Set scheduler policy */
                   2165:                if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_SETSCHEDULER)
1.466     kamil    2166:                        error = do_sched_setparam(p->p_pid, 0,
1.337     martin   2167:                            spawn_data->sed_attrs->sa_schedpolicy,
                   2168:                            &spawn_data->sed_attrs->sa_schedparam);
                   2169:                else if (spawn_data->sed_attrs->sa_flags
                   2170:                    & POSIX_SPAWN_SETSCHEDPARAM) {
1.348     martin   2171:                        error = do_sched_setparam(ppid, 0,
1.337     martin   2172:                            SCHED_NONE, &spawn_data->sed_attrs->sa_schedparam);
                   2173:                }
                   2174:                if (error)
1.420     pgoyette 2175:                        goto report_error_stopped;
1.337     martin   2176:
                   2177:                /* Reset user ID's */
                   2178:                if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_RESETIDS) {
                   2179:                        error = do_setresuid(l, -1,
                   2180:                             kauth_cred_getgid(l->l_cred), -1,
                   2181:                             ID_E_EQ_R | ID_E_EQ_S);
                   2182:                        if (error)
1.420     pgoyette 2183:                                goto report_error_stopped;
1.337     martin   2184:                        error = do_setresuid(l, -1,
                   2185:                            kauth_cred_getuid(l->l_cred), -1,
                   2186:                            ID_E_EQ_R | ID_E_EQ_S);
                   2187:                        if (error)
1.420     pgoyette 2188:                                goto report_error_stopped;
1.337     martin   2189:                }
                   2190:
                   2191:                /* Set signal masks/defaults */
                   2192:                if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_SETSIGMASK) {
1.466     kamil    2193:                        mutex_enter(p->p_lock);
1.337     martin   2194:                        error = sigprocmask1(l, SIG_SETMASK,
                   2195:                            &spawn_data->sed_attrs->sa_sigmask, NULL);
1.466     kamil    2196:                        mutex_exit(p->p_lock);
1.337     martin   2197:                        if (error)
1.420     pgoyette 2198:                                goto report_error_stopped;
1.337     martin   2199:                }
                   2200:
                   2201:                if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_SETSIGDEF) {
1.375     christos 2202:                        /*
                   2203:                         * The following sigaction call is using a sigaction
                   2204:                         * version 0 trampoline which is in the compatibility
                   2205:                         * code only. This is not a problem because for SIG_DFL
                   2206:                         * and SIG_IGN, the trampolines are now ignored. If they
                   2207:                         * were not, this would be a problem because we are
                   2208:                         * holding the exec_lock, and the compat code needs
                   2209:                         * to do the same in order to replace the trampoline
                   2210:                         * code of the process.
                   2211:                         */
1.337     martin   2212:                        for (i = 1; i <= NSIG; i++) {
                   2213:                                if (sigismember(
                   2214:                                    &spawn_data->sed_attrs->sa_sigdefault, i))
                   2215:                                        sigaction1(l, i, &sigact, NULL, NULL,
                   2216:                                            0);
                   2217:                        }
                   2218:                }
1.420     pgoyette 2219:                mutex_enter(proc_lock);
1.466     kamil    2220:                p->p_stat = ostat;
                   2221:                p->p_pptr->p_nstopchild--;
1.420     pgoyette 2222:                mutex_exit(proc_lock);
1.337     martin   2223:        }
                   2224:
1.352     rmind    2225:        /* now do the real exec */
1.348     martin   2226:        error = execve_runproc(l, &spawn_data->sed_exec, parent_is_waiting,
                   2227:            true);
1.341     martin   2228:        have_reflock = false;
1.352     rmind    2229:        if (error == EJUSTRETURN)
                   2230:                error = 0;
                   2231:        else if (error)
1.337     martin   2232:                goto report_error;
                   2233:
1.348     martin   2234:        if (parent_is_waiting) {
                   2235:                mutex_enter(&spawn_data->sed_mtx_child);
                   2236:                cv_signal(&spawn_data->sed_cv_child_ready);
                   2237:                mutex_exit(&spawn_data->sed_mtx_child);
                   2238:        }
1.345     martin   2239:
1.348     martin   2240:        /* release our refcount on the data */
                   2241:        spawn_exec_data_release(spawn_data);
1.337     martin   2242:
1.466     kamil    2243:        if (p->p_slflag & PSL_TRACED) {
                   2244:                /* Paranoid check */
                   2245:                mutex_enter(proc_lock);
                   2246:                if (!(p->p_slflag & PSL_TRACED)) {
                   2247:                        mutex_exit(proc_lock);
                   2248:                        goto cpu_return;
                   2249:                }
                   2250:
                   2251:                mutex_enter(p->p_lock);
1.482   ! kamil    2252:                eventswitch(TRAP_CHLD, PTRACE_POSIX_SPAWN, p->p_oppid);
1.466     kamil    2253:        }
                   2254:
                   2255:  cpu_return:
1.369     christos 2256:        /* and finally: leave to userland for the first time */
1.337     martin   2257:        cpu_spawn_return(l);
                   2258:
                   2259:        /* NOTREACHED */
                   2260:        return;
                   2261:
1.420     pgoyette 2262:  report_error_stopped:
                   2263:        mutex_enter(proc_lock);
1.466     kamil    2264:        p->p_stat = ostat;
                   2265:        p->p_pptr->p_nstopchild--;
1.420     pgoyette 2266:        mutex_exit(proc_lock);
1.337     martin   2267:  report_error:
1.376     maxv     2268:        if (have_reflock) {
                   2269:                /*
1.350     martin   2270:                 * We have not passed through execve_runproc(),
                   2271:                 * which would have released the p_reflock and also
                   2272:                 * taken ownership of the sed_exec part of spawn_data,
                   2273:                 * so release/free both here.
                   2274:                 */
1.466     kamil    2275:                rw_exit(&p->p_reflock);
1.350     martin   2276:                execve_free_data(&spawn_data->sed_exec);
                   2277:        }
1.341     martin   2278:
1.348     martin   2279:        if (parent_is_waiting) {
                   2280:                /* pass error to parent */
                   2281:                mutex_enter(&spawn_data->sed_mtx_child);
                   2282:                spawn_data->sed_error = error;
                   2283:                cv_signal(&spawn_data->sed_cv_child_ready);
                   2284:                mutex_exit(&spawn_data->sed_mtx_child);
                   2285:        } else {
                   2286:                rw_exit(&exec_lock);
1.337     martin   2287:        }
                   2288:
1.348     martin   2289:        /* release our refcount on the data */
                   2290:        spawn_exec_data_release(spawn_data);
                   2291:
1.352     rmind    2292:        /* done, exit */
1.466     kamil    2293:        mutex_enter(p->p_lock);
1.348     martin   2294:        /*
1.352     rmind    2295:         * Posix explicitly asks for an exit code of 127 if we report
1.348     martin   2296:         * errors from the child process - so, unfortunately, there
                   2297:         * is no way to report a more exact error code.
                   2298:         * A NetBSD specific workaround is POSIX_SPAWN_RETURNERROR as
                   2299:         * flag bit in the attrp argument to posix_spawn(2), see above.
                   2300:         */
1.426     christos 2301:        exit1(l, 127, 0);
1.337     martin   2302: }
                   2303:
1.348     martin   2304: void
1.344     christos 2305: posix_spawn_fa_free(struct posix_spawn_file_actions *fa, size_t len)
1.342     christos 2306: {
                   2307:
1.344     christos 2308:        for (size_t i = 0; i < len; i++) {
1.342     christos 2309:                struct posix_spawn_file_actions_entry *fae = &fa->fae[i];
                   2310:                if (fae->fae_action != FAE_OPEN)
                   2311:                        continue;
1.450     christos 2312:                kmem_strfree(fae->fae_path);
1.342     christos 2313:        }
1.348     martin   2314:        if (fa->len > 0)
1.343     christos 2315:                kmem_free(fa->fae, sizeof(*fa->fae) * fa->len);
1.342     christos 2316:        kmem_free(fa, sizeof(*fa));
                   2317: }
                   2318:
                   2319: static int
                   2320: posix_spawn_fa_alloc(struct posix_spawn_file_actions **fap,
1.373     martin   2321:     const struct posix_spawn_file_actions *ufa, rlim_t lim)
1.342     christos 2322: {
                   2323:        struct posix_spawn_file_actions *fa;
                   2324:        struct posix_spawn_file_actions_entry *fae;
                   2325:        char *pbuf = NULL;
                   2326:        int error;
1.352     rmind    2327:        size_t i = 0;
1.342     christos 2328:
                   2329:        fa = kmem_alloc(sizeof(*fa), KM_SLEEP);
                   2330:        error = copyin(ufa, fa, sizeof(*fa));
1.369     christos 2331:        if (error || fa->len == 0) {
1.348     martin   2332:                kmem_free(fa, sizeof(*fa));
1.369     christos 2333:                return error;   /* 0 if not an error, and len == 0 */
1.348     martin   2334:        }
1.342     christos 2335:
1.373     martin   2336:        if (fa->len > lim) {
                   2337:                kmem_free(fa, sizeof(*fa));
                   2338:                return EINVAL;
                   2339:        }
                   2340:
1.348     martin   2341:        fa->size = fa->len;
1.352     rmind    2342:        size_t fal = fa->len * sizeof(*fae);
                   2343:        fae = fa->fae;
                   2344:        fa->fae = kmem_alloc(fal, KM_SLEEP);
                   2345:        error = copyin(fae, fa->fae, fal);
1.344     christos 2346:        if (error)
1.342     christos 2347:                goto out;
                   2348:
                   2349:        pbuf = PNBUF_GET();
1.344     christos 2350:        for (; i < fa->len; i++) {
1.342     christos 2351:                fae = &fa->fae[i];
                   2352:                if (fae->fae_action != FAE_OPEN)
                   2353:                        continue;
1.352     rmind    2354:                error = copyinstr(fae->fae_path, pbuf, MAXPATHLEN, &fal);
1.344     christos 2355:                if (error)
1.342     christos 2356:                        goto out;
1.352     rmind    2357:                fae->fae_path = kmem_alloc(fal, KM_SLEEP);
                   2358:                memcpy(fae->fae_path, pbuf, fal);
1.342     christos 2359:        }
                   2360:        PNBUF_PUT(pbuf);
1.348     martin   2361:
1.342     christos 2362:        *fap = fa;
                   2363:        return 0;
                   2364: out:
                   2365:        if (pbuf)
                   2366:                PNBUF_PUT(pbuf);
1.344     christos 2367:        posix_spawn_fa_free(fa, i);
1.342     christos 2368:        return error;
                   2369: }
                   2370:
1.337     martin   2371: int
1.348     martin   2372: check_posix_spawn(struct lwp *l1)
1.337     martin   2373: {
1.348     martin   2374:        int error, tnprocs, count;
1.337     martin   2375:        uid_t uid;
1.348     martin   2376:        struct proc *p1;
1.337     martin   2377:
                   2378:        p1 = l1->l_proc;
                   2379:        uid = kauth_cred_getuid(l1->l_cred);
                   2380:        tnprocs = atomic_inc_uint_nv(&nprocs);
                   2381:
                   2382:        /*
                   2383:         * Although process entries are dynamically created, we still keep
                   2384:         * a global limit on the maximum number we will create.
                   2385:         */
                   2386:        if (__predict_false(tnprocs >= maxproc))
                   2387:                error = -1;
                   2388:        else
                   2389:                error = kauth_authorize_process(l1->l_cred,
                   2390:                    KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL);
                   2391:
                   2392:        if (error) {
                   2393:                atomic_dec_uint(&nprocs);
1.348     martin   2394:                return EAGAIN;
1.337     martin   2395:        }
                   2396:
                   2397:        /*
                   2398:         * Enforce limits.
                   2399:         */
                   2400:        count = chgproccnt(uid, 1);
1.347     elad     2401:        if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT,
                   2402:             p1, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
                   2403:             &p1->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != 0 &&
                   2404:            __predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) {
1.348     martin   2405:                (void)chgproccnt(uid, -1);
                   2406:                atomic_dec_uint(&nprocs);
                   2407:                return EAGAIN;
1.337     martin   2408:        }
                   2409:
1.348     martin   2410:        return 0;
                   2411: }
                   2412:
                   2413: int
1.352     rmind    2414: do_posix_spawn(struct lwp *l1, pid_t *pid_res, bool *child_ok, const char *path,
                   2415:        struct posix_spawn_file_actions *fa,
                   2416:        struct posix_spawnattr *sa,
                   2417:        char *const *argv, char *const *envp,
                   2418:        execve_fetch_element_t fetch)
1.348     martin   2419: {
1.352     rmind    2420:
1.348     martin   2421:        struct proc *p1, *p2;
                   2422:        struct lwp *l2;
                   2423:        int error;
                   2424:        struct spawn_exec_data *spawn_data;
                   2425:        vaddr_t uaddr;
                   2426:        pid_t pid;
1.352     rmind    2427:        bool have_exec_lock = false;
1.348     martin   2428:
                   2429:        p1 = l1->l_proc;
1.342     christos 2430:
1.348     martin   2431:        /* Allocate and init spawn_data */
                   2432:        spawn_data = kmem_zalloc(sizeof(*spawn_data), KM_SLEEP);
                   2433:        spawn_data->sed_refcnt = 1; /* only parent so far */
                   2434:        cv_init(&spawn_data->sed_cv_child_ready, "pspawn");
                   2435:        mutex_init(&spawn_data->sed_mtx_child, MUTEX_DEFAULT, IPL_NONE);
1.352     rmind    2436:        mutex_enter(&spawn_data->sed_mtx_child);
                   2437:
                   2438:        /*
                   2439:         * Do the first part of the exec now, collect state
                   2440:         * in spawn_data.
                   2441:         */
1.481     christos 2442:        error = execve_loadvm(l1, true, path, -1, argv,
1.352     rmind    2443:            envp, fetch, &spawn_data->sed_exec);
                   2444:        if (error == EJUSTRETURN)
                   2445:                error = 0;
                   2446:        else if (error)
                   2447:                goto error_exit;
                   2448:
                   2449:        have_exec_lock = true;
1.337     martin   2450:
                   2451:        /*
                   2452:         * Allocate virtual address space for the U-area now, while it
                   2453:         * is still easy to abort the fork operation if we're out of
                   2454:         * kernel virtual address space.
                   2455:         */
                   2456:        uaddr = uvm_uarea_alloc();
                   2457:        if (__predict_false(uaddr == 0)) {
1.352     rmind    2458:                error = ENOMEM;
                   2459:                goto error_exit;
1.351     rmind    2460:        }
1.352     rmind    2461:
1.337     martin   2462:        /*
1.348     martin   2463:         * Allocate new proc. Borrow proc0 vmspace for it, we will
                   2464:         * replace it with its own before returning to userland
                   2465:         * in the child.
1.337     martin   2466:         * This is a point of no return, we will have to go through
                   2467:         * the child proc to properly clean it up past this point.
                   2468:         */
                   2469:        p2 = proc_alloc();
                   2470:        pid = p2->p_pid;
                   2471:
                   2472:        /*
                   2473:         * Make a proc table entry for the new process.
                   2474:         * Start by zeroing the section of proc that is zero-initialized,
                   2475:         * then copy the section that is copied directly from the parent.
                   2476:         */
                   2477:        memset(&p2->p_startzero, 0,
                   2478:            (unsigned) ((char *)&p2->p_endzero - (char *)&p2->p_startzero));
                   2479:        memcpy(&p2->p_startcopy, &p1->p_startcopy,
                   2480:            (unsigned) ((char *)&p2->p_endcopy - (char *)&p2->p_startcopy));
1.348     martin   2481:        p2->p_vmspace = proc0.p_vmspace;
1.337     martin   2482:
1.366     christos 2483:        TAILQ_INIT(&p2->p_sigpend.sp_info);
1.337     martin   2484:
                   2485:        LIST_INIT(&p2->p_lwps);
                   2486:        LIST_INIT(&p2->p_sigwaiters);
                   2487:
                   2488:        /*
                   2489:         * Duplicate sub-structures as needed.
                   2490:         * Increase reference counts on shared objects.
                   2491:         * Inherit flags we want to keep.  The flags related to SIGCHLD
                   2492:         * handling are important in order to keep a consistent behaviour
                   2493:         * for the child after the fork.  If we are a 32-bit process, the
                   2494:         * child will be too.
                   2495:         */
                   2496:        p2->p_flag =
                   2497:            p1->p_flag & (PK_SUGID | PK_NOCLDWAIT | PK_CLDSIGIGN | PK_32);
                   2498:        p2->p_emul = p1->p_emul;
                   2499:        p2->p_execsw = p1->p_execsw;
                   2500:
                   2501:        mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
                   2502:        mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
                   2503:        rw_init(&p2->p_reflock);
                   2504:        cv_init(&p2->p_waitcv, "wait");
                   2505:        cv_init(&p2->p_lwpcv, "lwpwait");
                   2506:
                   2507:        p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
                   2508:
                   2509:        kauth_proc_fork(p1, p2);
                   2510:
                   2511:        p2->p_raslist = NULL;
                   2512:        p2->p_fd = fd_copy();
                   2513:
                   2514:        /* XXX racy */
                   2515:        p2->p_mqueue_cnt = p1->p_mqueue_cnt;
                   2516:
                   2517:        p2->p_cwdi = cwdinit();
                   2518:
                   2519:        /*
                   2520:         * Note: p_limit (rlimit stuff) is copy-on-write, so normally
                   2521:         * we just need increase pl_refcnt.
                   2522:         */
1.348     martin   2523:        if (!p1->p_limit->pl_writeable) {
                   2524:                lim_addref(p1->p_limit);
                   2525:                p2->p_limit = p1->p_limit;
1.337     martin   2526:        } else {
                   2527:                p2->p_limit = lim_copy(p1->p_limit);
                   2528:        }
                   2529:
                   2530:        p2->p_lflag = 0;
1.468     kamil    2531:        l1->l_vforkwaiting = false;
1.337     martin   2532:        p2->p_sflag = 0;
                   2533:        p2->p_slflag = 0;
                   2534:        p2->p_pptr = p1;
                   2535:        p2->p_ppid = p1->p_pid;
                   2536:        LIST_INIT(&p2->p_children);
                   2537:
                   2538:        p2->p_aio = NULL;
                   2539:
                   2540: #ifdef KTRACE
                   2541:        /*
                   2542:         * Copy traceflag and tracefile if enabled.
                   2543:         * If not inherited, these were zeroed above.
                   2544:         */
                   2545:        if (p1->p_traceflag & KTRFAC_INHERIT) {
                   2546:                mutex_enter(&ktrace_lock);
                   2547:                p2->p_traceflag = p1->p_traceflag;
                   2548:                if ((p2->p_tracep = p1->p_tracep) != NULL)
                   2549:                        ktradref(p2);
                   2550:                mutex_exit(&ktrace_lock);
                   2551:        }
                   2552: #endif
                   2553:
                   2554:        /*
                   2555:         * Create signal actions for the child process.
                   2556:         */
                   2557:        p2->p_sigacts = sigactsinit(p1, 0);
                   2558:        mutex_enter(p1->p_lock);
                   2559:        p2->p_sflag |=
                   2560:            (p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP));
                   2561:        sched_proc_fork(p1, p2);
                   2562:        mutex_exit(p1->p_lock);
                   2563:
                   2564:        p2->p_stflag = p1->p_stflag;
                   2565:
                   2566:        /*
                   2567:         * p_stats.
                   2568:         * Copy parts of p_stats, and zero out the rest.
                   2569:         */
                   2570:        p2->p_stats = pstatscopy(p1->p_stats);
                   2571:
                   2572:        /* copy over machdep flags to the new proc */
                   2573:        cpu_proc_fork(p1, p2);
                   2574:
                   2575:        /*
1.352     rmind    2576:         * Prepare remaining parts of spawn data
1.337     martin   2577:         */
1.348     martin   2578:        spawn_data->sed_actions = fa;
                   2579:        spawn_data->sed_attrs = sa;
1.352     rmind    2580:
1.337     martin   2581:        spawn_data->sed_parent = p1;
                   2582:
1.352     rmind    2583:        /* create LWP */
1.337     martin   2584:        lwp_create(l1, p2, uaddr, 0, NULL, 0, spawn_return, spawn_data,
1.442     christos 2585:            &l2, l1->l_class, &l1->l_sigmask, &l1->l_sigstk);
1.337     martin   2586:        l2->l_ctxlink = NULL;   /* reset ucontext link */
                   2587:
                   2588:        /*
                   2589:         * Copy the credential so other references don't see our changes.
                   2590:         * Test to see if this is necessary first, since in the common case
                   2591:         * we won't need a private reference.
                   2592:         */
                   2593:        if (kauth_cred_geteuid(l2->l_cred) != kauth_cred_getsvuid(l2->l_cred) ||
                   2594:            kauth_cred_getegid(l2->l_cred) != kauth_cred_getsvgid(l2->l_cred)) {
                   2595:                l2->l_cred = kauth_cred_copy(l2->l_cred);
                   2596:                kauth_cred_setsvuid(l2->l_cred, kauth_cred_geteuid(l2->l_cred));
                   2597:                kauth_cred_setsvgid(l2->l_cred, kauth_cred_getegid(l2->l_cred));
                   2598:        }
                   2599:
                   2600:        /* Update the master credentials. */
                   2601:        if (l2->l_cred != p2->p_cred) {
                   2602:                kauth_cred_t ocred;
                   2603:
                   2604:                kauth_cred_hold(l2->l_cred);
                   2605:                mutex_enter(p2->p_lock);
                   2606:                ocred = p2->p_cred;
                   2607:                p2->p_cred = l2->l_cred;
                   2608:                mutex_exit(p2->p_lock);
                   2609:                kauth_cred_free(ocred);
                   2610:        }
                   2611:
1.352     rmind    2612:        *child_ok = true;
                   2613:        spawn_data->sed_refcnt = 2;     /* child gets it as well */
1.348     martin   2614: #if 0
1.345     martin   2615:        l2->l_nopreempt = 1; /* start it non-preemptable */
1.348     martin   2616: #endif
1.345     martin   2617:
1.337     martin   2618:        /*
                   2619:         * It's now safe for the scheduler and other processes to see the
                   2620:         * child process.
                   2621:         */
                   2622:        mutex_enter(proc_lock);
                   2623:
                   2624:        if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT)
                   2625:                p2->p_lflag |= PL_CONTROLT;
                   2626:
                   2627:        LIST_INSERT_HEAD(&p1->p_children, p2, p_sibling);
                   2628:        p2->p_exitsig = SIGCHLD;        /* signal for parent on exit */
                   2629:
1.466     kamil    2630:        if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) ==
                   2631:            (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) {
                   2632:                proc_changeparent(p2, p1->p_pptr);
1.482   ! kamil    2633:                p2->p_oppid = p1->p_pid;
1.466     kamil    2634:        }
                   2635:
1.337     martin   2636:        LIST_INSERT_AFTER(p1, p2, p_pglist);
                   2637:        LIST_INSERT_HEAD(&allproc, p2, p_list);
                   2638:
                   2639:        p2->p_trace_enabled = trace_is_enabled(p2);
                   2640: #ifdef __HAVE_SYSCALL_INTERN
                   2641:        (*p2->p_emul->e_syscall_intern)(p2);
                   2642: #endif
                   2643:
                   2644:        /*
                   2645:         * Make child runnable, set start time, and add to run queue except
                   2646:         * if the parent requested the child to start in SSTOP state.
                   2647:         */
                   2648:        mutex_enter(p2->p_lock);
                   2649:
                   2650:        getmicrotime(&p2->p_stats->p_start);
                   2651:
                   2652:        lwp_lock(l2);
                   2653:        KASSERT(p2->p_nrlwps == 1);
                   2654:        p2->p_nrlwps = 1;
                   2655:        p2->p_stat = SACTIVE;
                   2656:        l2->l_stat = LSRUN;
                   2657:        sched_enqueue(l2, false);
                   2658:        lwp_unlock(l2);
                   2659:
                   2660:        mutex_exit(p2->p_lock);
                   2661:        mutex_exit(proc_lock);
                   2662:
                   2663:        cv_wait(&spawn_data->sed_cv_child_ready, &spawn_data->sed_mtx_child);
1.348     martin   2664:        error = spawn_data->sed_error;
1.337     martin   2665:        mutex_exit(&spawn_data->sed_mtx_child);
1.352     rmind    2666:        spawn_exec_data_release(spawn_data);
1.337     martin   2667:
1.341     martin   2668:        rw_exit(&p1->p_reflock);
1.337     martin   2669:        rw_exit(&exec_lock);
1.352     rmind    2670:        have_exec_lock = false;
1.351     rmind    2671:
1.352     rmind    2672:        *pid_res = pid;
1.466     kamil    2673:
                   2674:        if (error)
                   2675:                return error;
                   2676:
                   2677:        if (p1->p_slflag & PSL_TRACED) {
                   2678:                /* Paranoid check */
                   2679:                mutex_enter(proc_lock);
                   2680:                if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) !=
                   2681:                    (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) {
                   2682:                        mutex_exit(proc_lock);
                   2683:                        return 0;
                   2684:                }
                   2685:
                   2686:                mutex_enter(p1->p_lock);
1.482   ! kamil    2687:                eventswitch(TRAP_CHLD, PTRACE_POSIX_SPAWN, pid);
1.466     kamil    2688:        }
                   2689:        return 0;
1.352     rmind    2690:
                   2691:  error_exit:
1.376     maxv     2692:        if (have_exec_lock) {
1.352     rmind    2693:                execve_free_data(&spawn_data->sed_exec);
                   2694:                rw_exit(&p1->p_reflock);
1.376     maxv     2695:                rw_exit(&exec_lock);
1.352     rmind    2696:        }
                   2697:        mutex_exit(&spawn_data->sed_mtx_child);
1.351     rmind    2698:        spawn_exec_data_release(spawn_data);
1.376     maxv     2699:
1.348     martin   2700:        return error;
                   2701: }
1.337     martin   2702:
1.348     martin   2703: int
                   2704: sys_posix_spawn(struct lwp *l1, const struct sys_posix_spawn_args *uap,
                   2705:     register_t *retval)
                   2706: {
                   2707:        /* {
                   2708:                syscallarg(pid_t *) pid;
                   2709:                syscallarg(const char *) path;
                   2710:                syscallarg(const struct posix_spawn_file_actions *) file_actions;
                   2711:                syscallarg(const struct posix_spawnattr *) attrp;
                   2712:                syscallarg(char *const *) argv;
                   2713:                syscallarg(char *const *) envp;
                   2714:        } */
                   2715:
                   2716:        int error;
                   2717:        struct posix_spawn_file_actions *fa = NULL;
                   2718:        struct posix_spawnattr *sa = NULL;
                   2719:        pid_t pid;
1.352     rmind    2720:        bool child_ok = false;
1.373     martin   2721:        rlim_t max_fileactions;
                   2722:        proc_t *p = l1->l_proc;
1.348     martin   2723:
                   2724:        error = check_posix_spawn(l1);
                   2725:        if (error) {
                   2726:                *retval = error;
                   2727:                return 0;
                   2728:        }
                   2729:
                   2730:        /* copy in file_actions struct */
                   2731:        if (SCARG(uap, file_actions) != NULL) {
1.461     riastrad 2732:                max_fileactions = 2 * uimin(p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
1.373     martin   2733:                    maxfiles);
                   2734:                error = posix_spawn_fa_alloc(&fa, SCARG(uap, file_actions),
                   2735:                    max_fileactions);
1.348     martin   2736:                if (error)
1.352     rmind    2737:                        goto error_exit;
1.348     martin   2738:        }
                   2739:
                   2740:        /* copyin posix_spawnattr struct */
                   2741:        if (SCARG(uap, attrp) != NULL) {
                   2742:                sa = kmem_alloc(sizeof(*sa), KM_SLEEP);
                   2743:                error = copyin(SCARG(uap, attrp), sa, sizeof(*sa));
                   2744:                if (error)
1.352     rmind    2745:                        goto error_exit;
1.348     martin   2746:        }
1.337     martin   2747:
1.348     martin   2748:        /*
                   2749:         * Do the spawn
                   2750:         */
1.352     rmind    2751:        error = do_posix_spawn(l1, &pid, &child_ok, SCARG(uap, path), fa, sa,
1.348     martin   2752:            SCARG(uap, argv), SCARG(uap, envp), execve_fetch_element);
                   2753:        if (error)
1.352     rmind    2754:                goto error_exit;
1.337     martin   2755:
                   2756:        if (error == 0 && SCARG(uap, pid) != NULL)
                   2757:                error = copyout(&pid, SCARG(uap, pid), sizeof(pid));
                   2758:
                   2759:        *retval = error;
                   2760:        return 0;
                   2761:
1.352     rmind    2762:  error_exit:
                   2763:        if (!child_ok) {
                   2764:                (void)chgproccnt(kauth_cred_getuid(l1->l_cred), -1);
                   2765:                atomic_dec_uint(&nprocs);
                   2766:
                   2767:                if (sa)
                   2768:                        kmem_free(sa, sizeof(*sa));
                   2769:                if (fa)
                   2770:                        posix_spawn_fa_free(fa, fa->len);
                   2771:        }
                   2772:
1.337     martin   2773:        *retval = error;
                   2774:        return 0;
                   2775: }
                   2776:
1.336     matt     2777: void
                   2778: exec_free_emul_arg(struct exec_package *epp)
                   2779: {
                   2780:        if (epp->ep_emul_arg_free != NULL) {
                   2781:                KASSERT(epp->ep_emul_arg != NULL);
                   2782:                (*epp->ep_emul_arg_free)(epp->ep_emul_arg);
                   2783:                epp->ep_emul_arg_free = NULL;
                   2784:                epp->ep_emul_arg = NULL;
                   2785:        } else {
                   2786:                KASSERT(epp->ep_emul_arg == NULL);
                   2787:        }
                   2788: }
1.388     uebayasi 2789:
                   2790: #ifdef DEBUG_EXEC
                   2791: static void
                   2792: dump_vmcmds(const struct exec_package * const epp, size_t x, int error)
                   2793: {
                   2794:        struct exec_vmcmd *vp = &epp->ep_vmcmds.evs_cmds[0];
                   2795:        size_t j;
                   2796:
                   2797:        if (error == 0)
                   2798:                DPRINTF(("vmcmds %u\n", epp->ep_vmcmds.evs_used));
                   2799:        else
                   2800:                DPRINTF(("vmcmds %zu/%u, error %d\n", x,
                   2801:                    epp->ep_vmcmds.evs_used, error));
                   2802:
                   2803:        for (j = 0; j < epp->ep_vmcmds.evs_used; j++) {
                   2804:                DPRINTF(("vmcmd[%zu] = vmcmd_map_%s %#"
                   2805:                    PRIxVADDR"/%#"PRIxVSIZE" fd@%#"
                   2806:                    PRIxVSIZE" prot=0%o flags=%d\n", j,
                   2807:                    vp[j].ev_proc == vmcmd_map_pagedvn ?
                   2808:                    "pagedvn" :
                   2809:                    vp[j].ev_proc == vmcmd_map_readvn ?
                   2810:                    "readvn" :
                   2811:                    vp[j].ev_proc == vmcmd_map_zero ?
                   2812:                    "zero" : "*unknown*",
                   2813:                    vp[j].ev_addr, vp[j].ev_len,
                   2814:                    vp[j].ev_offset, vp[j].ev_prot,
                   2815:                    vp[j].ev_flags));
                   2816:                if (error != 0 && j == x)
                   2817:                        DPRINTF(("     ^--- failed\n"));
                   2818:        }
                   2819: }
                   2820: #endif

CVSweb <webmaster@jp.NetBSD.org>