[BACK]Return to sched_4bsd.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / kern

Annotation of src/sys/kern/sched_4bsd.c, Revision 1.7.4.1

1.7.4.1 ! mjf         1: /*     $NetBSD: sched_4bsd.c,v 1.8 2007/11/06 00:42:43 ad Exp $        */
1.2       yamt        2:
                      3: /*-
                      4:  * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
                      5:  * All rights reserved.
                      6:  *
                      7:  * This code is derived from software contributed to The NetBSD Foundation
                      8:  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
                      9:  * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and
                     10:  * Daniel Sieger.
                     11:  *
                     12:  * Redistribution and use in source and binary forms, with or without
                     13:  * modification, are permitted provided that the following conditions
                     14:  * are met:
                     15:  * 1. Redistributions of source code must retain the above copyright
                     16:  *    notice, this list of conditions and the following disclaimer.
                     17:  * 2. Redistributions in binary form must reproduce the above copyright
                     18:  *    notice, this list of conditions and the following disclaimer in the
                     19:  *    documentation and/or other materials provided with the distribution.
                     20:  * 3. All advertising materials mentioning features or use of this software
                     21:  *    must display the following acknowledgement:
                     22:  *     This product includes software developed by the NetBSD
                     23:  *     Foundation, Inc. and its contributors.
                     24:  * 4. Neither the name of The NetBSD Foundation nor the names of its
                     25:  *    contributors may be used to endorse or promote products derived
                     26:  *    from this software without specific prior written permission.
                     27:  *
                     28:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     29:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     30:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     31:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     32:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     33:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     34:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     35:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     36:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     37:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     38:  * POSSIBILITY OF SUCH DAMAGE.
                     39:  */
                     40:
                     41: /*-
                     42:  * Copyright (c) 1982, 1986, 1990, 1991, 1993
                     43:  *     The Regents of the University of California.  All rights reserved.
                     44:  * (c) UNIX System Laboratories, Inc.
                     45:  * All or some portions of this file are derived from material licensed
                     46:  * to the University of California by American Telephone and Telegraph
                     47:  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
                     48:  * the permission of UNIX System Laboratories, Inc.
                     49:  *
                     50:  * Redistribution and use in source and binary forms, with or without
                     51:  * modification, are permitted provided that the following conditions
                     52:  * are met:
                     53:  * 1. Redistributions of source code must retain the above copyright
                     54:  *    notice, this list of conditions and the following disclaimer.
                     55:  * 2. Redistributions in binary form must reproduce the above copyright
                     56:  *    notice, this list of conditions and the following disclaimer in the
                     57:  *    documentation and/or other materials provided with the distribution.
                     58:  * 3. Neither the name of the University nor the names of its contributors
                     59:  *    may be used to endorse or promote products derived from this software
                     60:  *    without specific prior written permission.
                     61:  *
                     62:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
                     63:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     64:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     65:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
                     66:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     67:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     68:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     69:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     70:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     71:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     72:  * SUCH DAMAGE.
                     73:  *
                     74:  *     @(#)kern_synch.c        8.9 (Berkeley) 5/19/95
                     75:  */
                     76:
                     77: #include <sys/cdefs.h>
1.7.4.1 ! mjf        78: __KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.8 2007/11/06 00:42:43 ad Exp $");
1.2       yamt       79:
                     80: #include "opt_ddb.h"
                     81: #include "opt_lockdebug.h"
                     82: #include "opt_perfctrs.h"
                     83:
                     84: #define        __MUTEX_PRIVATE
                     85:
                     86: #include <sys/param.h>
                     87: #include <sys/systm.h>
                     88: #include <sys/callout.h>
                     89: #include <sys/cpu.h>
                     90: #include <sys/proc.h>
                     91: #include <sys/kernel.h>
                     92: #include <sys/signalvar.h>
                     93: #include <sys/resourcevar.h>
                     94: #include <sys/sched.h>
                     95: #include <sys/sysctl.h>
                     96: #include <sys/kauth.h>
                     97: #include <sys/lockdebug.h>
                     98: #include <sys/kmem.h>
1.5       ad         99: #include <sys/intr.h>
1.2       yamt      100:
                    101: #include <uvm/uvm_extern.h>
                    102:
                    103: /*
                    104:  * Run queues.
                    105:  *
1.7.4.1 ! mjf       106:  * We maintain bitmasks of non-empty queues in order speed up finding
        !           107:  * the first runnable process.  Since there can be (by definition) few
        !           108:  * real time LWPs in the the system, we maintain them on a linked list,
        !           109:  * sorted by priority.
1.2       yamt      110:  */
                    111:
1.7.4.1 ! mjf       112: #define        PPB_SHIFT       5
        !           113: #define        PPB_MASK        31
        !           114:
        !           115: #define        NUM_Q           (NPRI_KERNEL + NPRI_USER)
        !           116: #define        NUM_PPB         (1 << PPB_SHIFT)
        !           117: #define        NUM_B           (NUM_Q / NUM_PPB)
        !           118:
1.2       yamt      119: typedef struct runqueue {
1.7.4.1 ! mjf       120:        TAILQ_HEAD(, lwp) rq_fixedpri;          /* realtime, kthread */
        !           121:        u_int           rq_count;               /* total # jobs */
        !           122:        uint32_t        rq_bitmap[NUM_B];       /* bitmap of queues */
        !           123:        TAILQ_HEAD(, lwp) rq_queue[NUM_Q];      /* user+kernel */
1.2       yamt      124: } runqueue_t;
1.7.4.1 ! mjf       125:
1.2       yamt      126: static runqueue_t global_queue;
                    127:
                    128: static void updatepri(struct lwp *);
                    129: static void resetpriority(struct lwp *);
                    130:
1.6       rmind     131: fixpt_t decay_cpu(fixpt_t, fixpt_t);
                    132:
1.2       yamt      133: extern unsigned int sched_pstats_ticks; /* defined in kern_synch.c */
                    134:
                    135: /* The global scheduler state */
                    136: kmutex_t sched_mutex;
                    137:
                    138: /* Number of hardclock ticks per sched_tick() */
                    139: int rrticks;
                    140:
1.7.4.1 ! mjf       141: const int schedppq = 1;
        !           142:
1.2       yamt      143: /*
                    144:  * Force switch among equal priority processes every 100ms.
                    145:  * Called from hardclock every hz/10 == rrticks hardclock ticks.
1.5       ad        146:  *
                    147:  * There's no need to lock anywhere in this routine, as it's
                    148:  * CPU-local and runs at IPL_SCHED (called from clock interrupt).
1.2       yamt      149:  */
                    150: /* ARGSUSED */
                    151: void
                    152: sched_tick(struct cpu_info *ci)
                    153: {
                    154:        struct schedstate_percpu *spc = &ci->ci_schedstate;
                    155:
                    156:        spc->spc_ticks = rrticks;
                    157:
1.7       rmind     158:        if (CURCPU_IDLE_P())
                    159:                return;
                    160:
                    161:        if (spc->spc_flags & SPCF_SEENRR) {
                    162:                /*
                    163:                 * The process has already been through a roundrobin
                    164:                 * without switching and may be hogging the CPU.
                    165:                 * Indicate that the process should yield.
                    166:                 */
                    167:                spc->spc_flags |= SPCF_SHOULDYIELD;
                    168:        } else
                    169:                spc->spc_flags |= SPCF_SEENRR;
                    170:
                    171:        cpu_need_resched(ci, 0);
1.2       yamt      172: }
                    173:
1.7.4.1 ! mjf       174: /*
        !           175:  * Why PRIO_MAX - 2? From setpriority(2):
        !           176:  *
        !           177:  *     prio is a value in the range -20 to 20.  The default priority is
        !           178:  *     0; lower priorities cause more favorable scheduling.  A value of
        !           179:  *     19 or 20 will schedule a process only when nothing at priority <=
        !           180:  *     0 is runnable.
        !           181:  *
        !           182:  * This gives estcpu influence over 18 priority levels, and leaves nice
        !           183:  * with 40 levels.  One way to think about it is that nice has 20 levels
        !           184:  * either side of estcpu's 18.
        !           185:  */
1.2       yamt      186: #define        ESTCPU_SHIFT    11
1.7.4.1 ! mjf       187: #define        ESTCPU_MAX      ((PRIO_MAX - 2) << ESTCPU_SHIFT)
        !           188: #define        ESTCPU_ACCUM    (1 << (ESTCPU_SHIFT - 1))
1.2       yamt      189: #define        ESTCPULIM(e)    min((e), ESTCPU_MAX)
                    190:
                    191: /*
                    192:  * Constants for digital decay and forget:
1.7.4.1 ! mjf       193:  *     90% of (l_estcpu) usage in 5 * loadav time
        !           194:  *     95% of (l_pctcpu) usage in 60 seconds (load insensitive)
1.2       yamt      195:  *          Note that, as ps(1) mentions, this can let percentages
                    196:  *          total over 100% (I've seen 137.9% for 3 processes).
                    197:  *
1.7.4.1 ! mjf       198:  * Note that hardclock updates l_estcpu and l_cpticks independently.
1.2       yamt      199:  *
1.7.4.1 ! mjf       200:  * We wish to decay away 90% of l_estcpu in (5 * loadavg) seconds.
1.2       yamt      201:  * That is, the system wants to compute a value of decay such
                    202:  * that the following for loop:
                    203:  *     for (i = 0; i < (5 * loadavg); i++)
1.7.4.1 ! mjf       204:  *             l_estcpu *= decay;
1.2       yamt      205:  * will compute
1.7.4.1 ! mjf       206:  *     l_estcpu *= 0.1;
1.2       yamt      207:  * for all values of loadavg:
                    208:  *
                    209:  * Mathematically this loop can be expressed by saying:
                    210:  *     decay ** (5 * loadavg) ~= .1
                    211:  *
                    212:  * The system computes decay as:
                    213:  *     decay = (2 * loadavg) / (2 * loadavg + 1)
                    214:  *
                    215:  * We wish to prove that the system's computation of decay
                    216:  * will always fulfill the equation:
                    217:  *     decay ** (5 * loadavg) ~= .1
                    218:  *
                    219:  * If we compute b as:
                    220:  *     b = 2 * loadavg
                    221:  * then
                    222:  *     decay = b / (b + 1)
                    223:  *
                    224:  * We now need to prove two things:
                    225:  *     1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
                    226:  *     2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
                    227:  *
                    228:  * Facts:
                    229:  *         For x close to zero, exp(x) =~ 1 + x, since
                    230:  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
                    231:  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
                    232:  *         For x close to zero, ln(1+x) =~ x, since
                    233:  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
                    234:  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
                    235:  *         ln(.1) =~ -2.30
                    236:  *
                    237:  * Proof of (1):
                    238:  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
                    239:  *     solving for factor,
                    240:  *      ln(factor) =~ (-2.30/5*loadav), or
                    241:  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
                    242:  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
                    243:  *
                    244:  * Proof of (2):
                    245:  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
                    246:  *     solving for power,
                    247:  *      power*ln(b/(b+1)) =~ -2.30, or
                    248:  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
                    249:  *
                    250:  * Actual power values for the implemented algorithm are as follows:
                    251:  *      loadav: 1       2       3       4
                    252:  *      power:  5.68    10.32   14.94   19.55
                    253:  */
                    254:
                    255: /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
                    256: #define        loadfactor(loadav)      (2 * (loadav))
                    257:
1.6       rmind     258: fixpt_t
1.2       yamt      259: decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
                    260: {
                    261:
                    262:        if (estcpu == 0) {
                    263:                return 0;
                    264:        }
                    265:
                    266: #if !defined(_LP64)
                    267:        /* avoid 64bit arithmetics. */
                    268: #define        FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
                    269:        if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
                    270:                return estcpu * loadfac / (loadfac + FSCALE);
                    271:        }
                    272: #endif /* !defined(_LP64) */
                    273:
                    274:        return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
                    275: }
                    276:
                    277: /*
1.7.4.1 ! mjf       278:  * For all load averages >= 1 and max l_estcpu of (255 << ESTCPU_SHIFT),
        !           279:  * sleeping for at least seven times the loadfactor will decay l_estcpu to
1.2       yamt      280:  * less than (1 << ESTCPU_SHIFT).
                    281:  *
                    282:  * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
                    283:  */
                    284: static fixpt_t
                    285: decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
                    286: {
                    287:
                    288:        if ((n << FSHIFT) >= 7 * loadfac) {
                    289:                return 0;
                    290:        }
                    291:
                    292:        while (estcpu != 0 && n > 1) {
                    293:                estcpu = decay_cpu(loadfac, estcpu);
                    294:                n--;
                    295:        }
                    296:
                    297:        return estcpu;
                    298: }
                    299:
                    300: /*
                    301:  * sched_pstats_hook:
                    302:  *
                    303:  * Periodically called from sched_pstats(); used to recalculate priorities.
                    304:  */
                    305: void
1.6       rmind     306: sched_pstats_hook(struct lwp *l)
1.2       yamt      307: {
1.7.4.1 ! mjf       308:        fixpt_t loadfac;
        !           309:        int sleeptm;
1.2       yamt      310:
1.7.4.1 ! mjf       311:        /*
        !           312:         * If the LWP has slept an entire second, stop recalculating
        !           313:         * its priority until it wakes up.
        !           314:         */
        !           315:        if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
        !           316:            l->l_stat == LSSUSPENDED) {
        !           317:                l->l_slptime++;
        !           318:                sleeptm = 1;
        !           319:        } else {
        !           320:                sleeptm = 0x7fffffff;
        !           321:        }
        !           322:
        !           323:        if (l->l_slptime <= sleeptm) {
        !           324:                loadfac = 2 * (averunnable.ldavg[0]);
        !           325:                l->l_estcpu = decay_cpu(loadfac, l->l_estcpu);
1.6       rmind     326:                resetpriority(l);
1.7.4.1 ! mjf       327:        }
1.2       yamt      328: }
                    329:
                    330: /*
                    331:  * Recalculate the priority of a process after it has slept for a while.
                    332:  */
                    333: static void
                    334: updatepri(struct lwp *l)
                    335: {
                    336:        fixpt_t loadfac;
                    337:
1.3       ad        338:        KASSERT(lwp_locked(l, NULL));
1.2       yamt      339:        KASSERT(l->l_slptime > 1);
                    340:
                    341:        loadfac = loadfactor(averunnable.ldavg[0]);
                    342:
                    343:        l->l_slptime--; /* the first time was done in sched_pstats */
1.7.4.1 ! mjf       344:        l->l_estcpu = decay_cpu_batch(loadfac, l->l_estcpu, l->l_slptime);
1.2       yamt      345:        resetpriority(l);
                    346: }
                    347:
                    348: static void
                    349: runqueue_init(runqueue_t *rq)
                    350: {
                    351:        int i;
                    352:
1.7.4.1 ! mjf       353:        for (i = 0; i < NUM_Q; i++)
        !           354:                TAILQ_INIT(&rq->rq_queue[i]);
        !           355:        for (i = 0; i < NUM_B; i++)
        !           356:                rq->rq_bitmap[i] = 0;
        !           357:        TAILQ_INIT(&rq->rq_fixedpri);
        !           358:        rq->rq_count = 0;
1.2       yamt      359: }
                    360:
                    361: static void
                    362: runqueue_enqueue(runqueue_t *rq, struct lwp *l)
                    363: {
1.7.4.1 ! mjf       364:        pri_t pri;
        !           365:        lwp_t *l2;
1.2       yamt      366:
                    367:        KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
                    368:
1.7.4.1 ! mjf       369:        pri = lwp_eprio(l);
        !           370:        rq->rq_count++;
        !           371:
        !           372:        if (pri >= PRI_KTHREAD) {
        !           373:                TAILQ_FOREACH(l2, &rq->rq_fixedpri, l_runq) {
        !           374:                        if (lwp_eprio(l2) < pri) {
        !           375:                                TAILQ_INSERT_BEFORE(l2, l, l_runq);
        !           376:                                return;
        !           377:                        }
        !           378:                }
        !           379:                TAILQ_INSERT_TAIL(&rq->rq_fixedpri, l, l_runq);
        !           380:                return;
        !           381:        }
        !           382:
        !           383:        rq->rq_bitmap[pri >> PPB_SHIFT] |=
        !           384:            (0x80000000U >> (pri & PPB_MASK));
        !           385:        TAILQ_INSERT_TAIL(&rq->rq_queue[pri], l, l_runq);
1.2       yamt      386: }
                    387:
                    388: static void
                    389: runqueue_dequeue(runqueue_t *rq, struct lwp *l)
                    390: {
1.7.4.1 ! mjf       391:        pri_t pri;
1.2       yamt      392:
                    393:        KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
                    394:
1.7.4.1 ! mjf       395:        pri = lwp_eprio(l);
        !           396:        rq->rq_count--;
        !           397:
        !           398:        if (pri >= PRI_KTHREAD) {
        !           399:                TAILQ_REMOVE(&rq->rq_fixedpri, l, l_runq);
        !           400:                return;
        !           401:        }
        !           402:
        !           403:        TAILQ_REMOVE(&rq->rq_queue[pri], l, l_runq);
        !           404:        if (TAILQ_EMPTY(&rq->rq_queue[pri]))
        !           405:                rq->rq_bitmap[pri >> PPB_SHIFT] ^=
        !           406:                    (0x80000000U >> (pri & PPB_MASK));
1.2       yamt      407: }
                    408:
1.7.4.1 ! mjf       409: #if (NUM_B != 3) || (NUM_Q != 96)
        !           410: #error adjust runqueue_nextlwp
        !           411: #endif
        !           412:
1.2       yamt      413: static struct lwp *
                    414: runqueue_nextlwp(runqueue_t *rq)
                    415: {
1.7.4.1 ! mjf       416:        pri_t pri;
1.2       yamt      417:
1.7.4.1 ! mjf       418:        KASSERT(rq->rq_count != 0);
        !           419:
        !           420:        if (!TAILQ_EMPTY(&rq->rq_fixedpri))
        !           421:                return TAILQ_FIRST(&rq->rq_fixedpri);
        !           422:
        !           423:        if (rq->rq_bitmap[2] != 0)
        !           424:                pri = 96 - ffs(rq->rq_bitmap[2]);
        !           425:        else if (rq->rq_bitmap[1] != 0)
        !           426:                pri = 64 - ffs(rq->rq_bitmap[1]);
        !           427:        else
        !           428:                pri = 32 - ffs(rq->rq_bitmap[0]);
        !           429:        return TAILQ_FIRST(&rq->rq_queue[pri]);
1.2       yamt      430: }
                    431:
                    432: #if defined(DDB)
                    433: static void
                    434: runqueue_print(const runqueue_t *rq, void (*pr)(const char *, ...))
                    435: {
1.7.4.1 ! mjf       436:        CPU_INFO_ITERATOR cii;
        !           437:        struct cpu_info *ci;
        !           438:        lwp_t *l;
        !           439:        int i;
1.2       yamt      440:
1.7.4.1 ! mjf       441:        printf("PID\tLID\tPRI\tIPRI\tEPRI\tLWP\t\t NAME\n");
        !           442:
        !           443:        TAILQ_FOREACH(l, &rq->rq_fixedpri, l_runq) {
        !           444:                (*pr)("%d\t%d\%d\t%d\t%d\t%016lx %s\n",
        !           445:                    l->l_proc->p_pid, l->l_lid, (int)l->l_priority,
        !           446:                    (int)l->l_inheritedprio, lwp_eprio(l),
        !           447:                    (long)l, l->l_proc->p_comm);
        !           448:        }
        !           449:
        !           450:        for (i = NUM_Q - 1; i >= 0; i--) {
        !           451:                TAILQ_FOREACH(l, &rq->rq_queue[i], l_runq) {
        !           452:                        (*pr)("%d\t%d\t%d\t%d\t%d\t%016lx %s\n",
        !           453:                            l->l_proc->p_pid, l->l_lid, (int)l->l_priority,
        !           454:                            (int)l->l_inheritedprio, lwp_eprio(l),
        !           455:                            (long)l, l->l_proc->p_comm);
1.2       yamt      456:                }
                    457:        }
1.7.4.1 ! mjf       458:
        !           459:        printf("CPUIDX\tRESCHED\tCURPRI\tFLAGS\n");
        !           460:        for (CPU_INFO_FOREACH(cii, ci)) {
        !           461:                printf("%d\t%d\t%d\t%04x\n", (int)ci->ci_index,
        !           462:                    (int)ci->ci_want_resched,
        !           463:                    (int)ci->ci_schedstate.spc_curpriority,
        !           464:                    (int)ci->ci_schedstate.spc_flags);
        !           465:        }
        !           466:
        !           467:        printf("NEXTLWP\n%016lx\n", (long)sched_nextlwp());
1.2       yamt      468: }
                    469: #endif /* defined(DDB) */
                    470:
                    471: /*
                    472:  * Initialize the (doubly-linked) run queues
                    473:  * to be empty.
                    474:  */
                    475: void
                    476: sched_rqinit()
                    477: {
                    478:
                    479:        runqueue_init(&global_queue);
                    480:        mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
                    481:        /* Initialize the lock pointer for lwp0 */
                    482:        lwp0.l_mutex = &curcpu()->ci_schedstate.spc_lwplock;
                    483: }
                    484:
                    485: void
                    486: sched_cpuattach(struct cpu_info *ci)
                    487: {
                    488:        runqueue_t *rq;
                    489:
                    490:        ci->ci_schedstate.spc_mutex = &sched_mutex;
                    491:        rq = kmem_zalloc(sizeof(*rq), KM_NOSLEEP);
                    492:        runqueue_init(rq);
                    493:        ci->ci_schedstate.spc_sched_info = rq;
                    494: }
                    495:
                    496: void
                    497: sched_setup()
                    498: {
                    499:
                    500:        rrticks = hz / 10;
                    501: }
                    502:
                    503: void
                    504: sched_setrunnable(struct lwp *l)
                    505: {
                    506:
                    507:        if (l->l_slptime > 1)
                    508:                updatepri(l);
                    509: }
                    510:
                    511: bool
                    512: sched_curcpu_runnable_p(void)
                    513: {
1.4       ad        514:        struct schedstate_percpu *spc;
1.7.4.1 ! mjf       515:        struct cpu_info *ci;
        !           516:        int bits;
1.2       yamt      517:
1.7.4.1 ! mjf       518:        ci = curcpu();
        !           519:        spc = &ci->ci_schedstate;
        !           520: #ifndef __HAVE_FAST_SOFTINTS
        !           521:        bits = ci->ci_data.cpu_softints;
        !           522:        bits |= ((runqueue_t *)spc->spc_sched_info)->rq_count;
        !           523: #else
        !           524:        bits = ((runqueue_t *)spc->spc_sched_info)->rq_count;
        !           525: #endif
1.4       ad        526:        if (__predict_true((spc->spc_flags & SPCF_OFFLINE) == 0))
1.7.4.1 ! mjf       527:                bits |= global_queue.rq_count;
        !           528:        return bits != 0;
1.2       yamt      529: }
                    530:
                    531: void
1.7.4.1 ! mjf       532: sched_nice(struct proc *p, int n)
1.2       yamt      533: {
1.7.4.1 ! mjf       534:        struct lwp *l;
        !           535:
        !           536:        KASSERT(mutex_owned(&p->p_smutex));
1.2       yamt      537:
1.7.4.1 ! mjf       538:        p->p_nice = n;
        !           539:        LIST_FOREACH(l, &p->p_lwps, l_sibling) {
        !           540:                lwp_lock(l);
        !           541:                resetpriority(l);
        !           542:                lwp_unlock(l);
        !           543:        }
1.2       yamt      544: }
                    545:
                    546: /*
1.7.4.1 ! mjf       547:  * Recompute the priority of an LWP.  Arrange to reschedule if
        !           548:  * the resulting priority is better than that of the current LWP.
1.2       yamt      549:  */
                    550: static void
                    551: resetpriority(struct lwp *l)
                    552: {
1.7.4.1 ! mjf       553:        pri_t pri;
1.2       yamt      554:        struct proc *p = l->l_proc;
                    555:
1.7.4.1 ! mjf       556:        KASSERT(lwp_locked(l, NULL));
1.2       yamt      557:
1.7.4.1 ! mjf       558:        if (l->l_class != SCHED_OTHER)
1.2       yamt      559:                return;
                    560:
1.7.4.1 ! mjf       561:        /* See comments above ESTCPU_SHIFT definition. */
        !           562:        pri = (PRI_KERNEL - 1) - (l->l_estcpu >> ESTCPU_SHIFT) - p->p_nice;
        !           563:        pri = imax(pri, 0);
        !           564:        if (pri != l->l_priority)
        !           565:                lwp_changepri(l, pri);
1.2       yamt      566: }
                    567:
                    568: /*
                    569:  * We adjust the priority of the current process.  The priority of a process
1.7.4.1 ! mjf       570:  * gets worse as it accumulates CPU time.  The CPU usage estimator (l_estcpu)
1.2       yamt      571:  * is increased here.  The formula for computing priorities (in kern_synch.c)
1.7.4.1 ! mjf       572:  * will compute a different value each time l_estcpu increases. This can
1.2       yamt      573:  * cause a switch, but unless the priority crosses a PPQ boundary the actual
                    574:  * queue will not change.  The CPU usage estimator ramps up quite quickly
                    575:  * when the process is running (linearly), and decays away exponentially, at
                    576:  * a rate which is proportionally slower when the system is busy.  The basic
                    577:  * principle is that the system will 90% forget that the process used a lot
                    578:  * of CPU time in 5 * loadav seconds.  This causes the system to favor
                    579:  * processes which haven't run much recently, and to round-robin among other
                    580:  * processes.
                    581:  */
                    582:
                    583: void
                    584: sched_schedclock(struct lwp *l)
                    585: {
1.7.4.1 ! mjf       586:
        !           587:        if (l->l_class != SCHED_OTHER)
        !           588:                return;
1.2       yamt      589:
                    590:        KASSERT(!CURCPU_IDLE_P());
1.7.4.1 ! mjf       591:        l->l_estcpu = ESTCPULIM(l->l_estcpu + ESTCPU_ACCUM);
1.2       yamt      592:        lwp_lock(l);
                    593:        resetpriority(l);
                    594:        lwp_unlock(l);
                    595: }
                    596:
                    597: /*
                    598:  * sched_proc_fork:
                    599:  *
                    600:  *     Inherit the parent's scheduler history.
                    601:  */
                    602: void
                    603: sched_proc_fork(struct proc *parent, struct proc *child)
                    604: {
1.7.4.1 ! mjf       605:        lwp_t *pl;
1.2       yamt      606:
1.3       ad        607:        KASSERT(mutex_owned(&parent->p_smutex));
1.2       yamt      608:
1.7.4.1 ! mjf       609:        pl = LIST_FIRST(&parent->p_lwps);
        !           610:        child->p_estcpu_inherited = pl->l_estcpu;
1.2       yamt      611:        child->p_forktime = sched_pstats_ticks;
                    612: }
                    613:
                    614: /*
                    615:  * sched_proc_exit:
                    616:  *
                    617:  *     Chargeback parents for the sins of their children.
                    618:  */
                    619: void
                    620: sched_proc_exit(struct proc *parent, struct proc *child)
                    621: {
                    622:        fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
                    623:        fixpt_t estcpu;
1.7.4.1 ! mjf       624:        lwp_t *pl, *cl;
1.2       yamt      625:
                    626:        /* XXX Only if parent != init?? */
                    627:
1.7.4.1 ! mjf       628:        mutex_enter(&parent->p_smutex);
        !           629:        pl = LIST_FIRST(&parent->p_lwps);
        !           630:        cl = LIST_FIRST(&child->p_lwps);
1.2       yamt      631:        estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
                    632:            sched_pstats_ticks - child->p_forktime);
1.7.4.1 ! mjf       633:        if (cl->l_estcpu > estcpu) {
        !           634:                lwp_lock(pl);
        !           635:                pl->l_estcpu = ESTCPULIM(pl->l_estcpu + cl->l_estcpu - estcpu);
        !           636:                lwp_unlock(pl);
        !           637:        }
        !           638:        mutex_exit(&parent->p_smutex);
1.2       yamt      639: }
                    640:
                    641: void
                    642: sched_enqueue(struct lwp *l, bool ctxswitch)
                    643: {
                    644:
                    645:        if ((l->l_flag & LW_BOUND) != 0)
                    646:                runqueue_enqueue(l->l_cpu->ci_schedstate.spc_sched_info, l);
                    647:        else
                    648:                runqueue_enqueue(&global_queue, l);
                    649: }
                    650:
                    651: /*
                    652:  * XXXSMP When LWP dispatch (cpu_switch()) is changed to use sched_dequeue(),
                    653:  * drop of the effective priority level from kernel to user needs to be
                    654:  * moved here from userret().  The assignment in userret() is currently
                    655:  * done unlocked.
                    656:  */
                    657: void
                    658: sched_dequeue(struct lwp *l)
                    659: {
                    660:
                    661:        if ((l->l_flag & LW_BOUND) != 0)
                    662:                runqueue_dequeue(l->l_cpu->ci_schedstate.spc_sched_info, l);
                    663:        else
                    664:                runqueue_dequeue(&global_queue, l);
                    665: }
                    666:
                    667: struct lwp *
                    668: sched_nextlwp(void)
                    669: {
1.4       ad        670:        struct schedstate_percpu *spc;
1.7.4.1 ! mjf       671:        runqueue_t *rq;
1.2       yamt      672:        lwp_t *l1, *l2;
                    673:
1.4       ad        674:        spc = &curcpu()->ci_schedstate;
                    675:
1.2       yamt      676:        /* For now, just pick the highest priority LWP. */
1.7.4.1 ! mjf       677:        rq = spc->spc_sched_info;
        !           678:        l1 = NULL;
        !           679:        if (rq->rq_count != 0)
        !           680:                l1 = runqueue_nextlwp(rq);
        !           681:
        !           682:        rq = &global_queue;
        !           683:        if (__predict_false((spc->spc_flags & SPCF_OFFLINE) != 0) ||
        !           684:            rq->rq_count == 0)
1.4       ad        685:                return l1;
1.7.4.1 ! mjf       686:        l2 = runqueue_nextlwp(rq);
1.2       yamt      687:
                    688:        if (l1 == NULL)
                    689:                return l2;
                    690:        if (l2 == NULL)
                    691:                return l1;
1.7.4.1 ! mjf       692:        if (lwp_eprio(l2) > lwp_eprio(l1))
1.2       yamt      693:                return l2;
                    694:        else
                    695:                return l1;
                    696: }
                    697:
1.6       rmind     698: struct cpu_info *
                    699: sched_takecpu(struct lwp *l)
                    700: {
                    701:
                    702:        return l->l_cpu;
                    703: }
                    704:
                    705: void
                    706: sched_wakeup(struct lwp *l)
                    707: {
                    708:
                    709: }
                    710:
                    711: void
                    712: sched_slept(struct lwp *l)
                    713: {
                    714:
                    715: }
                    716:
1.2       yamt      717: void
1.7.4.1 ! mjf       718: sched_lwp_fork(struct lwp *l1, struct lwp *l2)
1.2       yamt      719: {
                    720:
1.7.4.1 ! mjf       721:        l2->l_estcpu = l1->l_estcpu;
1.2       yamt      722: }
                    723:
                    724: void
                    725: sched_lwp_exit(struct lwp *l)
                    726: {
                    727:
                    728: }
                    729:
1.7.4.1 ! mjf       730: void
        !           731: sched_lwp_collect(struct lwp *t)
        !           732: {
        !           733:        lwp_t *l;
        !           734:
        !           735:        /* Absorb estcpu value of collected LWP. */
        !           736:        l = curlwp;
        !           737:        lwp_lock(l);
        !           738:        l->l_estcpu += t->l_estcpu;
        !           739:        lwp_unlock(l);
        !           740: }
        !           741:
1.5       ad        742: /*
                    743:  * sysctl setup.  XXX This should be split with kern_synch.c.
                    744:  */
1.2       yamt      745: SYSCTL_SETUP(sysctl_sched_setup, "sysctl kern.sched subtree setup")
                    746: {
                    747:        const struct sysctlnode *node = NULL;
                    748:
                    749:        sysctl_createv(clog, 0, NULL, NULL,
                    750:                CTLFLAG_PERMANENT,
                    751:                CTLTYPE_NODE, "kern", NULL,
                    752:                NULL, 0, NULL, 0,
                    753:                CTL_KERN, CTL_EOL);
                    754:        sysctl_createv(clog, 0, NULL, &node,
                    755:                CTLFLAG_PERMANENT,
                    756:                CTLTYPE_NODE, "sched",
                    757:                SYSCTL_DESCR("Scheduler options"),
                    758:                NULL, 0, NULL, 0,
                    759:                CTL_KERN, CTL_CREATE, CTL_EOL);
                    760:
1.5       ad        761:        KASSERT(node != NULL);
                    762:
                    763:        sysctl_createv(clog, 0, &node, NULL,
                    764:                CTLFLAG_PERMANENT,
                    765:                CTLTYPE_STRING, "name", NULL,
                    766:                NULL, 0, __UNCONST("4.4BSD"), 0,
                    767:                CTL_CREATE, CTL_EOL);
                    768:        sysctl_createv(clog, 0, &node, NULL,
                    769:                CTLFLAG_READWRITE,
                    770:                CTLTYPE_INT, "timesoftints",
                    771:                SYSCTL_DESCR("Track CPU time for soft interrupts"),
                    772:                NULL, 0, &softint_timing, 0,
                    773:                CTL_CREATE, CTL_EOL);
1.2       yamt      774: }
                    775:
                    776: #if defined(DDB)
                    777: void
                    778: sched_print_runqueue(void (*pr)(const char *, ...))
                    779: {
                    780:
                    781:        runqueue_print(&global_queue, pr);
                    782: }
                    783: #endif /* defined(DDB) */

CVSweb <webmaster@jp.NetBSD.org>