diff options
Diffstat (limited to 'libgo/runtime/proc.c')
-rw-r--r-- | libgo/runtime/proc.c | 707 |
1 files changed, 387 insertions, 320 deletions
diff --git a/libgo/runtime/proc.c b/libgo/runtime/proc.c index 30516ad7d7..da0f2ed3a7 100644 --- a/libgo/runtime/proc.c +++ b/libgo/runtime/proc.c @@ -18,7 +18,6 @@ #include "arch.h" #include "defs.h" #include "malloc.h" -#include "race.h" #include "go-type.h" #include "go-defer.h" @@ -51,7 +50,7 @@ extern void __splitstack_block_signals_context (void *context[10], int *, #if defined(USING_SPLIT_STACK) && defined(LINKER_SUPPORTS_SPLIT_STACK) # define StackMin PTHREAD_STACK_MIN #else -# define StackMin 2 * 1024 * 1024 +# define StackMin ((sizeof(char *) < 8) ? 2 * 1024 * 1024 : 4 * 1024 * 1024) #endif uintptr runtime_stacks_sys; @@ -127,6 +126,30 @@ fixcontext(ucontext_t* c) c->uc_mcontext._mc_tlsbase = tlsbase; } +# elif defined(__sparc__) + +static inline void +initcontext(void) +{ +} + +static inline void +fixcontext(ucontext_t *c) +{ + /* ??? Using + register unsigned long thread __asm__("%g7"); + c->uc_mcontext.gregs[REG_G7] = thread; + results in + error: variable ‘thread’ might be clobbered by \ + ‘longjmp’ or ‘vfork’ [-Werror=clobbered] + which ought to be false, as %g7 is a fixed register. */ + + if (sizeof (c->uc_mcontext.gregs[REG_G7]) == 8) + asm ("stx %%g7, %0" : "=m"(c->uc_mcontext.gregs[REG_G7])); + else + asm ("st %%g7, %0" : "=m"(c->uc_mcontext.gregs[REG_G7])); +} + # else # error unknown case for SETCONTEXT_CLOBBERS_TLS @@ -167,15 +190,11 @@ runtime_setmg(M* mp, G* gp) g = gp; } -// The static TLS size. See runtime_newm. -static int tlssize; - // Start a new thread. static void runtime_newosproc(M *mp) { pthread_attr_t attr; - size_t stacksize; sigset_t clear, old; pthread_t tid; int ret; @@ -185,19 +204,6 @@ runtime_newosproc(M *mp) if(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0) runtime_throw("pthread_attr_setdetachstate"); - stacksize = PTHREAD_STACK_MIN; - - // With glibc before version 2.16 the static TLS size is taken - // out of the stack size, and we get an error or a crash if - // there is not enough stack space left. Add it back in if we - // can, in case the program uses a lot of TLS space. FIXME: - // This can be disabled in glibc 2.16 and later, if the bug is - // indeed fixed then. - stacksize += tlssize; - - if(pthread_attr_setstacksize(&attr, stacksize) != 0) - runtime_throw("pthread_attr_setstacksize"); - // Block signals during pthread_create so that the new thread // starts with signals disabled. It will enable them in minit. sigfillset(&clear); @@ -255,9 +261,6 @@ runtime_mcall(void (*pfn)(G*)) { M *mp; G *gp; -#ifndef USING_SPLIT_STACK - int i; -#endif // Ensure that all registers are on the stack for the garbage // collector. @@ -273,7 +276,7 @@ runtime_mcall(void (*pfn)(G*)) #ifdef USING_SPLIT_STACK __splitstack_getcontext(&g->stack_context[0]); #else - gp->gcnext_sp = &i; + gp->gcnext_sp = &pfn; #endif gp->fromgogo = false; getcontext(&gp->context); @@ -309,43 +312,6 @@ runtime_mcall(void (*pfn)(G*)) } } -#ifdef HAVE_DL_ITERATE_PHDR - -// Called via dl_iterate_phdr. - -static int -addtls(struct dl_phdr_info* info, size_t size __attribute__ ((unused)), void *data) -{ - size_t *total = (size_t *)data; - unsigned int i; - - for(i = 0; i < info->dlpi_phnum; ++i) { - if(info->dlpi_phdr[i].p_type == PT_TLS) - *total += info->dlpi_phdr[i].p_memsz; - } - return 0; -} - -// Set the total TLS size. - -static void -inittlssize() -{ - size_t total = 0; - - dl_iterate_phdr(addtls, (void *)&total); - tlssize = total; -} - -#else - -static void -inittlssize() -{ -} - -#endif - // Goroutine scheduler // The scheduler's job is to distribute ready-to-run goroutines over worker threads. // @@ -392,17 +358,23 @@ struct Sched { int32 profilehz; // cpu profiling rate }; -// The max value of GOMAXPROCS. -// There are no fundamental restrictions on the value. -enum { MaxGomaxprocs = 1<<8 }; +enum +{ + // The max value of GOMAXPROCS. + // There are no fundamental restrictions on the value. + MaxGomaxprocs = 1<<8, + + // Number of goroutine ids to grab from runtime_sched.goidgen to local per-P cache at once. + // 16 seems to provide enough amortization, but other than that it's mostly arbitrary number. + GoidCacheBatch = 16, +}; Sched runtime_sched; int32 runtime_gomaxprocs; uint32 runtime_needextram = 1; bool runtime_iscgo = true; M runtime_m0; -G runtime_g0; // idle goroutine for m0 -G* runtime_allg; +G runtime_g0; // idle goroutine for m0 G* runtime_lastg; M* runtime_allm; P** runtime_allp; @@ -412,10 +384,15 @@ int32 runtime_ncpu; bool runtime_precisestack; static int32 newprocs; +static Lock allglock; // the following vars are protected by this lock or by stoptheworld +G** runtime_allg; +uintptr runtime_allglen; +static uintptr allgcap; + void* runtime_mstart(void*); static void runqput(P*, G*); static G* runqget(P*); -static void runqgrow(P*); +static bool runqputslow(P*, G*, uint32, uint32); static G* runqsteal(P*, P*); static void mput(M*); static M* mget(void); @@ -442,12 +419,14 @@ static void gfput(P*, G*); static G* gfget(P*); static void gfpurge(P*); static void globrunqput(G*); +static void globrunqputbatch(G*, G*, int32); static G* globrunqget(P*, int32); static P* pidleget(void); static void pidleput(P*); static void injectglist(G*); static bool preemptall(void); static bool exitsyscallfast(void); +static void allgadd(G*); // The bootstrap sequence is: // @@ -471,12 +450,11 @@ runtime_schedinit(void) g->m = m; initcontext(); - inittlssize(); runtime_sched.maxmcount = 10000; runtime_precisestack = 0; - runtime_mprofinit(); + // runtime_symtabinit(); runtime_mallocinit(); mcommoninit(m); @@ -485,6 +463,10 @@ runtime_schedinit(void) // in a fault during a garbage collection, it will not // need to allocated memory. runtime_newErrorCString(0, &i); + + // Initialize the cached gotraceback value, since + // gotraceback calls getenv, which mallocs on Plan 9. + runtime_gotraceback(nil); runtime_goargs(); runtime_goenvs(); @@ -503,9 +485,6 @@ runtime_schedinit(void) // Can not enable GC until all roots are registered. // mstats.enablegc = 1; - - // if(raceenabled) - // g->racectx = runtime_raceinit(); } extern void main_init(void) __asm__ (GOSYM_PREFIX "__go_init_main"); @@ -517,6 +496,15 @@ initDone(void *arg __attribute__ ((unused))) { }; // The main goroutine. +// Note: C frames in general are not copyable during stack growth, for two reasons: +// 1) We don't know where in a frame to find pointers to other stack locations. +// 2) There's no guarantee that globals or heap values do not point into the frame. +// +// The C frame for runtime.main is copyable, because: +// 1) There are no pointers to other stack locations in the frame +// (d.fn points at a global, d.link is nil, d.argp is -1). +// 2) The only pointer into this frame is from the defer chain, +// which is explicitly handled during stack copying. void runtime_main(void* dummy __attribute__((unused))) { @@ -541,7 +529,7 @@ runtime_main(void* dummy __attribute__((unused))) d.__retaddr = nil; d.__makefunc_can_recover = 0; d.__frame = &frame; - d.__free = 0; + d.__special = true; g->defer = &d; if(m != &runtime_m0) @@ -560,8 +548,6 @@ runtime_main(void* dummy __attribute__((unused))) mstats.enablegc = 1; main_main(); - if(raceenabled) - runtime_racefini(); // Make racy client program work: if panicking on // another goroutine at the same time as main returns, @@ -579,6 +565,7 @@ void runtime_goroutineheader(G *gp) { const char *status; + int64 waitfor; switch(gp->status) { case Gidle: @@ -603,7 +590,16 @@ runtime_goroutineheader(G *gp) status = "???"; break; } - runtime_printf("goroutine %D [%s]:\n", gp->goid, status); + + // approx time the G is blocked, in minutes + waitfor = 0; + if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince != 0) + waitfor = (runtime_nanotime() - gp->waitsince) / (60LL*1000*1000*1000); + + if(waitfor < 1) + runtime_printf("goroutine %D [%s]:\n", gp->goid, status); + else + runtime_printf("goroutine %D [%s, %D minutes]:\n", gp->goid, status, waitfor); } void @@ -624,7 +620,7 @@ runtime_printcreatedby(G *g) struct Traceback { G* gp; - Location locbuf[100]; + Location locbuf[TracebackMaxFrames]; int32 c; }; @@ -634,6 +630,7 @@ runtime_tracebackothers(G * volatile me) G * volatile gp; Traceback tb; int32 traceback; + volatile uintptr i; tb.gp = me; traceback = runtime_gotraceback(nil); @@ -657,7 +654,9 @@ runtime_tracebackothers(G * volatile me) runtime_printcreatedby(gp); } - for(gp = runtime_allg; gp != nil; gp = gp->alllink) { + runtime_lock(&allglock); + for(i = 0; i < runtime_allglen; i++) { + gp = runtime_allg[i]; if(gp == me || gp == m->curg || gp->status == Gdead) continue; if(gp->issystem && traceback < 2) @@ -696,6 +695,7 @@ runtime_tracebackothers(G * volatile me) runtime_printcreatedby(gp); } } + runtime_unlock(&allglock); } static void @@ -719,7 +719,7 @@ gtraceback(G* gp) traceback = gp->traceback; gp->traceback = nil; traceback->c = runtime_callers(1, traceback->locbuf, - sizeof traceback->locbuf / sizeof traceback->locbuf[0]); + sizeof traceback->locbuf / sizeof traceback->locbuf[0], false); runtime_gogo(traceback->gp); } @@ -729,7 +729,7 @@ mcommoninit(M *mp) // If there is no mcache runtime_callers() will crash, // and we are most likely in sysmon thread so the stack is senseless anyway. if(m->mcache) - runtime_callers(1, mp->createstack, nelem(mp->createstack)); + runtime_callers(1, mp->createstack, nelem(mp->createstack), false); mp->fastrand = 0x49f6428aUL + mp->id + runtime_cputicks(); @@ -1038,6 +1038,7 @@ struct CgoThreadStart { M *m; G *g; + uintptr *tls; void (*fn)(void); }; @@ -1070,6 +1071,22 @@ runtime_allocm(P *p, int32 stacksize, byte** ret_g0_stack, size_t* ret_g0_stacks return mp; } +static G* +allocg(void) +{ + G *gp; + // static Type *gtype; + + // if(gtype == nil) { + // Eface e; + // runtime_gc_g_ptr(&e); + // gtype = ((PtrType*)e.__type_descriptor)->__element_type; + // } + // gp = runtime_cnew(gtype); + gp = runtime_malloc(sizeof(G)); + return gp; +} + static M* lockextra(bool nilokay); static void unlockextra(M*); @@ -1151,6 +1168,7 @@ runtime_needm(void) __splitstack_getcontext(&g->stack_context[0]); #else g->gcinitial_sp = ∓ + g->gcstack = nil; g->gcstack_size = 0; g->gcnext_sp = ∓ #endif @@ -1200,22 +1218,12 @@ runtime_newextram(void) gp->lockedm = mp; gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1); // put on allg for garbage collector - runtime_lock(&runtime_sched); - if(runtime_lastg == nil) - runtime_allg = gp; - else - runtime_lastg->alllink = gp; - runtime_lastg = gp; - runtime_unlock(&runtime_sched); - gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1); + allgadd(gp); // The context for gp will be set up in runtime_needm. But // here we need to set up the context for g0. getcontext(&mp->g0->context); mp->g0->context.uc_stack.ss_sp = g0_sp; -#ifdef MAKECONTEXT_STACK_TOP - mp->g0->context.uc_stack.ss_sp += g0_spsize; -#endif mp->g0->context.uc_stack.ss_size = g0_spsize; makecontext(&mp->g0->context, kickoff, 0); @@ -1262,6 +1270,8 @@ runtime_dropm(void) runtime_setmg(nil, nil); mp->curg->status = Gdead; + mp->curg->gcstack = nil; + mp->curg->gcnext_sp = nil; mnext = lockextra(true); mp->schedlink = mnext; @@ -1382,7 +1392,7 @@ mspinning(void) } // Schedules some M to run the p (creates an M if necessary). -// If p==nil, tries to get an idle P, if no idle P's returns false. +// If p==nil, tries to get an idle P, if no idle P's does nothing. static void startm(P *p, bool spinning) { @@ -1546,6 +1556,7 @@ execute(G *gp) runtime_throw("execute: bad g status"); } gp->status = Grunning; + gp->waitsince = 0; m->p->schedtick++; m->curg = gp; gp->m = m; @@ -1572,6 +1583,8 @@ top: gcstopm(); goto top; } + if(runtime_fingwait && runtime_fingwake && (gp = runtime_wakefing()) != nil) + runtime_ready(gp); // local runq gp = runqget(m->p); if(gp) @@ -1763,28 +1776,52 @@ top: execute(gp); } -// Puts the current goroutine into a waiting state and unlocks the lock. -// The goroutine can be made runnable again by calling runtime_ready(gp). +// Puts the current goroutine into a waiting state and calls unlockf. +// If unlockf returns false, the goroutine is resumed. void -runtime_park(void(*unlockf)(Lock*), Lock *lock, const char *reason) +runtime_park(bool(*unlockf)(G*, void*), void *lock, const char *reason) { + if(g->status != Grunning) + runtime_throw("bad g status"); m->waitlock = lock; m->waitunlockf = unlockf; g->waitreason = reason; runtime_mcall(park0); } +static bool +parkunlock(G *gp, void *lock) +{ + USED(gp); + runtime_unlock(lock); + return true; +} + +// Puts the current goroutine into a waiting state and unlocks the lock. +// The goroutine can be made runnable again by calling runtime_ready(gp). +void +runtime_parkunlock(Lock *lock, const char *reason) +{ + runtime_park(parkunlock, lock, reason); +} + // runtime_park continuation on g0. static void park0(G *gp) { + bool ok; + gp->status = Gwaiting; gp->m = nil; m->curg = nil; if(m->waitunlockf) { - m->waitunlockf(m->waitlock); + ok = m->waitunlockf(gp, m->waitlock); m->waitunlockf = nil; m->waitlock = nil; + if(!ok) { + gp->status = Grunnable; + execute(gp); // Schedule it back, never returns. + } } if(m->lockedg) { stoplockedm(); @@ -1797,6 +1834,8 @@ park0(G *gp) void runtime_gosched(void) { + if(g->status != Grunning) + runtime_throw("bad g status"); runtime_mcall(runtime_gosched0); } @@ -1821,11 +1860,12 @@ runtime_gosched0(G *gp) // Need to mark it as nosplit, because it runs with sp > stackbase (as runtime_lessstack). // Since it does not return it does not matter. But if it is preempted // at the split stack check, GC will complain about inconsistent sp. +void runtime_goexit(void) __attribute__ ((noinline)); void runtime_goexit(void) { - if(raceenabled) - runtime_racegoend(); + if(g->status != Grunning) + runtime_throw("bad g status"); runtime_mcall(goexit0); } @@ -1837,6 +1877,13 @@ goexit0(G *gp) gp->entry = nil; gp->m = nil; gp->lockedm = nil; + gp->paniconfault = 0; + gp->defer = nil; // should be true already but just in case. + gp->panic = nil; // non-nil for Goexit during panic. points at stack-allocated data. + gp->writenbuf = 0; + gp->writebuf = nil; + gp->waitreason = nil; + gp->param = nil; m->curg = nil; m->lockedg = nil; if(m->locked & ~LockExternal) { @@ -1893,7 +1940,7 @@ doentersyscall() &g->gcinitial_sp); #else { - uint32 v; + void *v; g->gcnext_sp = (byte *) &v; } @@ -1971,6 +2018,7 @@ runtime_exitsyscall(void) if(gp->isbackground) // do not consider blocked scavenger for deadlock detection incidlelocked(-1); + g->waitsince = 0; if(exitsyscallfast()) { // There's a cpu for us, so we can run. m->p->syscalltick++; @@ -2084,8 +2132,8 @@ syscall_runtime_BeforeFork(void) { // Fork can hang if preempted with signals frequently enough (see issue 5517). // Ensure that we stay on the same M where we disable profiling. - m->locks++; - if(m->profilehz != 0) + runtime_m()->locks++; + if(runtime_m()->profilehz != 0) runtime_resetcpuprofiler(0); } @@ -2100,7 +2148,7 @@ syscall_runtime_AfterFork(void) hz = runtime_sched.profilehz; if(hz != 0) runtime_resetcpuprofiler(hz); - m->locks--; + runtime_m()->locks--; } // Allocate a new g, with a stack big enough for stacksize bytes. @@ -2109,7 +2157,7 @@ runtime_malg(int32 stacksize, byte** ret_stack, size_t* ret_stacksize) { G *newg; - newg = runtime_malloc(sizeof(G)); + newg = allocg(); if(stacksize >= 0) { #if USING_SPLIT_STACK int dont_block_signals = 0; @@ -2163,11 +2211,17 @@ __go_go(void (*fn)(void*), void* arg) byte *sp; size_t spsize; G *newg; + P *p; //runtime_printf("newproc1 %p %p narg=%d nret=%d\n", fn->fn, argp, narg, nret); + if(fn == nil) { + m->throwing = -1; // do not dump full stacks + runtime_throw("go of nil func value"); + } m->locks++; // disable preemption because it can be holding p in a local var - if((newg = gfget(m->p)) != nil) { + p = m->p; + if((newg = gfget(p)) != nil) { #ifdef USING_SPLIT_STACK int dont_block_signals = 0; @@ -2184,20 +2238,18 @@ __go_go(void (*fn)(void*), void* arg) #endif } else { newg = runtime_malg(StackMin, &sp, &spsize); - runtime_lock(&runtime_sched); - if(runtime_lastg == nil) - runtime_allg = newg; - else - runtime_lastg->alllink = newg; - runtime_lastg = newg; - runtime_unlock(&runtime_sched); + allgadd(newg); } newg->entry = (byte*)fn; newg->param = arg; newg->gopc = (uintptr)__builtin_return_address(0); newg->status = Grunnable; - newg->goid = runtime_xadd64(&runtime_sched.goidgen, 1); + if(p->goidcache == p->goidcacheend) { + p->goidcache = runtime_xadd64(&runtime_sched.goidgen, GoidCacheBatch); + p->goidcacheend = p->goidcache + GoidCacheBatch; + } + newg->goid = p->goidcache++; { // Avoid warnings about variables clobbered by @@ -2214,7 +2266,7 @@ __go_go(void (*fn)(void*), void* arg) vnewg->context.uc_stack.ss_size = vspsize; makecontext(&vnewg->context, kickoff, 0); - runqput(m->p, vnewg); + runqput(p, vnewg); if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0 && fn != runtime_main) // TODO: fast atomic wakep(); @@ -2223,6 +2275,31 @@ __go_go(void (*fn)(void*), void* arg) } } +static void +allgadd(G *gp) +{ + G **new; + uintptr cap; + + runtime_lock(&allglock); + if(runtime_allglen >= allgcap) { + cap = 4096/sizeof(new[0]); + if(cap < 2*allgcap) + cap = 2*allgcap; + new = runtime_malloc(cap*sizeof(new[0])); + if(new == nil) + runtime_throw("runtime: cannot allocate memory"); + if(runtime_allg != nil) { + runtime_memmove(new, runtime_allg, runtime_allglen*sizeof(new[0])); + runtime_free(runtime_allg); + } + runtime_allg = new; + allgcap = cap; + } + runtime_allg[runtime_allglen++] = gp; + runtime_unlock(&allglock); +} + // Put on gfree list. // If local list is too long, transfer a batch to the global list. static void @@ -2393,44 +2470,26 @@ runtime_lockedOSThread(void) return g->lockedm != nil && m->lockedg != nil; } -// for testing of callbacks - -_Bool runtime_golockedOSThread(void) - __asm__ (GOSYM_PREFIX "runtime.golockedOSThread"); - -_Bool -runtime_golockedOSThread(void) -{ - return runtime_lockedOSThread(); -} - -intgo runtime_NumGoroutine (void) - __asm__ (GOSYM_PREFIX "runtime.NumGoroutine"); - -intgo -runtime_NumGoroutine() -{ - return runtime_gcount(); -} - int32 runtime_gcount(void) { G *gp; int32 n, s; + uintptr i; n = 0; - runtime_lock(&runtime_sched); + runtime_lock(&allglock); // TODO(dvyukov): runtime.NumGoroutine() is O(N). // We do not want to increment/decrement centralized counter in newproc/goexit, // just to make runtime.NumGoroutine() faster. // Compromise solution is to introduce per-P counters of active goroutines. - for(gp = runtime_allg; gp; gp = gp->alllink) { + for(i = 0; i < runtime_allglen; i++) { + gp = runtime_allg[i]; s = gp->status; if(s == Grunnable || s == Grunning || s == Gsyscall || s == Gwaiting) n++; } - runtime_unlock(&runtime_sched); + runtime_unlock(&allglock); return n; } @@ -2444,32 +2503,39 @@ static struct { Lock; void (*fn)(uintptr*, int32); int32 hz; - uintptr pcbuf[100]; - Location locbuf[100]; + uintptr pcbuf[TracebackMaxFrames]; + Location locbuf[TracebackMaxFrames]; } prof; -static void -System(void) -{ -} +static void System(void) {} +static void GC(void) {} // Called if we receive a SIGPROF signal. void runtime_sigprof() { + M *mp = m; int32 n, i; bool traceback; if(prof.fn == nil || prof.hz == 0) return; + + if(mp == nil) + return; + + // Profiling runs concurrently with GC, so it must not allocate. + mp->mallocing++; + traceback = true; - // Windows does profiling in a dedicated thread w/o m. - if(!Windows && (m == nil || m->mcache == nil)) + + if(mp->mcache == nil) traceback = false; - + runtime_lock(&prof); if(prof.fn == nil) { runtime_unlock(&prof); + mp->mallocing--; return; } n = 0; @@ -2483,17 +2549,21 @@ runtime_sigprof() } if(traceback) { - n = runtime_callers(0, prof.locbuf, nelem(prof.locbuf)); + n = runtime_callers(0, prof.locbuf, nelem(prof.locbuf), false); for(i = 0; i < n; i++) prof.pcbuf[i] = prof.locbuf[i].pc; } - if (!traceback || n <= 0) { + if(!traceback || n <= 0) { n = 2; prof.pcbuf[0] = (uintptr)runtime_getcallerpc(&n); - prof.pcbuf[1] = (uintptr)System + 1; + if(mp->gcing || mp->helpgc) + prof.pcbuf[1] = (uintptr)GC; + else + prof.pcbuf[1] = (uintptr)System; } prof.fn(prof.pcbuf, n); runtime_unlock(&prof); + mp->mallocing--; } // Arrange to call fn with a traceback hz times a second. @@ -2536,6 +2606,7 @@ static void procresize(int32 new) { int32 i, old; + bool empty; G *gp; P *p; @@ -2557,27 +2628,42 @@ procresize(int32 new) else p->mcache = runtime_allocmcache(); } - if(p->runq == nil) { - p->runqsize = 128; - p->runq = (G**)runtime_mallocgc(p->runqsize*sizeof(G*), 0, FlagNoInvokeGC); - } } // redistribute runnable G's evenly - for(i = 0; i < old; i++) { - p = runtime_allp[i]; - while((gp = runqget(p)) != nil) - globrunqput(gp); + // collect all runnable goroutines in global queue preserving FIFO order + // FIFO order is required to ensure fairness even during frequent GCs + // see http://golang.org/issue/7126 + empty = false; + while(!empty) { + empty = true; + for(i = 0; i < old; i++) { + p = runtime_allp[i]; + if(p->runqhead == p->runqtail) + continue; + empty = false; + // pop from tail of local queue + p->runqtail--; + gp = p->runq[p->runqtail%nelem(p->runq)]; + // push onto head of global queue + gp->schedlink = runtime_sched.runqhead; + runtime_sched.runqhead = gp; + if(runtime_sched.runqtail == nil) + runtime_sched.runqtail = gp; + runtime_sched.runqsize++; + } } + // fill local queues with at most nelem(p->runq)/2 goroutines // start at 1 because current M already executes some G and will acquire allp[0] below, // so if we have a spare G we want to put it into allp[1]. - for(i = 1; runtime_sched.runqhead; i++) { + for(i = 1; (uint32)i < (uint32)new * nelem(p->runq)/2 && runtime_sched.runqsize > 0; i++) { gp = runtime_sched.runqhead; runtime_sched.runqhead = gp->schedlink; + if(runtime_sched.runqhead == nil) + runtime_sched.runqtail = nil; + runtime_sched.runqsize--; runqput(runtime_allp[i%new], gp); } - runtime_sched.runqtail = nil; - runtime_sched.runqsize = 0; // free unused P's for(i = new; i < old; i++) { @@ -2659,30 +2745,41 @@ checkdead(void) { G *gp; int32 run, grunning, s; + uintptr i; // -1 for sysmon run = runtime_sched.mcount - runtime_sched.nmidle - runtime_sched.nmidlelocked - 1 - countextra(); if(run > 0) return; + // If we are dying because of a signal caught on an already idle thread, + // freezetheworld will cause all running threads to block. + // And runtime will essentially enter into deadlock state, + // except that there is a thread that will call runtime_exit soon. + if(runtime_panicking > 0) + return; if(run < 0) { - runtime_printf("checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n", + runtime_printf("runtime: checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n", runtime_sched.nmidle, runtime_sched.nmidlelocked, runtime_sched.mcount); runtime_throw("checkdead: inconsistent counts"); } grunning = 0; - for(gp = runtime_allg; gp; gp = gp->alllink) { + runtime_lock(&allglock); + for(i = 0; i < runtime_allglen; i++) { + gp = runtime_allg[i]; if(gp->isbackground) continue; s = gp->status; if(s == Gwaiting) grunning++; else if(s == Grunnable || s == Grunning || s == Gsyscall) { - runtime_printf("checkdead: find g %D in status %d\n", gp->goid, s); + runtime_unlock(&allglock); + runtime_printf("runtime: checkdead: find g %D in status %d\n", gp->goid, s); runtime_throw("checkdead: runnable g"); } } + runtime_unlock(&allglock); if(grunning == 0) // possible if main goroutine calls runtime_Goexit() - runtime_exit(0); + runtime_throw("no goroutines (main called runtime.Goexit) - deadlock!"); m->throwing = -1; // do not dump full stacks runtime_throw("all goroutines are asleep - deadlock!"); } @@ -2777,16 +2874,19 @@ retake(int64 now) pd = &pdesc[i]; s = p->status; if(s == Psyscall) { - // Retake P from syscall if it's there for more than 1 sysmon tick (20us). - // But only if there is other work to do. + // Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us). t = p->syscalltick; if(pd->syscalltick != t) { pd->syscalltick = t; pd->syscallwhen = now; continue; } + // On the one hand we don't want to retake Ps if there is no other work to do, + // but on the other hand we want to retake them eventually + // because they can prevent the sysmon thread from deep sleep. if(p->runqhead == p->runqtail && - runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0) + runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0 && + pd->syscallwhen + 10*1000*1000 > now) continue; // Need to decrement number of idle locked M's // (pretending that one more is running) before the CAS. @@ -2831,7 +2931,8 @@ runtime_schedtrace(bool detailed) static int64 starttime; int64 now; int64 id1, id2, id3; - int32 i, q, t, h, s; + int32 i, t, h; + uintptr gi; const char *fmt; M *mp, *lockedm; G *gp, *lockedg; @@ -2858,15 +2959,11 @@ runtime_schedtrace(bool detailed) if(p == nil) continue; mp = p->m; - t = p->runqtail; - h = p->runqhead; - s = p->runqsize; - q = t - h; - if(q < 0) - q += s; + h = runtime_atomicload(&p->runqhead); + t = runtime_atomicload(&p->runqtail); if(detailed) - runtime_printf(" P%d: status=%d schedtick=%d syscalltick=%d m=%d runqsize=%d/%d gfreecnt=%d\n", - i, p->status, p->schedtick, p->syscalltick, mp ? mp->id : -1, q, s, p->gfreecnt); + runtime_printf(" P%d: status=%d schedtick=%d syscalltick=%d m=%d runqsize=%d gfreecnt=%d\n", + i, p->status, p->schedtick, p->syscalltick, mp ? mp->id : -1, t-h, p->gfreecnt); else { // In non-detailed mode format lengths of per-P run queues as: // [len1 len2 len3 len4] @@ -2877,7 +2974,7 @@ runtime_schedtrace(bool detailed) fmt = " [%d"; else if(i == runtime_gomaxprocs-1) fmt = " %d]\n"; - runtime_printf(fmt, q); + runtime_printf(fmt, t-h); } } if(!detailed) { @@ -2898,18 +2995,21 @@ runtime_schedtrace(bool detailed) if(lockedg) id3 = lockedg->goid; runtime_printf(" M%d: p=%D curg=%D mallocing=%d throwing=%d gcing=%d" - " locks=%d dying=%d helpgc=%d spinning=%d lockedg=%D\n", + " locks=%d dying=%d helpgc=%d spinning=%d blocked=%d lockedg=%D\n", mp->id, id1, id2, mp->mallocing, mp->throwing, mp->gcing, mp->locks, mp->dying, mp->helpgc, - mp->spinning, id3); + mp->spinning, m->blocked, id3); } - for(gp = runtime_allg; gp; gp = gp->alllink) { + runtime_lock(&allglock); + for(gi = 0; gi < runtime_allglen; gi++) { + gp = runtime_allg[gi]; mp = gp->m; lockedm = gp->lockedm; runtime_printf(" G%D: status=%d(%s) m=%d lockedm=%d\n", gp->goid, gp->status, gp->waitreason, mp ? mp->id : -1, lockedm ? lockedm->id : -1); } + runtime_unlock(&allglock); runtime_unlock(&runtime_sched); } @@ -2952,6 +3052,20 @@ globrunqput(G *gp) runtime_sched.runqsize++; } +// Put a batch of runnable goroutines on the global runnable queue. +// Sched must be locked. +static void +globrunqputbatch(G *ghead, G *gtail, int32 n) +{ + gtail->schedlink = nil; + if(runtime_sched.runqtail) + runtime_sched.runqtail->schedlink = ghead; + else + runtime_sched.runqhead = ghead; + runtime_sched.runqtail = gtail; + runtime_sched.runqsize += n; +} + // Try get a batch of G's from the global runnable queue. // Sched must be locked. static G* @@ -2967,6 +3081,8 @@ globrunqget(P *p, int32 max) n = runtime_sched.runqsize; if(max > 0 && n > max) n = max; + if((uint32)n > nelem(p->runq)/2) + n = nelem(p->runq)/2; runtime_sched.runqsize -= n; if(runtime_sched.runqsize == 0) runtime_sched.runqtail = nil; @@ -3006,78 +3122,98 @@ pidleget(void) return p; } -// Put g on local runnable queue. -// TODO(dvyukov): consider using lock-free queue. +// Try to put g on local runnable queue. +// If it's full, put onto global queue. +// Executed only by the owner P. static void runqput(P *p, G *gp) { - int32 h, t, s; + uint32 h, t; - runtime_lock(p); retry: - h = p->runqhead; + h = runtime_atomicload(&p->runqhead); // load-acquire, synchronize with consumers t = p->runqtail; - s = p->runqsize; - if(t == h-1 || (h == 0 && t == s-1)) { - runqgrow(p); - goto retry; + if(t - h < nelem(p->runq)) { + p->runq[t%nelem(p->runq)] = gp; + runtime_atomicstore(&p->runqtail, t+1); // store-release, makes the item available for consumption + return; } - p->runq[t++] = gp; - if(t == s) - t = 0; - p->runqtail = t; - runtime_unlock(p); + if(runqputslow(p, gp, h, t)) + return; + // the queue is not full, now the put above must suceed + goto retry; +} + +// Put g and a batch of work from local runnable queue on global queue. +// Executed only by the owner P. +static bool +runqputslow(P *p, G *gp, uint32 h, uint32 t) +{ + G *batch[nelem(p->runq)/2+1]; + uint32 n, i; + + // First, grab a batch from local queue. + n = t-h; + n = n/2; + if(n != nelem(p->runq)/2) + runtime_throw("runqputslow: queue is not full"); + for(i=0; i<n; i++) + batch[i] = p->runq[(h+i)%nelem(p->runq)]; + if(!runtime_cas(&p->runqhead, h, h+n)) // cas-release, commits consume + return false; + batch[n] = gp; + // Link the goroutines. + for(i=0; i<n; i++) + batch[i]->schedlink = batch[i+1]; + // Now put the batch on global queue. + runtime_lock(&runtime_sched); + globrunqputbatch(batch[0], batch[n], n+1); + runtime_unlock(&runtime_sched); + return true; } // Get g from local runnable queue. +// Executed only by the owner P. static G* runqget(P *p) { G *gp; - int32 t, h, s; + uint32 t, h; - if(p->runqhead == p->runqtail) - return nil; - runtime_lock(p); - h = p->runqhead; - t = p->runqtail; - s = p->runqsize; - if(t == h) { - runtime_unlock(p); - return nil; + for(;;) { + h = runtime_atomicload(&p->runqhead); // load-acquire, synchronize with other consumers + t = p->runqtail; + if(t == h) + return nil; + gp = p->runq[h%nelem(p->runq)]; + if(runtime_cas(&p->runqhead, h, h+1)) // cas-release, commits consume + return gp; } - gp = p->runq[h++]; - if(h == s) - h = 0; - p->runqhead = h; - runtime_unlock(p); - return gp; } -// Grow local runnable queue. -// TODO(dvyukov): consider using fixed-size array -// and transfer excess to the global list (local queue can grow way too big). -static void -runqgrow(P *p) +// Grabs a batch of goroutines from local runnable queue. +// batch array must be of size nelem(p->runq)/2. Returns number of grabbed goroutines. +// Can be executed by any P. +static uint32 +runqgrab(P *p, G **batch) { - G **q; - int32 s, t, h, t2; + uint32 t, h, n, i; - h = p->runqhead; - t = p->runqtail; - s = p->runqsize; - t2 = 0; - q = runtime_malloc(2*s*sizeof(*q)); - while(t != h) { - q[t2++] = p->runq[h++]; - if(h == s) - h = 0; + for(;;) { + h = runtime_atomicload(&p->runqhead); // load-acquire, synchronize with other consumers + t = runtime_atomicload(&p->runqtail); // load-acquire, synchronize with the producer + n = t-h; + n = n - n/2; + if(n == 0) + break; + if(n > nelem(p->runq)/2) // read inconsistent h and t + continue; + for(i=0; i<n; i++) + batch[i] = p->runq[(h+i)%nelem(p->runq)]; + if(runtime_cas(&p->runqhead, h, h+n)) // cas-release, commits consume + break; } - runtime_free(p->runq); - p->runq = q; - p->runqhead = 0; - p->runqtail = t2; - p->runqsize = 2*s; + return n; } // Steal half of elements from local runnable queue of p2 @@ -3086,57 +3222,24 @@ runqgrow(P *p) static G* runqsteal(P *p, P *p2) { - G *gp, *gp1; - int32 t, h, s, t2, h2, s2, c, i; + G *gp; + G *batch[nelem(p->runq)/2]; + uint32 t, h, n, i; - if(p2->runqhead == p2->runqtail) - return nil; - // sort locks to prevent deadlocks - if(p < p2) - runtime_lock(p); - runtime_lock(p2); - if(p2->runqhead == p2->runqtail) { - runtime_unlock(p2); - if(p < p2) - runtime_unlock(p); + n = runqgrab(p2, batch); + if(n == 0) return nil; - } - if(p >= p2) - runtime_lock(p); - // now we've locked both queues and know the victim is not empty - h = p->runqhead; + n--; + gp = batch[n]; + if(n == 0) + return gp; + h = runtime_atomicload(&p->runqhead); // load-acquire, synchronize with consumers t = p->runqtail; - s = p->runqsize; - h2 = p2->runqhead; - t2 = p2->runqtail; - s2 = p2->runqsize; - gp = p2->runq[h2++]; // return value - if(h2 == s2) - h2 = 0; - // steal roughly half - if(t2 > h2) - c = (t2 - h2) / 2; - else - c = (s2 - h2 + t2) / 2; - // copy - for(i = 0; i != c; i++) { - // the target queue is full? - if(t == h-1 || (h == 0 && t == s-1)) - break; - // the victim queue is empty? - if(t2 == h2) - break; - gp1 = p2->runq[h2++]; - if(h2 == s2) - h2 = 0; - p->runq[t++] = gp1; - if(t == s) - t = 0; - } - p->runqtail = t; - p2->runqhead = h2; - runtime_unlock(p2); - runtime_unlock(p); + if(t - h + n >= nelem(p->runq)) + runtime_throw("runqsteal: runq overflow"); + for(i=0; i<n; i++, t++) + p->runq[t%nelem(p->runq)] = batch[i]; + runtime_atomicstore(&p->runqtail, t); // store-release, makes the item available for consumption return gp; } @@ -3147,14 +3250,10 @@ void runtime_testSchedLocalQueue(void) { P p; - G gs[1000]; + G gs[nelem(p.runq)]; int32 i, j; runtime_memclr((byte*)&p, sizeof(p)); - p.runqsize = 1; - p.runqhead = 0; - p.runqtail = 0; - p.runq = runtime_malloc(p.runqsize*sizeof(*p.runq)); for(i = 0; i < (int32)nelem(gs); i++) { if(runqget(&p) != nil) @@ -3179,20 +3278,11 @@ void runtime_testSchedLocalQueueSteal(void) { P p1, p2; - G gs[1000], *gp; + G gs[nelem(p1.runq)], *gp; int32 i, j, s; runtime_memclr((byte*)&p1, sizeof(p1)); - p1.runqsize = 1; - p1.runqhead = 0; - p1.runqtail = 0; - p1.runq = runtime_malloc(p1.runqsize*sizeof(*p1.runq)); - runtime_memclr((byte*)&p2, sizeof(p2)); - p2.runqsize = nelem(gs); - p2.runqhead = 0; - p2.runqtail = 0; - p2.runq = runtime_malloc(p2.runqsize*sizeof(*p2.runq)); for(i = 0; i < (int32)nelem(gs); i++) { for(j = 0; j < i; j++) { @@ -3225,13 +3315,10 @@ runtime_testSchedLocalQueueSteal(void) } } -intgo runtime_debug_setMaxThreads(intgo) - __asm__(GOSYM_PREFIX "runtime_debug.setMaxThreads"); - -intgo -runtime_debug_setMaxThreads(intgo in) +int32 +runtime_setmaxthreads(int32 in) { - intgo out; + int32 out; runtime_lock(&runtime_sched); out = runtime_sched.maxmcount; @@ -3242,29 +3329,9 @@ runtime_debug_setMaxThreads(intgo in) } void -runtime_proc_scan(void (*addroot)(Obj)) -{ - addroot((Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0}); -} - -// When a function calls a closure, it passes the closure value to -// __go_set_closure immediately before the function call. When a -// function uses a closure, it calls __go_get_closure immediately on -// function entry. This is a hack, but it will work on any system. -// It would be better to use the static chain register when there is -// one. It is also worth considering expanding these functions -// directly in the compiler. - -void -__go_set_closure(void* v) -{ - g->closure = v; -} - -void * -__go_get_closure(void) +runtime_proc_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj)) { - return g->closure; + enqueue1(wbufp, (Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0}); } // Return whether we are waiting for a GC. This gc toolchain uses |