summaryrefslogtreecommitdiff
path: root/libgo/runtime/proc.c
diff options
context:
space:
mode:
Diffstat (limited to 'libgo/runtime/proc.c')
-rw-r--r--libgo/runtime/proc.c707
1 files changed, 387 insertions, 320 deletions
diff --git a/libgo/runtime/proc.c b/libgo/runtime/proc.c
index 30516ad7d7..da0f2ed3a7 100644
--- a/libgo/runtime/proc.c
+++ b/libgo/runtime/proc.c
@@ -18,7 +18,6 @@
#include "arch.h"
#include "defs.h"
#include "malloc.h"
-#include "race.h"
#include "go-type.h"
#include "go-defer.h"
@@ -51,7 +50,7 @@ extern void __splitstack_block_signals_context (void *context[10], int *,
#if defined(USING_SPLIT_STACK) && defined(LINKER_SUPPORTS_SPLIT_STACK)
# define StackMin PTHREAD_STACK_MIN
#else
-# define StackMin 2 * 1024 * 1024
+# define StackMin ((sizeof(char *) < 8) ? 2 * 1024 * 1024 : 4 * 1024 * 1024)
#endif
uintptr runtime_stacks_sys;
@@ -127,6 +126,30 @@ fixcontext(ucontext_t* c)
c->uc_mcontext._mc_tlsbase = tlsbase;
}
+# elif defined(__sparc__)
+
+static inline void
+initcontext(void)
+{
+}
+
+static inline void
+fixcontext(ucontext_t *c)
+{
+ /* ??? Using
+ register unsigned long thread __asm__("%g7");
+ c->uc_mcontext.gregs[REG_G7] = thread;
+ results in
+ error: variable ‘thread’ might be clobbered by \
+ ‘longjmp’ or ‘vfork’ [-Werror=clobbered]
+ which ought to be false, as %g7 is a fixed register. */
+
+ if (sizeof (c->uc_mcontext.gregs[REG_G7]) == 8)
+ asm ("stx %%g7, %0" : "=m"(c->uc_mcontext.gregs[REG_G7]));
+ else
+ asm ("st %%g7, %0" : "=m"(c->uc_mcontext.gregs[REG_G7]));
+}
+
# else
# error unknown case for SETCONTEXT_CLOBBERS_TLS
@@ -167,15 +190,11 @@ runtime_setmg(M* mp, G* gp)
g = gp;
}
-// The static TLS size. See runtime_newm.
-static int tlssize;
-
// Start a new thread.
static void
runtime_newosproc(M *mp)
{
pthread_attr_t attr;
- size_t stacksize;
sigset_t clear, old;
pthread_t tid;
int ret;
@@ -185,19 +204,6 @@ runtime_newosproc(M *mp)
if(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0)
runtime_throw("pthread_attr_setdetachstate");
- stacksize = PTHREAD_STACK_MIN;
-
- // With glibc before version 2.16 the static TLS size is taken
- // out of the stack size, and we get an error or a crash if
- // there is not enough stack space left. Add it back in if we
- // can, in case the program uses a lot of TLS space. FIXME:
- // This can be disabled in glibc 2.16 and later, if the bug is
- // indeed fixed then.
- stacksize += tlssize;
-
- if(pthread_attr_setstacksize(&attr, stacksize) != 0)
- runtime_throw("pthread_attr_setstacksize");
-
// Block signals during pthread_create so that the new thread
// starts with signals disabled. It will enable them in minit.
sigfillset(&clear);
@@ -255,9 +261,6 @@ runtime_mcall(void (*pfn)(G*))
{
M *mp;
G *gp;
-#ifndef USING_SPLIT_STACK
- int i;
-#endif
// Ensure that all registers are on the stack for the garbage
// collector.
@@ -273,7 +276,7 @@ runtime_mcall(void (*pfn)(G*))
#ifdef USING_SPLIT_STACK
__splitstack_getcontext(&g->stack_context[0]);
#else
- gp->gcnext_sp = &i;
+ gp->gcnext_sp = &pfn;
#endif
gp->fromgogo = false;
getcontext(&gp->context);
@@ -309,43 +312,6 @@ runtime_mcall(void (*pfn)(G*))
}
}
-#ifdef HAVE_DL_ITERATE_PHDR
-
-// Called via dl_iterate_phdr.
-
-static int
-addtls(struct dl_phdr_info* info, size_t size __attribute__ ((unused)), void *data)
-{
- size_t *total = (size_t *)data;
- unsigned int i;
-
- for(i = 0; i < info->dlpi_phnum; ++i) {
- if(info->dlpi_phdr[i].p_type == PT_TLS)
- *total += info->dlpi_phdr[i].p_memsz;
- }
- return 0;
-}
-
-// Set the total TLS size.
-
-static void
-inittlssize()
-{
- size_t total = 0;
-
- dl_iterate_phdr(addtls, (void *)&total);
- tlssize = total;
-}
-
-#else
-
-static void
-inittlssize()
-{
-}
-
-#endif
-
// Goroutine scheduler
// The scheduler's job is to distribute ready-to-run goroutines over worker threads.
//
@@ -392,17 +358,23 @@ struct Sched {
int32 profilehz; // cpu profiling rate
};
-// The max value of GOMAXPROCS.
-// There are no fundamental restrictions on the value.
-enum { MaxGomaxprocs = 1<<8 };
+enum
+{
+ // The max value of GOMAXPROCS.
+ // There are no fundamental restrictions on the value.
+ MaxGomaxprocs = 1<<8,
+
+ // Number of goroutine ids to grab from runtime_sched.goidgen to local per-P cache at once.
+ // 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
+ GoidCacheBatch = 16,
+};
Sched runtime_sched;
int32 runtime_gomaxprocs;
uint32 runtime_needextram = 1;
bool runtime_iscgo = true;
M runtime_m0;
-G runtime_g0; // idle goroutine for m0
-G* runtime_allg;
+G runtime_g0; // idle goroutine for m0
G* runtime_lastg;
M* runtime_allm;
P** runtime_allp;
@@ -412,10 +384,15 @@ int32 runtime_ncpu;
bool runtime_precisestack;
static int32 newprocs;
+static Lock allglock; // the following vars are protected by this lock or by stoptheworld
+G** runtime_allg;
+uintptr runtime_allglen;
+static uintptr allgcap;
+
void* runtime_mstart(void*);
static void runqput(P*, G*);
static G* runqget(P*);
-static void runqgrow(P*);
+static bool runqputslow(P*, G*, uint32, uint32);
static G* runqsteal(P*, P*);
static void mput(M*);
static M* mget(void);
@@ -442,12 +419,14 @@ static void gfput(P*, G*);
static G* gfget(P*);
static void gfpurge(P*);
static void globrunqput(G*);
+static void globrunqputbatch(G*, G*, int32);
static G* globrunqget(P*, int32);
static P* pidleget(void);
static void pidleput(P*);
static void injectglist(G*);
static bool preemptall(void);
static bool exitsyscallfast(void);
+static void allgadd(G*);
// The bootstrap sequence is:
//
@@ -471,12 +450,11 @@ runtime_schedinit(void)
g->m = m;
initcontext();
- inittlssize();
runtime_sched.maxmcount = 10000;
runtime_precisestack = 0;
- runtime_mprofinit();
+ // runtime_symtabinit();
runtime_mallocinit();
mcommoninit(m);
@@ -485,6 +463,10 @@ runtime_schedinit(void)
// in a fault during a garbage collection, it will not
// need to allocated memory.
runtime_newErrorCString(0, &i);
+
+ // Initialize the cached gotraceback value, since
+ // gotraceback calls getenv, which mallocs on Plan 9.
+ runtime_gotraceback(nil);
runtime_goargs();
runtime_goenvs();
@@ -503,9 +485,6 @@ runtime_schedinit(void)
// Can not enable GC until all roots are registered.
// mstats.enablegc = 1;
-
- // if(raceenabled)
- // g->racectx = runtime_raceinit();
}
extern void main_init(void) __asm__ (GOSYM_PREFIX "__go_init_main");
@@ -517,6 +496,15 @@ initDone(void *arg __attribute__ ((unused))) {
};
// The main goroutine.
+// Note: C frames in general are not copyable during stack growth, for two reasons:
+// 1) We don't know where in a frame to find pointers to other stack locations.
+// 2) There's no guarantee that globals or heap values do not point into the frame.
+//
+// The C frame for runtime.main is copyable, because:
+// 1) There are no pointers to other stack locations in the frame
+// (d.fn points at a global, d.link is nil, d.argp is -1).
+// 2) The only pointer into this frame is from the defer chain,
+// which is explicitly handled during stack copying.
void
runtime_main(void* dummy __attribute__((unused)))
{
@@ -541,7 +529,7 @@ runtime_main(void* dummy __attribute__((unused)))
d.__retaddr = nil;
d.__makefunc_can_recover = 0;
d.__frame = &frame;
- d.__free = 0;
+ d.__special = true;
g->defer = &d;
if(m != &runtime_m0)
@@ -560,8 +548,6 @@ runtime_main(void* dummy __attribute__((unused)))
mstats.enablegc = 1;
main_main();
- if(raceenabled)
- runtime_racefini();
// Make racy client program work: if panicking on
// another goroutine at the same time as main returns,
@@ -579,6 +565,7 @@ void
runtime_goroutineheader(G *gp)
{
const char *status;
+ int64 waitfor;
switch(gp->status) {
case Gidle:
@@ -603,7 +590,16 @@ runtime_goroutineheader(G *gp)
status = "???";
break;
}
- runtime_printf("goroutine %D [%s]:\n", gp->goid, status);
+
+ // approx time the G is blocked, in minutes
+ waitfor = 0;
+ if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince != 0)
+ waitfor = (runtime_nanotime() - gp->waitsince) / (60LL*1000*1000*1000);
+
+ if(waitfor < 1)
+ runtime_printf("goroutine %D [%s]:\n", gp->goid, status);
+ else
+ runtime_printf("goroutine %D [%s, %D minutes]:\n", gp->goid, status, waitfor);
}
void
@@ -624,7 +620,7 @@ runtime_printcreatedby(G *g)
struct Traceback
{
G* gp;
- Location locbuf[100];
+ Location locbuf[TracebackMaxFrames];
int32 c;
};
@@ -634,6 +630,7 @@ runtime_tracebackothers(G * volatile me)
G * volatile gp;
Traceback tb;
int32 traceback;
+ volatile uintptr i;
tb.gp = me;
traceback = runtime_gotraceback(nil);
@@ -657,7 +654,9 @@ runtime_tracebackothers(G * volatile me)
runtime_printcreatedby(gp);
}
- for(gp = runtime_allg; gp != nil; gp = gp->alllink) {
+ runtime_lock(&allglock);
+ for(i = 0; i < runtime_allglen; i++) {
+ gp = runtime_allg[i];
if(gp == me || gp == m->curg || gp->status == Gdead)
continue;
if(gp->issystem && traceback < 2)
@@ -696,6 +695,7 @@ runtime_tracebackothers(G * volatile me)
runtime_printcreatedby(gp);
}
}
+ runtime_unlock(&allglock);
}
static void
@@ -719,7 +719,7 @@ gtraceback(G* gp)
traceback = gp->traceback;
gp->traceback = nil;
traceback->c = runtime_callers(1, traceback->locbuf,
- sizeof traceback->locbuf / sizeof traceback->locbuf[0]);
+ sizeof traceback->locbuf / sizeof traceback->locbuf[0], false);
runtime_gogo(traceback->gp);
}
@@ -729,7 +729,7 @@ mcommoninit(M *mp)
// If there is no mcache runtime_callers() will crash,
// and we are most likely in sysmon thread so the stack is senseless anyway.
if(m->mcache)
- runtime_callers(1, mp->createstack, nelem(mp->createstack));
+ runtime_callers(1, mp->createstack, nelem(mp->createstack), false);
mp->fastrand = 0x49f6428aUL + mp->id + runtime_cputicks();
@@ -1038,6 +1038,7 @@ struct CgoThreadStart
{
M *m;
G *g;
+ uintptr *tls;
void (*fn)(void);
};
@@ -1070,6 +1071,22 @@ runtime_allocm(P *p, int32 stacksize, byte** ret_g0_stack, size_t* ret_g0_stacks
return mp;
}
+static G*
+allocg(void)
+{
+ G *gp;
+ // static Type *gtype;
+
+ // if(gtype == nil) {
+ // Eface e;
+ // runtime_gc_g_ptr(&e);
+ // gtype = ((PtrType*)e.__type_descriptor)->__element_type;
+ // }
+ // gp = runtime_cnew(gtype);
+ gp = runtime_malloc(sizeof(G));
+ return gp;
+}
+
static M* lockextra(bool nilokay);
static void unlockextra(M*);
@@ -1151,6 +1168,7 @@ runtime_needm(void)
__splitstack_getcontext(&g->stack_context[0]);
#else
g->gcinitial_sp = &mp;
+ g->gcstack = nil;
g->gcstack_size = 0;
g->gcnext_sp = &mp;
#endif
@@ -1200,22 +1218,12 @@ runtime_newextram(void)
gp->lockedm = mp;
gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
// put on allg for garbage collector
- runtime_lock(&runtime_sched);
- if(runtime_lastg == nil)
- runtime_allg = gp;
- else
- runtime_lastg->alllink = gp;
- runtime_lastg = gp;
- runtime_unlock(&runtime_sched);
- gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
+ allgadd(gp);
// The context for gp will be set up in runtime_needm. But
// here we need to set up the context for g0.
getcontext(&mp->g0->context);
mp->g0->context.uc_stack.ss_sp = g0_sp;
-#ifdef MAKECONTEXT_STACK_TOP
- mp->g0->context.uc_stack.ss_sp += g0_spsize;
-#endif
mp->g0->context.uc_stack.ss_size = g0_spsize;
makecontext(&mp->g0->context, kickoff, 0);
@@ -1262,6 +1270,8 @@ runtime_dropm(void)
runtime_setmg(nil, nil);
mp->curg->status = Gdead;
+ mp->curg->gcstack = nil;
+ mp->curg->gcnext_sp = nil;
mnext = lockextra(true);
mp->schedlink = mnext;
@@ -1382,7 +1392,7 @@ mspinning(void)
}
// Schedules some M to run the p (creates an M if necessary).
-// If p==nil, tries to get an idle P, if no idle P's returns false.
+// If p==nil, tries to get an idle P, if no idle P's does nothing.
static void
startm(P *p, bool spinning)
{
@@ -1546,6 +1556,7 @@ execute(G *gp)
runtime_throw("execute: bad g status");
}
gp->status = Grunning;
+ gp->waitsince = 0;
m->p->schedtick++;
m->curg = gp;
gp->m = m;
@@ -1572,6 +1583,8 @@ top:
gcstopm();
goto top;
}
+ if(runtime_fingwait && runtime_fingwake && (gp = runtime_wakefing()) != nil)
+ runtime_ready(gp);
// local runq
gp = runqget(m->p);
if(gp)
@@ -1763,28 +1776,52 @@ top:
execute(gp);
}
-// Puts the current goroutine into a waiting state and unlocks the lock.
-// The goroutine can be made runnable again by calling runtime_ready(gp).
+// Puts the current goroutine into a waiting state and calls unlockf.
+// If unlockf returns false, the goroutine is resumed.
void
-runtime_park(void(*unlockf)(Lock*), Lock *lock, const char *reason)
+runtime_park(bool(*unlockf)(G*, void*), void *lock, const char *reason)
{
+ if(g->status != Grunning)
+ runtime_throw("bad g status");
m->waitlock = lock;
m->waitunlockf = unlockf;
g->waitreason = reason;
runtime_mcall(park0);
}
+static bool
+parkunlock(G *gp, void *lock)
+{
+ USED(gp);
+ runtime_unlock(lock);
+ return true;
+}
+
+// Puts the current goroutine into a waiting state and unlocks the lock.
+// The goroutine can be made runnable again by calling runtime_ready(gp).
+void
+runtime_parkunlock(Lock *lock, const char *reason)
+{
+ runtime_park(parkunlock, lock, reason);
+}
+
// runtime_park continuation on g0.
static void
park0(G *gp)
{
+ bool ok;
+
gp->status = Gwaiting;
gp->m = nil;
m->curg = nil;
if(m->waitunlockf) {
- m->waitunlockf(m->waitlock);
+ ok = m->waitunlockf(gp, m->waitlock);
m->waitunlockf = nil;
m->waitlock = nil;
+ if(!ok) {
+ gp->status = Grunnable;
+ execute(gp); // Schedule it back, never returns.
+ }
}
if(m->lockedg) {
stoplockedm();
@@ -1797,6 +1834,8 @@ park0(G *gp)
void
runtime_gosched(void)
{
+ if(g->status != Grunning)
+ runtime_throw("bad g status");
runtime_mcall(runtime_gosched0);
}
@@ -1821,11 +1860,12 @@ runtime_gosched0(G *gp)
// Need to mark it as nosplit, because it runs with sp > stackbase (as runtime_lessstack).
// Since it does not return it does not matter. But if it is preempted
// at the split stack check, GC will complain about inconsistent sp.
+void runtime_goexit(void) __attribute__ ((noinline));
void
runtime_goexit(void)
{
- if(raceenabled)
- runtime_racegoend();
+ if(g->status != Grunning)
+ runtime_throw("bad g status");
runtime_mcall(goexit0);
}
@@ -1837,6 +1877,13 @@ goexit0(G *gp)
gp->entry = nil;
gp->m = nil;
gp->lockedm = nil;
+ gp->paniconfault = 0;
+ gp->defer = nil; // should be true already but just in case.
+ gp->panic = nil; // non-nil for Goexit during panic. points at stack-allocated data.
+ gp->writenbuf = 0;
+ gp->writebuf = nil;
+ gp->waitreason = nil;
+ gp->param = nil;
m->curg = nil;
m->lockedg = nil;
if(m->locked & ~LockExternal) {
@@ -1893,7 +1940,7 @@ doentersyscall()
&g->gcinitial_sp);
#else
{
- uint32 v;
+ void *v;
g->gcnext_sp = (byte *) &v;
}
@@ -1971,6 +2018,7 @@ runtime_exitsyscall(void)
if(gp->isbackground) // do not consider blocked scavenger for deadlock detection
incidlelocked(-1);
+ g->waitsince = 0;
if(exitsyscallfast()) {
// There's a cpu for us, so we can run.
m->p->syscalltick++;
@@ -2084,8 +2132,8 @@ syscall_runtime_BeforeFork(void)
{
// Fork can hang if preempted with signals frequently enough (see issue 5517).
// Ensure that we stay on the same M where we disable profiling.
- m->locks++;
- if(m->profilehz != 0)
+ runtime_m()->locks++;
+ if(runtime_m()->profilehz != 0)
runtime_resetcpuprofiler(0);
}
@@ -2100,7 +2148,7 @@ syscall_runtime_AfterFork(void)
hz = runtime_sched.profilehz;
if(hz != 0)
runtime_resetcpuprofiler(hz);
- m->locks--;
+ runtime_m()->locks--;
}
// Allocate a new g, with a stack big enough for stacksize bytes.
@@ -2109,7 +2157,7 @@ runtime_malg(int32 stacksize, byte** ret_stack, size_t* ret_stacksize)
{
G *newg;
- newg = runtime_malloc(sizeof(G));
+ newg = allocg();
if(stacksize >= 0) {
#if USING_SPLIT_STACK
int dont_block_signals = 0;
@@ -2163,11 +2211,17 @@ __go_go(void (*fn)(void*), void* arg)
byte *sp;
size_t spsize;
G *newg;
+ P *p;
//runtime_printf("newproc1 %p %p narg=%d nret=%d\n", fn->fn, argp, narg, nret);
+ if(fn == nil) {
+ m->throwing = -1; // do not dump full stacks
+ runtime_throw("go of nil func value");
+ }
m->locks++; // disable preemption because it can be holding p in a local var
- if((newg = gfget(m->p)) != nil) {
+ p = m->p;
+ if((newg = gfget(p)) != nil) {
#ifdef USING_SPLIT_STACK
int dont_block_signals = 0;
@@ -2184,20 +2238,18 @@ __go_go(void (*fn)(void*), void* arg)
#endif
} else {
newg = runtime_malg(StackMin, &sp, &spsize);
- runtime_lock(&runtime_sched);
- if(runtime_lastg == nil)
- runtime_allg = newg;
- else
- runtime_lastg->alllink = newg;
- runtime_lastg = newg;
- runtime_unlock(&runtime_sched);
+ allgadd(newg);
}
newg->entry = (byte*)fn;
newg->param = arg;
newg->gopc = (uintptr)__builtin_return_address(0);
newg->status = Grunnable;
- newg->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
+ if(p->goidcache == p->goidcacheend) {
+ p->goidcache = runtime_xadd64(&runtime_sched.goidgen, GoidCacheBatch);
+ p->goidcacheend = p->goidcache + GoidCacheBatch;
+ }
+ newg->goid = p->goidcache++;
{
// Avoid warnings about variables clobbered by
@@ -2214,7 +2266,7 @@ __go_go(void (*fn)(void*), void* arg)
vnewg->context.uc_stack.ss_size = vspsize;
makecontext(&vnewg->context, kickoff, 0);
- runqput(m->p, vnewg);
+ runqput(p, vnewg);
if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0 && fn != runtime_main) // TODO: fast atomic
wakep();
@@ -2223,6 +2275,31 @@ __go_go(void (*fn)(void*), void* arg)
}
}
+static void
+allgadd(G *gp)
+{
+ G **new;
+ uintptr cap;
+
+ runtime_lock(&allglock);
+ if(runtime_allglen >= allgcap) {
+ cap = 4096/sizeof(new[0]);
+ if(cap < 2*allgcap)
+ cap = 2*allgcap;
+ new = runtime_malloc(cap*sizeof(new[0]));
+ if(new == nil)
+ runtime_throw("runtime: cannot allocate memory");
+ if(runtime_allg != nil) {
+ runtime_memmove(new, runtime_allg, runtime_allglen*sizeof(new[0]));
+ runtime_free(runtime_allg);
+ }
+ runtime_allg = new;
+ allgcap = cap;
+ }
+ runtime_allg[runtime_allglen++] = gp;
+ runtime_unlock(&allglock);
+}
+
// Put on gfree list.
// If local list is too long, transfer a batch to the global list.
static void
@@ -2393,44 +2470,26 @@ runtime_lockedOSThread(void)
return g->lockedm != nil && m->lockedg != nil;
}
-// for testing of callbacks
-
-_Bool runtime_golockedOSThread(void)
- __asm__ (GOSYM_PREFIX "runtime.golockedOSThread");
-
-_Bool
-runtime_golockedOSThread(void)
-{
- return runtime_lockedOSThread();
-}
-
-intgo runtime_NumGoroutine (void)
- __asm__ (GOSYM_PREFIX "runtime.NumGoroutine");
-
-intgo
-runtime_NumGoroutine()
-{
- return runtime_gcount();
-}
-
int32
runtime_gcount(void)
{
G *gp;
int32 n, s;
+ uintptr i;
n = 0;
- runtime_lock(&runtime_sched);
+ runtime_lock(&allglock);
// TODO(dvyukov): runtime.NumGoroutine() is O(N).
// We do not want to increment/decrement centralized counter in newproc/goexit,
// just to make runtime.NumGoroutine() faster.
// Compromise solution is to introduce per-P counters of active goroutines.
- for(gp = runtime_allg; gp; gp = gp->alllink) {
+ for(i = 0; i < runtime_allglen; i++) {
+ gp = runtime_allg[i];
s = gp->status;
if(s == Grunnable || s == Grunning || s == Gsyscall || s == Gwaiting)
n++;
}
- runtime_unlock(&runtime_sched);
+ runtime_unlock(&allglock);
return n;
}
@@ -2444,32 +2503,39 @@ static struct {
Lock;
void (*fn)(uintptr*, int32);
int32 hz;
- uintptr pcbuf[100];
- Location locbuf[100];
+ uintptr pcbuf[TracebackMaxFrames];
+ Location locbuf[TracebackMaxFrames];
} prof;
-static void
-System(void)
-{
-}
+static void System(void) {}
+static void GC(void) {}
// Called if we receive a SIGPROF signal.
void
runtime_sigprof()
{
+ M *mp = m;
int32 n, i;
bool traceback;
if(prof.fn == nil || prof.hz == 0)
return;
+
+ if(mp == nil)
+ return;
+
+ // Profiling runs concurrently with GC, so it must not allocate.
+ mp->mallocing++;
+
traceback = true;
- // Windows does profiling in a dedicated thread w/o m.
- if(!Windows && (m == nil || m->mcache == nil))
+
+ if(mp->mcache == nil)
traceback = false;
-
+
runtime_lock(&prof);
if(prof.fn == nil) {
runtime_unlock(&prof);
+ mp->mallocing--;
return;
}
n = 0;
@@ -2483,17 +2549,21 @@ runtime_sigprof()
}
if(traceback) {
- n = runtime_callers(0, prof.locbuf, nelem(prof.locbuf));
+ n = runtime_callers(0, prof.locbuf, nelem(prof.locbuf), false);
for(i = 0; i < n; i++)
prof.pcbuf[i] = prof.locbuf[i].pc;
}
- if (!traceback || n <= 0) {
+ if(!traceback || n <= 0) {
n = 2;
prof.pcbuf[0] = (uintptr)runtime_getcallerpc(&n);
- prof.pcbuf[1] = (uintptr)System + 1;
+ if(mp->gcing || mp->helpgc)
+ prof.pcbuf[1] = (uintptr)GC;
+ else
+ prof.pcbuf[1] = (uintptr)System;
}
prof.fn(prof.pcbuf, n);
runtime_unlock(&prof);
+ mp->mallocing--;
}
// Arrange to call fn with a traceback hz times a second.
@@ -2536,6 +2606,7 @@ static void
procresize(int32 new)
{
int32 i, old;
+ bool empty;
G *gp;
P *p;
@@ -2557,27 +2628,42 @@ procresize(int32 new)
else
p->mcache = runtime_allocmcache();
}
- if(p->runq == nil) {
- p->runqsize = 128;
- p->runq = (G**)runtime_mallocgc(p->runqsize*sizeof(G*), 0, FlagNoInvokeGC);
- }
}
// redistribute runnable G's evenly
- for(i = 0; i < old; i++) {
- p = runtime_allp[i];
- while((gp = runqget(p)) != nil)
- globrunqput(gp);
+ // collect all runnable goroutines in global queue preserving FIFO order
+ // FIFO order is required to ensure fairness even during frequent GCs
+ // see http://golang.org/issue/7126
+ empty = false;
+ while(!empty) {
+ empty = true;
+ for(i = 0; i < old; i++) {
+ p = runtime_allp[i];
+ if(p->runqhead == p->runqtail)
+ continue;
+ empty = false;
+ // pop from tail of local queue
+ p->runqtail--;
+ gp = p->runq[p->runqtail%nelem(p->runq)];
+ // push onto head of global queue
+ gp->schedlink = runtime_sched.runqhead;
+ runtime_sched.runqhead = gp;
+ if(runtime_sched.runqtail == nil)
+ runtime_sched.runqtail = gp;
+ runtime_sched.runqsize++;
+ }
}
+ // fill local queues with at most nelem(p->runq)/2 goroutines
// start at 1 because current M already executes some G and will acquire allp[0] below,
// so if we have a spare G we want to put it into allp[1].
- for(i = 1; runtime_sched.runqhead; i++) {
+ for(i = 1; (uint32)i < (uint32)new * nelem(p->runq)/2 && runtime_sched.runqsize > 0; i++) {
gp = runtime_sched.runqhead;
runtime_sched.runqhead = gp->schedlink;
+ if(runtime_sched.runqhead == nil)
+ runtime_sched.runqtail = nil;
+ runtime_sched.runqsize--;
runqput(runtime_allp[i%new], gp);
}
- runtime_sched.runqtail = nil;
- runtime_sched.runqsize = 0;
// free unused P's
for(i = new; i < old; i++) {
@@ -2659,30 +2745,41 @@ checkdead(void)
{
G *gp;
int32 run, grunning, s;
+ uintptr i;
// -1 for sysmon
run = runtime_sched.mcount - runtime_sched.nmidle - runtime_sched.nmidlelocked - 1 - countextra();
if(run > 0)
return;
+ // If we are dying because of a signal caught on an already idle thread,
+ // freezetheworld will cause all running threads to block.
+ // And runtime will essentially enter into deadlock state,
+ // except that there is a thread that will call runtime_exit soon.
+ if(runtime_panicking > 0)
+ return;
if(run < 0) {
- runtime_printf("checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
+ runtime_printf("runtime: checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
runtime_sched.nmidle, runtime_sched.nmidlelocked, runtime_sched.mcount);
runtime_throw("checkdead: inconsistent counts");
}
grunning = 0;
- for(gp = runtime_allg; gp; gp = gp->alllink) {
+ runtime_lock(&allglock);
+ for(i = 0; i < runtime_allglen; i++) {
+ gp = runtime_allg[i];
if(gp->isbackground)
continue;
s = gp->status;
if(s == Gwaiting)
grunning++;
else if(s == Grunnable || s == Grunning || s == Gsyscall) {
- runtime_printf("checkdead: find g %D in status %d\n", gp->goid, s);
+ runtime_unlock(&allglock);
+ runtime_printf("runtime: checkdead: find g %D in status %d\n", gp->goid, s);
runtime_throw("checkdead: runnable g");
}
}
+ runtime_unlock(&allglock);
if(grunning == 0) // possible if main goroutine calls runtime_Goexit()
- runtime_exit(0);
+ runtime_throw("no goroutines (main called runtime.Goexit) - deadlock!");
m->throwing = -1; // do not dump full stacks
runtime_throw("all goroutines are asleep - deadlock!");
}
@@ -2777,16 +2874,19 @@ retake(int64 now)
pd = &pdesc[i];
s = p->status;
if(s == Psyscall) {
- // Retake P from syscall if it's there for more than 1 sysmon tick (20us).
- // But only if there is other work to do.
+ // Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
t = p->syscalltick;
if(pd->syscalltick != t) {
pd->syscalltick = t;
pd->syscallwhen = now;
continue;
}
+ // On the one hand we don't want to retake Ps if there is no other work to do,
+ // but on the other hand we want to retake them eventually
+ // because they can prevent the sysmon thread from deep sleep.
if(p->runqhead == p->runqtail &&
- runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0)
+ runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0 &&
+ pd->syscallwhen + 10*1000*1000 > now)
continue;
// Need to decrement number of idle locked M's
// (pretending that one more is running) before the CAS.
@@ -2831,7 +2931,8 @@ runtime_schedtrace(bool detailed)
static int64 starttime;
int64 now;
int64 id1, id2, id3;
- int32 i, q, t, h, s;
+ int32 i, t, h;
+ uintptr gi;
const char *fmt;
M *mp, *lockedm;
G *gp, *lockedg;
@@ -2858,15 +2959,11 @@ runtime_schedtrace(bool detailed)
if(p == nil)
continue;
mp = p->m;
- t = p->runqtail;
- h = p->runqhead;
- s = p->runqsize;
- q = t - h;
- if(q < 0)
- q += s;
+ h = runtime_atomicload(&p->runqhead);
+ t = runtime_atomicload(&p->runqtail);
if(detailed)
- runtime_printf(" P%d: status=%d schedtick=%d syscalltick=%d m=%d runqsize=%d/%d gfreecnt=%d\n",
- i, p->status, p->schedtick, p->syscalltick, mp ? mp->id : -1, q, s, p->gfreecnt);
+ runtime_printf(" P%d: status=%d schedtick=%d syscalltick=%d m=%d runqsize=%d gfreecnt=%d\n",
+ i, p->status, p->schedtick, p->syscalltick, mp ? mp->id : -1, t-h, p->gfreecnt);
else {
// In non-detailed mode format lengths of per-P run queues as:
// [len1 len2 len3 len4]
@@ -2877,7 +2974,7 @@ runtime_schedtrace(bool detailed)
fmt = " [%d";
else if(i == runtime_gomaxprocs-1)
fmt = " %d]\n";
- runtime_printf(fmt, q);
+ runtime_printf(fmt, t-h);
}
}
if(!detailed) {
@@ -2898,18 +2995,21 @@ runtime_schedtrace(bool detailed)
if(lockedg)
id3 = lockedg->goid;
runtime_printf(" M%d: p=%D curg=%D mallocing=%d throwing=%d gcing=%d"
- " locks=%d dying=%d helpgc=%d spinning=%d lockedg=%D\n",
+ " locks=%d dying=%d helpgc=%d spinning=%d blocked=%d lockedg=%D\n",
mp->id, id1, id2,
mp->mallocing, mp->throwing, mp->gcing, mp->locks, mp->dying, mp->helpgc,
- mp->spinning, id3);
+ mp->spinning, m->blocked, id3);
}
- for(gp = runtime_allg; gp; gp = gp->alllink) {
+ runtime_lock(&allglock);
+ for(gi = 0; gi < runtime_allglen; gi++) {
+ gp = runtime_allg[gi];
mp = gp->m;
lockedm = gp->lockedm;
runtime_printf(" G%D: status=%d(%s) m=%d lockedm=%d\n",
gp->goid, gp->status, gp->waitreason, mp ? mp->id : -1,
lockedm ? lockedm->id : -1);
}
+ runtime_unlock(&allglock);
runtime_unlock(&runtime_sched);
}
@@ -2952,6 +3052,20 @@ globrunqput(G *gp)
runtime_sched.runqsize++;
}
+// Put a batch of runnable goroutines on the global runnable queue.
+// Sched must be locked.
+static void
+globrunqputbatch(G *ghead, G *gtail, int32 n)
+{
+ gtail->schedlink = nil;
+ if(runtime_sched.runqtail)
+ runtime_sched.runqtail->schedlink = ghead;
+ else
+ runtime_sched.runqhead = ghead;
+ runtime_sched.runqtail = gtail;
+ runtime_sched.runqsize += n;
+}
+
// Try get a batch of G's from the global runnable queue.
// Sched must be locked.
static G*
@@ -2967,6 +3081,8 @@ globrunqget(P *p, int32 max)
n = runtime_sched.runqsize;
if(max > 0 && n > max)
n = max;
+ if((uint32)n > nelem(p->runq)/2)
+ n = nelem(p->runq)/2;
runtime_sched.runqsize -= n;
if(runtime_sched.runqsize == 0)
runtime_sched.runqtail = nil;
@@ -3006,78 +3122,98 @@ pidleget(void)
return p;
}
-// Put g on local runnable queue.
-// TODO(dvyukov): consider using lock-free queue.
+// Try to put g on local runnable queue.
+// If it's full, put onto global queue.
+// Executed only by the owner P.
static void
runqput(P *p, G *gp)
{
- int32 h, t, s;
+ uint32 h, t;
- runtime_lock(p);
retry:
- h = p->runqhead;
+ h = runtime_atomicload(&p->runqhead); // load-acquire, synchronize with consumers
t = p->runqtail;
- s = p->runqsize;
- if(t == h-1 || (h == 0 && t == s-1)) {
- runqgrow(p);
- goto retry;
+ if(t - h < nelem(p->runq)) {
+ p->runq[t%nelem(p->runq)] = gp;
+ runtime_atomicstore(&p->runqtail, t+1); // store-release, makes the item available for consumption
+ return;
}
- p->runq[t++] = gp;
- if(t == s)
- t = 0;
- p->runqtail = t;
- runtime_unlock(p);
+ if(runqputslow(p, gp, h, t))
+ return;
+ // the queue is not full, now the put above must suceed
+ goto retry;
+}
+
+// Put g and a batch of work from local runnable queue on global queue.
+// Executed only by the owner P.
+static bool
+runqputslow(P *p, G *gp, uint32 h, uint32 t)
+{
+ G *batch[nelem(p->runq)/2+1];
+ uint32 n, i;
+
+ // First, grab a batch from local queue.
+ n = t-h;
+ n = n/2;
+ if(n != nelem(p->runq)/2)
+ runtime_throw("runqputslow: queue is not full");
+ for(i=0; i<n; i++)
+ batch[i] = p->runq[(h+i)%nelem(p->runq)];
+ if(!runtime_cas(&p->runqhead, h, h+n)) // cas-release, commits consume
+ return false;
+ batch[n] = gp;
+ // Link the goroutines.
+ for(i=0; i<n; i++)
+ batch[i]->schedlink = batch[i+1];
+ // Now put the batch on global queue.
+ runtime_lock(&runtime_sched);
+ globrunqputbatch(batch[0], batch[n], n+1);
+ runtime_unlock(&runtime_sched);
+ return true;
}
// Get g from local runnable queue.
+// Executed only by the owner P.
static G*
runqget(P *p)
{
G *gp;
- int32 t, h, s;
+ uint32 t, h;
- if(p->runqhead == p->runqtail)
- return nil;
- runtime_lock(p);
- h = p->runqhead;
- t = p->runqtail;
- s = p->runqsize;
- if(t == h) {
- runtime_unlock(p);
- return nil;
+ for(;;) {
+ h = runtime_atomicload(&p->runqhead); // load-acquire, synchronize with other consumers
+ t = p->runqtail;
+ if(t == h)
+ return nil;
+ gp = p->runq[h%nelem(p->runq)];
+ if(runtime_cas(&p->runqhead, h, h+1)) // cas-release, commits consume
+ return gp;
}
- gp = p->runq[h++];
- if(h == s)
- h = 0;
- p->runqhead = h;
- runtime_unlock(p);
- return gp;
}
-// Grow local runnable queue.
-// TODO(dvyukov): consider using fixed-size array
-// and transfer excess to the global list (local queue can grow way too big).
-static void
-runqgrow(P *p)
+// Grabs a batch of goroutines from local runnable queue.
+// batch array must be of size nelem(p->runq)/2. Returns number of grabbed goroutines.
+// Can be executed by any P.
+static uint32
+runqgrab(P *p, G **batch)
{
- G **q;
- int32 s, t, h, t2;
+ uint32 t, h, n, i;
- h = p->runqhead;
- t = p->runqtail;
- s = p->runqsize;
- t2 = 0;
- q = runtime_malloc(2*s*sizeof(*q));
- while(t != h) {
- q[t2++] = p->runq[h++];
- if(h == s)
- h = 0;
+ for(;;) {
+ h = runtime_atomicload(&p->runqhead); // load-acquire, synchronize with other consumers
+ t = runtime_atomicload(&p->runqtail); // load-acquire, synchronize with the producer
+ n = t-h;
+ n = n - n/2;
+ if(n == 0)
+ break;
+ if(n > nelem(p->runq)/2) // read inconsistent h and t
+ continue;
+ for(i=0; i<n; i++)
+ batch[i] = p->runq[(h+i)%nelem(p->runq)];
+ if(runtime_cas(&p->runqhead, h, h+n)) // cas-release, commits consume
+ break;
}
- runtime_free(p->runq);
- p->runq = q;
- p->runqhead = 0;
- p->runqtail = t2;
- p->runqsize = 2*s;
+ return n;
}
// Steal half of elements from local runnable queue of p2
@@ -3086,57 +3222,24 @@ runqgrow(P *p)
static G*
runqsteal(P *p, P *p2)
{
- G *gp, *gp1;
- int32 t, h, s, t2, h2, s2, c, i;
+ G *gp;
+ G *batch[nelem(p->runq)/2];
+ uint32 t, h, n, i;
- if(p2->runqhead == p2->runqtail)
- return nil;
- // sort locks to prevent deadlocks
- if(p < p2)
- runtime_lock(p);
- runtime_lock(p2);
- if(p2->runqhead == p2->runqtail) {
- runtime_unlock(p2);
- if(p < p2)
- runtime_unlock(p);
+ n = runqgrab(p2, batch);
+ if(n == 0)
return nil;
- }
- if(p >= p2)
- runtime_lock(p);
- // now we've locked both queues and know the victim is not empty
- h = p->runqhead;
+ n--;
+ gp = batch[n];
+ if(n == 0)
+ return gp;
+ h = runtime_atomicload(&p->runqhead); // load-acquire, synchronize with consumers
t = p->runqtail;
- s = p->runqsize;
- h2 = p2->runqhead;
- t2 = p2->runqtail;
- s2 = p2->runqsize;
- gp = p2->runq[h2++]; // return value
- if(h2 == s2)
- h2 = 0;
- // steal roughly half
- if(t2 > h2)
- c = (t2 - h2) / 2;
- else
- c = (s2 - h2 + t2) / 2;
- // copy
- for(i = 0; i != c; i++) {
- // the target queue is full?
- if(t == h-1 || (h == 0 && t == s-1))
- break;
- // the victim queue is empty?
- if(t2 == h2)
- break;
- gp1 = p2->runq[h2++];
- if(h2 == s2)
- h2 = 0;
- p->runq[t++] = gp1;
- if(t == s)
- t = 0;
- }
- p->runqtail = t;
- p2->runqhead = h2;
- runtime_unlock(p2);
- runtime_unlock(p);
+ if(t - h + n >= nelem(p->runq))
+ runtime_throw("runqsteal: runq overflow");
+ for(i=0; i<n; i++, t++)
+ p->runq[t%nelem(p->runq)] = batch[i];
+ runtime_atomicstore(&p->runqtail, t); // store-release, makes the item available for consumption
return gp;
}
@@ -3147,14 +3250,10 @@ void
runtime_testSchedLocalQueue(void)
{
P p;
- G gs[1000];
+ G gs[nelem(p.runq)];
int32 i, j;
runtime_memclr((byte*)&p, sizeof(p));
- p.runqsize = 1;
- p.runqhead = 0;
- p.runqtail = 0;
- p.runq = runtime_malloc(p.runqsize*sizeof(*p.runq));
for(i = 0; i < (int32)nelem(gs); i++) {
if(runqget(&p) != nil)
@@ -3179,20 +3278,11 @@ void
runtime_testSchedLocalQueueSteal(void)
{
P p1, p2;
- G gs[1000], *gp;
+ G gs[nelem(p1.runq)], *gp;
int32 i, j, s;
runtime_memclr((byte*)&p1, sizeof(p1));
- p1.runqsize = 1;
- p1.runqhead = 0;
- p1.runqtail = 0;
- p1.runq = runtime_malloc(p1.runqsize*sizeof(*p1.runq));
-
runtime_memclr((byte*)&p2, sizeof(p2));
- p2.runqsize = nelem(gs);
- p2.runqhead = 0;
- p2.runqtail = 0;
- p2.runq = runtime_malloc(p2.runqsize*sizeof(*p2.runq));
for(i = 0; i < (int32)nelem(gs); i++) {
for(j = 0; j < i; j++) {
@@ -3225,13 +3315,10 @@ runtime_testSchedLocalQueueSteal(void)
}
}
-intgo runtime_debug_setMaxThreads(intgo)
- __asm__(GOSYM_PREFIX "runtime_debug.setMaxThreads");
-
-intgo
-runtime_debug_setMaxThreads(intgo in)
+int32
+runtime_setmaxthreads(int32 in)
{
- intgo out;
+ int32 out;
runtime_lock(&runtime_sched);
out = runtime_sched.maxmcount;
@@ -3242,29 +3329,9 @@ runtime_debug_setMaxThreads(intgo in)
}
void
-runtime_proc_scan(void (*addroot)(Obj))
-{
- addroot((Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0});
-}
-
-// When a function calls a closure, it passes the closure value to
-// __go_set_closure immediately before the function call. When a
-// function uses a closure, it calls __go_get_closure immediately on
-// function entry. This is a hack, but it will work on any system.
-// It would be better to use the static chain register when there is
-// one. It is also worth considering expanding these functions
-// directly in the compiler.
-
-void
-__go_set_closure(void* v)
-{
- g->closure = v;
-}
-
-void *
-__go_get_closure(void)
+runtime_proc_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj))
{
- return g->closure;
+ enqueue1(wbufp, (Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0});
}
// Return whether we are waiting for a GC. This gc toolchain uses