From 59a42ca068d7cbe0b92f2f68768129599a6c42f8 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Thu, 2 Oct 2014 14:26:04 -0400 Subject: [dev.garbage] runtime: remove another BitsMultiWord Not found because it was not used by name. Add name in comments for what's left behind. LGTM=rlh R=rlh CC=golang-codereviews https://codereview.appspot.com/148430043 --- src/runtime/gcinfo_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go index 88f6703f9..e74d8c2c0 100644 --- a/src/runtime/gcinfo_test.go +++ b/src/runtime/gcinfo_test.go @@ -188,6 +188,6 @@ var ( infoString = []byte{BitsPointer, BitsDead} infoSlice = []byte{BitsPointer, BitsDead, BitsDead} - infoEface = []byte{BitsMultiWord, BitsEface} - infoIface = []byte{BitsMultiWord, BitsIface} + infoEface = []byte{BitsPointer, BitsPointer} + infoIface = []byte{BitsPointer, BitsPointer} ) -- cgit v1.2.1 From 5fc8be8db6a4e7a0e43718e6b21d2b78e5aacf5f Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Thu, 2 Oct 2014 16:49:11 -0400 Subject: [dev.garbage] runtime: make sure G.param and SudoG.elem do not hold stale pointers In old conservative Go, this could cause memory leaks. A new pickier collector might reasonably crash when it saw one of these. LGTM=rlh R=rlh CC=golang-codereviews https://codereview.appspot.com/147480043 --- src/runtime/chan.go | 11 +++++++++-- src/runtime/proc.go | 13 +++++++++++++ src/runtime/select.go | 7 +++++++ src/runtime/sema.go | 1 + 4 files changed, 30 insertions(+), 2 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/chan.go b/src/runtime/chan.go index 48925b2e3..10503f4e1 100644 --- a/src/runtime/chan.go +++ b/src/runtime/chan.go @@ -140,10 +140,11 @@ func chansend(t *chantype, c *hchan, ep unsafe.Pointer, block bool, callerpc uin unlock(&c.lock) recvg := sg.g - recvg.param = unsafe.Pointer(sg) if sg.elem != nil { memmove(unsafe.Pointer(sg.elem), ep, uintptr(c.elemsize)) + sg.elem = nil } + recvg.param = unsafe.Pointer(sg) if sg.releasetime != 0 { sg.releasetime = cputicks() } @@ -179,6 +180,7 @@ func chansend(t *chantype, c *hchan, ep unsafe.Pointer, block bool, callerpc uin } panic("send on closed channel") } + gp.param = nil if mysg.releasetime > 0 { blockevent(int64(mysg.releasetime)-t0, 2) } @@ -278,6 +280,7 @@ func closechan(c *hchan) { break } gp := sg.g + sg.elem = nil gp.param = nil if sg.releasetime != 0 { sg.releasetime = cputicks() @@ -292,6 +295,7 @@ func closechan(c *hchan) { break } gp := sg.g + sg.elem = nil gp.param = nil if sg.releasetime != 0 { sg.releasetime = cputicks() @@ -372,6 +376,7 @@ func chanrecv(t *chantype, c *hchan, ep unsafe.Pointer, block bool) (selected, r if ep != nil { memmove(ep, sg.elem, uintptr(c.elemsize)) } + sg.elem = nil gp := sg.g gp.param = unsafe.Pointer(sg) if sg.releasetime != 0 { @@ -409,9 +414,11 @@ func chanrecv(t *chantype, c *hchan, ep unsafe.Pointer, block bool) (selected, r if mysg.releasetime > 0 { blockevent(mysg.releasetime-t0, 2) } + haveData := gp.param != nil + gp.param = nil releaseSudog(mysg) - if gp.param != nil { + if haveData { // a sender sent us some data. It already wrote to ep. selected = true received = true diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 9b9586859..eefe8239f 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -148,6 +148,9 @@ func acquireSudog() *sudog { c := gomcache() s := c.sudogcache if s != nil { + if s.elem != nil { + gothrow("acquireSudog: found s.elem != nil in cache") + } c.sudogcache = s.next return s } @@ -162,12 +165,22 @@ func acquireSudog() *sudog { // which keeps the garbage collector from being invoked. mp := acquirem() p := new(sudog) + if p.elem != nil { + gothrow("acquireSudog: found p.elem != nil after new") + } releasem(mp) return p } //go:nosplit func releaseSudog(s *sudog) { + if s.elem != nil { + gothrow("runtime: sudog with non-nil elem") + } + gp := getg() + if gp.param != nil { + gothrow("runtime: releaseSudog with non-nil gp.param") + } c := gomcache() s.next = c.sudogcache c.sudogcache = s diff --git a/src/runtime/select.go b/src/runtime/select.go index 7716d2d4b..1bcea8c4b 100644 --- a/src/runtime/select.go +++ b/src/runtime/select.go @@ -368,6 +368,7 @@ loop: // someone woke us up sellock(sel) sg = (*sudog)(gp.param) + gp.param = nil // pass 3 - dequeue from unsuccessful chans // otherwise they stack up on quiet channels @@ -376,6 +377,10 @@ loop: // iterating through the linked list they are in reverse order. cas = nil sglist = gp.waiting + // Clear all elem before unlinking from gp.waiting. + for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink { + sg1.elem = nil + } gp.waiting = nil for i := int(sel.ncase) - 1; i >= 0; i-- { k = &scases[pollorder[i]] @@ -506,6 +511,7 @@ syncrecv: if cas.elem != nil { memmove(cas.elem, sg.elem, uintptr(c.elemsize)) } + sg.elem = nil gp = sg.g gp.param = unsafe.Pointer(sg) if sg.releasetime != 0 { @@ -541,6 +547,7 @@ syncsend: if sg.elem != nil { memmove(sg.elem, cas.elem, uintptr(c.elemsize)) } + sg.elem = nil gp = sg.g gp.param = unsafe.Pointer(sg) if sg.releasetime != 0 { diff --git a/src/runtime/sema.go b/src/runtime/sema.go index beacd6716..142d3082c 100644 --- a/src/runtime/sema.go +++ b/src/runtime/sema.go @@ -168,6 +168,7 @@ func (root *semaRoot) dequeue(s *sudog) { } else { root.head = s.next } + s.elem = nil s.next = nil s.prev = nil } -- cgit v1.2.1 From 4eb6792aa572c7e6d3448d4cf22223b61b65724f Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Fri, 3 Oct 2014 11:33:57 -0400 Subject: [dev.garbage] runtime: scan and mark phase refactoring Refactoring of the scan and mark phase so that concurrent GC, in particular the write barrier, can share a common infrastructure. Now that the scan and mark phases have been separated we will be able to scan stacks without blackening any objects. This in turn will allow us to delay installing expensive write barrier code. LGTM=rsc R=rsc, khr, dvyukov CC=golang-codereviews https://codereview.appspot.com/145640044 Committer: Russ Cox --- src/runtime/malloc.h | 10 +- src/runtime/mgc0.c | 627 ++++++++++++++++++++++++++++++-------------------- src/runtime/proc.c | 3 +- src/runtime/runtime.h | 27 ++- 4 files changed, 404 insertions(+), 263 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/malloc.h b/src/runtime/malloc.h index 3f1981f70..413870c9f 100644 --- a/src/runtime/malloc.h +++ b/src/runtime/malloc.h @@ -86,6 +86,7 @@ typedef struct MSpan MSpan; typedef struct MStats MStats; typedef struct MLink MLink; typedef struct GCStats GCStats; +typedef struct Workbuf Workbuf; enum { @@ -337,8 +338,11 @@ struct MCache StackFreeList stackcache[NumStackOrders]; SudoG* sudogcache; - - void* gcworkbuf; + // Cached P local buffer holding grey objects (marked by not yet scanned) + // Used by mutator for write barrier work. + // GC uses the mcache of the P it is running on for stack and global scanning + // work as well marking. + Workbuf* gcworkbuf; // Local allocator stats, flushed during GC. uintptr local_nlookup; // number of pointer lookups @@ -350,7 +354,7 @@ struct MCache MSpan* runtime·MCache_Refill(MCache *c, int32 sizeclass); void runtime·MCache_ReleaseAll(MCache *c); void runtime·stackcache_clear(MCache *c); -void runtime·gcworkbuffree(void *b); +void runtime·gcworkbuffree(Workbuf *b); enum { diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index 7a3498ae1..b4cd3474d 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -66,7 +66,6 @@ enum { Debug = 0, ConcurrentSweep = 1, - WorkbufSize = 4*1024, FinBlockSize = 4*1024, RootData = 0, RootBss = 1, @@ -97,12 +96,12 @@ extern int32 runtime·gcpercent; // uint32 runtime·worldsema = 1; -typedef struct Workbuf Workbuf; -struct Workbuf -{ - LFNode node; // must be first - uintptr nobj; - byte* obj[(WorkbufSize-sizeof(LFNode)-sizeof(uintptr))/PtrSize]; +typedef struct Markbits Markbits; +struct Markbits { + byte *bitp; // pointer to the byte holding xbits + byte shift; // bits xbits needs to be shifted to get bits + byte xbits; // byte holding all the bits from *bitp + byte bits; // bits relevant to corresponding slot. }; extern byte runtime·data[]; @@ -127,15 +126,22 @@ BitVector runtime·gcbssmask; Mutex runtime·gclock; +static Workbuf* getpartial(void); +static void putpartial(Workbuf*); static Workbuf* getempty(Workbuf*); static Workbuf* getfull(Workbuf*); static void putempty(Workbuf*); static Workbuf* handoff(Workbuf*); static void gchelperstart(void); static void flushallmcaches(void); -static bool scanframe(Stkframe *frame, void *unused); -static void scanstack(G *gp); -static BitVector unrollglobgcprog(byte *prog, uintptr size); +static bool scanframe(Stkframe*, void*); +static void scanstack(G*); +static BitVector unrollglobgcprog(byte*, uintptr); +static void scanblock(byte*, uintptr, byte*); +static byte* objectstart(byte*, Markbits*); +static Workbuf* greyobject(byte*, Markbits*, Workbuf*); +static bool inheap(byte*); +static void slottombits(byte*, Markbits*); void runtime·bgsweep(void); static FuncVal bgsweepv = {runtime·bgsweep}; @@ -156,258 +162,279 @@ static struct { uint32 nspan; } work; -// scanblock scans a block of n bytes starting at pointer b for references -// to other objects, scanning any it finds recursively until there are no -// unscanned objects left. Instead of using an explicit recursion, it keeps -// a work list in the Workbuf* structures and loops in the main function -// body. Keeping an explicit work list is easier on the stack allocator and -// more efficient. +// Is address b in the known heap. If it doesn't have a valid gcmap +// returns false. For example pointers into stacks will return false. +static bool +inheap(byte *b) +{ + MSpan *s; + pageID k; + uintptr x; + + if(b == nil || b < runtime·mheap.arena_start || b >= runtime·mheap.arena_used) + return false; + // Not a beginning of a block, consult span table to find the block beginning. + k = (uintptr)b>>PageShift; + x = k; + x -= (uintptr)runtime·mheap.arena_start>>PageShift; + s = runtime·mheap.spans[x]; + if(s == nil || k < s->start || b >= s->limit || s->state != MSpanInUse) + return false; + return true; +} + +// Given an address in the heap return the relevant byte from the gcmap. This routine +// can be used on addresses to the start of an object or to the interior of the an object. static void -scanblock(byte *b, uintptr n, byte *ptrmask) +slottombits(byte *obj, Markbits *mbits) { - byte *obj, *p, *arena_start, *arena_used, **wp, *scanbuf[8], *ptrbitp, *bitp, bits, xbits, shift, cached; - uintptr i, nobj, size, idx, x, off, scanbufpos; - intptr ncached; - Workbuf *wbuf; - Iface *iface; - Eface *eface; - Type *typ; + uintptr off; + + off = (uintptr*)((uintptr)obj&~(PtrSize-1)) - (uintptr*)runtime·mheap.arena_start; + mbits->bitp = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1; + mbits->shift = (off % wordsPerBitmapByte) * gcBits; + mbits->xbits = *mbits->bitp; + mbits->bits = (mbits->xbits >> mbits->shift) & bitMask; +} + +// b is a pointer into the heap. +// Find the start of the object refered to by b. +// Set mbits to the associated bits from the bit map. +static byte* +objectstart(byte *b, Markbits *mbits) +{ + byte *obj, *p; MSpan *s; pageID k; - bool keepworking; + uintptr x, size, idx; - // Cache memory arena parameters in local vars. - arena_start = runtime·mheap.arena_start; - arena_used = runtime·mheap.arena_used; + obj = (byte*)((uintptr)b&~(PtrSize-1)); + for(;;) { + slottombits(obj, mbits); + if(mbits->bits&bitBoundary == bitBoundary) + break; + + // Not a beginning of a block, consult span table to find the block beginning. + k = (uintptr)obj>>PageShift; + x = k; + x -= (uintptr)runtime·mheap.arena_start>>PageShift; + s = runtime·mheap.spans[x]; + if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse){ + if(s->state == MSpanStack) + break; // This is legit. + + // The following is catching some bugs left over from + // us not being rigerous about what data structures are + // hold valid pointers and different parts of the system + // considering different structures as roots. For example + // if there is a pointer into a stack that is left in + // a global data structure but that part of the runtime knows that + // those structures will be reinitialized before they are + // reused. Unfortunately the GC believes these roots are valid. + // Typically a stack gets moved and only the structures that part of + // the system knows are alive are updated. The span is freed + // after the stack copy and the pointer is still alive. This + // check is catching that bug but for now we will not throw, + // instead we will simply break out of this routine and depend + // on the caller to recognize that this pointer is not a valid + // heap pointer. I leave the code that catches the bug so that once + // resolved we can turn this check back on and throw. + + //runtime·printf("Runtime: Span weird: obj=%p, k=%p", obj, k); + //if (s == nil) + // runtime·printf(" s=nil\n"); + //else + // runtime·printf(" s->start=%p s->limit=%p, s->state=%d\n", s->start*PageSize, s->limit, s->state); + //runtime·throw("Blowup on weird span"); + break; // We are not in a real block throw?? + } + p = (byte*)((uintptr)s->start<sizeclass != 0) { + size = s->elemsize; + idx = ((byte*)obj - p)/size; + p = p+idx*size; + } + if(p == obj) { + runtime·printf("runtime: failed to find block beginning for %p s=%p s->limit=%p\n", + p, s->start*PageSize, s->limit); + runtime·throw("failed to find block beginning"); + } + obj = p; + } + // if size(obj.firstfield) < PtrSize, the &obj.secondfield could map to the boundary bit + // Clear any low bits to get to the start of the object. + // greyobject depends on this. + return obj; +} - wbuf = getempty(nil); - nobj = wbuf->nobj; - wp = &wbuf->obj[nobj]; - keepworking = b == nil; - scanbufpos = 0; - for(i = 0; i < nelem(scanbuf); i++) - scanbuf[i] = nil; +// obj is the start of an object with mark mbits. +// If it isn't already marked, mark it and enqueue into workbuf. +// Return possibly new workbuf to use. +static Workbuf* +greyobject(byte *obj, Markbits *mbits, Workbuf *wbuf) +{ + // obj should be start of allocation, and so must be at least pointer-aligned. + if(((uintptr)obj & (PtrSize-1)) != 0) + runtime·throw("greyobject: obj not pointer-aligned"); + + // If marked we have nothing to do. + if((mbits->bits&bitMarked) != 0) + return wbuf; + + // Each byte of GC bitmap holds info for two words. + // If the current object is larger than two words, or if the object is one word + // but the object it shares the byte with is already marked, + // then all the possible concurrent updates are trying to set the same bit, + // so we can use a non-atomic update. + if((mbits->xbits&(bitMask|(bitMask<bitp = mbits->xbits | (bitMarked<shift); + else + runtime·atomicor8(mbits->bitp, bitMarked<shift); + + if(((mbits->xbits>>(mbits->shift+2))&BitsMask) == BitsDead) + return wbuf; // noscan object + + // Queue the obj for scanning. The PREFETCH(obj) logic has been removed but + // seems like a nice optimization that can be added back in. + // There needs to be time between the PREFETCH and the use. + // Previously we put the obj in an 8 element buffer that is drained at a rate + // to give the PREFETCH time to do its work. + // Use of PREFETCHNTA might be more appropriate than PREFETCH + + // If workbuf is full, obtain an empty one. + if(wbuf->nobj >= nelem(wbuf->obj)) { + wbuf = getempty(wbuf); + } + + wbuf->obj[wbuf->nobj] = obj; + wbuf->nobj++; + return wbuf; +} +// Scan the object b of size n, adding pointers to wbuf. +// Return possibly new wbuf to use. +// If ptrmask != nil, it specifies where pointers are in b. +// If ptrmask == nil, the GC bitmap should be consulted. +// In this case, n may be an overestimate of the size; the GC bitmap +// must also be used to make sure the scan stops at the end of b. +static Workbuf* +scanobject(byte *b, uintptr n, byte *ptrmask, Workbuf *wbuf) +{ + byte *obj, *arena_start, *arena_used, *ptrbitp, bits, cshift, cached; + uintptr i; + intptr ncached; + Markbits mbits; + + arena_start = (byte*)runtime·mheap.arena_start; + arena_used = runtime·mheap.arena_used; ptrbitp = nil; cached = 0; ncached = 0; + // Find bits of the beginning of the object. + if(ptrmask == nil) { + b = objectstart(b, &mbits); + ptrbitp = mbits.bitp; //arena_start - off/wordsPerBitmapByte - 1; + cshift = mbits.shift; //(off % wordsPerBitmapByte) * gcBits; + cached = *ptrbitp >> cshift; + cached &= ~bitBoundary; + ncached = (8 - cshift)/gcBits; + } + for(i = 0; i < n; i += PtrSize) { + // Find bits for this word. + if(ptrmask != nil) { + // dense mask (stack or data) + bits = (ptrmask[(i/PtrSize)/4]>>(((i/PtrSize)%4)*BitsPerPointer))&BitsMask; + } else { + // Check if we have reached end of span. + if((((uintptr)b+i)%PageSize) == 0 && + runtime·mheap.spans[(b-arena_start)>>PageShift] != runtime·mheap.spans[(b+i-arena_start)>>PageShift]) + break; + // Consult GC bitmap. + if(ncached <= 0) { + // Refill cache. + cached = *--ptrbitp; + ncached = 2; + } + bits = cached; + cached >>= gcBits; + ncached--; + + if((bits&bitBoundary) != 0) + break; // reached beginning of the next object + bits = (bits>>2)&BitsMask; + if(bits == BitsDead) + break; // reached no-scan part of the object + } + + if(bits == BitsScalar || bits == BitsDead) + continue; + if(bits != BitsPointer) + runtime·throw("unexpected garbage collection bits"); + + obj = *(byte**)(b+i); + // At this point we have extracted the next potential pointer. + // Check if it points into heap. + if(obj == nil || obj < arena_start || obj >= arena_used) + continue; + // Mark the object. return some important bits. + // We we combine the following two rotines we don't have to pass mbits or obj around. + obj = objectstart(obj, &mbits); + wbuf = greyobject(obj, &mbits, wbuf); + } + return wbuf; +} + +// scanblock starts by scanning b as scanobject would. +// If the gcphase is GCscan, that's all scanblock does. +// Otherwise it traverses some fraction of the pointers it found in b, recursively. +// As a special case, scanblock(nil, 0, nil) means to scan previously queued work, +// stopping only when no work is left in the system. +static void +scanblock(byte *b, uintptr n, byte *ptrmask) +{ + Workbuf *wbuf; + bool keepworking; + + wbuf = getpartial(); + if(b != nil) { + wbuf = scanobject(b, n, ptrmask, wbuf); + if(runtime·gcphase == GCscan) { + putpartial(wbuf); + return; + } + } + + keepworking = b == nil; + // ptrmask can have 2 possible values: // 1. nil - obtain pointer mask from GC bitmap. // 2. pointer to a compact mask (for stacks and data). - if(b != nil) - goto scanobj; for(;;) { - if(nobj == 0) { - // Out of work in workbuf. - // First, see is there is any work in scanbuf. - for(i = 0; i < nelem(scanbuf); i++) { - b = scanbuf[scanbufpos]; - scanbuf[scanbufpos++] = nil; - if(scanbufpos == nelem(scanbuf)) - scanbufpos = 0; - if(b != nil) { - n = arena_used - b; // scan until bitBoundary or BitsDead - ptrmask = nil; // use GC bitmap for pointer info - goto scanobj; - } - } + if(wbuf->nobj == 0) { if(!keepworking) { putempty(wbuf); return; } // Refill workbuf from global queue. wbuf = getfull(wbuf); - if(wbuf == nil) + if(wbuf == nil) // nil means out of work barrier reached return; - nobj = wbuf->nobj; - wp = &wbuf->obj[nobj]; } // If another proc wants a pointer, give it some. - if(work.nwait > 0 && nobj > 4 && work.full == 0) { - wbuf->nobj = nobj; + if(work.nwait > 0 && wbuf->nobj > 4 && work.full == 0) { wbuf = handoff(wbuf); - nobj = wbuf->nobj; - wp = &wbuf->obj[nobj]; - } - - wp--; - nobj--; - b = *wp; - n = arena_used - b; // scan until next bitBoundary or BitsDead - ptrmask = nil; // use GC bitmap for pointer info - - scanobj: - // Find bits of the beginning of the object. - if(ptrmask == nil) { - off = (uintptr*)b - (uintptr*)arena_start; - ptrbitp = arena_start - off/wordsPerBitmapByte - 1; - shift = (off % wordsPerBitmapByte) * gcBits; - cached = *ptrbitp >> shift; - cached &= ~bitBoundary; - ncached = (8 - shift)/gcBits; - } - for(i = 0; i < n; i += PtrSize) { - obj = nil; - // Find bits for this word. - if(ptrmask == nil) { - // Check is we have reached end of span. - if((((uintptr)b+i)%PageSize) == 0 && - runtime·mheap.spans[(b-arena_start)>>PageShift] != runtime·mheap.spans[(b+i-arena_start)>>PageShift]) - break; - // Consult GC bitmap. - if(ncached <= 0) { - // Refill cache. - cached = *--ptrbitp; - ncached = 2; - } - bits = cached; - cached >>= gcBits; - ncached--; - if((bits&bitBoundary) != 0) - break; // reached beginning of the next object - bits = (bits>>2)&BitsMask; - if(bits == BitsDead) - break; // reached no-scan part of the object - } else // dense mask (stack or data) - bits = (ptrmask[(i/PtrSize)/4]>>(((i/PtrSize)%4)*BitsPerPointer))&BitsMask; - - if(bits == BitsScalar || bits == BitsDead) - continue; - if(bits == BitsPointer) { - obj = *(byte**)(b+i); - goto markobj; - } - - // With those three out of the way, must be multi-word. - if(bits != BitsMultiWord) - runtime·throw("unexpected garbage collection bits"); - // Find the next pair of bits. - if(ptrmask == nil) { - if(ncached <= 0) { - // Refill cache. - cached = *--ptrbitp; - ncached = 2; - } - bits = (cached>>2)&BitsMask; - } else - bits = (ptrmask[((i+PtrSize)/PtrSize)/4]>>((((i+PtrSize)/PtrSize)%4)*BitsPerPointer))&BitsMask; - - switch(bits) { - default: - runtime·throw("unexpected garbage collection bits"); - case BitsIface: - iface = (Iface*)(b+i); - if(iface->tab != nil) { - typ = iface->tab->type; - if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers)) - obj = iface->data; - } - break; - case BitsEface: - eface = (Eface*)(b+i); - typ = eface->type; - if(typ != nil) { - if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers)) - obj = eface->data; - } - break; - } - - i += PtrSize; - cached >>= gcBits; - ncached--; - - markobj: - // At this point we have extracted the next potential pointer. - // Check if it points into heap. - if(obj == nil || obj < arena_start || obj >= arena_used) - continue; - // Mark the object. - off = (uintptr*)obj - (uintptr*)arena_start; - bitp = arena_start - off/wordsPerBitmapByte - 1; - shift = (off % wordsPerBitmapByte) * gcBits; - xbits = *bitp; - bits = (xbits >> shift) & bitMask; - if((bits&bitBoundary) == 0) { - // Not a beginning of a block, consult span table to find the block beginning. - k = (uintptr)obj>>PageShift; - x = k; - x -= (uintptr)arena_start>>PageShift; - s = runtime·mheap.spans[x]; - if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse) - continue; - p = (byte*)((uintptr)s->start<sizeclass != 0) { - size = s->elemsize; - idx = ((byte*)obj - p)/size; - p = p+idx*size; - } - if(p == obj) { - runtime·printf("runtime: failed to find block beginning for %p s=%p s->limit=%p\n", - p, s->start*PageSize, s->limit); - runtime·throw("failed to find block beginning"); - } - obj = p; - goto markobj; - } - - // Now we have bits, bitp, and shift correct for - // obj pointing at the base of the object. - // Only care about not marked objects. - if((bits&bitMarked) != 0) - continue; - // If obj size is greater than 8, then each byte of GC bitmap - // contains info for at most one object. In such case we use - // non-atomic byte store to mark the object. This can lead - // to double enqueue of the object for scanning, but scanning - // is an idempotent operation, so it is OK. This cannot lead - // to bitmap corruption because the single marked bit is the - // only thing that can change in the byte. - // For 8-byte objects we use non-atomic store, if the other - // quadruple is already marked. Otherwise we resort to CAS - // loop for marking. - if((xbits&(bitMask|(bitMask<>(shift+2))&BitsMask) == BitsDead) - continue; // noscan object - - // Queue the obj for scanning. - PREFETCH(obj); - obj = (byte*)((uintptr)obj & ~(PtrSize-1)); - p = scanbuf[scanbufpos]; - scanbuf[scanbufpos++] = obj; - if(scanbufpos == nelem(scanbuf)) - scanbufpos = 0; - if(p == nil) - continue; - - // If workbuf is full, obtain an empty one. - if(nobj >= nelem(wbuf->obj)) { - wbuf->nobj = nobj; - wbuf = getempty(wbuf); - nobj = wbuf->nobj; - wp = &wbuf->obj[nobj]; - } - *wp = p; - wp++; - nobj++; } - if(Debug && ptrmask == nil) { - // For heap objects ensure that we did not overscan. - n = 0; - p = nil; - if(!runtime·mlookup(b, &p, &n, nil) || b != p || i > n) { - runtime·printf("runtime: scanned (%p,%p), heap object (%p,%p)\n", b, i, p, n); - runtime·throw("scanblock: scanned invalid object"); - } - } + // This might be a good place to add prefetch code... + // if(wbuf->nobj > 4) { + // PREFETCH(wbuf->obj[wbuf->nobj - 3]; + // } + --wbuf->nobj; + b = wbuf->obj[wbuf->nobj]; + wbuf = scanobject(b, runtime·mheap.arena_used - b, nil, wbuf); } } @@ -460,7 +487,8 @@ markroot(ParFor *desc, uint32 i) spf = (SpecialFinalizer*)sp; // A finalizer can be set for an inner byte of an object, find object beginning. p = (void*)((s->start << PageShift) + spf->special.offset/s->elemsize*s->elemsize); - scanblock(p, s->elemsize, nil); + if(runtime·gcphase != GCscan) + scanblock(p, s->elemsize, nil); // Scanned during mark phase scanblock((void*)&spf->fn, PtrSize, oneptr); } } @@ -477,7 +505,7 @@ markroot(ParFor *desc, uint32 i) gp = runtime·allg[i - RootCount]; // remember when we've first observed the G blocked // needed only to output in traceback - status = runtime·readgstatus(gp); + status = runtime·readgstatus(gp); // We are not in a scan state if((status == Gwaiting || status == Gsyscall) && gp->waitsince == 0) gp->waitsince = work.tstart; // Shrink a stack if not much of it is being used. @@ -487,7 +515,31 @@ markroot(ParFor *desc, uint32 i) else gp->gcworkdone = false; restart = runtime·stopg(gp); - scanstack(gp); + + // goroutine will scan its own stack when it stops running. + // Wait until it has. + while((status = runtime·readgstatus(gp)) == Grunning && !gp->gcworkdone) { + if(status == Gdead) { + // TBD you need to explain why Gdead without gp->gcworkdone + // being true. If there is a race then it needs to be + // explained here. + gp->gcworkdone = true; // scan is a noop + break; + //do nothing, scan not needed. + } + // try again + } + + // scanstack(gp); now done as part of gcphasework + // But to make sure we finished we need to make sure that + // the stack traps have all responded so drop into + // this while loop until they respond. + if(!gp->gcworkdone) + // For some reason a G has not completed its work. This is a bug that + // needs to be investigated. For now I'll just print this message in + // case the bug is benign. + runtime·printf("runtime:markroot: post stack scan work not done gp=%p has status %x\n", gp, status); + if(restart) runtime·restartg(gp); break; @@ -511,8 +563,12 @@ getempty(Workbuf *b) } if(b == nil) b = (Workbuf*)runtime·lfstackpop(&work.empty); - if(b == nil) + if(b == nil) { b = runtime·persistentalloc(sizeof(*b), CacheLineSize, &mstats.gc_sys); + b->nobj = 0; + } + if(b->nobj != 0) + runtime·throw("getempty: b->nobj not 0/n"); b->nobj = 0; return b; } @@ -522,6 +578,8 @@ putempty(Workbuf *b) { MCache *c; + if(b->nobj != 0) + runtime·throw("putempty: b->nobj=%D not 0\n"); c = g->m->mcache; if(c->gcworkbuf == nil) { c->gcworkbuf = b; @@ -530,21 +588,70 @@ putempty(Workbuf *b) runtime·lfstackpush(&work.empty, &b->node); } +// Get an partially empty work buffer from the mcache structure +// and if non is available get an empty one. +static Workbuf* +getpartial(void) +{ + MCache *c; + Workbuf *b; + + c = g->m->mcache; + if(c->gcworkbuf != nil) { + b = c->gcworkbuf; + c->gcworkbuf = nil; + } else { + b = getempty(nil); + } + return b; +} + +static void +putpartial(Workbuf *b) +{ + MCache *c; + + c = g->m->mcache; + if(c->gcworkbuf == nil) { + c->gcworkbuf = b; + return; + } + + runtime·throw("putpartial: c->gcworkbuf is not nil\n"); + + runtime·lfstackpush(&work.full, &b->node); +} + void -runtime·gcworkbuffree(void *b) +runtime·gcworkbuffree(Workbuf *b) { - if(b != nil) + if(b != nil) { + if(b->nobj != 0) + runtime·throw("gcworkbufferfree: b->nobj not 0\n"); putempty(b); + } } + // Get a full work buffer off the work.full list, or return nil. +// getfull acts as a barrier for work.nproc helpers. As long as one +// gchelper is actively marking objects it +// may create a workbuffer that the other helpers can work on. +// The for loop either exits when a work buffer is found +// or when _all_ of the work.nproc gc helpers are in the loop +// looking for work and thus not capable of creating new work. +// This is in fact the termination condition for the STW mark +// phase. static Workbuf* getfull(Workbuf *b) { int32 i; - if(b != nil) + if(b != nil) { + if(b->nobj != 0) + runtime·printf("runtime:getfull: b->nobj=%D not 0.", b->nobj); runtime·lfstackpush(&work.empty, &b->node); + } b = (Workbuf*)runtime·lfstackpop(&work.full); if(b != nil || work.nproc == 1) return b; @@ -674,7 +781,7 @@ scanframe(Stkframe *frame, void *unused) } bv = runtime·stackmapdata(stackmap, pcdata); } - scanblock((byte*)frame->argp, bv.n/BitsPerPointer*PtrSize, bv.bytedata); + scanblock((byte*)frame->argp, bv.n/BitsPerPointer*PtrSize, bv.bytedata); } return true; } @@ -727,12 +834,23 @@ runtime·gcphasework(G *gp) case GCquiesce: case GCstw: case GCsweep: - // No work for now. + // No work. + break; + case GCscan: + // scan the stack, mark the objects, put pointers in work buffers + // hanging off the P where this is being run. + scanstack(gp); break; case GCmark: + case GCmarktermination: + // // Disabled until concurrent GC is implemented // but indicate the scan has been done. - // scanstack(gp); + scanstack(gp); + // scanstack will call shade which will populate + // the Workbuf. + // emptywbuf(gp) will empty it before returning + // break; } gp->gcworkdone = true; @@ -1108,6 +1226,7 @@ runtime·gosweepdone(void) return runtime·mheap.sweepdone; } + void runtime·gchelper(void) { @@ -1118,10 +1237,8 @@ runtime·gchelper(void) // parallel mark for over gc roots runtime·parfordo(work.markfor); - - // help other threads scan secondary blocks - scanblock(nil, 0, nil); - + if(runtime·gcphase != GCscan) + scanblock(nil, 0, nil); // blocks in getfull nproc = work.nproc; // work.nproc can change right after we increment work.ndone if(runtime·xadd(&work.ndone, +1) == nproc-1) runtime·notewakeup(&work.alldone); @@ -1288,6 +1405,7 @@ runtime·gcinit(void) runtime·gcbssmask = unrollglobgcprog(runtime·gcbss, runtime·ebss - runtime·bss); } +// Called from malloc.go using onM, stopping and starting the world handled in caller. void runtime·gc_m(void) { @@ -1311,7 +1429,8 @@ gc(struct gc_args *args) int64 t0, t1, t2, t3, t4; uint64 heap0, heap1, obj; GCStats stats; - + uint32 oldphase; + if(runtime·debug.allocfreetrace) runtime·tracegc(); @@ -1327,7 +1446,7 @@ gc(struct gc_args *args) while(runtime·sweepone() != -1) runtime·sweep.npausesweep++; - // Cache runtime.mheap.allspans in work.spans to avoid conflicts with + // Cache runtime·mheap.allspans in work.spans to avoid conflicts with // resizing/freeing allspans. // New spans can be created while GC progresses, but they are not garbage for // this round: @@ -1344,10 +1463,13 @@ gc(struct gc_args *args) work.spans = runtime·mheap.allspans; work.nspan = runtime·mheap.nspan; runtime·unlock(&runtime·mheap.lock); + oldphase = runtime·gcphase; work.nwait = 0; work.ndone = 0; - work.nproc = runtime·gcprocs(); + work.nproc = runtime·gcprocs(); + runtime·gcphase = GCmark; //^^ vv + runtime·parforsetup(work.markfor, work.nproc, RootCount + runtime·allglen, nil, false, markroot); if(work.nproc > 1) { runtime·noteclear(&work.alldone); @@ -1360,8 +1482,9 @@ gc(struct gc_args *args) gchelperstart(); runtime·parfordo(work.markfor); - scanblock(nil, 0, nil); + scanblock(nil, 0, nil); + runtime·gcphase = oldphase; //^^ vv t3 = 0; if(runtime·debug.gctrace) t3 = runtime·nanotime(); diff --git a/src/runtime/proc.c b/src/runtime/proc.c index 25f916640..1f1044d1d 100644 --- a/src/runtime/proc.c +++ b/src/runtime/proc.c @@ -623,9 +623,10 @@ mquiesce(G *gpmaster) uint32 status; uint32 activeglen; - activeglen = runtime·allglen; // enqueue the calling goroutine. runtime·restartg(gpmaster); + + activeglen = runtime·allglen; for(i = 0; i < activeglen; i++) { gp = runtime·allg[i]; if(runtime·readgstatus(gp) == Gdead) diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index adc74cf41..74d7ba4f5 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -93,6 +93,7 @@ typedef struct PollDesc PollDesc; typedef struct DebugVars DebugVars; typedef struct ForceGCState ForceGCState; typedef struct Stack Stack; +typedef struct Workbuf Workbuf; /* * Per-CPU declaration. @@ -303,7 +304,7 @@ struct G bool paniconfault; // panic (instead of crash) on unexpected fault address bool preemptscan; // preempted g does scan for GC bool gcworkdone; // debug: cleared at begining of gc work phase cycle, set by gcphasework, tested at end of cycle - bool throwsplit; // must not split stack + bool throwsplit; // must not split stack int8 raceignore; // ignore race detection events M* m; // for debuggers, but offset not hard-coded M* lockedm; @@ -561,6 +562,16 @@ struct ParFor uint64 nsleep; }; +enum { + WorkbufSize = 4*1024, +}; +struct Workbuf +{ + LFNode node; // must be first + uintptr nobj; + byte* obj[(WorkbufSize-sizeof(LFNode)-sizeof(uintptr))/PtrSize]; +}; + // Track memory allocated by code not written in Go during a cgo call, // so that the garbage collector can see them. struct CgoMal @@ -583,12 +594,14 @@ struct DebugVars // Indicates to write barrier and sychronization task to preform. enum -{ // Synchronization Write barrier - GCoff, // stop and start nop - GCquiesce, // stop and start nop - GCstw, // stop the ps nop - GCmark, // scan the stacks and start no white to black - GCsweep, // stop and start nop +{ // Action WB installation + GCoff = 0, // stop and start no wb + GCquiesce, // stop and start no wb + GCstw, // stop the ps nop + GCscan, // scan the stacks prior to marking + GCmark, // mark use wbufs from GCscan and globals, scan the stacks, then go to GCtermination + GCmarktermination, // mark termination detection. Allocate black, Ps help out GC + GCsweep, // stop and start nop }; struct ForceGCState -- cgit v1.2.1 From af64ab79c0ad72decf083d78cf54257d009741b5 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Tue, 14 Oct 2014 09:51:46 -0400 Subject: [dev.garbage] runtime: Write barrier code. Comments lay out the concurrent GC algorithms. This CL implements parts of the algorithm. The acknowledgement code has been removed from this CL LGTM=rsc, dvyukov R=dvyukov, rsc CC=golang-codereviews https://codereview.appspot.com/151540043 --- src/runtime/mgc0.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 129 insertions(+), 16 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index 39fae9bbe..dabd38a60 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -4,22 +4,73 @@ // Garbage collector (GC). // -// GC is: -// - mark&sweep -// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc) -// - parallel (up to MaxGcproc threads) -// - partially concurrent (mark is stop-the-world, while sweep is concurrent) -// - non-moving/non-compacting -// - full (non-partial) +// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple GC +// thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is +// non-generational and non-compacting. Allocation is done using size segregated per P allocation +// areas to minimize fragmentation while eliminating locks in the common case. // -// GC rate. -// Next GC is after we've allocated an extra amount of memory proportional to -// the amount already in use. The proportion is controlled by GOGC environment variable -// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M -// (this mark is tracked in next_gc variable). This keeps the GC cost in linear -// proportion to the allocation cost. Adjusting GOGC just changes the linear constant -// (and also the amount of extra memory used). +// The algorithm decomposes into several steps. +// This is a high level description of the algorithm being used. For an overview of GC a good +// place to start is Richard Jones' gchandbook.org. +// +// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see +// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978. +// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978), 966-975. +// For journal quality proofs that these steps are complete, correct, and terminate see +// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world. +// Concurrency and Computation: Practice and Experience 15(3-5), 2003. // +// 0. Set phase = GCscan from GCoff. +// 1. Wait for all P's to acknowledge phase change. +// At this point all goroutines have passed through a GC safepoint and +// know we are in the GCscan phase. +// 2. GC scans all goroutine stacks, mark and enqueues all encountered pointers +// (marking avoids most duplicate enqueuing but races may produce duplication which is benign). +// Preempted goroutines are scanned before P schedules next goroutine. +// 3. Set phase = GCmark. +// 4. Wait for all P's to acknowledge phase change. +// 5. Now write barrier marks and enqueues black or grey to white pointers. If a pointer is +// stored into a white slot, such pointer is not marked. +// Malloc still allocates white (non-marked) objects. +// 6. Meanwhile GC transitively walks the heap marking reachable objects. +// 7. When GC finishes marking heap, it preempts P's one-by-one and +// retakes partial wbufs (filled by write barrier or during a stack scan of the goroutine +// currently scheduled on the P). +// 8. Once the GC has exhausted all available marking work it sets phase = marktermination. +// 9. Wait for all P's to acknowledge phase change. +// 10. Malloc now allocates black objects, so number of unmarked reachable objects +// monotonically decreases. +// 11. GC preempts P's one-by-one taking partial wbufs and marks all unmarked yet reachable objects. +// 12. When GC completes a full cycle over P's and discovers no new grey +// objects, (which means all reachable objects are marked) set phase = GCsweep. +// 13. Wait for all P's to acknowledge phase change. +// 14. Now malloc allocates white (but sweeps spans before use). +// Write barrier becomes nop. +// 15. GC does background sweeping, see description below. +// 16. When sweeping is complete set phase to GCoff. +// 17. When sufficient allocation has taken place replay the sequence starting at 0 above, +// see discussion of GC rate below. + +// Changing phases. +// Phases are changed by setting the gcphase to the next phase and call ackgcphase. +// All phase action must be benign in the presence of a change. +// Starting with GCoff +// GCoff to GCscan +// GSscan scans stacks and globals greying them and never marks an object black. +// Once all the P's are aware of the new phase they will scan gs on preemption. +// This means that the scanning of preempted gs can't start until all the Ps +// have acknowledged. +// GCscan to GCmark +// GCMark turns on the write barrier which also only greys objects. No scanning +// of objects (making them black) can happen until all the Ps have acknowledged +// the phase change. +// GCmark to GCmarktermination +// The only change here is that we start allocating black so the Ps must acknowledge +// the change before we begin the termination algorithm +// GCmarktermination to GSsweep +// Object currently on the freelist must be marked black for this to work. +// Are things on the free lists black or white? How does the sweep phase work? + // Concurrent sweep. // The sweep phase proceeds concurrently with normal program execution. // The heap is swept span-by-span both lazily (when a goroutine needs another span) @@ -50,6 +101,14 @@ // The finalizer goroutine is kicked off only when all spans are swept. // When the next GC starts, it sweeps all not-yet-swept spans (if any). +// GC rate. +// Next GC is after we've allocated an extra amount of memory proportional to +// the amount already in use. The proportion is controlled by GOGC environment variable +// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M +// (this mark is tracked in next_gc variable). This keeps the GC cost in linear +// proportion to the allocation cost. Adjusting GOGC just changes the linear constant +// (and also the amount of extra memory used). + #include "runtime.h" #include "arch_GOARCH.h" #include "malloc.h" @@ -141,6 +200,8 @@ static void scanblock(byte*, uintptr, byte*); static byte* objectstart(byte*, Markbits*); static Workbuf* greyobject(byte*, Markbits*, Workbuf*); static bool inheap(byte*); +static bool shaded(byte*); +static void shade(byte*); static void slottombits(byte*, Markbits*); void runtime·bgsweep(void); @@ -633,13 +694,12 @@ runtime·gcworkbuffree(Workbuf *b) } } - // Get a full work buffer off the work.full list, or return nil. // getfull acts as a barrier for work.nproc helpers. As long as one // gchelper is actively marking objects it // may create a workbuffer that the other helpers can work on. // The for loop either exits when a work buffer is found -// or when _all_ of the work.nproc gc helpers are in the loop +// or when _all_ of the work.nproc GC helpers are in the loop // looking for work and thus not capable of creating new work. // This is in fact the termination condition for the STW mark // phase. @@ -823,6 +883,59 @@ scanstack(G *gp) runtime·tracebackdefers(gp, &fn, nil); } +// If the slot is grey or black return true, if white return false. +// If the slot is not in the known heap and thus does not have a valid GC bitmap then +// it is considered grey. Globals and stacks can hold such slots. +// The slot is grey if its mark bit is set and it is enqueued to be scanned. +// The slot is black if it has already been scanned. +// It is white if it has a valid mark bit and the bit is not set. +static bool +shaded(byte *slot) +{ + Markbits mbits; + + if(!inheap(slot)) // non-heap slots considered grey + return true; + + objectstart(slot, &mbits); + return (mbits.bits&bitMarked) != 0; +} + +// Shade the object if it isn't already. +// The object is not nil and known to be in the heap. +static void +shade(byte *b) +{ + byte *obj; + Workbuf *wbuf; + Markbits mbits; + + if(!inheap(b)) + runtime·throw("shade: passed an address not in the heap"); + + wbuf = getpartial(); + // Mark the object, return some important bits. + // If we combine the following two rotines we don't have to pass mbits or obj around. + obj = objectstart(b, &mbits); + wbuf = greyobject(obj, &mbits, wbuf); // augments the wbuf + putpartial(wbuf); + return; +} + +// This is the Dijkstra barrier coarsened to shade grey to white whereas +// the original Dijkstra barrier only shaded black to white. +// +// Shade indicates that it has seen a white pointer by adding the referent +// to wbuf. +void +runtime·markwb(void **slot, void *ptr) +{ + // initial nil check avoids some needlesss loads + if(ptr != nil && inheap(ptr) && shaded((void*)slot)) + shade(ptr); + *slot = ptr; +} + // The gp has been moved to a gc safepoint. If there is gcphase specific // work it is done here. void -- cgit v1.2.1 From b8d4cd490ddd8ba298b5347d3e72316df768565c Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Wed, 22 Oct 2014 14:02:04 -0400 Subject: [dev.power64] runtime: Fix broken merge of noasm.go The earlier dev.power64 merge missed the port of runtime/noasm.goc to runtime/noasm_arm.go. This CL fixes this by moving noasm_arm.go to noasm.go and adding a +build to share the file between arm and power64. LGTM=rsc R=rsc CC=golang-codereviews https://codereview.appspot.com/158350043 --- src/runtime/noasm.go | 56 ++++++++++++++++++++++++++++++++++++++++++++++++ src/runtime/noasm_arm.go | 54 ---------------------------------------------- 2 files changed, 56 insertions(+), 54 deletions(-) create mode 100644 src/runtime/noasm.go delete mode 100644 src/runtime/noasm_arm.go (limited to 'src/runtime') diff --git a/src/runtime/noasm.go b/src/runtime/noasm.go new file mode 100644 index 000000000..43c16860b --- /dev/null +++ b/src/runtime/noasm.go @@ -0,0 +1,56 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Routines that are implemented in assembly in asm_{amd64,386}.s +// but are implemented in Go for arm. + +// +build arm power64 power64le + +package runtime + +func cmpstring(s1, s2 string) int { + l := len(s1) + if len(s2) < l { + l = len(s2) + } + for i := 0; i < l; i++ { + c1, c2 := s1[i], s2[i] + if c1 < c2 { + return -1 + } + if c1 > c2 { + return +1 + } + } + if len(s1) < len(s2) { + return -1 + } + if len(s1) > len(s2) { + return +1 + } + return 0 +} + +func cmpbytes(s1, s2 []byte) int { + l := len(s1) + if len(s2) < l { + l = len(s2) + } + for i := 0; i < l; i++ { + c1, c2 := s1[i], s2[i] + if c1 < c2 { + return -1 + } + if c1 > c2 { + return +1 + } + } + if len(s1) < len(s2) { + return -1 + } + if len(s1) > len(s2) { + return +1 + } + return 0 +} diff --git a/src/runtime/noasm_arm.go b/src/runtime/noasm_arm.go deleted file mode 100644 index dd3ef8267..000000000 --- a/src/runtime/noasm_arm.go +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2013 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Routines that are implemented in assembly in asm_{amd64,386}.s -// but are implemented in Go for arm. - -package runtime - -func cmpstring(s1, s2 string) int { - l := len(s1) - if len(s2) < l { - l = len(s2) - } - for i := 0; i < l; i++ { - c1, c2 := s1[i], s2[i] - if c1 < c2 { - return -1 - } - if c1 > c2 { - return +1 - } - } - if len(s1) < len(s2) { - return -1 - } - if len(s1) > len(s2) { - return +1 - } - return 0 -} - -func cmpbytes(s1, s2 []byte) int { - l := len(s1) - if len(s2) < l { - l = len(s2) - } - for i := 0; i < l; i++ { - c1, c2 := s1[i], s2[i] - if c1 < c2 { - return -1 - } - if c1 > c2 { - return +1 - } - } - if len(s1) < len(s2) { - return -1 - } - if len(s1) > len(s2) { - return +1 - } - return 0 -} -- cgit v1.2.1 From 097998292f184b893daa9775d4997b0eb3e7f567 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Wed, 22 Oct 2014 16:39:31 -0400 Subject: [dev.power64] runtime: fix early GC of Defer objects go_bootstrap was panicking during runtime initialization (under runtime.main) because Defer objects were being prematurely GC'd. This happened because of an incorrect change to runtime?unrollgcprog_m to make it endian-agnostic during the conversion of runtime bitmaps to byte arrays. LGTM=rsc R=rsc CC=golang-codereviews https://codereview.appspot.com/161960044 --- src/runtime/mgc0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/runtime') diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index d376c1cf6..02f7eba12 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -1799,7 +1799,7 @@ runtime·unrollgcprog_m(void) } // atomic way to say mask[0] = 1 - x = typ->gc[0]; + x = *(uintptr*)mask; ((byte*)&x)[0] = 1; runtime·atomicstorep((void**)mask, (void*)x); } -- cgit v1.2.1 From cb7d300f93c28798b4a721d92655794d4dd2950e Mon Sep 17 00:00:00 2001 From: Dave Cheney Date: Thu, 23 Oct 2014 08:58:10 +1100 Subject: [dev.power64] runtime: fix SigaltstackT definition for power64le Also updated defs3_linux.go but had to manually edit defs_linux_power64le.h. Will regenerate the file when cgo is working natively on ppc64. LGTM=austin R=rsc, austin CC=golang-codereviews https://codereview.appspot.com/158360043 --- src/runtime/defs3_linux.go | 2 +- src/runtime/defs_linux_power64le.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/defs3_linux.go b/src/runtime/defs3_linux.go index eb65f9d1e..3551a4fa9 100644 --- a/src/runtime/defs3_linux.go +++ b/src/runtime/defs3_linux.go @@ -35,7 +35,7 @@ type Gregset C.elf_gregset_t type FPregset C.elf_fpregset_t type Vreg C.elf_vrreg_t -type Sigaltstack C.struct_sigaltstack +type SigaltstackT C.struct_sigaltstack // PPC64 uses sigcontext in place of mcontext in ucontext. // see http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/include/uapi/asm/ucontext.h diff --git a/src/runtime/defs_linux_power64le.h b/src/runtime/defs_linux_power64le.h index 41db45ca1..64f145672 100644 --- a/src/runtime/defs_linux_power64le.h +++ b/src/runtime/defs_linux_power64le.h @@ -147,7 +147,7 @@ enum { //typedef struct Usigset Usigset; typedef struct Ptregs Ptregs; typedef struct Vreg Vreg; -typedef struct Sigaltstack Sigaltstack; +typedef struct SigaltstackT SigaltstackT; typedef struct Sigcontext Sigcontext; typedef struct Ucontext Ucontext; @@ -179,7 +179,7 @@ struct Vreg { uint32 u[4]; }; -struct Sigaltstack { +struct SigaltstackT { byte *ss_sp; int32 ss_flags; byte Pad_cgo_0[4]; @@ -201,7 +201,7 @@ struct Sigcontext { struct Ucontext { uint64 uc_flags; Ucontext *uc_link; - Sigaltstack uc_stack; + SigaltstackT uc_stack; Usigset uc_sigmask; Usigset __unused[15]; Sigcontext uc_mcontext; -- cgit v1.2.1 From 978c971bacf92ee97b8e24ce45b485afa1c31fad Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Thu, 23 Oct 2014 15:51:17 -0400 Subject: [dev.garbage] runtime: simplifiy lfstack.c due to undiagnosed buffer corruption. The changes got rid of the problems we were seeing. We suspect the pushcnt field has a race. LGTM=rsc R=dvyukov, rsc CC=golang-codereviews https://codereview.appspot.com/159330043 Committer: Russ Cox --- src/runtime/lfstack.c | 14 ++++++-------- src/runtime/runtime.h | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/lfstack.c b/src/runtime/lfstack.c index 57e0af282..0ced839c2 100644 --- a/src/runtime/lfstack.c +++ b/src/runtime/lfstack.c @@ -46,7 +46,7 @@ runtime·lfstackpush(uint64 *head, LFNode *node) new = (uint64)(uintptr)node|(((uint64)node->pushcnt&CNT_MASK)<next = (LFNode*)(uintptr)(old&PTR_MASK); + node->next = old; if(runtime·cas64(head, old, new)) break; } @@ -55,19 +55,17 @@ runtime·lfstackpush(uint64 *head, LFNode *node) LFNode* runtime·lfstackpop(uint64 *head) { - LFNode *node, *node2; - uint64 old, new; + LFNode *node; + uint64 old, next; for(;;) { old = runtime·atomicload64(head); if(old == 0) return nil; node = (LFNode*)(uintptr)(old&PTR_MASK); - node2 = runtime·atomicloadp(&node->next); - new = 0; - if(node2 != nil) - new = (uint64)(uintptr)node2|(((uint64)node2->pushcnt&CNT_MASK)<next); + + if(runtime·cas64(head, old, next)) return node; } } diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index bea773799..37929c59c 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -573,7 +573,7 @@ enum { // Lock-free stack node. struct LFNode { - LFNode *next; + uint64 next; uintptr pushcnt; }; -- cgit v1.2.1 From 2c987a9ddefd9d256bf9e7e21396e2485a7d6514 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Fri, 24 Oct 2014 11:07:16 -0400 Subject: [dev.garbage] runtime: Concurrent scan code Routines and logic to preform a concurrent stack scan of go-routines. This CL excersizes most of the functionality needed. The major exception being that it does not scan running goroutines. After doing the scans it relies on a STW to finish the GC, including rescanning the stacks. It is intended to achieve correctness, performance will follow. LGTM=rsc R=golang-codereviews, rsc CC=dvyukov, golang-codereviews https://codereview.appspot.com/156580043 --- src/runtime/malloc.go | 8 ++ src/runtime/malloc.h | 5 - src/runtime/mcache.c | 2 +- src/runtime/mgc0.c | 308 ++++++++++++++++++++++++++++++++++---------------- src/runtime/proc.c | 19 ++-- src/runtime/runtime.h | 2 + src/runtime/stack.c | 35 +++--- src/runtime/stubs.go | 2 + 8 files changed, 254 insertions(+), 127 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 9b4264f2b..c56e03886 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -438,7 +438,15 @@ func gogc(force int32) { mp = acquirem() mp.gcing = 1 releasem(mp) + onM(stoptheworld) + onM(finishsweep_m) // finish sweep before we start concurrent scan. + onM(starttheworld) + + // Do a concurrent heap scan before we stop the world. + onM(gcscan_m) + onM(stoptheworld) + if mp != acquirem() { gothrow("gogc: rescheduled") } diff --git a/src/runtime/malloc.h b/src/runtime/malloc.h index edcd0be77..e606b0c7a 100644 --- a/src/runtime/malloc.h +++ b/src/runtime/malloc.h @@ -343,11 +343,6 @@ struct MCache StackFreeList stackcache[NumStackOrders]; SudoG* sudogcache; - // Cached P local buffer holding grey objects (marked by not yet scanned) - // Used by mutator for write barrier work. - // GC uses the mcache of the P it is running on for stack and global scanning - // work as well marking. - Workbuf* gcworkbuf; // Local allocator stats, flushed during GC. uintptr local_nlookup; // number of pointer lookups diff --git a/src/runtime/mcache.c b/src/runtime/mcache.c index 5fdbe3266..95ddced3e 100644 --- a/src/runtime/mcache.c +++ b/src/runtime/mcache.c @@ -39,12 +39,12 @@ runtime·allocmcache(void) return c; } +// mheap.lock needs to be held to release the gcworkbuf. static void freemcache(MCache *c) { runtime·MCache_ReleaseAll(c); runtime·stackcache_clear(c); - runtime·gcworkbuffree(c->gcworkbuf); runtime·lock(&runtime·mheap.lock); runtime·purgecachedstats(c); runtime·FixAlloc_Free(&runtime·mheap.cachealloc, c); diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index 8620f47af..c385d51cf 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -52,7 +52,7 @@ // see discussion of GC rate below. // Changing phases. -// Phases are changed by setting the gcphase to the next phase and call ackgcphase. +// Phases are changed by setting the gcphase to the next phase and possibly calling ackgcphase. // All phase action must be benign in the presence of a change. // Starting with GCoff // GCoff to GCscan @@ -137,7 +137,7 @@ enum { // ptrmask for an allocation containing a single pointer. static byte oneptr[] = {BitsPointer}; -// Initialized from $GOGC. GOGC=off means no gc. +// Initialized from $GOGC. GOGC=off means no GC. extern int32 runtime·gcpercent; // Holding worldsema grants an M the right to try to stop the world. @@ -185,11 +185,12 @@ BitVector runtime·gcbssmask; Mutex runtime·gclock; -static Workbuf* getpartial(void); +static Workbuf* getpartialorempty(void); static void putpartial(Workbuf*); static Workbuf* getempty(Workbuf*); static Workbuf* getfull(Workbuf*); static void putempty(Workbuf*); +static void putfull(Workbuf*); static Workbuf* handoff(Workbuf*); static void gchelperstart(void); static void flushallmcaches(void); @@ -205,12 +206,14 @@ static void shade(byte*); static void slottombits(byte*, Markbits*); void runtime·bgsweep(void); +void runtime·finishsweep_m(void); static FuncVal bgsweepv = {runtime·bgsweep}; typedef struct WorkData WorkData; struct WorkData { - uint64 full; // lock-free list of full blocks - uint64 empty; // lock-free list of empty blocks + uint64 full; // lock-free list of full blocks + uint64 empty; // lock-free list of empty blocks + uint64 partial; // lock-free list of partially filled blocks byte pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait uint32 nproc; int64 tstart; @@ -455,15 +458,22 @@ scanblock(byte *b, uintptr n, byte *ptrmask) Workbuf *wbuf; bool keepworking; - wbuf = getpartial(); + wbuf = getpartialorempty(); if(b != nil) { wbuf = scanobject(b, n, ptrmask, wbuf); if(runtime·gcphase == GCscan) { + if(inheap(b) && !ptrmask) + // b is in heap, we are in GCscan so there should be a ptrmask. + runtime·throw("scanblock: In GCscan phase and inheap is true."); + // GCscan only goes one level deep since mark wb not turned on. putpartial(wbuf); return; } } - + if(runtime·gcphase == GCscan) { + runtime·throw("scanblock: In GCscan phase but no b passed in."); + } + keepworking = b == nil; // ptrmask can have 2 possible values: @@ -479,6 +489,11 @@ scanblock(byte *b, uintptr n, byte *ptrmask) wbuf = getfull(wbuf); if(wbuf == nil) // nil means out of work barrier reached return; + + if(wbuf->nobj<=0) { + runtime·throw("runtime:scanblock getfull returns empty buffer"); + } + } // If another proc wants a pointer, give it some. @@ -506,7 +521,7 @@ markroot(ParFor *desc, uint32 i) void *p; uint32 status; bool restart; - + USED(&desc); // Note: if you add a case here, please also update heapdump.c:dumproots. switch(i) { @@ -553,7 +568,8 @@ markroot(ParFor *desc, uint32 i) break; case RootFlushCaches: - flushallmcaches(); + if (runtime·gcphase != GCscan) // Do not flush mcaches during GCscan phase. + flushallmcaches(); break; default: @@ -566,9 +582,10 @@ markroot(ParFor *desc, uint32 i) status = runtime·readgstatus(gp); // We are not in a scan state if((status == Gwaiting || status == Gsyscall) && gp->waitsince == 0) gp->waitsince = runtime·work.tstart; - // Shrink a stack if not much of it is being used. - runtime·shrinkstack(gp); - if(runtime·readgstatus(gp) == Gdead) + // Shrink a stack if not much of it is being used but not in the scan phase. + if (runtime·gcphase != GCscan) // Do not shrink during GCscan phase. + runtime·shrinkstack(gp); + if(runtime·readgstatus(gp) == Gdead) gp->gcworkdone = true; else gp->gcworkdone = false; @@ -576,121 +593,120 @@ markroot(ParFor *desc, uint32 i) // goroutine will scan its own stack when it stops running. // Wait until it has. - while((status = runtime·readgstatus(gp)) == Grunning && !gp->gcworkdone) { + while(runtime·readgstatus(gp) == Grunning && !gp->gcworkdone) { + } + + // scanstack(gp) is done as part of gcphasework + // But to make sure we finished we need to make sure that + // the stack traps have all responded so drop into + // this while loop until they respond. + while(!gp->gcworkdone){ + status = runtime·readgstatus(gp); if(status == Gdead) { - // TBD you need to explain why Gdead without gp->gcworkdone - // being true. If there is a race then it needs to be - // explained here. gp->gcworkdone = true; // scan is a noop break; //do nothing, scan not needed. } - // try again + if(status == Gwaiting || status == Grunnable) + restart = runtime·stopg(gp); } - - // scanstack(gp); now done as part of gcphasework - // But to make sure we finished we need to make sure that - // the stack traps have all responded so drop into - // this while loop until they respond. - if(!gp->gcworkdone) - // For some reason a G has not completed its work. This is a bug that - // needs to be investigated. For now I'll just print this message in - // case the bug is benign. - runtime·printf("runtime:markroot: post stack scan work not done gp=%p has status %x\n", gp, status); - if(restart) runtime·restartg(gp); break; } } +// wblock is used for creating new empty work buffer blocks. +static Mutex wblock; + // Get an empty work buffer off the work.empty list, // allocating new buffers as needed. static Workbuf* getempty(Workbuf *b) { - MCache *c; - - if(b != nil) - runtime·lfstackpush(&runtime·work.full, &b->node); - b = nil; - c = g->m->mcache; - if(c->gcworkbuf != nil) { - b = c->gcworkbuf; - c->gcworkbuf = nil; + if(b != nil) { + putfull(b); + b = nil; } - if(b == nil) + if(runtime·work.empty) b = (Workbuf*)runtime·lfstackpop(&runtime·work.empty); + + if(b && b->nobj != 0) { + runtime·printf("m%d: getempty: popped b=%p with non-zero b->nobj=%D\n", g->m->id, b, b->nobj); + runtime·throw("getempty: workbuffer not empty, b->nobj not 0"); + } if(b == nil) { + runtime·lock(&wblock); b = runtime·persistentalloc(sizeof(*b), CacheLineSize, &mstats.gc_sys); b->nobj = 0; + runtime·unlock(&wblock); } - if(b->nobj != 0) - runtime·throw("getempty: b->nobj not 0/n"); - b->nobj = 0; return b; } static void putempty(Workbuf *b) { - MCache *c; - - if(b->nobj != 0) - runtime·throw("putempty: b->nobj=%D not 0\n"); - c = g->m->mcache; - if(c->gcworkbuf == nil) { - c->gcworkbuf = b; - return; + if(b->nobj != 0) { + runtime·throw("putempty: b->nobj not 0\n"); } runtime·lfstackpush(&runtime·work.empty, &b->node); } -// Get an partially empty work buffer from the mcache structure -// and if non is available get an empty one. +// Put a full or partially full workbuf on the full list. +static void +putfull(Workbuf *b) +{ + if(b->nobj <= 0) { + runtime·throw("putfull: b->nobj <= 0\n"); + } + runtime·lfstackpush(&runtime·work.full, &b->node); +} + +// Get an partially empty work buffer +// if none are available get an empty one. static Workbuf* -getpartial(void) +getpartialorempty(void) { - MCache *c; Workbuf *b; - c = g->m->mcache; - if(c->gcworkbuf != nil) { - b = c->gcworkbuf; - c->gcworkbuf = nil; - } else { + b = (Workbuf*)runtime·lfstackpop(&runtime·work.partial); + if(b == nil) b = getempty(nil); - } return b; } static void putpartial(Workbuf *b) { - MCache *c; - c = g->m->mcache; - if(c->gcworkbuf == nil) { - c->gcworkbuf = b; - return; + if(b->nobj == 0) + runtime·lfstackpush(&runtime·work.empty, &b->node); + else if (b->nobj < nelem(b->obj)) + runtime·lfstackpush(&runtime·work.partial, &b->node); + else if (b->nobj == nelem(b->obj)) + runtime·lfstackpush(&runtime·work.full, &b->node); + else { + runtime·printf("b=%p, b->nobj=%D, nelem(b->obj)=%d\n", b, b->nobj, nelem(b->obj)); + runtime·throw("putpartial: bad Workbuf b->nobj"); } - - runtime·throw("putpartial: c->gcworkbuf is not nil\n"); - - runtime·lfstackpush(&runtime·work.full, &b->node); } void runtime·gcworkbuffree(Workbuf *b) { - if(b != nil) { - if(b->nobj != 0) - runtime·throw("gcworkbufferfree: b->nobj not 0\n"); + if(b == nil) + return; + if(b->nobj == 0) putempty(b); - } + else + putfull(b); } -// Get a full work buffer off the work.full list, or return nil. +// Get a full work buffer off the work.full or a partially +// filled one off the work.partial list. If nothing is available +// wait until all the other gc helpers have finished and then +// return nil. // getfull acts as a barrier for work.nproc helpers. As long as one // gchelper is actively marking objects it // may create a workbuffer that the other helpers can work on. @@ -704,12 +720,12 @@ getfull(Workbuf *b) { int32 i; - if(b != nil) { - if(b->nobj != 0) - runtime·printf("runtime:getfull: b->nobj=%D not 0.", b->nobj); - runtime·lfstackpush(&runtime·work.empty, &b->node); - } + if(b != nil) + putempty(b); + b = (Workbuf*)runtime·lfstackpop(&runtime·work.full); + if(b==nil) + b = (Workbuf*)runtime·lfstackpop(&runtime·work.partial); if(b != nil || runtime·work.nproc == 1) return b; @@ -718,7 +734,9 @@ getfull(Workbuf *b) if(runtime·work.full != 0) { runtime·xadd(&runtime·work.nwait, -1); b = (Workbuf*)runtime·lfstackpop(&runtime·work.full); - if(b != nil) + if(b==nil) + b = (Workbuf*)runtime·lfstackpop(&runtime·work.partial); + if(b != nil) return b; runtime·xadd(&runtime·work.nwait, +1); } @@ -861,8 +879,7 @@ scanstack(G *gp) case Gdead: return; case Grunning: - runtime·printf("runtime: gp=%p, goid=%D, gp->atomicstatus=%d\n", gp, gp->goid, runtime·readgstatus(gp)); - runtime·throw("mark - world not stopped"); + runtime·throw("scanstack: - goroutine not stopped"); case Grunnable: case Gsyscall: case Gwaiting: @@ -909,7 +926,7 @@ shade(byte *b) if(!inheap(b)) runtime·throw("shade: passed an address not in the heap"); - wbuf = getpartial(); + wbuf = getpartialorempty(); // Mark the object, return some important bits. // If we combine the following two rotines we don't have to pass mbits or obj around. obj = objectstart(b, &mbits); @@ -932,8 +949,8 @@ runtime·markwb(void **slot, void *ptr) *slot = ptr; } -// The gp has been moved to a gc safepoint. If there is gcphase specific -// work it is done here. +// The gp has been moved to a GC safepoint. GC phase specific +// work is done here. void runtime·gcphasework(G *gp) { @@ -953,14 +970,8 @@ runtime·gcphasework(G *gp) break; case GCmark: case GCmarktermination: - // - // Disabled until concurrent GC is implemented - // but indicate the scan has been done. scanstack(gp); - // scanstack will call shade which will populate - // the Workbuf. - // emptywbuf(gp) will empty it before returning - // + // All available mark work will be emptied before returning. break; } gp->gcworkdone = true; @@ -1050,6 +1061,7 @@ runtime·iterate_finq(void (*callback)(FuncVal*, byte*, uintptr, Type*, PtrType* } } +// Returns only when span s has been swept. void runtime·MSpan_EnsureSwept(MSpan *s) { @@ -1064,6 +1076,7 @@ runtime·MSpan_EnsureSwept(MSpan *s) sg = runtime·mheap.sweepgen; if(runtime·atomicload(&s->sweepgen) == sg) return; + // The caller must be sure that the span is a MSpanInUse span. if(runtime·cas(&s->sweepgen, sg-2, sg-1)) { runtime·MSpan_Sweep(s, false); return; @@ -1347,7 +1360,7 @@ runtime·gchelper(void) g->m->traceback = 2; gchelperstart(); - // parallel mark for over gc roots + // parallel mark for over GC roots runtime·parfordo(runtime·work.markfor); if(runtime·gcphase != GCscan) scanblock(nil, 0, nil); // blocks in getfull @@ -1531,10 +1544,93 @@ runtime·gc_m(void) a.start_time = (uint64)(g->m->scalararg[0]) | ((uint64)(g->m->scalararg[1]) << 32); a.eagersweep = g->m->scalararg[2]; gc(&a); - runtime·casgstatus(gp, Gwaiting, Grunning); } +void +runtime·finishsweep_m(void) +{ + uint32 i, sg; + MSpan *s; + + // The world is stopped so we should be able to complete the sweeps + // quickly. + while(runtime·sweepone() != -1) + runtime·sweep.npausesweep++; + + // There may be some other spans being swept concurrently that + // we need to wait for. If finishsweep_m is done with the world stopped + // this code is not required. + sg = runtime·mheap.sweepgen; + for(i=0; isweepgen == sg) { + continue; + } + if(s->state != MSpanInUse) // Span is not part of the GCed heap so no need to ensure it is swept. + continue; + runtime·MSpan_EnsureSwept(s); + } +} + +// Scan all of the stacks, greying (or graying if in America) the referents +// but not blackening them since the mark write barrier isn't installed. +void +runtime·gcscan_m(void) +{ + uint32 i, allglen, oldphase; + G *gp, *mastergp, **allg; + + // Grab the g that called us and potentially allow rescheduling. + // This allows it to be scanned like other goroutines. + mastergp = g->m->curg; + + runtime·casgstatus(mastergp, Grunning, Gwaiting); + mastergp->waitreason = runtime·gostringnocopy((byte*)"garbage collection scan"); + + // Span sweeping has been done by finishsweep_m. + // Long term we will want to make this goroutine runnable + // by placing it onto a scanenqueue state and then calling + // runtime·restartg(mastergp) to make it Grunnable. + // At the bottom we will want to return this p back to the scheduler. + + oldphase = runtime·gcphase; + + runtime·lock(&runtime·allglock); + allglen = runtime·allglen; + allg = runtime·allg; + // Prepare flag indicating that the scan has not been completed. + for(i = 0; i < allglen; i++) { + gp = allg[i]; + gp->gcworkdone = false; // set to true in gcphasework + } + runtime·unlock(&runtime·allglock); + + runtime·work.nwait = 0; + runtime·work.ndone = 0; + runtime·work.nproc = 1; // For now do not do this in parallel. + runtime·gcphase = GCscan; + // ackgcphase is not needed since we are not scanning running goroutines. + runtime·parforsetup(runtime·work.markfor, runtime·work.nproc, RootCount + allglen, nil, false, markroot); + runtime·parfordo(runtime·work.markfor); + + runtime·lock(&runtime·allglock); + + allg = runtime·allg; + // Check that gc work is done. + for(i = 0; i < allglen; i++) { + gp = allg[i]; + if(!gp->gcworkdone) { + runtime·throw("scan missed a g"); + } + } + runtime·unlock(&runtime·allglock); + + runtime·gcphase = oldphase; + runtime·casgstatus(mastergp, Gwaiting, Grunning); + // Let the g that called us continue to run. +} + static void gc(struct gc_args *args) { @@ -1542,7 +1638,9 @@ gc(struct gc_args *args) uint64 heap0, heap1, obj; GCStats stats; uint32 oldphase; - + uint32 i; + G *gp; + if(runtime·debug.allocfreetrace) runtime·tracegc(); @@ -1554,9 +1652,7 @@ gc(struct gc_args *args) if(runtime·debug.gctrace) t1 = runtime·nanotime(); - // Sweep what is not sweeped by bgsweep. - while(runtime·sweepone() != -1) - runtime·sweep.npausesweep++; + runtime·finishsweep_m(); // Cache runtime·mheap.allspans in work.spans to avoid conflicts with // resizing/freeing allspans. @@ -1580,7 +1676,13 @@ gc(struct gc_args *args) runtime·work.nwait = 0; runtime·work.ndone = 0; runtime·work.nproc = runtime·gcprocs(); - runtime·gcphase = GCmark; //^^ vv + runtime·gcphase = GCmark; + + // World is stopped so allglen will not change. + for(i = 0; i < runtime·allglen; i++) { + gp = runtime·allg[i]; + gp->gcworkdone = false; // set to true in gcphasework + } runtime·parforsetup(runtime·work.markfor, runtime·work.nproc, RootCount + runtime·allglen, nil, false, markroot); if(runtime·work.nproc > 1) { @@ -1596,7 +1698,13 @@ gc(struct gc_args *args) runtime·parfordo(runtime·work.markfor); scanblock(nil, 0, nil); - runtime·gcphase = oldphase; //^^ vv + + if(runtime·work.full) + runtime·throw("runtime·work.full != nil"); + if(runtime·work.partial) + runtime·throw("runtime·work.partial != nil"); + + runtime·gcphase = oldphase; t3 = 0; if(runtime·debug.gctrace) t3 = runtime·nanotime(); @@ -1735,7 +1843,7 @@ readgcstats_m(void) if(pauses->cap < nelem(mstats.pause_ns)+3) runtime·throw("runtime: short slice passed to readGCStats"); - // Pass back: pauses, last gc (absolute time), number of gc, total pause ns. + // Pass back: pauses, last GC (absolute time), number of GC, total pause ns. p = (uint64*)pauses->array; runtime·lock(&runtime·mheap.lock); n = mstats.numgc; diff --git a/src/runtime/proc.c b/src/runtime/proc.c index 9643abcc6..b824f574d 100644 --- a/src/runtime/proc.c +++ b/src/runtime/proc.c @@ -423,13 +423,7 @@ runtime·casgstatus(G *gp, uint32 oldval, uint32 newval) // loop if gp->atomicstatus is in a scan state giving // GC time to finish and change the state to oldval. while(!runtime·cas(&gp->atomicstatus, oldval, newval)) { - // Help GC if needed. - if(gp->preemptscan && !gp->gcworkdone && (oldval == Grunning || oldval == Gsyscall)) { - gp->preemptscan = false; - g->m->ptrarg[0] = gp; - fn = helpcasgstatus; - runtime·onM(&fn); - } + } } @@ -504,6 +498,13 @@ runtime·stopg(G *gp) return false; case Grunning: + if(runtime·gcphase == GCscan) { + gp->gcworkdone = true; + return false; + // Running routines not scanned during + // GCscan phase, we only scan non-running routines. + } + // Claim goroutine, so we aren't racing with a status // transition away from Grunning. if(!runtime·castogscanstatus(gp, Grunning, Gscanrunning)) @@ -1918,6 +1919,7 @@ exitsyscallfast(void) // Freezetheworld sets stopwait but does not retake P's. if(runtime·sched.stopwait) { + g->m->mcache = nil; g->m->p = nil; return false; } @@ -1930,6 +1932,7 @@ exitsyscallfast(void) return true; } // Try to get any other idle P. + g->m->mcache = nil; g->m->p = nil; if(runtime·sched.pidle) { fn = exitsyscallfast_pidle; @@ -2617,6 +2620,8 @@ runtime·setcpuprofilerate_m(void) P *runtime·newP(void); // Change number of processors. The world is stopped, sched is locked. +// gcworkbufs are not being modified by either the GC or +// the write barrier code. static void procresize(int32 new) { diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index 37929c59c..cbbf6b3fc 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -649,6 +649,7 @@ struct ForceGCState }; extern uint32 runtime·gcphase; +extern Mutex runtime·allglock; /* * defined macros @@ -677,6 +678,7 @@ enum { uint32 runtime·readgstatus(G*); void runtime·casgstatus(G*, uint32, uint32); +bool runtime·castogscanstatus(G*, uint32, uint32); void runtime·quiesce(G*); bool runtime·stopg(G*); void runtime·restartg(G*); diff --git a/src/runtime/stack.c b/src/runtime/stack.c index e402691f4..e06e48a93 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -587,13 +587,13 @@ adjustsudogs(G *gp, AdjustInfo *adjinfo) } // Copies gp's stack to a new stack of a different size. +// Caller must have changed gp status to Gcopystack. static void copystack(G *gp, uintptr newsize) { Stack old, new; uintptr used; AdjustInfo adjinfo; - uint32 oldstatus; bool (*cb)(Stkframe*, void*); byte *p, *ep; @@ -637,20 +637,11 @@ copystack(G *gp, uintptr newsize) } runtime·memmove((byte*)new.hi - used, (byte*)old.hi - used, used); - oldstatus = runtime·readgstatus(gp); - oldstatus &= ~Gscan; - if(oldstatus == Gwaiting || oldstatus == Grunnable) - runtime·casgstatus(gp, oldstatus, Gcopystack); // oldstatus is Gwaiting or Grunnable - else - runtime·throw("copystack: bad status, not Gwaiting or Grunnable"); - // Swap out old stack for new one gp->stack = new; gp->stackguard0 = new.lo + StackGuard; // NOTE: might clobber a preempt request gp->sched.sp = new.hi - used; - runtime·casgstatus(gp, Gcopystack, oldstatus); // oldstatus is Gwaiting or Grunnable - // free old stack if(StackPoisonCopy) { p = (byte*)old.lo; @@ -700,6 +691,7 @@ void runtime·newstack(void) { int32 oldsize, newsize; + uint32 oldstatus; uintptr sp; G *gp; Gobuf morebuf; @@ -789,12 +781,15 @@ runtime·newstack(void) runtime·throw("stack overflow"); } - // Note that the concurrent GC might be scanning the stack as we try to replace it. - // copystack takes care of the appropriate coordination with the stack scanner. + oldstatus = runtime·readgstatus(gp); + oldstatus &= ~Gscan; + runtime·casgstatus(gp, oldstatus, Gcopystack); // oldstatus is Gwaiting or Grunnable + // The concurrent GC will not scan the stack while we are doing the copy since + // the gp is in a Gcopystack status. copystack(gp, newsize); if(StackDebug >= 1) runtime·printf("stack grow done\n"); - runtime·casgstatus(gp, Gwaiting, Grunning); + runtime·casgstatus(gp, Gcopystack, Grunning); runtime·gogo(&gp->sched); } @@ -825,6 +820,7 @@ void runtime·shrinkstack(G *gp) { uintptr used, oldsize, newsize; + uint32 oldstatus; if(runtime·readgstatus(gp) == Gdead) { if(gp->stack.lo != 0) { @@ -858,8 +854,19 @@ runtime·shrinkstack(G *gp) #endif if(StackDebug > 0) runtime·printf("shrinking stack %D->%D\n", (uint64)oldsize, (uint64)newsize); + // This is being done in a Gscan state and was initiated by the GC so no need to move to + // the Gcopystate. + // The world is stopped, so the goroutine must be Gwaiting or Grunnable, + // and what it is is not changing underfoot. + + oldstatus = runtime·readgstatus(gp); + oldstatus &= ~Gscan; + if(oldstatus != Gwaiting && oldstatus != Grunnable) + runtime·throw("status is not Gwaiting or Grunnable"); + runtime·casgstatus(gp, oldstatus, Gcopystack); copystack(gp, newsize); -} + runtime·casgstatus(gp, Gcopystack, oldstatus); + } // Do any delayed stack freeing that was queued up during GC. void diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go index 6561094ff..32dfed7d3 100644 --- a/src/runtime/stubs.go +++ b/src/runtime/stubs.go @@ -106,6 +106,8 @@ func recovery_m(*g) func mcacheRefill_m() func largeAlloc_m() func gc_m() +func gcscan_m() +func finishsweep_m() func scavenge_m() func setFinalizer_m() func removeFinalizer_m() -- cgit v1.2.1 From 297ae6e1545a6982878a82114d5d2d40da373ca6 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Mon, 27 Oct 2014 15:57:07 -0400 Subject: [dev.garbage] runtime: fix TestLFStack on 386 LGTM=rlh R=rlh, dvyukov CC=golang-codereviews https://codereview.appspot.com/157430044 --- src/runtime/export_test.go | 2 +- src/runtime/lfstack_test.go | 2 +- src/runtime/runtime.h | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go index be352557f..65e918e84 100644 --- a/src/runtime/export_test.go +++ b/src/runtime/export_test.go @@ -26,7 +26,7 @@ var Exitsyscall = exitsyscall var LockedOSThread = lockedOSThread type LFNode struct { - Next *LFNode + Next uint64 Pushcnt uintptr } diff --git a/src/runtime/lfstack_test.go b/src/runtime/lfstack_test.go index e51877704..68f221d6e 100644 --- a/src/runtime/lfstack_test.go +++ b/src/runtime/lfstack_test.go @@ -121,7 +121,7 @@ func TestLFStackStress(t *testing.T) { } cnt++ sum2 += node.data - node.Next = nil + node.Next = 0 } } if cnt != K { diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index cbbf6b3fc..c1bba423a 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -571,6 +571,7 @@ enum { #endif // Lock-free stack node. +// Also known to export_test.go. struct LFNode { uint64 next; -- cgit v1.2.1 From 9232cbb6a6aaa8a4197b22819406128ff0f99265 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Mon, 27 Oct 2014 17:07:53 -0400 Subject: [dev.garbage] runtime: Fix 386 compiler warnings. LGTM=rsc R=rsc CC=golang-codereviews https://codereview.appspot.com/163390043 --- src/runtime/mgc0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index c385d51cf..cc1f81123 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -632,7 +632,7 @@ getempty(Workbuf *b) b = (Workbuf*)runtime·lfstackpop(&runtime·work.empty); if(b && b->nobj != 0) { - runtime·printf("m%d: getempty: popped b=%p with non-zero b->nobj=%D\n", g->m->id, b, b->nobj); + runtime·printf("m%d: getempty: popped b=%p with non-zero b->nobj=%d\n", g->m->id, b, (uint32)b->nobj); runtime·throw("getempty: workbuffer not empty, b->nobj not 0"); } if(b == nil) { @@ -687,7 +687,7 @@ putpartial(Workbuf *b) else if (b->nobj == nelem(b->obj)) runtime·lfstackpush(&runtime·work.full, &b->node); else { - runtime·printf("b=%p, b->nobj=%D, nelem(b->obj)=%d\n", b, b->nobj, nelem(b->obj)); + runtime·printf("b=%p, b->nobj=%d, nelem(b->obj)=%d\n", b, b->nobj, (uint32)nelem(b->obj)); runtime·throw("putpartial: bad Workbuf b->nobj"); } } -- cgit v1.2.1 From d794c3a12eb8d762b119cf244bbbac8ee66b5a54 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Mon, 27 Oct 2014 17:27:03 -0400 Subject: [dev.power64] runtime: power64 fixes and ports of changes Fix include paths that got moved in the great pkg/ rename. Add missing runtime/arch_* files for power64. Port changes that happened on default since branching to runtime/{asm,atomic,sys_linux}_power64x.s (precise stacks, calling convention change, various new and deleted functions. Port struct renaming and fix some bugs in runtime/defs_linux_power64.h. LGTM=rsc R=rsc, dave CC=golang-codereviews https://codereview.appspot.com/161450043 --- src/runtime/arch_power64.go | 8 + src/runtime/arch_power64le.go | 8 + src/runtime/asm_power64x.s | 478 ++++++++++++++++---------------------- src/runtime/atomic_power64x.s | 11 +- src/runtime/debug/stubs.s | 6 + src/runtime/defs_linux_power64.h | 19 +- src/runtime/memclr_power64x.s | 2 +- src/runtime/memmove_power64x.s | 2 +- src/runtime/rt0_linux_power64.s | 6 +- src/runtime/rt0_linux_power64le.s | 6 +- src/runtime/sys_linux_power64x.s | 166 +++++++------ 11 files changed, 332 insertions(+), 380 deletions(-) create mode 100644 src/runtime/arch_power64.go create mode 100644 src/runtime/arch_power64le.go (limited to 'src/runtime') diff --git a/src/runtime/arch_power64.go b/src/runtime/arch_power64.go new file mode 100644 index 000000000..270cd7b95 --- /dev/null +++ b/src/runtime/arch_power64.go @@ -0,0 +1,8 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime + +type uintreg uint64 +type intptr int64 // TODO(rsc): remove diff --git a/src/runtime/arch_power64le.go b/src/runtime/arch_power64le.go new file mode 100644 index 000000000..270cd7b95 --- /dev/null +++ b/src/runtime/arch_power64le.go @@ -0,0 +1,8 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime + +type uintreg uint64 +type intptr int64 // TODO(rsc): remove diff --git a/src/runtime/asm_power64x.s b/src/runtime/asm_power64x.s index a0511de8e..76bf42094 100644 --- a/src/runtime/asm_power64x.s +++ b/src/runtime/asm_power64x.s @@ -6,9 +6,9 @@ #include "zasm_GOOS_GOARCH.h" #include "funcdata.h" -#include "../../cmd/ld/textflag.h" +#include "textflag.h" -TEXT _rt0_go(SB),NOSPLIT,$0 +TEXT runtime·rt0_go(SB),NOSPLIT,$0 // initialize essential registers BL runtime·reginit(SB) @@ -21,9 +21,10 @@ TEXT _rt0_go(SB),NOSPLIT,$0 MOVD $runtime·g0(SB), g MOVD $(-64*1024), R31 ADD R31, R1, R3 - MOVD R3, g_stackguard(g) MOVD R3, g_stackguard0(g) - MOVD R1, g_stackbase(g) + MOVD R3, g_stackguard1(g) + MOVD R3, (g_stack+stack_lo)(g) + MOVD R1, (g_stack+stack_hi)(g) // TODO: if there is a _cgo_init, call it. // TODO: add TLS @@ -41,7 +42,6 @@ TEXT _rt0_go(SB),NOSPLIT,$0 // args are already prepared BL runtime·args(SB) BL runtime·osinit(SB) - BL runtime·hashinit(SB) BL runtime·schedinit(SB) // create a new goroutine to start program @@ -49,9 +49,7 @@ TEXT _rt0_go(SB),NOSPLIT,$0 MOVDU R3, -8(R1) MOVDU R0, -8(R1) MOVDU R0, -8(R1) - ARGSIZE(24) BL runtime·newproc(SB) - ARGSIZE(-1) ADD $24, R1 // start this M @@ -118,7 +116,7 @@ TEXT runtime·gogo(SB), NOSPLIT, $-8-8 MOVD R31, CTR BR (CTR) -// void mcall(void (*fn)(G*)) +// void mcall(fn func(*g)) // Switch to m->g0's stack, call fn(g). // Fn must never return. It should gogo(&g->sched) // to keep running g. @@ -137,9 +135,10 @@ TEXT runtime·mcall(SB), NOSPLIT, $-8-8 CMP g, R3 BNE 2(PC) BR runtime·badmcall(SB) - MOVD fn+0(FP), R4 + MOVD fn+0(FP), R11 // context + MOVD 0(R11), R4 // code pointer MOVD R4, CTR - MOVD (g_sched+gobuf_sp)(g), R1 + MOVD (g_sched+gobuf_sp)(g), R1 // sp = m->g0->sched.sp MOVDU R3, -8(R1) MOVDU R0, -8(R1) BL (CTR) @@ -150,23 +149,50 @@ TEXT runtime·mcall(SB), NOSPLIT, $-8-8 // lives at the bottom of the G stack from the one that lives // at the top of the M stack because the one at the top of // the M stack terminates the stack walk (see topofstack()). -TEXT runtime·switchtoM(SB), NOSPLIT, $0-8 +TEXT runtime·switchtoM(SB), NOSPLIT, $0-0 UNDEF BL (LR) // make sure this function is not leaf RETURN -// void onM(void (*fn)()) -// calls fn() on the M stack. -// switches to the M stack if not already on it, and -// switches back when fn() returns. +// func onM_signalok(fn func()) +TEXT runtime·onM_signalok(SB), NOSPLIT, $8-8 + MOVD g, R3 // R3 = g + MOVD g_m(R3), R4 // R4 = g->m + MOVD m_gsignal(R4), R4 // R4 = g->m->gsignal + MOVD fn+0(FP), R11 // context for call below + CMP R3, R4 + BEQ onsignal + MOVD R11, 8(R1) + BL runtime·onM(SB) + RETURN + +onsignal: + MOVD 0(R11), R3 // code pointer + MOVD R3, CTR + BL (CTR) + RETURN + +// void onM(fn func()) TEXT runtime·onM(SB), NOSPLIT, $0-8 MOVD fn+0(FP), R3 // R3 = fn - MOVD R3, CTR + MOVD R3, R11 // context MOVD g_m(g), R4 // R4 = m + MOVD m_g0(R4), R5 // R5 = g0 CMP g, R5 BEQ onm + MOVD m_curg(R4), R6 + CMP g, R6 + BEQ oncurg + + // Not g0, not curg. Must be gsignal, but that's not allowed. + // Hide call from linker nosplit analysis. + MOVD $runtime·badonm(SB), R3 + MOVD R3, CTR + BL (CTR) + +oncurg: // save our state in g->sched. Pretend to // be switchtoM if the G stack is scanned. MOVD $runtime·switchtoM(SB), R6 @@ -178,10 +204,16 @@ TEXT runtime·onM(SB), NOSPLIT, $0-8 // switch to g0 MOVD R5, g - MOVD (g_sched+gobuf_sp)(g), R1 + MOVD (g_sched+gobuf_sp)(g), R3 + // make it look like mstart called onM on g0, to stop traceback + SUB $8, R3 + MOVD $runtime·mstart(SB), R4 + MOVD R4, 0(R3) + MOVD R3, R1 // call target function - ARGSIZE(0) + MOVD 0(R11), R3 // code pointer + MOVD R3, CTR BL (CTR) // switch back to g @@ -193,6 +225,8 @@ TEXT runtime·onM(SB), NOSPLIT, $0-8 onm: // already on m stack, just call directly + MOVD 0(R11), R3 // code pointer + MOVD R3, CTR BL (CTR) RETURN @@ -216,8 +250,11 @@ TEXT runtime·morestack(SB),NOSPLIT,$-8-0 BNE 2(PC) BL runtime·abort(SB) - MOVW R3, m_moreframesize(R7) - MOVW R4, m_moreargsize(R7) + // Cannot grow signal stack (m->gsignal). + MOVD m_gsignal(R7), R8 + CMP g, R8 + BNE 2(PC) + BL runtime·abort(SB) // Called from f. // Set g->sched to context in f. @@ -231,8 +268,6 @@ TEXT runtime·morestack(SB),NOSPLIT,$-8-0 // Set m->morebuf to f's caller. MOVD R5, (m_morebuf+gobuf_pc)(R7) // f's caller's PC MOVD R1, (m_morebuf+gobuf_sp)(R7) // f's caller's SP - MOVD $8(R1), R8 // f's argument pointer - MOVD R8, m_moreargp(R7) MOVD g, (m_morebuf+gobuf_g)(R7) // Call newstack on m->g0's stack. @@ -248,51 +283,8 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$-8-0 MOVD R0, R11 BR runtime·morestack(SB) -// Called from panic. Mimics morestack, -// reuses stack growth code to create a frame -// with the desired args running the desired function. -// -// func call(fn *byte, arg *byte, argsize uint32). -TEXT runtime·newstackcall(SB), NOSPLIT, $-8-20 - // Save our caller's state as the PC and SP to restore when - // returning from f. - MOVD g_m(g), R5 - MOVD LR, R31 - MOVD R31, (m_morebuf+gobuf_pc)(R5) // our caller's PC - MOVD R1, (m_morebuf+gobuf_sp)(R5) // our caller's SP - MOVD g, (m_morebuf+gobuf_g)(R5) - - // Save our own state as the PC and SP to restore if this - // goroutine needs to be restarted. - MOVD $runtime·newstackcall(SB), R7 - MOVD R7, (g_sched+gobuf_pc)(g) - MOVD LR, R31 - MOVD R31, (g_sched+gobuf_lr)(g) - MOVD R1, (g_sched+gobuf_sp)(g) - - // Set up morestack arguments to call f on a new stack. - // We set f's frame size to 1, as a hint to newstack that - // this is a call from runtime.newstackcall. - // If it turns out that f needs a larger frame than the - // default stack, f's usual stack growth prolog will - // allocate a new segment (and recopy the arguments). - MOVD fn+0(FP), R7 - MOVD args+8(FP), R8 - MOVW n+16(FP), R9 - - MOVD R7, m_cret(R5) - MOVD R8, m_moreargp(R5) - MOVW R9, m_moreargsize(R5) - MOVD $1, R10 - MOVW R10, m_moreframesize(R5) - - // call newstack on m->g0's stack - MOVD m_g0(R5), g - MOVD (g_sched+gobuf_sp)(g), R1 - BR runtime·newstack(SB) - -// reflect·call: call a function with the given argument list -// func call(f *FuncVal, arg *byte, argsize uint32). +// reflectcall: call a function with the given argument list +// func call(f *FuncVal, arg *byte, argsize, retoffset uint32). // we don't have variable-sized frames, so we use a small number // of constant-sized-frame functions to encode a few bits of size in the pc. // Caution: ugly multiline assembly macros in your future! @@ -301,60 +293,47 @@ TEXT runtime·newstackcall(SB), NOSPLIT, $-8-20 MOVD $MAXSIZE, R31; \ CMP R3, R31; \ BGT 4(PC); \ - MOVD $runtime·NAME(SB), R31; \ + MOVD $NAME(SB), R31; \ MOVD R31, CTR; \ BR (CTR) +// Note: can't just "BR NAME(SB)" - bad inlining results. -// Note: can't just "BR runtime·NAME(SB)" - bad inlining results. -TEXT reflect·call(SB), NOSPLIT, $-8-24 +TEXT ·reflectcall(SB), NOSPLIT, $-8-24 MOVW argsize+16(FP), R3 - DISPATCH(call16, 16) - DISPATCH(call32, 32) - DISPATCH(call64, 64) - DISPATCH(call128, 128) - DISPATCH(call256, 256) - DISPATCH(call512, 512) - DISPATCH(call1024, 1024) - DISPATCH(call2048, 2048) - DISPATCH(call4096, 4096) - DISPATCH(call8192, 8192) - DISPATCH(call16384, 16384) - DISPATCH(call32768, 32768) - DISPATCH(call65536, 65536) - DISPATCH(call131072, 131072) - DISPATCH(call262144, 262144) - DISPATCH(call524288, 524288) - DISPATCH(call1048576, 1048576) - DISPATCH(call2097152, 2097152) - DISPATCH(call4194304, 4194304) - DISPATCH(call8388608, 8388608) - DISPATCH(call16777216, 16777216) - DISPATCH(call33554432, 33554432) - DISPATCH(call67108864, 67108864) - DISPATCH(call134217728, 134217728) - DISPATCH(call268435456, 268435456) - DISPATCH(call536870912, 536870912) - DISPATCH(call1073741824, 1073741824) + DISPATCH(runtime·call16, 16) + DISPATCH(runtime·call32, 32) + DISPATCH(runtime·call64, 64) + DISPATCH(runtime·call128, 128) + DISPATCH(runtime·call256, 256) + DISPATCH(runtime·call512, 512) + DISPATCH(runtime·call1024, 1024) + DISPATCH(runtime·call2048, 2048) + DISPATCH(runtime·call4096, 4096) + DISPATCH(runtime·call8192, 8192) + DISPATCH(runtime·call16384, 16384) + DISPATCH(runtime·call32768, 32768) + DISPATCH(runtime·call65536, 65536) + DISPATCH(runtime·call131072, 131072) + DISPATCH(runtime·call262144, 262144) + DISPATCH(runtime·call524288, 524288) + DISPATCH(runtime·call1048576, 1048576) + DISPATCH(runtime·call2097152, 2097152) + DISPATCH(runtime·call4194304, 4194304) + DISPATCH(runtime·call8388608, 8388608) + DISPATCH(runtime·call16777216, 16777216) + DISPATCH(runtime·call33554432, 33554432) + DISPATCH(runtime·call67108864, 67108864) + DISPATCH(runtime·call134217728, 134217728) + DISPATCH(runtime·call268435456, 268435456) + DISPATCH(runtime·call536870912, 536870912) + DISPATCH(runtime·call1073741824, 1073741824) MOVD $runtime·badreflectcall(SB), R31 MOVD R31, CTR BR (CTR) -// Argument map for the callXX frames. Each has one -// stack map (for the single call) with 3 arguments. -DATA gcargs_reflectcall<>+0x00(SB)/4, $1 // 1 stackmap -DATA gcargs_reflectcall<>+0x04(SB)/4, $6 // 3 args -DATA gcargs_reflectcall<>+0x08(SB)/4, $(const_BitsPointer+(const_BitsPointer<<2)+(const_BitsScalar<<4)) -GLOBL gcargs_reflectcall<>(SB),RODATA,$12 - -// callXX frames have no locals -DATA gclocals_reflectcall<>+0x00(SB)/4, $1 // 1 stackmap -DATA gclocals_reflectcall<>+0x04(SB)/4, $0 // 0 locals -GLOBL gclocals_reflectcall<>(SB),RODATA,$8 - #define CALLFN(NAME,MAXSIZE) \ -TEXT runtime·NAME(SB), WRAPPER, $MAXSIZE-24; \ - FUNCDATA $FUNCDATA_ArgsPointerMaps,gcargs_reflectcall<>(SB); \ - FUNCDATA $FUNCDATA_LocalsPointerMaps,gclocals_reflectcall<>(SB);\ +TEXT NAME(SB), WRAPPER, $MAXSIZE-24; \ + NO_LOCAL_POINTERS; \ /* copy arguments to stack */ \ MOVD argptr+8(FP), R3; \ MOVW argsize+16(FP), R4; \ @@ -391,47 +370,33 @@ TEXT runtime·NAME(SB), WRAPPER, $MAXSIZE-24; \ BR -4(PC); \ RETURN -CALLFN(call16, 16) -CALLFN(call32, 32) -CALLFN(call64, 64) -CALLFN(call128, 128) -CALLFN(call256, 256) -CALLFN(call512, 512) -CALLFN(call1024, 1024) -CALLFN(call2048, 2048) -CALLFN(call4096, 4096) -CALLFN(call8192, 8192) -CALLFN(call16384, 16384) -CALLFN(call32768, 32768) -CALLFN(call65536, 65536) -CALLFN(call131072, 131072) -CALLFN(call262144, 262144) -CALLFN(call524288, 524288) -CALLFN(call1048576, 1048576) -CALLFN(call2097152, 2097152) -CALLFN(call4194304, 4194304) -CALLFN(call8388608, 8388608) -CALLFN(call16777216, 16777216) -CALLFN(call33554432, 33554432) -CALLFN(call67108864, 67108864) -CALLFN(call134217728, 134217728) -CALLFN(call268435456, 268435456) -CALLFN(call536870912, 536870912) -CALLFN(call1073741824, 1073741824) - -// Return point when leaving stack. -// -// Lessstack can appear in stack traces for the same reason -// as morestack; in that context, it has 0 arguments. -TEXT runtime·lessstack(SB), NOSPLIT, $-8-0 - // Save return value in m->cret - MOVD g_m(g), R5 - MOVD R3, m_cret(R5) - - // Call oldstack on m->g0's stack. - MOVD m_g0(R5), g - MOVD (g_sched+gobuf_sp)(g), R1 - BL runtime·oldstack(SB) +CALLFN(·call16, 16) +CALLFN(·call32, 32) +CALLFN(·call64, 64) +CALLFN(·call128, 128) +CALLFN(·call256, 256) +CALLFN(·call512, 512) +CALLFN(·call1024, 1024) +CALLFN(·call2048, 2048) +CALLFN(·call4096, 4096) +CALLFN(·call8192, 8192) +CALLFN(·call16384, 16384) +CALLFN(·call32768, 32768) +CALLFN(·call65536, 65536) +CALLFN(·call131072, 131072) +CALLFN(·call262144, 262144) +CALLFN(·call524288, 524288) +CALLFN(·call1048576, 1048576) +CALLFN(·call2097152, 2097152) +CALLFN(·call4194304, 4194304) +CALLFN(·call8388608, 8388608) +CALLFN(·call16777216, 16777216) +CALLFN(·call33554432, 33554432) +CALLFN(·call67108864, 67108864) +CALLFN(·call134217728, 134217728) +CALLFN(·call268435456, 268435456) +CALLFN(·call536870912, 536870912) +CALLFN(·call1073741824, 1073741824) // bool cas(int32 *val, int32 old, int32 new) // Atomically: @@ -440,22 +405,23 @@ TEXT runtime·lessstack(SB), NOSPLIT, $-8-0 // return 1; // } else // return 0; -TEXT runtime·cas(SB), NOSPLIT, $0-16 +TEXT runtime·cas(SB), NOSPLIT, $0-17 MOVD p+0(FP), R3 MOVW old+8(FP), R4 MOVW new+12(FP), R5 SYNC LWAR (R3), R6 CMPW R6, R4 - BNE 7(PC) + BNE 8(PC) STWCCC R5, (R3) BNE -5(PC) MOVD $1, R3 SYNC ISYNC + MOVB R3, ret+16(FP) RETURN MOVD $0, R3 - BR -4(PC) + BR -5(PC) // bool runtime·cas64(uint64 *val, uint64 old, uint64 new) // Atomically: @@ -465,7 +431,7 @@ TEXT runtime·cas(SB), NOSPLIT, $0-16 // } else { // return 0; // } -TEXT runtime·cas64(SB), NOSPLIT, $0-24 +TEXT runtime·cas64(SB), NOSPLIT, $0-25 MOVD p+0(FP), R3 MOVD old+8(FP), R4 MOVD new+16(FP), R5 @@ -478,10 +444,23 @@ TEXT runtime·cas64(SB), NOSPLIT, $0-24 MOVD $1, R3 SYNC ISYNC + MOVB R3, ret+24(FP) RETURN MOVD $0, R3 BR -4(PC) +TEXT runtime·casuintptr(SB), NOSPLIT, $0-25 + BR runtime·cas64(SB) + +TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $-8-16 + BR runtime·atomicload64(SB) + +TEXT runtime·atomicloaduint(SB), NOSPLIT, $-8-16 + BR runtime·atomicload64(SB) + +TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16 + BR runtime·atomicstore64(SB) + // bool casp(void **val, void *old, void *new) // Atomically: // if(*val == old){ @@ -489,14 +468,14 @@ TEXT runtime·cas64(SB), NOSPLIT, $0-24 // return 1; // } else // return 0; -TEXT runtime·casp(SB), NOSPLIT, $0-24 +TEXT runtime·casp(SB), NOSPLIT, $0-25 BR runtime·cas64(SB) // uint32 xadd(uint32 volatile *val, int32 delta) // Atomically: // *val += delta; // return *val; -TEXT runtime·xadd(SB), NOSPLIT, $0-12 +TEXT runtime·xadd(SB), NOSPLIT, $0-20 MOVD p+0(FP), R4 MOVW delta+8(FP), R5 SYNC @@ -506,10 +485,10 @@ TEXT runtime·xadd(SB), NOSPLIT, $0-12 BNE -4(PC) SYNC ISYNC - MOVW R3, R3 + MOVW R3, ret+16(FP) RETURN -TEXT runtime·xadd64(SB), NOSPLIT, $0-16 +TEXT runtime·xadd64(SB), NOSPLIT, $0-24 MOVD p+0(FP), R4 MOVD delta+8(FP), R5 SYNC @@ -519,9 +498,10 @@ TEXT runtime·xadd64(SB), NOSPLIT, $0-16 BNE -4(PC) SYNC ISYNC + MOVD R3, ret+16(FP) RETURN -TEXT runtime·xchg(SB), NOSPLIT, $0-12 +TEXT runtime·xchg(SB), NOSPLIT, $0-20 MOVD p+0(FP), R4 MOVW new+8(FP), R5 SYNC @@ -530,9 +510,10 @@ TEXT runtime·xchg(SB), NOSPLIT, $0-12 BNE -3(PC) SYNC ISYNC + MOVW R3, ret+16(FP) RETURN -TEXT runtime·xchg64(SB), NOSPLIT, $0-16 +TEXT runtime·xchg64(SB), NOSPLIT, $0-24 MOVD p+0(FP), R4 MOVD new+8(FP), R5 SYNC @@ -541,9 +522,13 @@ TEXT runtime·xchg64(SB), NOSPLIT, $0-16 BNE -3(PC) SYNC ISYNC + MOVD R3, ret+16(FP) RETURN -TEXT runtime·xchgp(SB), NOSPLIT, $0-16 +TEXT runtime·xchgp(SB), NOSPLIT, $0-24 + BR runtime·xchg64(SB) + +TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24 BR runtime·xchg64(SB) TEXT runtime·procyield(SB),NOSPLIT,$0-0 @@ -553,20 +538,33 @@ TEXT runtime·atomicstorep(SB), NOSPLIT, $0-16 BR runtime·atomicstore64(SB) TEXT runtime·atomicstore(SB), NOSPLIT, $0-12 - MOVD 0(FP), R3 - MOVW 8(FP), R4 + MOVD ptr+0(FP), R3 + MOVW val+8(FP), R4 SYNC MOVW R4, 0(R3) RETURN TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16 + MOVD ptr+0(FP), R3 + MOVD val+8(FP), R4 + SYNC + MOVD R4, 0(R3) + RETURN + +// void runtime·atomicor8(byte volatile*, byte); +TEXT runtime·atomicor8(SB), NOSPLIT, $0-9 MOVD 0(FP), R3 MOVD 8(FP), R4 SYNC - MOVD R4, 0(R3) + LWAR (R3), R5 + OR R4, R5 + STWCCC R5, (R3) + BNE -3(PC) + SYNC + ISYNC RETURN -// void jmpdefer(fn, sp); +// void jmpdefer(fv, sp); // called from deferreturn. // 1. grab stored LR for caller // 2. sub 4 bytes to get back to BL deferreturn @@ -576,7 +574,7 @@ TEXT runtime·jmpdefer(SB), NOSPLIT, $-8-16 SUB $4, R31 MOVD R31, LR - MOVD fn+0(FP), R11 + MOVD fv+0(FP), R11 MOVD argp+8(FP), R1 SUB $8, R1 MOVD 0(R11), R3 @@ -597,7 +595,7 @@ TEXT gosave<>(SB),NOSPLIT,$-8 // Call fn(arg) on the scheduler stack, // aligned appropriately for the gcc ABI. // See cgocall.c for more details. -TEXT runtime·asmcgocall(SB),NOSPLIT,$0-16 +TEXT ·asmcgocall(SB),NOSPLIT,$0-16 MOVD R0, 21(R0) // cgocallback(void (*fn)(void*), void *frame, uintptr framesize) @@ -608,19 +606,20 @@ TEXT runtime·cgocallback(SB),NOSPLIT,$24-24 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) // See cgocall.c for more details. -TEXT runtime·cgocallback_gofunc(SB),NOSPLIT,$8-24 +TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24 MOVD R0, 23(R0) // void setg(G*); set g. for use by needm. -TEXT runtime·setg(SB), NOSPLIT, $0-16 +TEXT runtime·setg(SB), NOSPLIT, $0-8 MOVD R0, 24(R0) // void setg_gcc(G*); set g called from gcc. TEXT setg_gcc<>(SB),NOSPLIT,$0 MOVD R0, 25(R0) -TEXT runtime·getcallerpc(SB),NOSPLIT,$-8-8 +TEXT runtime·getcallerpc(SB),NOSPLIT,$-8-16 MOVD 0(R1), R3 + MOVD R3, ret+8(FP) RETURN TEXT runtime·gogetcallerpc(SB),NOSPLIT,$-8-16 @@ -629,16 +628,24 @@ TEXT runtime·gogetcallerpc(SB),NOSPLIT,$-8-16 RETURN TEXT runtime·setcallerpc(SB),NOSPLIT,$-8-16 - MOVD x+8(FP),R3 // addr of first arg + MOVD pc+8(FP), R3 MOVD R3, 0(R1) // set calling pc RETURN -TEXT runtime·getcallersp(SB),NOSPLIT,$0-8 +TEXT runtime·getcallersp(SB),NOSPLIT,$0-16 MOVD sp+0(FP), R3 SUB $8, R3 + MOVD R3, ret+8(FP) RETURN -TEXT runtime·abort(SB),NOSPLIT,$-4-0 +// func gogetcallersp(p unsafe.Pointer) uintptr +TEXT runtime·gogetcallersp(SB),NOSPLIT,$0-16 + MOVD sp+0(FP), R3 + SUB $8, R3 + MOVD R3,ret+8(FP) + RETURN + +TEXT runtime·abort(SB),NOSPLIT,$-8-0 MOVW (R0), R0 UNDEF @@ -646,7 +653,7 @@ TEXT runtime·abort(SB),NOSPLIT,$-4-0 #define TBRU 269 /* Time base Upper/Lower */ // int64 runtime·cputicks(void) -TEXT runtime·cputicks(SB),NOSPLIT,$0-0 +TEXT runtime·cputicks(SB),NOSPLIT,$0-8 MOVW SPR(TBRU), R4 MOVW SPR(TBRL), R3 MOVW SPR(TBRU), R5 @@ -654,17 +661,9 @@ TEXT runtime·cputicks(SB),NOSPLIT,$0-0 BNE -4(PC) SLD $32, R5 OR R5, R3 + MOVD R3, ret+0(FP) RETURN -TEXT runtime·stackguard(SB),NOSPLIT,$0-16 - MOVD R1, R3 - MOVD R3, sp+0(FP) - MOVD g_stackguard(g), R3 - MOVD R3, limit+8(FP) - RETURN - -GLOBL runtime·tls0(SB), $64 - // AES hashing not implemented for Power TEXT runtime·aeshash(SB),NOSPLIT,$-8-0 MOVW (R0), R1 @@ -675,7 +674,7 @@ TEXT runtime·aeshash64(SB),NOSPLIT,$-8-0 TEXT runtime·aeshashstr(SB),NOSPLIT,$-8-0 MOVW (R0), R1 -TEXT runtime·memeq(SB),NOSPLIT,$-8-24 +TEXT runtime·memeq(SB),NOSPLIT,$-8-25 MOVD a+0(FP), R3 MOVD b+8(FP), R4 MOVD count+16(FP), R5 @@ -683,26 +682,6 @@ TEXT runtime·memeq(SB),NOSPLIT,$-8-24 SUB $1, R4 ADD R3, R5, R8 _next: - CMP R3, R8 - BNE 3(PC) - MOVD $1, R3 - RETURN - MOVBZU 1(R3), R6 - MOVBZU 1(R4), R7 - CMP R6, R7 - BEQ _next - - MOVD $0, R3 - RETURN - -TEXT runtime·gomemeq(SB),NOSPLIT,$0-25 - MOVD a+0(FP), R3 - MOVD b+8(FP), R4 - MOVD count+16(FP), R5 - SUB $1, R3 - SUB $1, R4 - ADD R3, R5, R8 -_next2: CMP R3, R8 BNE 4(PC) MOVD $1, R3 @@ -711,14 +690,14 @@ _next2: MOVBZU 1(R3), R6 MOVBZU 1(R4), R7 CMP R6, R7 - BEQ _next2 + BEQ _next MOVB R0, ret+24(FP) RETURN // eqstring tests whether two strings are equal. // See runtime_test.go:eqstring_generic for -// equivlaent Go code. +// equivalent Go code. TEXT runtime·eqstring(SB),NOSPLIT,$0-33 MOVD s1len+8(FP), R4 MOVD s2len+24(FP), R5 @@ -824,9 +803,6 @@ _index2_notfound: RETURN -TEXT runtime·timenow(SB), NOSPLIT, $0-0 - BR time·now(SB) - // A Duff's device for zeroing memory. // The compiler jumps to computed addresses within // this routine to zero chunks of memory. Do not @@ -966,80 +942,22 @@ TEXT runtime·duffzero(SB), NOSPLIT, $-8-0 MOVDU R0, 8(R3) RETURN -TEXT runtime·fastrand2(SB), NOSPLIT, $0-4 +TEXT runtime·fastrand1(SB), NOSPLIT, $0-4 MOVD g_m(g), R4 - MOVD m_fastrand(R4), R3 + MOVWZ m_fastrand(R4), R3 ADD R3, R3 CMP R3, $0 BGE 2(PC) XOR $0x88888eef, R3 - MOVD R3, m_fastrand(R4) - MOVD R3, ret+0(FP) + MOVW R3, m_fastrand(R4) + MOVW R3, ret+0(FP) RETURN -// The gohash and goeq trampolines are necessary while we have -// both Go and C calls to alg functions. Once we move all call -// sites to Go, we can redo the hash/eq functions to use the -// Go calling convention and remove these. - -// convert call to: -// func (alg unsafe.Pointer, p unsafe.Pointer, size uintpr, seed uintptr) uintptr -// to: -// func (hash *uintptr, size uintptr, p unsafe.Pointer) -TEXT runtime·gohash(SB), NOSPLIT, $24-40 - FUNCDATA $FUNCDATA_ArgsPointerMaps,gcargs_gohash<>(SB) - FUNCDATA $FUNCDATA_LocalsPointerMaps,gclocals_gohash<>(SB) - MOVD a+0(FP), R3 - MOVD alg_hash(R3), R3 - MOVD R3, CTR - MOVD p+8(FP), R4 - MOVD size+16(FP), R5 - MOVD seed+24(FP), R6 - MOVD R6, ret+32(FP) - MOVD $ret+32(FP), R7 - MOVD R7, 8(R1) - MOVD R5, 16(R1) - MOVD R4, 24(R1) - PCDATA $PCDATA_StackMapIndex, $0 - BL (CTR) +TEXT runtime·return0(SB), NOSPLIT, $0 + MOVW $0, R3 RETURN -DATA gcargs_gohash<>+0x00(SB)/4, $1 // 1 stackmap -DATA gcargs_gohash<>+0x04(SB)/4, $10 // 5 args -DATA gcargs_gohash<>+0x08(SB)/4, $(const_BitsPointer+(const_BitsPointer<<2)) -GLOBL gcargs_gohash<>(SB),RODATA,$12 - -DATA gclocals_gohash<>+0x00(SB)/4, $1 // 1 stackmap -DATA gclocals_gohash<>+0x04(SB)/4, $0 // 0 locals -GLOBL gclocals_gohash<>(SB),RODATA,$8 - -// convert call to: -// func (alg unsafe.Pointer, p, q unsafe.Pointer, size uintptr) bool -// to: -// func (eq *bool, size uintptr, p, q unsafe.Pointer) -TEXT runtime·goeq(SB), NOSPLIT, $32-33 - FUNCDATA $FUNCDATA_ArgsPointerMaps,gcargs_goeq<>(SB) - FUNCDATA $FUNCDATA_LocalsPointerMaps,gclocals_goeq<>(SB) - MOVD alg+0(FP), R3 - MOVD alg_equal(R3), R3 - MOVD R3, CTR - MOVD p+8(FP), R4 - MOVD q+16(FP), R5 - MOVD size+24(FP), R6 - MOVD $ret+32(FP), R7 - MOVD R7, 8(R1) - MOVD R6, 16(R1) - MOVD R5, 24(R1) - MOVD R4, 32(R1) - PCDATA $PCDATA_StackMapIndex, $0 - BL (CTR) - RETURN - -DATA gcargs_goeq<>+0x00(SB)/4, $1 // 1 stackmap -DATA gcargs_goeq<>+0x04(SB)/4, $10 // 5 args -DATA gcargs_goeq<>+0x08(SB)/4, $(const_BitsPointer+(const_BitsPointer<<2)+(const_BitsPointer<<4)) -GLOBL gcargs_goeq<>(SB),RODATA,$12 - -DATA gclocals_goeq<>+0x00(SB)/4, $1 // 1 stackmap -DATA gclocals_goeq<>+0x04(SB)/4, $0 // 0 locals -GLOBL gclocals_goeq<>(SB),RODATA,$8 +// Called from cgo wrappers, this function returns g->m->curg.stack.hi. +// Must obey the gcc calling convention. +TEXT _cgo_topofstack(SB),NOSPLIT,$0 + MOVD R0, 26(R0) diff --git a/src/runtime/atomic_power64x.s b/src/runtime/atomic_power64x.s index c08590ac9..e72871761 100644 --- a/src/runtime/atomic_power64x.s +++ b/src/runtime/atomic_power64x.s @@ -4,34 +4,37 @@ // +build power64 power64le -#include "../../cmd/ld/textflag.h" +#include "textflag.h" // uint32 runtime·atomicload(uint32 volatile* addr) -TEXT ·atomicload(SB),NOSPLIT,$-8-8 +TEXT ·atomicload(SB),NOSPLIT,$-8-12 MOVD 0(FP), R3 SYNC MOVWZ 0(R3), R3 CMPW R3, R3, CR7 BC 4, 30, 1(PC) // bne- cr7,0x4 ISYNC + MOVW R3, ret+8(FP) RETURN // uint64 runtime·atomicload64(uint64 volatile* addr) -TEXT ·atomicload64(SB),NOSPLIT,$-8-8 +TEXT ·atomicload64(SB),NOSPLIT,$-8-16 MOVD 0(FP), R3 SYNC MOVD 0(R3), R3 CMP R3, R3, CR7 BC 4, 30, 1(PC) // bne- cr7,0x4 ISYNC + MOVD R3, ret+8(FP) RETURN // void *runtime·atomicloadp(void *volatile *addr) -TEXT ·atomicloadp(SB),NOSPLIT,$-8-8 +TEXT ·atomicloadp(SB),NOSPLIT,$-8-16 MOVD 0(FP), R3 SYNC MOVD 0(R3), R3 CMP R3, R3, CR7 BC 4, 30, 1(PC) // bne- cr7,0x4 ISYNC + MOVD R3, ret+8(FP) RETURN diff --git a/src/runtime/debug/stubs.s b/src/runtime/debug/stubs.s index d56274f2d..1e883b72c 100644 --- a/src/runtime/debug/stubs.s +++ b/src/runtime/debug/stubs.s @@ -7,6 +7,12 @@ #ifdef GOARCH_arm #define JMP B #endif +#ifdef GOARCH_power64 +#define JMP BR +#endif +#ifdef GOARCH_power64le +#define JMP BR +#endif TEXT ·setMaxStack(SB),NOSPLIT,$0-0 JMP runtime·setMaxStack(SB) diff --git a/src/runtime/defs_linux_power64.h b/src/runtime/defs_linux_power64.h index 64f145672..93742fa34 100644 --- a/src/runtime/defs_linux_power64.h +++ b/src/runtime/defs_linux_power64.h @@ -88,11 +88,10 @@ enum { typedef struct Sigset Sigset; typedef struct Timespec Timespec; typedef struct Timeval Timeval; -typedef struct Sigaction Sigaction; +typedef struct SigactionT SigactionT; typedef struct Siginfo Siginfo; typedef struct Itimerval Itimerval; typedef struct EpollEvent EpollEvent; -typedef uint64 Usigset; #pragma pack on @@ -109,11 +108,11 @@ struct Timeval { int64 tv_sec; int64 tv_usec; }; -struct Sigaction { +struct SigactionT { void *sa_handler; uint64 sa_flags; void *sa_restorer; - Usigset sa_mask; + uint64 sa_mask; }; struct Siginfo { int32 si_signo; @@ -129,7 +128,7 @@ struct Itimerval { struct EpollEvent { uint32 events; byte Pad_cgo_0[4]; - uint64 data; + byte data[8]; // unaligned uintptr }; @@ -144,7 +143,6 @@ enum { SA_RESTORER = 0, }; -//typedef struct Usigset Usigset; typedef struct Ptregs Ptregs; typedef struct Vreg Vreg; typedef struct SigaltstackT SigaltstackT; @@ -153,11 +151,6 @@ typedef struct Ucontext Ucontext; #pragma pack on -//struct Usigset { -// uint64 sig[1]; -//}; -//typedef Sigset Usigset; - struct Ptregs { uint64 gpr[32]; uint64 nip; @@ -202,8 +195,8 @@ struct Ucontext { uint64 uc_flags; Ucontext *uc_link; SigaltstackT uc_stack; - Usigset uc_sigmask; - Usigset __unused[15]; + uint64 uc_sigmask; + uint64 __unused[15]; Sigcontext uc_mcontext; }; diff --git a/src/runtime/memclr_power64x.s b/src/runtime/memclr_power64x.s index 4a2437c20..dfad64b6f 100644 --- a/src/runtime/memclr_power64x.s +++ b/src/runtime/memclr_power64x.s @@ -4,7 +4,7 @@ // +build power64 power64le -#include "../../cmd/ld/textflag.h" +#include "textflag.h" // void runtime·memclr(void*, uintptr) TEXT runtime·memclr(SB),NOSPLIT,$0-16 diff --git a/src/runtime/memmove_power64x.s b/src/runtime/memmove_power64x.s index b618f0ad7..2b04d8319 100644 --- a/src/runtime/memmove_power64x.s +++ b/src/runtime/memmove_power64x.s @@ -4,7 +4,7 @@ // +build power64 power64le -#include "../../cmd/ld/textflag.h" +#include "textflag.h" // void runtime·memmove(void*, void*, uintptr) TEXT runtime·memmove(SB), NOSPLIT, $-8-24 diff --git a/src/runtime/rt0_linux_power64.s b/src/runtime/rt0_linux_power64.s index e944bcdbf..970b6a673 100644 --- a/src/runtime/rt0_linux_power64.s +++ b/src/runtime/rt0_linux_power64.s @@ -1,7 +1,7 @@ -#include "../../cmd/ld/textflag.h" +#include "textflag.h" // actually a function descriptor for _main<>(SB) -TEXT _rt0_power64_linux(SB),7,$0 +TEXT _rt0_power64_linux(SB),NOSPLIT,$0 DWORD $_main<>(SB) DWORD $0 DWORD $0 @@ -12,6 +12,6 @@ TEXT _main<>(SB),NOSPLIT,$-8 BR main(SB) TEXT main(SB),NOSPLIT,$-8 - MOVD $_rt0_go(SB), R31 + MOVD $runtime·rt0_go(SB), R31 MOVD R31, CTR BR (CTR) diff --git a/src/runtime/rt0_linux_power64le.s b/src/runtime/rt0_linux_power64le.s index 051815dbc..85ce84733 100644 --- a/src/runtime/rt0_linux_power64le.s +++ b/src/runtime/rt0_linux_power64le.s @@ -1,6 +1,6 @@ -#include "../../cmd/ld/textflag.h" +#include "textflag.h" -TEXT _rt0_power64le_linux(SB),7,$0 +TEXT _rt0_power64le_linux(SB),NOSPLIT,$0 BR _main<>(SB) TEXT _main<>(SB),NOSPLIT,$-8 @@ -9,6 +9,6 @@ TEXT _main<>(SB),NOSPLIT,$-8 BR main(SB) TEXT main(SB),NOSPLIT,$-8 - MOVD $_rt0_go(SB), R31 + MOVD $runtime·rt0_go(SB), R31 MOVD R31, CTR BR (CTR) diff --git a/src/runtime/sys_linux_power64x.s b/src/runtime/sys_linux_power64x.s index fbd59a052..fb24d3e79 100644 --- a/src/runtime/sys_linux_power64x.s +++ b/src/runtime/sys_linux_power64x.s @@ -10,7 +10,7 @@ // #include "zasm_GOOS_GOARCH.h" -#include "../../cmd/ld/textflag.h" +#include "textflag.h" #define SYS_exit 1 #define SYS_read 3 @@ -44,49 +44,54 @@ #define SYS_clock_gettime 246 #define SYS_epoll_create1 315 -TEXT runtime·exit(SB),NOSPLIT,$-8-8 - MOVW 8(R1), R3 +TEXT runtime·exit(SB),NOSPLIT,$-8-4 + MOVW code+0(FP), R3 SYSCALL $SYS_exit_group RETURN -TEXT runtime·exit1(SB),NOSPLIT,$-8-8 - MOVW 8(R1), R3 +TEXT runtime·exit1(SB),NOSPLIT,$-8-4 + MOVW code+0(FP), R3 SYSCALL $SYS_exit RETURN -TEXT runtime·open(SB),NOSPLIT,$-8-16 - MOVD 8(R1), R3 - MOVW 16(R1), R4 - MOVW 20(R1), R5 +TEXT runtime·open(SB),NOSPLIT,$-8-20 + MOVD name+0(FP), R3 + MOVW mode+8(FP), R4 + MOVW perm+12(FP), R5 SYSCALL $SYS_open + MOVW R3, ret+16(FP) RETURN -TEXT runtime·close(SB),NOSPLIT,$-8-16 - MOVW 8(R1), R3 +TEXT runtime·close(SB),NOSPLIT,$-8-12 + MOVW fd+0(FP), R3 SYSCALL $SYS_close + MOVW R3, ret+8(FP) RETURN -TEXT runtime·write(SB),NOSPLIT,$-8-24 - MOVD 8(R1), R3 - MOVD 16(R1), R4 - MOVW 24(R1), R5 +TEXT runtime·write(SB),NOSPLIT,$-8-28 + MOVD fd+0(FP), R3 + MOVD p+8(FP), R4 + MOVW n+16(FP), R5 SYSCALL $SYS_write + MOVW R3, ret+24(FP) RETURN -TEXT runtime·read(SB),NOSPLIT,$-8-24 - MOVW 8(R1), R3 - MOVD 16(R1), R4 - MOVW 24(R1), R5 +TEXT runtime·read(SB),NOSPLIT,$-8-28 + MOVW fd+0(FP), R3 + MOVD p+8(FP), R4 + MOVW n+16(FP), R5 SYSCALL $SYS_read + MOVW R3, ret+24(FP) RETURN -TEXT runtime·getrlimit(SB),NOSPLIT,$-8-24 - MOVW 8(R1), R3 - MOVD 16(R1), R4 +TEXT runtime·getrlimit(SB),NOSPLIT,$-8-20 + MOVW kind+0(FP), R3 + MOVD limit+8(FP), R4 SYSCALL $SYS_ugetrlimit + MOVW R3, ret+16(FP) RETURN -TEXT runtime·usleep(SB),NOSPLIT,$-8-16 +TEXT runtime·usleep(SB),NOSPLIT,$16-4 MOVW usec+0(FP), R3 MOVD R3, R5 MOVW $1000000, R4 @@ -113,17 +118,18 @@ TEXT runtime·raise(SB),NOSPLIT,$-8 RETURN TEXT runtime·setitimer(SB),NOSPLIT,$-8-24 - MOVW 8(R1), R3 - MOVD 16(R1), R4 - MOVD 24(R1), R5 + MOVW mode+0(FP), R3 + MOVD new+8(FP), R4 + MOVD old+16(FP), R5 SYSCALL $SYS_setitimer RETURN -TEXT runtime·mincore(SB),NOSPLIT,$-8-24 - MOVD 8(R1), R3 - MOVD 16(R1), R4 - MOVD 24(R1), R5 +TEXT runtime·mincore(SB),NOSPLIT,$-8-28 + MOVD addr+0(FP), R3 + MOVD n+8(FP), R4 + MOVD dst+16(FP), R5 SYSCALL $SYS_mincore + MOVW R3, ret+24(FP) RETURN // func now() (sec int64, nsec int32) @@ -150,24 +156,26 @@ TEXT runtime·nanotime(SB),NOSPLIT,$16 MOVD $1000000000, R4 MULLD R4, R3 ADD R5, R3 + MOVD R3, ret+0(FP) RETURN -TEXT runtime·rtsigprocmask(SB),NOSPLIT,$-8-32 - MOVW 8(R1), R3 - MOVD 16(R1), R4 - MOVD 24(R1), R5 - MOVW 32(R1), R6 +TEXT runtime·rtsigprocmask(SB),NOSPLIT,$-8-28 + MOVW sig+0(FP), R3 + MOVD new+8(FP), R4 + MOVD old+16(FP), R5 + MOVW size+24(FP), R6 SYSCALL $SYS_rt_sigprocmask BVC 2(PC) MOVD R0, 0xf1(R0) // crash RETURN -TEXT runtime·rt_sigaction(SB),NOSPLIT,$-8-32 - MOVD 8(R1), R3 - MOVD 16(R1), R4 - MOVD 24(R1), R5 - MOVD 32(R1), R6 +TEXT runtime·rt_sigaction(SB),NOSPLIT,$-8-36 + MOVD sig+0(FP), R3 + MOVD new+8(FP), R4 + MOVD old+16(FP), R5 + MOVD size+24(FP), R6 SYSCALL $SYS_rt_sigaction + MOVW R3, ret+32(FP) RETURN #ifdef GOARCH_power64le @@ -214,28 +222,29 @@ TEXT runtime·_sigtramp(SB),NOSPLIT,$64 RETURN TEXT runtime·mmap(SB),NOSPLIT,$-8 - MOVD 8(R1), R3 - MOVD 16(R1), R4 - MOVW 24(R1), R5 - MOVW 28(R1), R6 - MOVW 32(R1), R7 - MOVW 36(R1), R8 + MOVD addr+0(FP), R3 + MOVD n+8(FP), R4 + MOVW prot+16(FP), R5 + MOVW flags+20(FP), R6 + MOVW fd+24(FP), R7 + MOVW off+28(FP), R8 SYSCALL $SYS_mmap + MOVD R3, ret+32(FP) RETURN TEXT runtime·munmap(SB),NOSPLIT,$-8 - MOVD 8(R1), R3 - MOVD 16(R1), R4 + MOVD addr+0(FP), R3 + MOVD n+8(FP), R4 SYSCALL $SYS_munmap BVC 2(PC) MOVD R0, 0xf3(R0) RETURN TEXT runtime·madvise(SB),NOSPLIT,$-8 - MOVD 8(R1), R3 - MOVD 16(R1), R4 - MOVD 24(R1), R5 + MOVD addr+0(FP), R3 + MOVD n+8(FP), R4 + MOVW flags+16(FP), R5 SYSCALL $SYS_madvise // ignore failure - maybe pages are locked RETURN @@ -243,19 +252,20 @@ TEXT runtime·madvise(SB),NOSPLIT,$-8 // int64 futex(int32 *uaddr, int32 op, int32 val, // struct timespec *timeout, int32 *uaddr2, int32 val2); TEXT runtime·futex(SB),NOSPLIT,$-8 - MOVD 8(R1), R3 - MOVW 16(R1), R4 - MOVW 20(R1), R5 - MOVD 24(R1), R6 - MOVD 32(R1), R7 - MOVW 40(R1), R8 + MOVD addr+0(FP), R3 + MOVW op+8(FP), R4 + MOVW val+12(FP), R5 + MOVD ts+16(FP), R6 + MOVD addr2+24(FP), R7 + MOVW val3+32(FP), R8 SYSCALL $SYS_futex + MOVW R3, ret+40(FP) RETURN -// int64 clone(int32 flags, void *stack, M *mp, G *gp, void (*fn)(void)); +// int64 clone(int32 flags, void *stk, M *mp, G *gp, void (*fn)(void)); TEXT runtime·clone(SB),NOSPLIT,$-8 MOVW flags+0(FP), R3 - MOVD stack+8(FP), R4 + MOVD stk+8(FP), R4 // Copy mp, gp, fn off parent stack for use by child. // Careful: Linux system call clobbers ???. @@ -273,7 +283,8 @@ TEXT runtime·clone(SB),NOSPLIT,$-8 // In parent, return. CMP R3, $0 - BEQ 2(PC) + BEQ 3(PC) + MOVW R3, ret+40(FP) RETURN // In child, on new stack. @@ -322,45 +333,50 @@ TEXT runtime·osyield(SB),NOSPLIT,$-8 RETURN TEXT runtime·sched_getaffinity(SB),NOSPLIT,$-8 - MOVD 8(R1), R3 - MOVD 16(R1), R4 - MOVD 24(R1), R5 + MOVD pid+0(FP), R3 + MOVD len+8(FP), R4 + MOVD buf+16(FP), R5 SYSCALL $SYS_sched_getaffinity + MOVW R3, ret+24(FP) RETURN // int32 runtime·epollcreate(int32 size); TEXT runtime·epollcreate(SB),NOSPLIT,$-8 - MOVW 8(R1), R3 + MOVW size+0(FP), R3 SYSCALL $SYS_epoll_create + MOVW R3, ret+8(FP) RETURN // int32 runtime·epollcreate1(int32 flags); TEXT runtime·epollcreate1(SB),NOSPLIT,$-8 - MOVW 8(R1), R3 + MOVW flags+0(FP), R3 SYSCALL $SYS_epoll_create1 + MOVW R3, ret+8(FP) RETURN -// int32 runtime·epollctl(int32 epfd, int32 op, int32 fd, EpollEvent *ev); +// func epollctl(epfd, op, fd int32, ev *epollEvent) int TEXT runtime·epollctl(SB),NOSPLIT,$-8 - MOVW 8(R1), R3 - MOVW 12(R1), R4 - MOVW 16(R1), R5 - MOVD 24(R1), R6 + MOVW epfd+0(FP), R3 + MOVW op+4(FP), R4 + MOVW fd+8(FP), R5 + MOVD ev+16(FP), R6 SYSCALL $SYS_epoll_ctl + MOVW R3, ret+24(FP) RETURN // int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout); TEXT runtime·epollwait(SB),NOSPLIT,$-8 - MOVW 8(R1), R3 - MOVD 16(R1), R4 - MOVW 24(R1), R5 - MOVW 28(R1), R6 + MOVW epfd+0(FP), R3 + MOVD ev+8(FP), R4 + MOVW nev+16(FP), R5 + MOVW timeout+20(FP), R6 SYSCALL $SYS_epoll_wait + MOVW R3, ret+24(FP) RETURN // void runtime·closeonexec(int32 fd); TEXT runtime·closeonexec(SB),NOSPLIT,$-8 - MOVW 8(R1), R3 // fd + MOVW fd+0(FP), R3 // fd MOVD $2, R4 // F_SETFD MOVD $1, R5 // FD_CLOEXEC SYSCALL $SYS_fcntl -- cgit v1.2.1 From 8fd8d4fc3e77101d12b836ca0ff334d20cd2ea04 Mon Sep 17 00:00:00 2001 From: Dave Cheney Date: Tue, 28 Oct 2014 09:56:33 +1100 Subject: [dev.power64] runtime: fix power64le build Brings defs_linux_power64le.h up to date with the big endian version. LGTM=rsc R=rsc, austin CC=golang-codereviews https://codereview.appspot.com/161470043 --- src/runtime/defs_linux_power64le.h | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/defs_linux_power64le.h b/src/runtime/defs_linux_power64le.h index 64f145672..93742fa34 100644 --- a/src/runtime/defs_linux_power64le.h +++ b/src/runtime/defs_linux_power64le.h @@ -88,11 +88,10 @@ enum { typedef struct Sigset Sigset; typedef struct Timespec Timespec; typedef struct Timeval Timeval; -typedef struct Sigaction Sigaction; +typedef struct SigactionT SigactionT; typedef struct Siginfo Siginfo; typedef struct Itimerval Itimerval; typedef struct EpollEvent EpollEvent; -typedef uint64 Usigset; #pragma pack on @@ -109,11 +108,11 @@ struct Timeval { int64 tv_sec; int64 tv_usec; }; -struct Sigaction { +struct SigactionT { void *sa_handler; uint64 sa_flags; void *sa_restorer; - Usigset sa_mask; + uint64 sa_mask; }; struct Siginfo { int32 si_signo; @@ -129,7 +128,7 @@ struct Itimerval { struct EpollEvent { uint32 events; byte Pad_cgo_0[4]; - uint64 data; + byte data[8]; // unaligned uintptr }; @@ -144,7 +143,6 @@ enum { SA_RESTORER = 0, }; -//typedef struct Usigset Usigset; typedef struct Ptregs Ptregs; typedef struct Vreg Vreg; typedef struct SigaltstackT SigaltstackT; @@ -153,11 +151,6 @@ typedef struct Ucontext Ucontext; #pragma pack on -//struct Usigset { -// uint64 sig[1]; -//}; -//typedef Sigset Usigset; - struct Ptregs { uint64 gpr[32]; uint64 nip; @@ -202,8 +195,8 @@ struct Ucontext { uint64 uc_flags; Ucontext *uc_link; SigaltstackT uc_stack; - Usigset uc_sigmask; - Usigset __unused[15]; + uint64 uc_sigmask; + uint64 __unused[15]; Sigcontext uc_mcontext; }; -- cgit v1.2.1 From c9d3f794a30d01abe9da84bfc2be7b5303421043 Mon Sep 17 00:00:00 2001 From: Dave Cheney Date: Tue, 28 Oct 2014 11:15:48 +1100 Subject: [dev.power64] runtime: fix cas64 on power64x cas64 was jumping to the wrong offset. LGTM=minux, rsc R=rsc, austin, minux CC=golang-codereviews https://codereview.appspot.com/158710043 --- src/runtime/asm_power64x.s | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/asm_power64x.s b/src/runtime/asm_power64x.s index 76bf42094..b489f6acc 100644 --- a/src/runtime/asm_power64x.s +++ b/src/runtime/asm_power64x.s @@ -409,17 +409,19 @@ TEXT runtime·cas(SB), NOSPLIT, $0-17 MOVD p+0(FP), R3 MOVW old+8(FP), R4 MOVW new+12(FP), R5 +cas_again: SYNC LWAR (R3), R6 CMPW R6, R4 - BNE 8(PC) + BNE cas_fail STWCCC R5, (R3) - BNE -5(PC) + BNE cas_again MOVD $1, R3 SYNC ISYNC MOVB R3, ret+16(FP) RETURN +cas_fail: MOVD $0, R3 BR -5(PC) @@ -435,19 +437,21 @@ TEXT runtime·cas64(SB), NOSPLIT, $0-25 MOVD p+0(FP), R3 MOVD old+8(FP), R4 MOVD new+16(FP), R5 +cas64_again: SYNC LDAR (R3), R6 CMP R6, R4 - BNE 7(PC) + BNE cas64_fail STDCCC R5, (R3) - BNE -5(PC) + BNE cas64_again MOVD $1, R3 SYNC ISYNC MOVB R3, ret+24(FP) RETURN +cas64_fail: MOVD $0, R3 - BR -4(PC) + BR -5(PC) TEXT runtime·casuintptr(SB), NOSPLIT, $0-25 BR runtime·cas64(SB) -- cgit v1.2.1 From c9851bece1a56434cf60ad6b98694a027599d1f6 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Tue, 28 Oct 2014 15:57:33 -0400 Subject: [dev.power64] runtime: fix atomicor8 for power64x Power64 servers do not currently support sub-word size atomic memory access, so atomicor8 uses word size atomic access. However, previously atomicor8 made no attempt to align this access, resulting in errors. Fix this by aligning the pointer to a word boundary and shifting the value appropriately. Since atomicor8 is used in GC, add a test to runtime?check to make sure this doesn't break in the future. This also fixes an incorrect branch label, an incorrectly sized argument move, and adds argument names to help go vet. LGTM=rsc R=rsc, dave CC=golang-codereviews https://codereview.appspot.com/165820043 --- src/runtime/asm_power64x.s | 26 ++++++++++++++++++++------ src/runtime/runtime.c | 6 ++++++ 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/asm_power64x.s b/src/runtime/asm_power64x.s index b489f6acc..21220e5cb 100644 --- a/src/runtime/asm_power64x.s +++ b/src/runtime/asm_power64x.s @@ -557,13 +557,27 @@ TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16 // void runtime·atomicor8(byte volatile*, byte); TEXT runtime·atomicor8(SB), NOSPLIT, $0-9 - MOVD 0(FP), R3 - MOVD 8(FP), R4 + MOVD ptr+0(FP), R3 + MOVBZ val+8(FP), R4 + // Align ptr down to 4 bytes so we can use 32-bit load/store. + // R5 = (R3 << 0) & ~3 + RLDCR $0, R3, $~3, R5 + // Compute val shift. +#ifdef GOARCH_power64 + // Big endian. ptr = ptr ^ 3 + XOR $3, R3 +#endif + // R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8) + RLDC $3, R3, $(3*8), R6 + // Shift val for aligned ptr. R4 = val << R6 + SLD R6, R4, R4 + +atomicor8_again: SYNC - LWAR (R3), R5 - OR R4, R5 - STWCCC R5, (R3) - BNE -3(PC) + LWAR (R5), R6 + OR R4, R6 + STWCCC R6, (R5) + BNE atomicor8_again SYNC ISYNC RETURN diff --git a/src/runtime/runtime.c b/src/runtime/runtime.c index b3503fb90..d984983ce 100644 --- a/src/runtime/runtime.c +++ b/src/runtime/runtime.c @@ -185,6 +185,7 @@ runtime·check(void) float64 j, j1; byte *k, *k1; uint16* l; + byte m[4]; struct x1 { byte x; }; @@ -236,6 +237,11 @@ runtime·check(void) if(k != k1) runtime·throw("casp3"); + m[0] = m[1] = m[2] = m[3] = 0x1; + runtime·atomicor8(&m[1], 0xf0); + if (m[0] != 0x1 || m[1] != 0xf1 || m[2] != 0x1 || m[3] != 0x1) + runtime·throw("atomicor8"); + *(uint64*)&j = ~0ULL; if(j == j) runtime·throw("float64nan"); -- cgit v1.2.1 From 05d42f8a61328aa7ea55887f601286b597caf0da Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Tue, 28 Oct 2014 21:50:16 -0400 Subject: [dev.power64] cmd/5a, cmd/6a, cmd/8a, cmd/9a: make labels function-scoped I removed support for jumping between functions years ago, as part of doing the instruction layout for each function separately. Given that, it makes sense to treat labels as function-scoped. This lets each function have its own 'loop' label, for example. Makes the assembly much cleaner and removes the last reason anyone would reach for the 123(PC) form instead. Note that this is on the dev.power64 branch, but it changes all the assemblers. The change will ship in Go 1.5 (perhaps after being ported into the new assembler). Came up as part of CL 167730043. LGTM=r R=r CC=austin, dave, golang-codereviews, minux https://codereview.appspot.com/159670043 --- src/runtime/asm_386.s | 58 +++++++++++++++--------------- src/runtime/asm_amd64.s | 78 ++++++++++++++++++++--------------------- src/runtime/asm_amd64p32.s | 58 +++++++++++++++--------------- src/runtime/asm_arm.s | 24 ++++++------- src/runtime/asm_power64x.s | 40 ++++++++++----------- src/runtime/memclr_386.s | 46 ++++++++++++------------ src/runtime/memclr_amd64.s | 44 +++++++++++------------ src/runtime/memclr_plan9_386.s | 24 ++++++------- src/runtime/race_amd64.s | 18 +++++----- src/runtime/sys_darwin_386.s | 4 +-- src/runtime/sys_darwin_amd64.s | 4 +-- src/runtime/sys_dragonfly_386.s | 4 +-- src/runtime/sys_freebsd_386.s | 4 +-- src/runtime/sys_linux_amd64.s | 8 ++--- src/runtime/sys_linux_arm.s | 8 ++--- src/runtime/sys_nacl_386.s | 4 +-- src/runtime/sys_nacl_amd64p32.s | 1 - src/runtime/sys_nacl_arm.s | 1 - src/runtime/sys_openbsd_386.s | 4 +-- src/runtime/sys_solaris_amd64.s | 12 +++---- src/runtime/sys_windows_386.s | 12 +++---- src/runtime/sys_windows_amd64.s | 12 +++---- 22 files changed, 233 insertions(+), 235 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s index 20d3c47c9..d0b3969bd 100644 --- a/src/runtime/asm_386.s +++ b/src/runtime/asm_386.s @@ -486,11 +486,11 @@ TEXT runtime·cas64(SB), NOSPLIT, $0-21 MOVL new_hi+16(FP), CX LOCK CMPXCHG8B 0(BP) - JNZ cas64_fail + JNZ fail MOVL $1, AX MOVB AX, ret+20(FP) RET -cas64_fail: +fail: MOVL $0, AX MOVB AX, ret+20(FP) RET @@ -1342,29 +1342,29 @@ TEXT strings·IndexByte(SB),NOSPLIT,$0 // AX = 1/0/-1 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 CMPL SI, DI - JEQ cmp_allsame + JEQ allsame CMPL BX, DX MOVL DX, BP CMOVLLT BX, BP // BP = min(alen, blen) CMPL BP, $4 - JB cmp_small + JB small TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 - JE cmp_mediumloop -cmp_largeloop: + JE mediumloop +largeloop: CMPL BP, $16 - JB cmp_mediumloop + JB mediumloop MOVOU (SI), X0 MOVOU (DI), X1 PCMPEQB X0, X1 PMOVMSKB X1, AX XORL $0xffff, AX // convert EQ to NE - JNE cmp_diff16 // branch if at least one byte is not equal + JNE diff16 // branch if at least one byte is not equal ADDL $16, SI ADDL $16, DI SUBL $16, BP - JMP cmp_largeloop + JMP largeloop -cmp_diff16: +diff16: BSFL AX, BX // index of first byte that differs XORL AX, AX MOVB (SI)(BX*1), CX @@ -1373,25 +1373,25 @@ cmp_diff16: LEAL -1(AX*2), AX // convert 1/0 to +1/-1 RET -cmp_mediumloop: +mediumloop: CMPL BP, $4 - JBE cmp_0through4 + JBE _0through4 MOVL (SI), AX MOVL (DI), CX CMPL AX, CX - JNE cmp_diff4 + JNE diff4 ADDL $4, SI ADDL $4, DI SUBL $4, BP - JMP cmp_mediumloop + JMP mediumloop -cmp_0through4: +_0through4: MOVL -4(SI)(BP*1), AX MOVL -4(DI)(BP*1), CX CMPL AX, CX - JEQ cmp_allsame + JEQ allsame -cmp_diff4: +diff4: BSWAPL AX // reverse order of bytes BSWAPL CX XORL AX, CX // find bit differences @@ -1402,37 +1402,37 @@ cmp_diff4: RET // 0-3 bytes in common -cmp_small: +small: LEAL (BP*8), CX NEGL CX - JEQ cmp_allsame + JEQ allsame // load si CMPB SI, $0xfc - JA cmp_si_high + JA si_high MOVL (SI), SI - JMP cmp_si_finish -cmp_si_high: + JMP si_finish +si_high: MOVL -4(SI)(BP*1), SI SHRL CX, SI -cmp_si_finish: +si_finish: SHLL CX, SI // same for di CMPB DI, $0xfc - JA cmp_di_high + JA di_high MOVL (DI), DI - JMP cmp_di_finish -cmp_di_high: + JMP di_finish +di_high: MOVL -4(DI)(BP*1), DI SHRL CX, DI -cmp_di_finish: +di_finish: SHLL CX, DI BSWAPL SI // reverse order of bytes BSWAPL DI XORL SI, DI // find bit differences - JEQ cmp_allsame + JEQ allsame BSRL DI, CX // index of highest bit difference SHRL CX, SI // move a's bit to bottom ANDL $1, SI // mask bit @@ -1441,7 +1441,7 @@ cmp_di_finish: // all the bytes in common are the same, so we just need // to compare the lengths. -cmp_allsame: +allsame: XORL AX, AX XORL CX, CX CMPL BX, DX diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 709834180..7a0fdfa73 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -461,11 +461,11 @@ TEXT runtime·cas64(SB), NOSPLIT, $0-25 MOVQ new+16(FP), CX LOCK CMPXCHGQ CX, 0(BX) - JNZ cas64_fail + JNZ fail MOVL $1, AX MOVB AX, ret+24(FP) RET -cas64_fail: +fail: MOVL $0, AX MOVB AX, ret+24(FP) RET @@ -876,24 +876,24 @@ TEXT runtime·aeshashbody(SB),NOSPLIT,$0-32 MOVO runtime·aeskeysched+0(SB), X2 MOVO runtime·aeskeysched+16(SB), X3 CMPQ CX, $16 - JB aessmall -aesloop: + JB small +loop: CMPQ CX, $16 - JBE aesloopend + JBE loopend MOVOU (AX), X1 AESENC X2, X0 AESENC X1, X0 SUBQ $16, CX ADDQ $16, AX - JMP aesloop + JMP loop // 1-16 bytes remaining -aesloopend: +loopend: // This load may overlap with the previous load above. // We'll hash some bytes twice, but that's ok. MOVOU -16(AX)(CX*1), X1 JMP partial // 0-15 bytes -aessmall: +small: TESTQ CX, CX JE finalize // 0 bytes @@ -1036,18 +1036,18 @@ TEXT runtime·eqstring(SB),NOSPLIT,$0-33 MOVQ s1len+8(FP), AX MOVQ s2len+24(FP), BX CMPQ AX, BX - JNE different + JNE noteq MOVQ s1str+0(FP), SI MOVQ s2str+16(FP), DI CMPQ SI, DI - JEQ same + JEQ eq CALL runtime·memeqbody(SB) MOVB AX, v+32(FP) RET -same: +eq: MOVB $1, v+32(FP) RET -different: +noteq: MOVB $0, v+32(FP) RET @@ -1170,29 +1170,29 @@ TEXT runtime·cmpbytes(SB),NOSPLIT,$0-56 // AX = 1/0/-1 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 CMPQ SI, DI - JEQ cmp_allsame + JEQ allsame CMPQ BX, DX MOVQ DX, BP CMOVQLT BX, BP // BP = min(alen, blen) = # of bytes to compare CMPQ BP, $8 - JB cmp_small + JB small -cmp_loop: +loop: CMPQ BP, $16 - JBE cmp_0through16 + JBE _0through16 MOVOU (SI), X0 MOVOU (DI), X1 PCMPEQB X0, X1 PMOVMSKB X1, AX XORQ $0xffff, AX // convert EQ to NE - JNE cmp_diff16 // branch if at least one byte is not equal + JNE diff16 // branch if at least one byte is not equal ADDQ $16, SI ADDQ $16, DI SUBQ $16, BP - JMP cmp_loop + JMP loop // AX = bit mask of differences -cmp_diff16: +diff16: BSFQ AX, BX // index of first byte that differs XORQ AX, AX MOVB (SI)(BX*1), CX @@ -1202,21 +1202,21 @@ cmp_diff16: RET // 0 through 16 bytes left, alen>=8, blen>=8 -cmp_0through16: +_0through16: CMPQ BP, $8 - JBE cmp_0through8 + JBE _0through8 MOVQ (SI), AX MOVQ (DI), CX CMPQ AX, CX - JNE cmp_diff8 -cmp_0through8: + JNE diff8 +_0through8: MOVQ -8(SI)(BP*1), AX MOVQ -8(DI)(BP*1), CX CMPQ AX, CX - JEQ cmp_allsame + JEQ allsame // AX and CX contain parts of a and b that differ. -cmp_diff8: +diff8: BSWAPQ AX // reverse order of bytes BSWAPQ CX XORQ AX, CX @@ -1227,44 +1227,44 @@ cmp_diff8: RET // 0-7 bytes in common -cmp_small: +small: LEAQ (BP*8), CX // bytes left -> bits left NEGQ CX // - bits lift (== 64 - bits left mod 64) - JEQ cmp_allsame + JEQ allsame // load bytes of a into high bytes of AX CMPB SI, $0xf8 - JA cmp_si_high + JA si_high MOVQ (SI), SI - JMP cmp_si_finish -cmp_si_high: + JMP si_finish +si_high: MOVQ -8(SI)(BP*1), SI SHRQ CX, SI -cmp_si_finish: +si_finish: SHLQ CX, SI // load bytes of b in to high bytes of BX CMPB DI, $0xf8 - JA cmp_di_high + JA di_high MOVQ (DI), DI - JMP cmp_di_finish -cmp_di_high: + JMP di_finish +di_high: MOVQ -8(DI)(BP*1), DI SHRQ CX, DI -cmp_di_finish: +di_finish: SHLQ CX, DI BSWAPQ SI // reverse order of bytes BSWAPQ DI XORQ SI, DI // find bit differences - JEQ cmp_allsame + JEQ allsame BSRQ DI, CX // index of highest bit difference SHRQ CX, SI // move a's bit to bottom ANDQ $1, SI // mask bit LEAQ -1(SI*2), AX // 1/0 => +1/-1 RET -cmp_allsame: +allsame: XORQ AX, AX XORQ CX, CX CMPQ BX, DX @@ -1299,7 +1299,7 @@ TEXT runtime·indexbytebody(SB),NOSPLIT,$0 MOVQ SI, DI CMPQ BX, $16 - JLT indexbyte_small + JLT small // round up to first 16-byte boundary TESTQ $15, SI @@ -1357,7 +1357,7 @@ failure: RET // handle for lengths < 16 -indexbyte_small: +small: MOVQ BX, CX REPN; SCASB JZ success diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s index 28875bc55..de3ef3a23 100644 --- a/src/runtime/asm_amd64p32.s +++ b/src/runtime/asm_amd64p32.s @@ -444,11 +444,11 @@ TEXT runtime·cas64(SB), NOSPLIT, $0-25 MOVQ new+16(FP), CX LOCK CMPXCHGQ CX, 0(BX) - JNZ cas64_fail + JNZ fail MOVL $1, AX MOVB AX, ret+24(FP) RET -cas64_fail: +fail: MOVL $0, AX MOVB AX, ret+24(FP) RET @@ -834,29 +834,29 @@ TEXT runtime·cmpbytes(SB),NOSPLIT,$0-28 // AX = 1/0/-1 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 CMPQ SI, DI - JEQ cmp_allsame + JEQ allsame CMPQ BX, DX MOVQ DX, R8 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare CMPQ R8, $8 - JB cmp_small + JB small -cmp_loop: +loop: CMPQ R8, $16 - JBE cmp_0through16 + JBE _0through16 MOVOU (SI), X0 MOVOU (DI), X1 PCMPEQB X0, X1 PMOVMSKB X1, AX XORQ $0xffff, AX // convert EQ to NE - JNE cmp_diff16 // branch if at least one byte is not equal + JNE diff16 // branch if at least one byte is not equal ADDQ $16, SI ADDQ $16, DI SUBQ $16, R8 - JMP cmp_loop + JMP loop // AX = bit mask of differences -cmp_diff16: +diff16: BSFQ AX, BX // index of first byte that differs XORQ AX, AX ADDQ BX, SI @@ -868,23 +868,23 @@ cmp_diff16: RET // 0 through 16 bytes left, alen>=8, blen>=8 -cmp_0through16: +_0through16: CMPQ R8, $8 - JBE cmp_0through8 + JBE _0through8 MOVQ (SI), AX MOVQ (DI), CX CMPQ AX, CX - JNE cmp_diff8 -cmp_0through8: + JNE diff8 +_0through8: ADDQ R8, SI ADDQ R8, DI MOVQ -8(SI), AX MOVQ -8(DI), CX CMPQ AX, CX - JEQ cmp_allsame + JEQ allsame // AX and CX contain parts of a and b that differ. -cmp_diff8: +diff8: BSWAPQ AX // reverse order of bytes BSWAPQ CX XORQ AX, CX @@ -895,46 +895,46 @@ cmp_diff8: RET // 0-7 bytes in common -cmp_small: +small: LEAQ (R8*8), CX // bytes left -> bits left NEGQ CX // - bits lift (== 64 - bits left mod 64) - JEQ cmp_allsame + JEQ allsame // load bytes of a into high bytes of AX CMPB SI, $0xf8 - JA cmp_si_high + JA si_high MOVQ (SI), SI - JMP cmp_si_finish -cmp_si_high: + JMP si_finish +si_high: ADDQ R8, SI MOVQ -8(SI), SI SHRQ CX, SI -cmp_si_finish: +si_finish: SHLQ CX, SI // load bytes of b in to high bytes of BX CMPB DI, $0xf8 - JA cmp_di_high + JA di_high MOVQ (DI), DI - JMP cmp_di_finish -cmp_di_high: + JMP di_finish +di_high: ADDQ R8, DI MOVQ -8(DI), DI SHRQ CX, DI -cmp_di_finish: +di_finish: SHLQ CX, DI BSWAPQ SI // reverse order of bytes BSWAPQ DI XORQ SI, DI // find bit differences - JEQ cmp_allsame + JEQ allsame BSRQ DI, CX // index of highest bit difference SHRQ CX, SI // move a's bit to bottom ANDQ $1, SI // mask bit LEAQ -1(SI*2), AX // 1/0 => +1/-1 RET -cmp_allsame: +allsame: XORQ AX, AX XORQ CX, CX CMPQ BX, DX @@ -969,7 +969,7 @@ TEXT runtime·indexbytebody(SB),NOSPLIT,$0 MOVL SI, DI CMPL BX, $16 - JLT indexbyte_small + JLT small // round up to first 16-byte boundary TESTL $15, SI @@ -1027,7 +1027,7 @@ failure: RET // handle for lengths < 16 -indexbyte_small: +small: MOVL BX, CX REPN; SCASB JZ success diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s index 621d13187..8942b11ac 100644 --- a/src/runtime/asm_arm.s +++ b/src/runtime/asm_arm.s @@ -492,7 +492,7 @@ TEXT asmcgocall<>(SB),NOSPLIT,$0-0 MOVW g_m(g), R8 MOVW m_g0(R8), R3 CMP R3, g - BEQ asmcgocall_g0 + BEQ g0 BL gosave<>(SB) MOVW R0, R5 MOVW R3, R0 @@ -501,7 +501,7 @@ TEXT asmcgocall<>(SB),NOSPLIT,$0-0 MOVW (g_sched+gobuf_sp)(g), R13 // Now on a scheduling stack (a pthread-created stack). -asmcgocall_g0: +g0: SUB $24, R13 BIC $0x7, R13 // alignment for gcc ABI MOVW R4, 20(R13) // save old g @@ -736,13 +736,13 @@ TEXT runtime·memeq(SB),NOSPLIT,$-4-13 ADD R1, R3, R6 MOVW $1, R0 MOVB R0, ret+12(FP) -_next2: +loop: CMP R1, R6 RET.EQ MOVBU.P 1(R1), R4 MOVBU.P 1(R2), R5 CMP R4, R5 - BEQ _next2 + BEQ loop MOVW $0, R0 MOVB R0, ret+12(FP) @@ -765,13 +765,13 @@ TEXT runtime·eqstring(SB),NOSPLIT,$-4-17 CMP R2, R3 RET.EQ ADD R2, R0, R6 -_eqnext: +loop: CMP R2, R6 RET.EQ MOVBU.P 1(R2), R4 MOVBU.P 1(R3), R5 CMP R4, R5 - BEQ _eqnext + BEQ loop MOVB R7, v+16(FP) RET @@ -786,26 +786,26 @@ TEXT bytes·Equal(SB),NOSPLIT,$0 MOVW b_len+16(FP), R3 CMP R1, R3 // unequal lengths are not equal - B.NE _notequal + B.NE notequal MOVW a+0(FP), R0 MOVW b+12(FP), R2 ADD R0, R1 // end -_byteseq_next: +loop: CMP R0, R1 - B.EQ _equal // reached the end + B.EQ equal // reached the end MOVBU.P 1(R0), R4 MOVBU.P 1(R2), R5 CMP R4, R5 - B.EQ _byteseq_next + B.EQ loop -_notequal: +notequal: MOVW $0, R0 MOVBU R0, ret+24(FP) RET -_equal: +equal: MOVW $1, R0 MOVBU R0, ret+24(FP) RET diff --git a/src/runtime/asm_power64x.s b/src/runtime/asm_power64x.s index 21220e5cb..f77658032 100644 --- a/src/runtime/asm_power64x.s +++ b/src/runtime/asm_power64x.s @@ -699,7 +699,7 @@ TEXT runtime·memeq(SB),NOSPLIT,$-8-25 SUB $1, R3 SUB $1, R4 ADD R3, R5, R8 -_next: +loop: CMP R3, R8 BNE 4(PC) MOVD $1, R3 @@ -708,7 +708,7 @@ _next: MOVBZU 1(R3), R6 MOVBZU 1(R4), R7 CMP R6, R7 - BEQ _next + BEQ loop MOVB R0, ret+24(FP) RETURN @@ -720,14 +720,14 @@ TEXT runtime·eqstring(SB),NOSPLIT,$0-33 MOVD s1len+8(FP), R4 MOVD s2len+24(FP), R5 CMP R4, R5 - BNE str_noteq + BNE noteq MOVD s1str+0(FP), R3 MOVD s2str+16(FP), R4 SUB $1, R3 SUB $1, R4 ADD R3, R5, R8 -eq_next: +loop: CMP R3, R8 BNE 4(PC) MOVD $1, R3 @@ -736,8 +736,8 @@ eq_next: MOVBZU 1(R3), R6 MOVBZU 1(R4), R7 CMP R6, R7 - BEQ eq_next -str_noteq: + BEQ loop +noteq: MOVB R0, ret+32(FP) RETURN @@ -747,7 +747,7 @@ TEXT bytes·Equal(SB),NOSPLIT,$0-49 MOVD b_len+32(FP), R4 CMP R3, R4 // unequal lengths are not equal - BNE _notequal + BNE noteq MOVD a+0(FP), R5 MOVD b+24(FP), R6 @@ -755,19 +755,19 @@ TEXT bytes·Equal(SB),NOSPLIT,$0-49 SUB $1, R6 ADD R5, R3 // end-1 -_byteseq_next: +loop: CMP R5, R3 - BEQ _equal // reached the end + BEQ equal // reached the end MOVBZU 1(R5), R4 MOVBZU 1(R6), R7 CMP R4, R7 - BEQ _byteseq_next + BEQ loop -_notequal: +noteq: MOVBZ R0, ret+48(FP) RETURN -_equal: +equal: MOVD $1, R3 MOVBZ R3, ret+48(FP) RETURN @@ -780,18 +780,18 @@ TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 SUB $1, R3 ADD R3, R4 // end-1 -_index_loop: +loop: CMP R3, R4 - BEQ _index_notfound + BEQ notfound MOVBZU 1(R3), R7 CMP R7, R5 - BNE _index_loop + BNE loop SUB R6, R3 // remove base MOVD R3, ret+32(FP) RETURN -_index_notfound: +notfound: MOVD $-1, R3 MOVD R3, ret+32(FP) RETURN @@ -804,18 +804,18 @@ TEXT strings·IndexByte(SB),NOSPLIT,$0 SUB $1, R3 ADD R3, R4 // end-1 -_index2_loop: +loop: CMP R3, R4 - BEQ _index2_notfound + BEQ notfound MOVBZU 1(R3), R7 CMP R7, R5 - BNE _index2_loop + BNE loop SUB R6, R3 // remove base MOVD R3, ret+24(FP) RETURN -_index2_notfound: +notfound: MOVD $-1, R3 MOVD R3, ret+24(FP) RETURN diff --git a/src/runtime/memclr_386.s b/src/runtime/memclr_386.s index 1520aea2e..3f20b69c8 100644 --- a/src/runtime/memclr_386.s +++ b/src/runtime/memclr_386.s @@ -15,31 +15,31 @@ TEXT runtime·memclr(SB), NOSPLIT, $0-8 XORL AX, AX // MOVOU seems always faster than REP STOSL. -clr_tail: +tail: TESTL BX, BX - JEQ clr_0 + JEQ _0 CMPL BX, $2 - JBE clr_1or2 + JBE _1or2 CMPL BX, $4 - JBE clr_3or4 + JBE _3or4 CMPL BX, $8 - JBE clr_5through8 + JBE _5through8 CMPL BX, $16 - JBE clr_9through16 + JBE _9through16 TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 JEQ nosse2 PXOR X0, X0 CMPL BX, $32 - JBE clr_17through32 + JBE _17through32 CMPL BX, $64 - JBE clr_33through64 + JBE _33through64 CMPL BX, $128 - JBE clr_65through128 + JBE _65through128 CMPL BX, $256 - JBE clr_129through256 + JBE _129through256 // TODO: use branch table and BSR to make this just a single dispatch -clr_loop: +loop: MOVOU X0, 0(DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) @@ -59,40 +59,40 @@ clr_loop: SUBL $256, BX ADDL $256, DI CMPL BX, $256 - JAE clr_loop - JMP clr_tail + JAE loop + JMP tail -clr_1or2: +_1or2: MOVB AX, (DI) MOVB AX, -1(DI)(BX*1) RET -clr_0: +_0: RET -clr_3or4: +_3or4: MOVW AX, (DI) MOVW AX, -2(DI)(BX*1) RET -clr_5through8: +_5through8: MOVL AX, (DI) MOVL AX, -4(DI)(BX*1) RET -clr_9through16: +_9through16: MOVL AX, (DI) MOVL AX, 4(DI) MOVL AX, -8(DI)(BX*1) MOVL AX, -4(DI)(BX*1) RET -clr_17through32: +_17through32: MOVOU X0, (DI) MOVOU X0, -16(DI)(BX*1) RET -clr_33through64: +_33through64: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, -32(DI)(BX*1) MOVOU X0, -16(DI)(BX*1) RET -clr_65through128: +_65through128: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) @@ -102,7 +102,7 @@ clr_65through128: MOVOU X0, -32(DI)(BX*1) MOVOU X0, -16(DI)(BX*1) RET -clr_129through256: +_129through256: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) @@ -126,5 +126,5 @@ nosse2: REP STOSL ANDL $3, BX - JNE clr_tail + JNE tail RET diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s index 94a2c7f23..ec24f1db2 100644 --- a/src/runtime/memclr_amd64.s +++ b/src/runtime/memclr_amd64.s @@ -15,30 +15,30 @@ TEXT runtime·memclr(SB), NOSPLIT, $0-16 XORQ AX, AX // MOVOU seems always faster than REP STOSQ. -clr_tail: +tail: TESTQ BX, BX - JEQ clr_0 + JEQ _0 CMPQ BX, $2 - JBE clr_1or2 + JBE _1or2 CMPQ BX, $4 - JBE clr_3or4 + JBE _3or4 CMPQ BX, $8 - JBE clr_5through8 + JBE _5through8 CMPQ BX, $16 - JBE clr_9through16 + JBE _9through16 PXOR X0, X0 CMPQ BX, $32 - JBE clr_17through32 + JBE _17through32 CMPQ BX, $64 - JBE clr_33through64 + JBE _33through64 CMPQ BX, $128 - JBE clr_65through128 + JBE _65through128 CMPQ BX, $256 - JBE clr_129through256 + JBE _129through256 // TODO: use branch table and BSR to make this just a single dispatch // TODO: for really big clears, use MOVNTDQ. -clr_loop: +loop: MOVOU X0, 0(DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) @@ -58,38 +58,38 @@ clr_loop: SUBQ $256, BX ADDQ $256, DI CMPQ BX, $256 - JAE clr_loop - JMP clr_tail + JAE loop + JMP tail -clr_1or2: +_1or2: MOVB AX, (DI) MOVB AX, -1(DI)(BX*1) RET -clr_0: +_0: RET -clr_3or4: +_3or4: MOVW AX, (DI) MOVW AX, -2(DI)(BX*1) RET -clr_5through8: +_5through8: MOVL AX, (DI) MOVL AX, -4(DI)(BX*1) RET -clr_9through16: +_9through16: MOVQ AX, (DI) MOVQ AX, -8(DI)(BX*1) RET -clr_17through32: +_17through32: MOVOU X0, (DI) MOVOU X0, -16(DI)(BX*1) RET -clr_33through64: +_33through64: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, -32(DI)(BX*1) MOVOU X0, -16(DI)(BX*1) RET -clr_65through128: +_65through128: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) @@ -99,7 +99,7 @@ clr_65through128: MOVOU X0, -32(DI)(BX*1) MOVOU X0, -16(DI)(BX*1) RET -clr_129through256: +_129through256: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) diff --git a/src/runtime/memclr_plan9_386.s b/src/runtime/memclr_plan9_386.s index b4b671f77..50f327b4e 100644 --- a/src/runtime/memclr_plan9_386.s +++ b/src/runtime/memclr_plan9_386.s @@ -10,40 +10,40 @@ TEXT runtime·memclr(SB), NOSPLIT, $0-8 MOVL n+4(FP), BX XORL AX, AX -clr_tail: +tail: TESTL BX, BX - JEQ clr_0 + JEQ _0 CMPL BX, $2 - JBE clr_1or2 + JBE _1or2 CMPL BX, $4 - JBE clr_3or4 + JBE _3or4 CMPL BX, $8 - JBE clr_5through8 + JBE _5through8 CMPL BX, $16 - JBE clr_9through16 + JBE _9through16 MOVL BX, CX SHRL $2, CX REP STOSL ANDL $3, BX - JNE clr_tail + JNE tail RET -clr_1or2: +_1or2: MOVB AX, (DI) MOVB AX, -1(DI)(BX*1) RET -clr_0: +_0: RET -clr_3or4: +_3or4: MOVW AX, (DI) MOVW AX, -2(DI)(BX*1) RET -clr_5through8: +_5through8: MOVL AX, (DI) MOVL AX, -4(DI)(BX*1) RET -clr_9through16: +_9through16: MOVL AX, (DI) MOVL AX, 4(DI) MOVL AX, -8(DI)(BX*1) diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s index bdea28c7c..15b18ff8f 100644 --- a/src/runtime/race_amd64.s +++ b/src/runtime/race_amd64.s @@ -140,20 +140,20 @@ TEXT racecalladdr<>(SB), NOSPLIT, $0-0 MOVQ g_racectx(R14), RARG0 // goroutine context // Check that addr is within [arenastart, arenaend) or within [noptrdata, enoptrbss). CMPQ RARG1, runtime·racearenastart(SB) - JB racecalladdr_data + JB data CMPQ RARG1, runtime·racearenaend(SB) - JB racecalladdr_call -racecalladdr_data: + JB call +data: MOVQ $runtime·noptrdata(SB), R13 CMPQ RARG1, R13 - JB racecalladdr_ret + JB ret MOVQ $runtime·enoptrbss(SB), R13 CMPQ RARG1, R13 - JAE racecalladdr_ret -racecalladdr_call: + JAE ret +call: MOVQ AX, AX // w/o this 6a miscompiles this function JMP racecall<>(SB) -racecalladdr_ret: +ret: RET // func runtime·racefuncenter(pc uintptr) @@ -335,9 +335,9 @@ TEXT racecall<>(SB), NOSPLIT, $0-0 MOVQ SP, R12 // callee-saved, preserved across the CALL MOVQ m_g0(R13), R10 CMPQ R10, R14 - JE racecall_cont // already on g0 + JE call // already on g0 MOVQ (g_sched+gobuf_sp)(R10), SP -racecall_cont: +call: ANDQ $~15, SP // alignment for gcc ABI CALL AX MOVQ R12, SP diff --git a/src/runtime/sys_darwin_386.s b/src/runtime/sys_darwin_386.s index a961c71a8..3bf8b1d41 100644 --- a/src/runtime/sys_darwin_386.s +++ b/src/runtime/sys_darwin_386.s @@ -248,7 +248,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$40 MOVL BX, 0(SP) MOVL $runtime·badsignal(SB), AX CALL AX - JMP sigtramp_ret + JMP ret // save g MOVL DI, 20(SP) @@ -275,7 +275,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$40 MOVL 20(SP), DI MOVL DI, g(CX) -sigtramp_ret: +ret: // call sigreturn MOVL context+16(FP), CX MOVL style+4(FP), BX diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s index bd397d72a..8a8928e06 100644 --- a/src/runtime/sys_darwin_amd64.s +++ b/src/runtime/sys_darwin_amd64.s @@ -211,7 +211,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$64 MOVL DX, 0(SP) MOVQ $runtime·badsignal(SB), AX CALL AX - JMP sigtramp_ret + JMP ret // save g MOVQ R10, 48(SP) @@ -233,7 +233,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$64 MOVQ 48(SP), R10 MOVQ R10, g(BX) -sigtramp_ret: +ret: // call sigreturn MOVL $(0x2000000+184), AX // sigreturn(ucontext, infostyle) MOVQ 32(SP), DI // saved ucontext diff --git a/src/runtime/sys_dragonfly_386.s b/src/runtime/sys_dragonfly_386.s index 161eaec19..71ece9ecb 100644 --- a/src/runtime/sys_dragonfly_386.s +++ b/src/runtime/sys_dragonfly_386.s @@ -217,7 +217,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$44 MOVL BX, 0(SP) MOVL $runtime·badsignal(SB), AX CALL AX - JMP sigtramp_ret + JMP ret // save g MOVL DI, 20(SP) @@ -243,7 +243,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$44 MOVL 20(SP), BX MOVL BX, g(CX) -sigtramp_ret: +ret: // call sigreturn MOVL context+8(FP), AX MOVL $0, 0(SP) // syscall gap diff --git a/src/runtime/sys_freebsd_386.s b/src/runtime/sys_freebsd_386.s index 2c40fc433..66d03c27d 100644 --- a/src/runtime/sys_freebsd_386.s +++ b/src/runtime/sys_freebsd_386.s @@ -197,7 +197,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$44 MOVL BX, 0(SP) MOVL $runtime·badsignal(SB), AX CALL AX - JMP sigtramp_ret + JMP ret // save g MOVL DI, 20(SP) @@ -223,7 +223,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$44 MOVL 20(SP), BX MOVL BX, g(CX) -sigtramp_ret: +ret: // call sigreturn MOVL context+8(FP), AX MOVL $0, 0(SP) // syscall gap diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s index 33b91e872..d8d86ffad 100644 --- a/src/runtime/sys_linux_amd64.s +++ b/src/runtime/sys_linux_amd64.s @@ -115,7 +115,7 @@ TEXT time·now(SB),NOSPLIT,$16 // That leaves 104 for the gettime code to use. Hope that's enough! MOVQ runtime·__vdso_clock_gettime_sym(SB), AX CMPQ AX, $0 - JEQ fallback_gtod + JEQ fallback MOVL $0, DI // CLOCK_REALTIME LEAQ 0(SP), SI CALL AX @@ -124,7 +124,7 @@ TEXT time·now(SB),NOSPLIT,$16 MOVQ AX, sec+0(FP) MOVL DX, nsec+8(FP) RET -fallback_gtod: +fallback: LEAQ 0(SP), DI MOVQ $0, SI MOVQ runtime·__vdso_gettimeofday_sym(SB), AX @@ -141,7 +141,7 @@ TEXT runtime·nanotime(SB),NOSPLIT,$16 // See comment above in time.now. MOVQ runtime·__vdso_clock_gettime_sym(SB), AX CMPQ AX, $0 - JEQ fallback_gtod_nt + JEQ fallback MOVL $1, DI // CLOCK_MONOTONIC LEAQ 0(SP), SI CALL AX @@ -153,7 +153,7 @@ TEXT runtime·nanotime(SB),NOSPLIT,$16 ADDQ DX, AX MOVQ AX, ret+0(FP) RET -fallback_gtod_nt: +fallback: LEAQ 0(SP), DI MOVQ $0, SI MOVQ runtime·__vdso_gettimeofday_sym(SB), AX diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s index bd285f399..033a03642 100644 --- a/src/runtime/sys_linux_arm.s +++ b/src/runtime/sys_linux_arm.s @@ -373,20 +373,20 @@ TEXT cas<>(SB),NOSPLIT,$0 TEXT runtime·cas(SB),NOSPLIT,$0 MOVW ptr+0(FP), R2 MOVW old+4(FP), R0 -casagain: +loop: MOVW new+8(FP), R1 BL cas<>(SB) - BCC cascheck + BCC check MOVW $1, R0 MOVB R0, ret+12(FP) RET -cascheck: +check: // Kernel lies; double-check. MOVW ptr+0(FP), R2 MOVW old+4(FP), R0 MOVW 0(R2), R3 CMP R0, R3 - BEQ casagain + BEQ loop MOVW $0, R0 MOVB R0, ret+12(FP) RET diff --git a/src/runtime/sys_nacl_386.s b/src/runtime/sys_nacl_386.s index 47985f31f..16cd721d9 100644 --- a/src/runtime/sys_nacl_386.s +++ b/src/runtime/sys_nacl_386.s @@ -293,7 +293,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0 MOVL $0, 0(SP) MOVL $runtime·badsignal(SB), AX CALL AX - JMP sigtramp_ret + JMP ret // save g MOVL DI, 20(SP) @@ -317,7 +317,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0 MOVL 20(SP), BX MOVL BX, g(CX) -sigtramp_ret: +ret: // Enable exceptions again. NACL_SYSCALL(SYS_exception_clear_flag) diff --git a/src/runtime/sys_nacl_amd64p32.s b/src/runtime/sys_nacl_amd64p32.s index c30c2a893..06a0dc5dd 100644 --- a/src/runtime/sys_nacl_amd64p32.s +++ b/src/runtime/sys_nacl_amd64p32.s @@ -338,7 +338,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$80 MOVL 20(SP), BX MOVL BX, g(CX) -sigtramp_ret: // Enable exceptions again. NACL_SYSCALL(SYS_exception_clear_flag) diff --git a/src/runtime/sys_nacl_arm.s b/src/runtime/sys_nacl_arm.s index d354ab483..432deadf4 100644 --- a/src/runtime/sys_nacl_arm.s +++ b/src/runtime/sys_nacl_arm.s @@ -269,7 +269,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$80 // restore g MOVW 20(R13), g -sigtramp_ret: // Enable exceptions again. NACL_SYSCALL(SYS_exception_clear_flag) diff --git a/src/runtime/sys_openbsd_386.s b/src/runtime/sys_openbsd_386.s index 5cda7768a..b1ae5ecee 100644 --- a/src/runtime/sys_openbsd_386.s +++ b/src/runtime/sys_openbsd_386.s @@ -186,7 +186,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$44 MOVL BX, 0(SP) MOVL $runtime·badsignal(SB), AX CALL AX - JMP sigtramp_ret + JMP ret // save g MOVL DI, 20(SP) @@ -212,7 +212,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$44 MOVL 20(SP), BX MOVL BX, g(CX) -sigtramp_ret: +ret: // call sigreturn MOVL context+8(FP), AX MOVL $0, 0(SP) // syscall gap diff --git a/src/runtime/sys_solaris_amd64.s b/src/runtime/sys_solaris_amd64.s index 0ebdab6ee..3981893b0 100644 --- a/src/runtime/sys_solaris_amd64.s +++ b/src/runtime/sys_solaris_amd64.s @@ -287,24 +287,24 @@ TEXT runtime·usleep1(SB),NOSPLIT,$0 // Execute call on m->g0. get_tls(R15) CMPQ R15, $0 - JE usleep1_noswitch + JE noswitch MOVQ g(R15), R13 CMPQ R13, $0 - JE usleep1_noswitch + JE noswitch MOVQ g_m(R13), R13 CMPQ R13, $0 - JE usleep1_noswitch + JE noswitch // TODO(aram): do something about the cpu profiler here. MOVQ m_g0(R13), R14 CMPQ g(R15), R14 - JNE usleep1_switch + JNE switch // executing on m->g0 already CALL AX RET -usleep1_switch: +switch: // Switch to m->g0 stack and back. MOVQ (g_sched+gobuf_sp)(R14), R14 MOVQ SP, -8(R14) @@ -313,7 +313,7 @@ usleep1_switch: MOVQ 0(SP), SP RET -usleep1_noswitch: +noswitch: // Not a Go-managed thread. Do not switch stack. CALL AX RET diff --git a/src/runtime/sys_windows_386.s b/src/runtime/sys_windows_386.s index 932fe9dd2..13fb5bdc9 100644 --- a/src/runtime/sys_windows_386.s +++ b/src/runtime/sys_windows_386.s @@ -106,7 +106,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0-0 MOVL g_m(DX), BX MOVL m_g0(BX), BX CMPL DX, BX - JEQ sigtramp_g0 + JEQ g0 // switch to the g0 stack get_tls(BP) @@ -123,7 +123,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0-0 MOVL SP, 36(DI) MOVL DI, SP -sigtramp_g0: +g0: MOVL 0(CX), BX // ExceptionRecord* MOVL 4(CX), CX // Context* MOVL BX, 0(SP) @@ -383,12 +383,12 @@ TEXT runtime·usleep1(SB),NOSPLIT,$0 MOVL m_g0(BP), SI CMPL g(CX), SI - JNE usleep1_switch + JNE switch // executing on m->g0 already CALL AX - JMP usleep1_ret + JMP ret -usleep1_switch: +switch: // Switch to m->g0 stack and back. MOVL (g_sched+gobuf_sp)(SI), SI MOVL SP, -4(SI) @@ -396,7 +396,7 @@ usleep1_switch: CALL AX MOVL 0(SP), SP -usleep1_ret: +ret: get_tls(CX) MOVL g(CX), BP MOVL g_m(BP), BP diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s index e6190ce68..8b95f6d6c 100644 --- a/src/runtime/sys_windows_amd64.s +++ b/src/runtime/sys_windows_amd64.s @@ -138,7 +138,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0-0 MOVQ g_m(DX), BX MOVQ m_g0(BX), BX CMPQ DX, BX - JEQ sigtramp_g0 + JEQ g0 // switch to g0 stack get_tls(BP) @@ -157,7 +157,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0-0 MOVQ SP, 104(DI) MOVQ DI, SP -sigtramp_g0: +g0: MOVQ 0(CX), BX // ExceptionRecord* MOVQ 8(CX), CX // Context* MOVQ BX, 0(SP) @@ -407,12 +407,12 @@ TEXT runtime·usleep1(SB),NOSPLIT,$0 MOVQ m_g0(R13), R14 CMPQ g(R15), R14 - JNE usleep1_switch + JNE switch // executing on m->g0 already CALL AX - JMP usleep1_ret + JMP ret -usleep1_switch: +switch: // Switch to m->g0 stack and back. MOVQ (g_sched+gobuf_sp)(R14), R14 MOVQ SP, -8(R14) @@ -420,7 +420,7 @@ usleep1_switch: CALL AX MOVQ 0(SP), SP -usleep1_ret: +ret: MOVQ $0, m_libcallsp(R13) RET -- cgit v1.2.1 From f65bb028c5ceb4fb213b103f24a85f17cf67ac39 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Thu, 30 Oct 2014 10:16:03 -0400 Subject: [dev.garbage] cmd/gc, runtime: implement write barriers in terms of writebarrierptr This CL implements the many multiword write barriers by calling writebarrierptr, so that only writebarrierptr needs the actual barrier. In lieu of an actual barrier, writebarrierptr checks that the value being copied is not a small non-zero integer. This is enough to shake out bugs where the barrier is being called when it should not (for non-pointer values). It also found a few tests in sync/atomic that were being too clever. This CL adds a write barrier for the memory moved during the builtin copy function, which I forgot when inserting barriers for Go 1.4. This CL re-enables some write barriers that were disabled for Go 1.4. Those were disabled because it is possible to change the generated code so that they are unnecessary most of the time, but we have not changed the generated code yet. For safety they must be enabled. None of this is terribly efficient. We are aiming for correct first. LGTM=rlh R=rlh CC=golang-codereviews https://codereview.appspot.com/168770043 --- src/runtime/malloc.go | 33 ++++++++ src/runtime/mgc0.go | 104 ++++++++++++++++++++------ src/runtime/mgc0.h | 2 +- src/runtime/wbfat.go | 190 +++++++++++++++++++++++++++++++++++++++++++++++ src/runtime/wbfat_gen.go | 41 ++++++++++ 5 files changed, 346 insertions(+), 24 deletions(-) create mode 100644 src/runtime/wbfat.go create mode 100644 src/runtime/wbfat_gen.go (limited to 'src/runtime') diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 020f87a7a..56f4f7cd7 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -245,6 +245,8 @@ func mallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { masksize = masksize * pointersPerByte / 8 // 4 bits per word masksize++ // unroll flag in the beginning if masksize > maxGCMask && typ.gc[1] != 0 { + // write barriers have not been updated to deal with this case yet. + gothrow("maxGCMask too small for now") // If the mask is too large, unroll the program directly // into the GC bitmap. It's 7 times slower than copying // from the pre-unrolled mask, but saves 1/16 of type size @@ -344,6 +346,37 @@ marked: return x } +func loadPtrMask(typ *_type) []uint8 { + var ptrmask *uint8 + nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize + if typ.kind&kindGCProg != 0 { + masksize := nptr + if masksize%2 != 0 { + masksize *= 2 // repeated + } + masksize = masksize * pointersPerByte / 8 // 4 bits per word + masksize++ // unroll flag in the beginning + if masksize > maxGCMask && typ.gc[1] != 0 { + // write barriers have not been updated to deal with this case yet. + gothrow("maxGCMask too small for now") + } + ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0]))) + // Check whether the program is already unrolled + // by checking if the unroll flag byte is set + maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask))) + if *(*uint8)(unsafe.Pointer(&maskword)) == 0 { + mp := acquirem() + mp.ptrarg[0] = unsafe.Pointer(typ) + onM(unrollgcprog_m) + releasem(mp) + } + ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte + } else { + ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask + } + return (*[1 << 30]byte)(unsafe.Pointer(ptrmask))[:(nptr+1)/2] +} + // implementation of new builtin func newobject(typ *_type) unsafe.Pointer { flags := 0 diff --git a/src/runtime/mgc0.go b/src/runtime/mgc0.go index 3a7204b54..75678c522 100644 --- a/src/runtime/mgc0.go +++ b/src/runtime/mgc0.go @@ -83,54 +83,112 @@ func bgsweep() { } } +const ( + _PoisonGC = 0xf969696969696969 & ^uintptr(0) + _PoisonStack = 0x6868686868686868 & ^uintptr(0) +) + // NOTE: Really dst *unsafe.Pointer, src unsafe.Pointer, // but if we do that, Go inserts a write barrier on *dst = src. //go:nosplit func writebarrierptr(dst *uintptr, src uintptr) { + if src != 0 && (src < _PageSize || src == _PoisonGC || src == _PoisonStack) { + onM(func() { gothrow("bad pointer in write barrier") }) + } *dst = src } //go:nosplit func writebarrierstring(dst *[2]uintptr, src [2]uintptr) { - dst[0] = src[0] + writebarrierptr(&dst[0], src[0]) dst[1] = src[1] } //go:nosplit func writebarrierslice(dst *[3]uintptr, src [3]uintptr) { - dst[0] = src[0] + writebarrierptr(&dst[0], src[0]) dst[1] = src[1] dst[2] = src[2] } //go:nosplit func writebarrieriface(dst *[2]uintptr, src [2]uintptr) { - dst[0] = src[0] - dst[1] = src[1] + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) } -//go:nosplit -func writebarrierfat2(dst *[2]uintptr, _ *byte, src [2]uintptr) { - dst[0] = src[0] - dst[1] = src[1] -} +//go:generate go run wbfat_gen.go -- wbfat.go +// +// The above line generates multiword write barriers for +// all the combinations of ptr+scalar up to four words. +// The implementations are written to wbfat.go. //go:nosplit -func writebarrierfat3(dst *[3]uintptr, _ *byte, src [3]uintptr) { - dst[0] = src[0] - dst[1] = src[1] - dst[2] = src[2] -} - -//go:nosplit -func writebarrierfat4(dst *[4]uintptr, _ *byte, src [4]uintptr) { - dst[0] = src[0] - dst[1] = src[1] - dst[2] = src[2] - dst[3] = src[3] +func writebarrierfat(typ *_type, dst, src unsafe.Pointer) { + mask := loadPtrMask(typ) + nptr := typ.size / ptrSize + for i := uintptr(0); i < nptr; i += 2 { + bits := mask[i/2] + if (bits>>2)&_BitsMask == _BitsPointer { + writebarrierptr((*uintptr)(dst), *(*uintptr)(src)) + } else { + *(*uintptr)(dst) = *(*uintptr)(src) + } + dst = add(dst, ptrSize) + src = add(src, ptrSize) + if i+1 == nptr { + break + } + bits >>= 4 + if (bits>>2)&_BitsMask == _BitsPointer { + writebarrierptr((*uintptr)(dst), *(*uintptr)(src)) + } else { + *(*uintptr)(dst) = *(*uintptr)(src) + } + dst = add(dst, ptrSize) + src = add(src, ptrSize) + } } //go:nosplit -func writebarrierfat(typ *_type, dst, src unsafe.Pointer) { - memmove(dst, src, typ.size) +func writebarriercopy(typ *_type, dst, src slice) int { + n := dst.len + if n > src.len { + n = src.len + } + if n == 0 { + return 0 + } + dstp := unsafe.Pointer(dst.array) + srcp := unsafe.Pointer(src.array) + + if uintptr(srcp) < uintptr(dstp) && uintptr(srcp)+uintptr(n)*typ.size > uintptr(dstp) { + // Overlap with src before dst. + // Copy backward, being careful not to move dstp/srcp + // out of the array they point into. + dstp = add(dstp, uintptr(n-1)*typ.size) + srcp = add(srcp, uintptr(n-1)*typ.size) + i := uint(0) + for { + writebarrierfat(typ, dstp, srcp) + if i++; i >= n { + break + } + dstp = add(dstp, -typ.size) + srcp = add(srcp, -typ.size) + } + } else { + // Copy forward, being careful not to move dstp/srcp + // out of the array they point into. + i := uint(0) + for { + writebarrierfat(typ, dstp, srcp) + if i++; i >= n { + break + } + dstp = add(dstp, typ.size) + srcp = add(srcp, typ.size) + } + } + return int(n) } diff --git a/src/runtime/mgc0.h b/src/runtime/mgc0.h index 64f818914..16fbe4665 100644 --- a/src/runtime/mgc0.h +++ b/src/runtime/mgc0.h @@ -56,7 +56,7 @@ enum { BitsEface = 3, // 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively. - MaxGCMask = 64, + MaxGCMask = 65536, // TODO(rsc): change back to 64 }; // Bits in per-word bitmap. diff --git a/src/runtime/wbfat.go b/src/runtime/wbfat.go new file mode 100644 index 000000000..75c58b26b --- /dev/null +++ b/src/runtime/wbfat.go @@ -0,0 +1,190 @@ +// generated by wbfat_gen.go; use go generate + +package runtime + +//go:nosplit +func writebarrierfat01(dst *[2]uintptr, _ *byte, src [2]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) +} + +//go:nosplit +func writebarrierfat10(dst *[2]uintptr, _ *byte, src [2]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] +} + +//go:nosplit +func writebarrierfat11(dst *[2]uintptr, _ *byte, src [2]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) +} + +//go:nosplit +func writebarrierfat001(dst *[3]uintptr, _ *byte, src [3]uintptr) { + dst[0] = src[0] + dst[1] = src[1] + writebarrierptr(&dst[2], src[2]) +} + +//go:nosplit +func writebarrierfat010(dst *[3]uintptr, _ *byte, src [3]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) + dst[2] = src[2] +} + +//go:nosplit +func writebarrierfat011(dst *[3]uintptr, _ *byte, src [3]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) + writebarrierptr(&dst[2], src[2]) +} + +//go:nosplit +func writebarrierfat100(dst *[3]uintptr, _ *byte, src [3]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] + dst[2] = src[2] +} + +//go:nosplit +func writebarrierfat101(dst *[3]uintptr, _ *byte, src [3]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] + writebarrierptr(&dst[2], src[2]) +} + +//go:nosplit +func writebarrierfat110(dst *[3]uintptr, _ *byte, src [3]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) + dst[2] = src[2] +} + +//go:nosplit +func writebarrierfat111(dst *[3]uintptr, _ *byte, src [3]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) + writebarrierptr(&dst[2], src[2]) +} + +//go:nosplit +func writebarrierfat0001(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + dst[1] = src[1] + dst[2] = src[2] + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat0010(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + dst[1] = src[1] + writebarrierptr(&dst[2], src[2]) + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat0011(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + dst[1] = src[1] + writebarrierptr(&dst[2], src[2]) + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat0100(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) + dst[2] = src[2] + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat0101(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) + dst[2] = src[2] + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat0110(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) + writebarrierptr(&dst[2], src[2]) + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat0111(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) + writebarrierptr(&dst[2], src[2]) + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat1000(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] + dst[2] = src[2] + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat1001(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] + dst[2] = src[2] + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat1010(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] + writebarrierptr(&dst[2], src[2]) + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat1011(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] + writebarrierptr(&dst[2], src[2]) + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat1100(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) + dst[2] = src[2] + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat1101(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) + dst[2] = src[2] + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat1110(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) + writebarrierptr(&dst[2], src[2]) + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat1111(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) + writebarrierptr(&dst[2], src[2]) + writebarrierptr(&dst[3], src[3]) +} diff --git a/src/runtime/wbfat_gen.go b/src/runtime/wbfat_gen.go new file mode 100644 index 000000000..78d5b6271 --- /dev/null +++ b/src/runtime/wbfat_gen.go @@ -0,0 +1,41 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +package main + +import ( + "flag" + "fmt" + "log" + "os" +) + +func main() { + flag.Parse() + if flag.NArg() > 0 { + f, err := os.Create(flag.Arg(0)) + if err != nil { + log.Fatal(err) + } + os.Stdout = f + } + fmt.Printf("// generated by wbfat_gen.go; use go generate\n\n") + fmt.Printf("package runtime\n") + for i := uint(2); i <= 4; i++ { + for j := 1; j < 1< Date: Tue, 4 Nov 2014 13:31:34 -0500 Subject: [dev.garbage] runtime: Add gc mark verification pass. This adds an independent mark phase to the GC that can be used to verify the the default concurrent mark phase has found all reachable objects. It uses the upper 2 bits of the boundary nibble to encode the mark leaving the lower bits to encode the boundary and the normal mark bit. LGTM=rsc R=rsc CC=golang-codereviews https://codereview.appspot.com/167130043 --- src/runtime/heapdump.c | 15 +- src/runtime/malloc.go | 28 +++- src/runtime/mgc0.c | 428 +++++++++++++++++++++++++++++++++++++++---------- src/runtime/mgc0.h | 8 +- src/runtime/stack.c | 40 +---- src/runtime/stubs.go | 6 + 6 files changed, 382 insertions(+), 143 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/heapdump.c b/src/runtime/heapdump.c index 71da419f1..5ac37803b 100644 --- a/src/runtime/heapdump.c +++ b/src/runtime/heapdump.c @@ -259,20 +259,7 @@ dumpbv(BitVector *bv, uintptr offset) dumpint(offset + i / BitsPerPointer * PtrSize); break; case BitsMultiWord: - switch(bv->bytedata[(i+BitsPerPointer)/8] >> (i+BitsPerPointer)%8 & 3) { - default: - runtime·throw("unexpected garbage collection bits"); - case BitsIface: - dumpint(FieldKindIface); - dumpint(offset + i / BitsPerPointer * PtrSize); - i += BitsPerPointer; - break; - case BitsEface: - dumpint(FieldKindEface); - dumpint(offset + i / BitsPerPointer * PtrSize); - i += BitsPerPointer; - break; - } + runtime·throw("bumpbv unexpected garbage collection bits"); } } } diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 56f4f7cd7..274bae9a3 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -306,6 +306,18 @@ func mallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { } } marked: + + // GCmarkterminate allocates black + // All slots hold nil so no scanning is needed. + // This may be racing with GC so do it atomically if there can be + // a race marking the bit. + if gcphase == _GCmarktermination { + mp := acquirem() + mp.ptrarg[0] = x + onM(gcmarknewobject_m) + releasem(mp) + } + if raceenabled { racemalloc(x, size) } @@ -478,8 +490,12 @@ func gogc(force int32) { // Do a concurrent heap scan before we stop the world. onM(gcscan_m) + onM(gcinstallmarkwb_m) onM(stoptheworld) - + // onM(starttheworld) + // mark from roots scanned in gcscan_m. startthework when write barrier works + onM(gcmark_m) + // onM(stoptheworld) if mp != acquirem() { gothrow("gogc: rescheduled") } @@ -510,6 +526,8 @@ func gogc(force int32) { onM(gc_m) } + onM(gccheckmark_m) + // all done mp.gcing = 0 semrelease(&worldsema) @@ -524,6 +542,14 @@ func gogc(force int32) { } } +func GCcheckmarkenable() { + onM(gccheckmarkenable_m) +} + +func GCcheckmarkdisable() { + onM(gccheckmarkdisable_m) +} + // GC runs a garbage collection. func GC() { gogc(2) diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index f76d7c05c..e283f6ee8 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -155,12 +155,16 @@ extern int32 runtime·gcpercent; // uint32 runtime·worldsema = 1; +// It is a bug if bits does not have bitBoundary set but +// there are still some cases where this happens related +// to stack spans. typedef struct Markbits Markbits; struct Markbits { byte *bitp; // pointer to the byte holding xbits byte shift; // bits xbits needs to be shifted to get bits byte xbits; // byte holding all the bits from *bitp - byte bits; // bits relevant to corresponding slot. + byte bits; // mark and boundary bits relevant to corresponding slot. + byte tbits; // pointer||scalar bits relevant to corresponding slot. }; extern byte runtime·data[]; @@ -204,6 +208,11 @@ static bool inheap(byte*); static bool shaded(byte*); static void shade(byte*); static void slottombits(byte*, Markbits*); +static void atomicxor8(byte*, byte); +static bool ischeckmarked(Markbits*); +static bool ismarked(Markbits*); +static void clearcheckmarkbits(void); +static void clearcheckmarkbitsspan(MSpan*); void runtime·bgsweep(void); void runtime·finishsweep_m(void); @@ -228,6 +237,28 @@ struct WorkData { }; WorkData runtime·work; +// To help debug the concurrent GC we remark with the world +// stopped ensuring that any object encountered has their normal +// mark bit set. To do this we use an orthogonal bit +// pattern to indicate the object is marked. The following pattern +// uses the upper two bits in the object's bounday nibble. +// 01: scalar not marked +// 10: pointer not marked +// 11: pointer marked +// 00: scalar marked +// Xoring with 01 will flip the pattern from marked to unmarked and vica versa. +// The higher bit is 1 for pointers and 0 for scalars, whether the object +// is marked or not. +// The first nibble no longer holds the bitsDead pattern indicating that the +// there are no more pointers in the object. This information is held +// in the second nibble. + +// When marking an object if the bool checkmark is true one uses the above +// encoding, otherwise one uses the bitMarked bit in the lower two bits +// of the nibble. +static bool checkmark = false; +static bool gccheckmarkenable = false; + // Is address b in the known heap. If it doesn't have a valid gcmap // returns false. For example pointers into stacks will return false. static bool @@ -261,11 +292,14 @@ slottombits(byte *obj, Markbits *mbits) mbits->shift = (off % wordsPerBitmapByte) * gcBits; mbits->xbits = *mbits->bitp; mbits->bits = (mbits->xbits >> mbits->shift) & bitMask; + mbits->tbits = (mbits->xbits >> mbits->shift) & bitPtrMask; } // b is a pointer into the heap. // Find the start of the object refered to by b. // Set mbits to the associated bits from the bit map. +// If b is not a valid heap object return nil and +// undefined values in mbits. static byte* objectstart(byte *b, Markbits *mbits) { @@ -277,42 +311,27 @@ objectstart(byte *b, Markbits *mbits) obj = (byte*)((uintptr)b&~(PtrSize-1)); for(;;) { slottombits(obj, mbits); - if(mbits->bits&bitBoundary == bitBoundary) + if((mbits->bits&bitBoundary) == bitBoundary) break; - + // Not a beginning of a block, consult span table to find the block beginning. k = (uintptr)obj>>PageShift; x = k; x -= (uintptr)runtime·mheap.arena_start>>PageShift; s = runtime·mheap.spans[x]; if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse){ - if(s->state == MSpanStack) - break; // This is legit. - - // The following is catching some bugs left over from - // us not being rigerous about what data structures are - // hold valid pointers and different parts of the system - // considering different structures as roots. For example - // if there is a pointer into a stack that is left in - // a global data structure but that part of the runtime knows that - // those structures will be reinitialized before they are - // reused. Unfortunately the GC believes these roots are valid. - // Typically a stack gets moved and only the structures that part of - // the system knows are alive are updated. The span is freed - // after the stack copy and the pointer is still alive. This - // check is catching that bug but for now we will not throw, - // instead we will simply break out of this routine and depend - // on the caller to recognize that this pointer is not a valid - // heap pointer. I leave the code that catches the bug so that once - // resolved we can turn this check back on and throw. - - //runtime·printf("Runtime: Span weird: obj=%p, k=%p", obj, k); - //if (s == nil) - // runtime·printf(" s=nil\n"); - //else - // runtime·printf(" s->start=%p s->limit=%p, s->state=%d\n", s->start*PageSize, s->limit, s->state); - //runtime·throw("Blowup on weird span"); - break; // We are not in a real block throw?? + if(s != nil && s->state == MSpanStack) { + return nil; // This is legit. + } + + // The following ensures that we are rigorous about what data + // structures hold valid pointers + runtime·printf("runtime:objectstart Span weird: obj=%p, k=%p", obj, k); + if (s == nil) + runtime·printf(" s=nil\n"); + else + runtime·printf(" s->start=%p s->limit=%p, s->state=%d\n", s->start*PageSize, s->limit, s->state); + runtime·throw("objectstart: bad span"); } p = (byte*)((uintptr)s->start<sizeclass != 0) { @@ -333,6 +352,75 @@ objectstart(byte *b, Markbits *mbits) return obj; } +// Slow for now as we serialize this, since this is on a debug path +// speed is not critical at this point. +static Mutex xorlock; +static void +atomicxor8(byte *src, byte val) +{ + runtime·lock(&xorlock); + *src = *src^val; + runtime·unlock(&xorlock); +} + +// Mark using the checkmark scheme. +void +docheckmark(Markbits *mbits) +{ + // xor 01 moves 01(scalar unmarked) to 00(scalar marked) + // and 10(pointer unmarked) to 11(pointer marked) + atomicxor8(mbits->bitp, BitsCheckMarkXor<shift<<2); + return; +} + +// In the default scheme does mbits refer to a marked object. +static bool +ismarked(Markbits *mbits) +{ + if((mbits->bits&bitBoundary) != bitBoundary) + runtime·throw("ismarked: bits should have boundary bit set"); + return (mbits->bits&bitMarked) == bitMarked; +} + +// In the checkmark scheme does mbits refer to a marked object. +static bool +ischeckmarked(Markbits *mbits) +{ + if((mbits->bits&bitBoundary) != bitBoundary) + runtime·printf("runtime:ischeckmarked: bits should have boundary bit set\n"); + return mbits->tbits==BitsScalarMarked || mbits->tbits==BitsPointerMarked; +} + +// When in GCmarkterminate phase we allocate black. +void +runtime·gcmarknewobject_m(void) +{ + Markbits mbits; + byte *obj; + + if(runtime·gcphase != GCmarktermination) + runtime·throw("marking new object while not in mark termination phase"); + if(checkmark) // The world should be stopped so this should not happen. + runtime·throw("gcmarknewobject called while doing checkmark"); + + obj = g->m->ptrarg[0]; + slottombits((byte*)((uintptr)obj & (PtrSize-1)), &mbits); + + if((mbits.bits&bitMarked) != 0) + return; + + // Each byte of GC bitmap holds info for two words. + // If the current object is larger than two words, or if the object is one word + // but the object it shares the byte with is already marked, + // then all the possible concurrent updates are trying to set the same bit, + // so we can use a non-atomic update. + if((mbits.xbits&(bitMask|(bitMask<bits&bitMarked) != 0) - return wbuf; + if(checkmark) { + if(!ismarked(mbits)) { + runtime·printf("runtime:greyobject: checkmarks finds unexpected unmarked object obj=%p, mbits->bits=%x, *mbits->bitp=%x\n", obj, mbits->bits, *mbits->bitp); + } + if(ischeckmarked(mbits)) + return wbuf; + docheckmark(mbits); + } else { + // If marked we have nothing to do. + if((mbits->bits&bitMarked) != 0) + return wbuf; + + // Each byte of GC bitmap holds info for two words. + // If the current object is larger than two words, or if the object is one word + // but the object it shares the byte with is already marked, + // then all the possible concurrent updates are trying to set the same bit, + // so we can use a non-atomic update. + if((mbits->xbits&(bitMask|(bitMask<bitp = mbits->xbits | (bitMarked<shift); + else + runtime·atomicor8(mbits->bitp, bitMarked<shift); + } - // Each byte of GC bitmap holds info for two words. - // If the current object is larger than two words, or if the object is one word - // but the object it shares the byte with is already marked, - // then all the possible concurrent updates are trying to set the same bit, - // so we can use a non-atomic update. - if((mbits->xbits&(bitMask|(bitMask<bitp = mbits->xbits | (bitMarked<shift); - else - runtime·atomicor8(mbits->bitp, bitMarked<shift); - - if(((mbits->xbits>>(mbits->shift+2))&BitsMask) == BitsDead) + if (!checkmark && (((mbits->xbits>>(mbits->shift+2))&BitsMask) == BitsDead)) return wbuf; // noscan object // Queue the obj for scanning. The PREFETCH(obj) logic has been removed but @@ -398,6 +495,8 @@ scanobject(byte *b, uintptr n, byte *ptrmask, Workbuf *wbuf) // Find bits of the beginning of the object. if(ptrmask == nil) { b = objectstart(b, &mbits); + if(b == nil) + return wbuf; ptrbitp = mbits.bitp; //arena_start - off/wordsPerBitmapByte - 1; } for(i = 0; i < n; i += PtrSize) { @@ -407,6 +506,7 @@ scanobject(byte *b, uintptr n, byte *ptrmask, Workbuf *wbuf) bits = (ptrmask[(i/PtrSize)/4]>>(((i/PtrSize)%4)*BitsPerPointer))&BitsMask; } else { // Check if we have reached end of span. + // n is an overestimate of the size of the object. if((((uintptr)b+i)%PageSize) == 0 && runtime·mheap.spans[(b-arena_start)>>PageShift] != runtime·mheap.spans[(b+i-arena_start)>>PageShift]) break; @@ -414,7 +514,7 @@ scanobject(byte *b, uintptr n, byte *ptrmask, Workbuf *wbuf) bits = *ptrbitp; if(wordsPerBitmapByte != 2) runtime·throw("alg doesn't work for wordsPerBitmapByte != 2"); - j = ((uintptr)b+i)/PtrSize & 1; + j = ((uintptr)b+i)/PtrSize & 1; // j indicates upper nibble or lower nibble bits >>= gcBits*j; if(i == 0) bits &= ~bitBoundary; @@ -422,15 +522,19 @@ scanobject(byte *b, uintptr n, byte *ptrmask, Workbuf *wbuf) if((bits&bitBoundary) != 0 && i != 0) break; // reached beginning of the next object - bits = (bits>>2)&BitsMask; - if(bits == BitsDead) + bits = (bits&bitPtrMask)>>2; // bits refer to the type bits. + + if(i != 0 && bits == BitsDead) // BitsDead in first nibble not valid during checkmark break; // reached no-scan part of the object - } + } - if(bits <= BitsScalar) // Bits Scalar || BitsDead - continue; - if(bits != BitsPointer) { - runtime·printf("gc bits=%x\n", bits); + if(bits <= BitsScalar) // Bits Scalar || + // BitsDead || // default encoding + // BitsScalarMarked // checkmark encoding + continue; + + if((bits&BitsPointer) != BitsPointer) { + runtime·printf("gc checkmark=%d, b=%p ptrmask=%p, mbits.bitp=%p, mbits.xbits=%x, bits=%x\n", checkmark, b, ptrmask, mbits.bitp, mbits.xbits, bits); runtime·throw("unexpected garbage collection bits"); } @@ -442,6 +546,11 @@ scanobject(byte *b, uintptr n, byte *ptrmask, Workbuf *wbuf) // Mark the object. return some important bits. // We we combine the following two rotines we don't have to pass mbits or obj around. obj = objectstart(obj, &mbits); + // In the case of the span being MSpan_Stack mbits is useless and will not have + // the boundary bit set. It does not need to be greyed since it will be + // scanned using the scan stack mechanism. + if(obj == nil) + continue; wbuf = greyobject(obj, &mbits, wbuf); } return wbuf; @@ -548,7 +657,8 @@ markroot(ParFor *desc, uint32 i) s = runtime·work.spans[spanidx]; if(s->state != MSpanInUse) continue; - if(s->sweepgen != sg) { + if(!checkmark && s->sweepgen != sg) { + // sweepgen was updated (+2) during non-checkmark GC pass runtime·printf("sweep %d %d\n", s->sweepgen, sg); runtime·throw("gc: unswept span"); } @@ -616,9 +726,6 @@ markroot(ParFor *desc, uint32 i) } } -// wblock is used for creating new empty work buffer blocks. -static Mutex wblock; - // Get an empty work buffer off the work.empty list, // allocating new buffers as needed. static Workbuf* @@ -636,10 +743,8 @@ getempty(Workbuf *b) runtime·throw("getempty: workbuffer not empty, b->nobj not 0"); } if(b == nil) { - runtime·lock(&wblock); b = runtime·persistentalloc(sizeof(*b), CacheLineSize, &mstats.gc_sys); b->nobj = 0; - runtime·unlock(&wblock); } return b; } @@ -692,17 +797,6 @@ putpartial(Workbuf *b) } } -void -runtime·gcworkbuffree(Workbuf *b) -{ - if(b == nil) - return; - if(b->nobj == 0) - putempty(b); - else - putfull(b); -} - // Get a full work buffer off the work.full or a partially // filled one off the work.partial list. If nothing is available // wait until all the other gc helpers have finished and then @@ -906,11 +1000,18 @@ static bool shaded(byte *slot) { Markbits mbits; + byte *valid; if(!inheap(slot)) // non-heap slots considered grey return true; - objectstart(slot, &mbits); + valid = objectstart(slot, &mbits); + if(valid == nil) + return true; + + if(checkmark) + return ischeckmarked(&mbits); + return (mbits.bits&bitMarked) != 0; } @@ -930,7 +1031,9 @@ shade(byte *b) // Mark the object, return some important bits. // If we combine the following two rotines we don't have to pass mbits or obj around. obj = objectstart(b, &mbits); - wbuf = greyobject(obj, &mbits, wbuf); // augments the wbuf + if(obj != nil) + wbuf = greyobject(obj, &mbits, wbuf); // augments the wbuf + putpartial(wbuf); return; } @@ -969,6 +1072,7 @@ runtime·gcphasework(G *gp) scanstack(gp); break; case GCmark: + break; case GCmarktermination: scanstack(gp); // All available mark work will be emptied before returning. @@ -1104,6 +1208,9 @@ runtime·MSpan_Sweep(MSpan *s, bool preserve) Special *special, **specialp, *y; bool res, sweepgenset; + if(checkmark) + runtime·throw("MSpan_Sweep: checkmark only runs in STW and after the sweep."); + // It's critical that we enter this function with preemption disabled, // GC must not start while we are in the middle of this function. if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0) @@ -1547,6 +1654,134 @@ runtime·gc_m(void) runtime·casgstatus(gp, Gwaiting, Grunning); } +// Similar to clearcheckmarkbits but works on a single span. +// It preforms two tasks. +// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01) +// for nibbles with the BoundaryBit set. +// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and +// BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding. +// For the second case it is possible to restore the BitsDead pattern but since +// clearmark is a debug tool performance has a lower priority than simplicity. +// The span is MSpanInUse and the world is stopped. +static void +clearcheckmarkbitsspan(MSpan *s) +{ + int32 cl, n, npages, i; + uintptr size, off, step; + byte *p, *bitp, *arena_start, b; + + if(!checkmark) + runtime·throw("clearcheckmarkbitsspan: checkmark not set."); + + if(s->state != MSpanInUse) { + runtime·printf("runtime:clearcheckmarkbitsspan: state=%d\n", + s->state); + runtime·throw("clearcheckmarkbitsspan: bad span state"); + } + arena_start = runtime·mheap.arena_start; + cl = s->sizeclass; + size = s->elemsize; + if(cl == 0) { + n = 1; + } else { + // Chunk full of small blocks. + npages = runtime·class_to_allocnpages[cl]; + n = (npages << PageShift) / size; + } + + // MSpan_Sweep has similar code but instead of overloading and + // complicating that routine we do a simpler walk here. + // Sweep through n objects of given size starting at p. + // This thread owns the span now, so it can manipulate + // the block bitmap without atomic operations. + p = (byte*)(s->start << PageShift); + // Find bits for the beginning of the span. + off = (uintptr*)p - (uintptr*)arena_start; + bitp = arena_start - off/wordsPerBitmapByte - 1; + step = size/(PtrSize*wordsPerBitmapByte); + + if(step == 0) { + // updating top and bottom nibbles, all boundaries + for(i=0; i>2; + if(b == BitsScalarMarked || b == BitsPointerMarked) + *bitp ^= BitsCheckMarkXor<<2; + + if(((*bitp>>gcBits) & bitBoundary) != bitBoundary) + runtime·throw("missing bitBoundary"); + b = ((*bitp>>gcBits) & bitPtrMask)>>2; + if(b == BitsScalarMarked || b == BitsPointerMarked) + *bitp ^= BitsCheckMarkXor<<(2+gcBits); + } + } else { + // updating bottom nibble for first word of each object + for(i=0; i>2; + if(b == BitsScalarMarked || b == BitsPointerMarked) + *bitp ^= BitsCheckMarkXor<<2; + } + } +} + +// clearcheckmarkbits preforms two tasks. +// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01) +// for nibbles with the BoundaryBit set. +// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and +// BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding. +// This is a bit expensive but preserves the BitsDead encoding during the normal marking. +// BitsDead remains valid for every nibble except the ones with BitsBoundary set. +static void +clearcheckmarkbits(void) +{ + uint32 idx; + MSpan *s; + for(idx=0; idxstate == MSpanInUse) { + clearcheckmarkbitsspan(s); + } + } +} + +// Called from malloc.go using onM. +// The world is stopped. Rerun the scan and mark phases +// using the bitMarkedCheck bit instead of the +// bitMarked bit. If the marking encounters an +// bitMarked bit that is not set then we throw. +void +runtime·gccheckmark_m(void) +{ + if(!gccheckmarkenable) + return; + + if(checkmark) + runtime·throw("gccheckmark_m, entered with checkmark already true."); + + checkmark = true; + clearcheckmarkbits(); // Converts BitsDead to BitsScalar. + runtime·gc_m(); + // Work done, fixed up the GC bitmap to remove the checkmark bits. + clearcheckmarkbits(); + checkmark = false; +} + +// checkmarkenable is initially false +void +runtime·gccheckmarkenable_m(void) +{ + gccheckmarkenable = true; +} + +void +runtime·gccheckmarkdisable_m(void) +{ + gccheckmarkenable = false; +} + void runtime·finishsweep_m(void) { @@ -1631,6 +1866,21 @@ runtime·gcscan_m(void) // Let the g that called us continue to run. } +// Mark all objects that are known about. +void +runtime·gcmark_m(void) +{ + scanblock(nil, 0, nil); +} + +// For now this must be followed by a stoptheworld and a starttheworld to ensure +// all go routines see the new barrier. +void +runtime·gcinstallmarkwb_m(void) +{ + runtime·gcphase = GCmark; +} + static void gc(struct gc_args *args) { @@ -1652,7 +1902,8 @@ gc(struct gc_args *args) if(runtime·debug.gctrace) t1 = runtime·nanotime(); - runtime·finishsweep_m(); + if(!checkmark) + runtime·finishsweep_m(); // skip during checkmark debug phase. // Cache runtime·mheap.allspans in work.spans to avoid conflicts with // resizing/freeing allspans. @@ -1676,7 +1927,7 @@ gc(struct gc_args *args) runtime·work.nwait = 0; runtime·work.ndone = 0; runtime·work.nproc = runtime·gcprocs(); - runtime·gcphase = GCmark; + runtime·gcphase = GCmarktermination; // World is stopped so allglen will not change. for(i = 0; i < runtime·allglen; i++) { @@ -1774,21 +2025,24 @@ gc(struct gc_args *args) runtime·sweep.spanidx = 0; runtime·unlock(&runtime·mheap.lock); - if(ConcurrentSweep && !args->eagersweep) { - runtime·lock(&runtime·gclock); - if(runtime·sweep.g == nil) - runtime·sweep.g = runtime·newproc1(&bgsweepv, nil, 0, 0, gc); - else if(runtime·sweep.parked) { - runtime·sweep.parked = false; - runtime·ready(runtime·sweep.g); + // Start the sweep after the checkmark phase if there is one. + if(!gccheckmarkenable || checkmark) { + if(ConcurrentSweep && !args->eagersweep) { + runtime·lock(&runtime·gclock); + if(runtime·sweep.g == nil) + runtime·sweep.g = runtime·newproc1(&bgsweepv, nil, 0, 0, gc); + else if(runtime·sweep.parked) { + runtime·sweep.parked = false; + runtime·ready(runtime·sweep.g); + } + runtime·unlock(&runtime·gclock); + } else { + // Sweep all spans eagerly. + while(runtime·sweepone() != -1) + runtime·sweep.npausesweep++; + // Do an additional mProf_GC, because all 'free' events are now real as well. + runtime·mProf_GC(); } - runtime·unlock(&runtime·gclock); - } else { - // Sweep all spans eagerly. - while(runtime·sweepone() != -1) - runtime·sweep.npausesweep++; - // Do an additional mProf_GC, because all 'free' events are now real as well. - runtime·mProf_GC(); } runtime·mProf_GC(); diff --git a/src/runtime/mgc0.h b/src/runtime/mgc0.h index 16fbe4665..519d7206e 100644 --- a/src/runtime/mgc0.h +++ b/src/runtime/mgc0.h @@ -45,8 +45,12 @@ enum { // If you change these, also change scanblock. // scanblock does "if(bits == BitsScalar || bits == BitsDead)" as "if(bits <= BitsScalar)". BitsDead = 0, - BitsScalar = 1, - BitsPointer = 2, + BitsScalar = 1, // 01 + BitsPointer = 2, // 10 + BitsCheckMarkXor = 1, // 10 + BitsScalarMarked = BitsScalar ^ BitsCheckMarkXor, // 00 + BitsPointerMarked = BitsPointer ^ BitsCheckMarkXor, // 11 + BitsMultiWord = 3, // BitsMultiWord will be set for the first word of a multi-word item. // When it is set, one of the following will be set for the second word. diff --git a/src/runtime/stack.c b/src/runtime/stack.c index f18171ea5..fb23cc1c3 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -382,8 +382,6 @@ adjustpointers(byte **scanp, BitVector *bv, AdjustInfo *adjinfo, Func *f) uintptr delta; int32 num, i; byte *p, *minp, *maxp; - Type *t; - Itab *tab; minp = (byte*)adjinfo->old.lo; maxp = (byte*)adjinfo->old.hi; @@ -415,43 +413,7 @@ adjustpointers(byte **scanp, BitVector *bv, AdjustInfo *adjinfo, Func *f) } break; case BitsMultiWord: - switch(bv->bytedata[(i+1) / (8 / BitsPerPointer)] >> ((i+1) * BitsPerPointer & 7) & 3) { - default: - runtime·throw("unexpected garbage collection bits"); - case BitsEface: - t = (Type*)scanp[i]; - if(t != nil && ((t->kind & KindDirectIface) == 0 || (t->kind & KindNoPointers) == 0)) { - p = scanp[i+1]; - if(minp <= p && p < maxp) { - if(StackDebug >= 3) - runtime·printf("adjust eface %p\n", p); - if(t->size > PtrSize) // currently we always allocate such objects on the heap - runtime·throw("large interface value found on stack"); - scanp[i+1] = p + delta; - } - } - i++; - break; - case BitsIface: - tab = (Itab*)scanp[i]; - if(tab != nil) { - t = tab->type; - //runtime·printf(" type=%p\n", t); - if((t->kind & KindDirectIface) == 0 || (t->kind & KindNoPointers) == 0) { - p = scanp[i+1]; - if(minp <= p && p < maxp) { - if(StackDebug >= 3) - runtime·printf("adjust iface %p\n", p); - if(t->size > PtrSize) // currently we always allocate such objects on the heap - runtime·throw("large interface value found on stack"); - scanp[i+1] = p + delta; - } - } - } - i++; - break; - } - break; + runtime·throw("adjustpointers: unexpected garbage collection bits"); } } } diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go index 2d5e41c1c..68f464f57 100644 --- a/src/runtime/stubs.go +++ b/src/runtime/stubs.go @@ -107,6 +107,12 @@ func mcacheRefill_m() func largeAlloc_m() func gc_m() func gcscan_m() +func gcmark_m() +func gccheckmark_m() +func gccheckmarkenable_m() +func gccheckmarkdisable_m() +func gcinstallmarkwb_m() +func gcmarknewobject_m() func finishsweep_m() func scavenge_m() func setFinalizer_m() -- cgit v1.2.1 From 494d936b2c70f1f8a1fa51efe419619c05f624e5 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Wed, 5 Nov 2014 11:09:08 -0500 Subject: [dev.garbage] runtime: fix 32-bit build TBR=crawshaw R=crawshaw CC=golang-codereviews https://codereview.appspot.com/168860046 --- src/runtime/mgc0.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/mgc0.go b/src/runtime/mgc0.go index 75678c522..22e88494a 100644 --- a/src/runtime/mgc0.go +++ b/src/runtime/mgc0.go @@ -84,8 +84,8 @@ func bgsweep() { } const ( - _PoisonGC = 0xf969696969696969 & ^uintptr(0) - _PoisonStack = 0x6868686868686868 & ^uintptr(0) + _PoisonGC = 0xf969696969696969 & (1<<(8*ptrSize) - 1) + _PoisonStack = 0x6868686868686868 & (1<<(8*ptrSize) - 1) ) // NOTE: Really dst *unsafe.Pointer, src unsafe.Pointer, -- cgit v1.2.1 From c4bde225a28b8ce8e0b2075c912f7726b3756300 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Wed, 5 Nov 2014 13:37:34 -0500 Subject: [dev.garbage] runtime: fix a few checkmark bugs - Some sequencing issues with stopping the first gc_m round at the right place to set up correctly for the second round. - atomicxor8 is not idempotent; avoid xor. - Maintain BitsDead type bits correctly; see long comment added. - Enable checkmark phase by default for now. LGTM=rlh R=rlh CC=golang-codereviews https://codereview.appspot.com/171090043 --- src/runtime/mgc0.c | 146 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 111 insertions(+), 35 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index e283f6ee8..77a6c9377 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -257,7 +257,7 @@ WorkData runtime·work; // encoding, otherwise one uses the bitMarked bit in the lower two bits // of the nibble. static bool checkmark = false; -static bool gccheckmarkenable = false; +static bool gccheckmarkenable = true; // Is address b in the known heap. If it doesn't have a valid gcmap // returns false. For example pointers into stacks will return false. @@ -292,7 +292,7 @@ slottombits(byte *obj, Markbits *mbits) mbits->shift = (off % wordsPerBitmapByte) * gcBits; mbits->xbits = *mbits->bitp; mbits->bits = (mbits->xbits >> mbits->shift) & bitMask; - mbits->tbits = (mbits->xbits >> mbits->shift) & bitPtrMask; + mbits->tbits = ((mbits->xbits >> mbits->shift) & bitPtrMask) >> 2; } // b is a pointer into the heap. @@ -354,13 +354,13 @@ objectstart(byte *b, Markbits *mbits) // Slow for now as we serialize this, since this is on a debug path // speed is not critical at this point. -static Mutex xorlock; +static Mutex andlock; static void -atomicxor8(byte *src, byte val) +atomicand8(byte *src, byte val) { - runtime·lock(&xorlock); - *src = *src^val; - runtime·unlock(&xorlock); + runtime·lock(&andlock); + *src = *src&val; + runtime·unlock(&andlock); } // Mark using the checkmark scheme. @@ -369,7 +369,16 @@ docheckmark(Markbits *mbits) { // xor 01 moves 01(scalar unmarked) to 00(scalar marked) // and 10(pointer unmarked) to 11(pointer marked) - atomicxor8(mbits->bitp, BitsCheckMarkXor<shift<<2); + if(mbits->tbits == BitsScalar) + atomicand8(mbits->bitp, ~(byte)(BitsCheckMarkXor<shift<<2)); + else if(mbits->tbits == BitsPointer) + runtime·atomicor8(mbits->bitp, BitsCheckMarkXor<shift<<2); + + // reload bits for ischeckmarked + mbits->xbits = *mbits->bitp; + mbits->bits = (mbits->xbits >> mbits->shift) & bitMask; + mbits->tbits = ((mbits->xbits >> mbits->shift) & bitPtrMask) >> 2; + return; } @@ -434,10 +443,15 @@ greyobject(byte *obj, Markbits *mbits, Workbuf *wbuf) if(checkmark) { if(!ismarked(mbits)) { runtime·printf("runtime:greyobject: checkmarks finds unexpected unmarked object obj=%p, mbits->bits=%x, *mbits->bitp=%x\n", obj, mbits->bits, *mbits->bitp); + runtime·throw("checkmark found unmarked object"); } if(ischeckmarked(mbits)) return wbuf; docheckmark(mbits); + if(!ischeckmarked(mbits)) { + runtime·printf("mbits xbits=%x bits=%x tbits=%x shift=%d\n", mbits->xbits, mbits->bits, mbits->tbits, mbits->shift); + runtime·throw("docheckmark and ischeckmarked disagree"); + } } else { // If marked we have nothing to do. if((mbits->bits&bitMarked) != 0) @@ -1670,9 +1684,6 @@ clearcheckmarkbitsspan(MSpan *s) uintptr size, off, step; byte *p, *bitp, *arena_start, b; - if(!checkmark) - runtime·throw("clearcheckmarkbitsspan: checkmark not set."); - if(s->state != MSpanInUse) { runtime·printf("runtime:clearcheckmarkbitsspan: state=%d\n", s->state); @@ -1700,19 +1711,65 @@ clearcheckmarkbitsspan(MSpan *s) bitp = arena_start - off/wordsPerBitmapByte - 1; step = size/(PtrSize*wordsPerBitmapByte); + // The type bit values are: + // 00 - BitsDead, for us BitsScalarMarked + // 01 - BitsScalar + // 10 - BitsPointer + // 11 - unused, for us BitsPointerMarked + // + // When called to prepare for the checkmark phase (checkmark==1), + // we change BitsDead to BitsScalar, so that there are no BitsScalarMarked + // type bits anywhere. + // + // The checkmark phase marks by changing BitsScalar to BitsScalarMarked + // and BitsPointer to BitsPointerMarked. + // + // When called to clean up after the checkmark phase (checkmark==0), + // we unmark by changing BitsScalarMarked back to BitsScalar and + // BitsPointerMarked back to BitsPointer. + // + // There are two problems with the scheme as just described. + // First, the setup rewrites BitsDead to BitsScalar, but the type bits + // following a BitsDead are uninitialized and must not be used. + // Second, objects that are free are expected to have their type + // bits zeroed (BitsDead), so in the cleanup we need to restore + // any BitsDeads that were there originally. + // + // In a one-word object (8-byte allocation on 64-bit system), + // there is no difference between BitsScalar and BitsDead, because + // neither is a pointer and there are no more words in the object, + // so using BitsScalar during the checkmark is safe and mapping + // both back to BitsDead during cleanup is also safe. + // + // In a larger object, we need to be more careful. During setup, + // if the type of the first word is BitsDead, we change it to BitsScalar + // (as we must) but also initialize the type of the second + // word to BitsDead, so that a scan during the checkmark phase + // will still stop before seeing the uninitialized type bits in the + // rest of the object. The sequence 'BitsScalar BitsDead' never + // happens in real type bitmaps - BitsDead is always as early + // as possible, so immediately after the last BitsPointer. + // During cleanup, if we see a BitsScalar, we can check to see if it + // is followed by BitsDead. If so, it was originally BitsDead and + // we can change it back. + if(step == 0) { // updating top and bottom nibbles, all boundaries for(i=0; i>2; - if(b == BitsScalarMarked || b == BitsPointerMarked) + if(!checkmark && (b == BitsScalar || b == BitsScalarMarked)) + *bitp &= ~0x0c; // convert to BitsDead + else if(b == BitsScalarMarked || b == BitsPointerMarked) *bitp ^= BitsCheckMarkXor<<2; - + if(((*bitp>>gcBits) & bitBoundary) != bitBoundary) runtime·throw("missing bitBoundary"); b = ((*bitp>>gcBits) & bitPtrMask)>>2; - if(b == BitsScalarMarked || b == BitsPointerMarked) + if(!checkmark && (b == BitsScalar || b == BitsScalarMarked)) + *bitp &= ~0xc0; // convert to BitsDead + else if(b == BitsScalarMarked || b == BitsPointerMarked) *bitp ^= BitsCheckMarkXor<<(2+gcBits); } } else { @@ -1721,7 +1778,19 @@ clearcheckmarkbitsspan(MSpan *s) if((*bitp & bitBoundary) != bitBoundary) runtime·throw("missing bitBoundary"); b = (*bitp & bitPtrMask)>>2; - if(b == BitsScalarMarked || b == BitsPointerMarked) + + if(checkmark && b == BitsDead) { + // move BitsDead into second word. + // set bits to BitsScalar in preparation for checkmark phase. + *bitp &= ~0xc0; + *bitp |= BitsScalar<<2; + } else if(!checkmark && (b == BitsScalar || b == BitsScalarMarked) && (*bitp & 0xc0) == 0) { + // Cleaning up after checkmark phase. + // First word is scalar or dead (we forgot) + // and second word is dead. + // First word might as well be dead too. + *bitp &= ~0x0c; + } else if(b == BitsScalarMarked || b == BitsPointerMarked) *bitp ^= BitsCheckMarkXor<<2; } } @@ -1763,10 +1832,9 @@ runtime·gccheckmark_m(void) checkmark = true; clearcheckmarkbits(); // Converts BitsDead to BitsScalar. - runtime·gc_m(); + runtime·gc_m(); // turns off checkmark // Work done, fixed up the GC bitmap to remove the checkmark bits. clearcheckmarkbits(); - checkmark = false; } // checkmarkenable is initially false @@ -2016,6 +2084,16 @@ gc(struct gc_args *args) // Free the old cached mark array if necessary. if(runtime·work.spans != nil && runtime·work.spans != runtime·mheap.allspans) runtime·SysFree(runtime·work.spans, runtime·work.nspan*sizeof(runtime·work.spans[0]), &mstats.other_sys); + + if(gccheckmarkenable) { + if(!checkmark) { + // first half of two-pass; don't set up sweep + runtime·unlock(&runtime·mheap.lock); + return; + } + checkmark = false; // done checking marks + } + // Cache the current array for sweeping. runtime·mheap.gcspans = runtime·mheap.allspans; runtime·mheap.sweepgen += 2; @@ -2025,24 +2103,22 @@ gc(struct gc_args *args) runtime·sweep.spanidx = 0; runtime·unlock(&runtime·mheap.lock); - // Start the sweep after the checkmark phase if there is one. - if(!gccheckmarkenable || checkmark) { - if(ConcurrentSweep && !args->eagersweep) { - runtime·lock(&runtime·gclock); - if(runtime·sweep.g == nil) - runtime·sweep.g = runtime·newproc1(&bgsweepv, nil, 0, 0, gc); - else if(runtime·sweep.parked) { - runtime·sweep.parked = false; - runtime·ready(runtime·sweep.g); - } - runtime·unlock(&runtime·gclock); - } else { - // Sweep all spans eagerly. - while(runtime·sweepone() != -1) - runtime·sweep.npausesweep++; - // Do an additional mProf_GC, because all 'free' events are now real as well. - runtime·mProf_GC(); + + if(ConcurrentSweep && !args->eagersweep) { + runtime·lock(&runtime·gclock); + if(runtime·sweep.g == nil) + runtime·sweep.g = runtime·newproc1(&bgsweepv, nil, 0, 0, gc); + else if(runtime·sweep.parked) { + runtime·sweep.parked = false; + runtime·ready(runtime·sweep.g); } + runtime·unlock(&runtime·gclock); + } else { + // Sweep all spans eagerly. + while(runtime·sweepone() != -1) + runtime·sweep.npausesweep++; + // Do an additional mProf_GC, because all 'free' events are now real as well. + runtime·mProf_GC(); } runtime·mProf_GC(); -- cgit v1.2.1 From 6d5e8c9c938b45ddcc62470a790408642a26218b Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Wed, 5 Nov 2014 14:42:24 -0500 Subject: [dev.garbage] runtime: ignore objects in dead spans We still don't know why this is happening. LGTM=rlh R=rlh CC=golang-codereviews https://codereview.appspot.com/169990043 --- src/runtime/mgc0.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index 77a6c9377..3ebaf005f 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -326,12 +326,16 @@ objectstart(byte *b, Markbits *mbits) // The following ensures that we are rigorous about what data // structures hold valid pointers - runtime·printf("runtime:objectstart Span weird: obj=%p, k=%p", obj, k); - if (s == nil) - runtime·printf(" s=nil\n"); - else - runtime·printf(" s->start=%p s->limit=%p, s->state=%d\n", s->start*PageSize, s->limit, s->state); - runtime·throw("objectstart: bad span"); + if(0) { + // Still happens sometimes. We don't know why. + runtime·printf("runtime:objectstart Span weird: obj=%p, k=%p", obj, k); + if (s == nil) + runtime·printf(" s=nil\n"); + else + runtime·printf(" s->start=%p s->limit=%p, s->state=%d\n", s->start*PageSize, s->limit, s->state); + runtime·throw("objectstart: bad pointer in unexpected span"); + } + return nil; } p = (byte*)((uintptr)s->start<sizeclass != 0) { -- cgit v1.2.1 From aefaeb75f3eff323f212c5309d8ae65768ad9809 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Wed, 5 Nov 2014 14:42:54 -0500 Subject: [dev.garbage] cmd/gc, runtime: add locks around print statements Now each C printf, Go print, or Go println is guaranteed not to be interleaved with other calls of those functions. This should help when debugging concurrent failures. LGTM=rlh R=rlh CC=golang-codereviews https://codereview.appspot.com/169120043 --- src/runtime/print1.go | 30 +++++++++++++++++++++++++++--- src/runtime/runtime.h | 1 + 2 files changed, 28 insertions(+), 3 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/print1.go b/src/runtime/print1.go index 8f8268873..3d812bd04 100644 --- a/src/runtime/print1.go +++ b/src/runtime/print1.go @@ -41,7 +41,31 @@ func snprintf(dst *byte, n int32, s *byte) { gp.writebuf = nil } -//var debuglock mutex +var debuglock mutex + +// The compiler emits calls to printlock and printunlock around +// the multiple calls that implement a single Go print or println +// statement. Some of the print helpers (printsp, for example) +// call print recursively. There is also the problem of a crash +// happening during the print routines and needing to acquire +// the print lock to print information about the crash. +// For both these reasons, let a thread acquire the printlock 'recursively'. + +func printlock() { + mp := getg().m + mp.printlock++ + if mp.printlock == 1 { + lock(&debuglock) + } +} + +func printunlock() { + mp := getg().m + mp.printlock-- + if mp.printlock == 0 { + unlock(&debuglock) + } +} // write to goroutine-local buffer if diverting output, // or else standard error. @@ -80,7 +104,7 @@ func printnl() { // Very simple printf. Only for debugging prints. // Do not add to this without checking with Rob. func vprintf(str string, arg unsafe.Pointer) { - //lock(&debuglock); + printlock() s := bytes(str) start := 0 @@ -160,7 +184,7 @@ func vprintf(str string, arg unsafe.Pointer) { gwrite(s[start:i]) } - //unlock(&debuglock); + printunlock() } func printpc(p unsafe.Pointer) { diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index 6a02ef1d3..ee86f2d17 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -345,6 +345,7 @@ struct M int32 helpgc; bool spinning; // M is out of work and is actively looking for work bool blocked; // M is blocked on a Note + int8 printlock; uint32 fastrand; uint64 ncgocall; // number of cgo calls in total int32 ncgo; // number of cgo calls currently in progress -- cgit v1.2.1 From cb223591339d2e03283e21144057b30d5e9667dd Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Mon, 10 Nov 2014 13:42:34 -0500 Subject: [dev.garbage] runtime: Code to implement write barriers To turn concurrent gc on alter the if false in func gogc currently at line 489 in malloc.go LGTM=rsc R=rsc CC=golang-codereviews, rlh https://codereview.appspot.com/172190043 Committer: Russ Cox --- src/runtime/malloc.go | 21 +++++++++++---------- src/runtime/mgc0.c | 39 ++++++++++++++++++++++++++++++++++----- src/runtime/mgc0.go | 19 ++++++++++++++++++- src/runtime/runtime.h | 1 + src/runtime/stubs.go | 2 ++ 5 files changed, 66 insertions(+), 16 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 274bae9a3..a18e77421 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -486,16 +486,17 @@ func gogc(force int32) { onM(stoptheworld) onM(finishsweep_m) // finish sweep before we start concurrent scan. - onM(starttheworld) - - // Do a concurrent heap scan before we stop the world. - onM(gcscan_m) - onM(gcinstallmarkwb_m) - onM(stoptheworld) - // onM(starttheworld) - // mark from roots scanned in gcscan_m. startthework when write barrier works - onM(gcmark_m) - // onM(stoptheworld) + if false { // To turn on concurrent scan and mark set to true... + onM(starttheworld) + // Do a concurrent heap scan before we stop the world. + onM(gcscan_m) + onM(stoptheworld) + onM(gcinstallmarkwb_m) + onM(starttheworld) + onM(gcmark_m) + onM(stoptheworld) + onM(gcinstalloffwb_m) + } if mp != acquirem() { gothrow("gogc: rescheduled") } diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index 3ebaf005f..5300f554b 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -1061,13 +1061,34 @@ shade(byte *b) // // Shade indicates that it has seen a white pointer by adding the referent // to wbuf. +// slot is the destination (dst) in go code +// ptr is the value that goes into the slot (src) in the go code void -runtime·markwb(void **slot, void *ptr) +runtime·gcmarkwb_m() { - // initial nil check avoids some needlesss loads - if(ptr != nil && inheap(ptr) && shaded((void*)slot)) - shade(ptr); + byte **slot, *ptr; + slot = (byte**)g->m->scalararg[0]; + ptr = (byte*)g->m->scalararg[1]; + *slot = ptr; + switch(runtime·gcphase) { + default: + runtime·throw("gcphasework in bad gcphase"); + case GCoff: + case GCquiesce: + case GCstw: + case GCsweep: + case GCscan: + break; + case GCmark: + if(ptr != nil && inheap(ptr) && shaded((byte*)slot)) + shade(ptr); + break; + case GCmarktermination: + if(ptr != nil && inheap(ptr) && shaded((byte*)slot)) + shade(ptr); + break; + } } // The gp has been moved to a GC safepoint. GC phase specific @@ -1945,7 +1966,7 @@ runtime·gcmark_m(void) scanblock(nil, 0, nil); } -// For now this must be followed by a stoptheworld and a starttheworld to ensure +// For now this must be bracketed with a stoptheworld and a starttheworld to ensure // all go routines see the new barrier. void runtime·gcinstallmarkwb_m(void) @@ -1953,6 +1974,14 @@ runtime·gcinstallmarkwb_m(void) runtime·gcphase = GCmark; } +// For now this must be bracketed with a stoptheworld and a starttheworld to ensure +// all go routines see the new barrier. +void +runtime·gcinstalloffwb_m(void) +{ + runtime·gcphase = GCoff; +} + static void gc(struct gc_args *args) { diff --git a/src/runtime/mgc0.go b/src/runtime/mgc0.go index 22e88494a..ce5c290ef 100644 --- a/src/runtime/mgc0.go +++ b/src/runtime/mgc0.go @@ -95,7 +95,24 @@ func writebarrierptr(dst *uintptr, src uintptr) { if src != 0 && (src < _PageSize || src == _PoisonGC || src == _PoisonStack) { onM(func() { gothrow("bad pointer in write barrier") }) } - *dst = src + + mp := acquirem() + if mp.inwb { + *dst = src + releasem(mp) + return + } + mp.inwb = true + oldscalar0 := mp.scalararg[0] + oldscalar1 := mp.scalararg[1] + mp.scalararg[0] = uintptr(unsafe.Pointer(dst)) + mp.scalararg[1] = src + onM_signalok(gcmarkwb_m) + mp.scalararg[0] = oldscalar0 + mp.scalararg[1] = oldscalar1 + mp.inwb = false + releasem(mp) + // *dst = src is done inside of the write barrier. } //go:nosplit diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index ee86f2d17..a0f1acc05 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -345,6 +345,7 @@ struct M int32 helpgc; bool spinning; // M is out of work and is actively looking for work bool blocked; // M is blocked on a Note + bool inwb; // M is executing a write barrier int8 printlock; uint32 fastrand; uint64 ncgocall; // number of cgo calls in total diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go index 68f464f57..852f4ddbb 100644 --- a/src/runtime/stubs.go +++ b/src/runtime/stubs.go @@ -112,7 +112,9 @@ func gccheckmark_m() func gccheckmarkenable_m() func gccheckmarkdisable_m() func gcinstallmarkwb_m() +func gcinstalloffwb_m() func gcmarknewobject_m() +func gcmarkwb_m() func finishsweep_m() func scavenge_m() func setFinalizer_m() -- cgit v1.2.1 From 11c9da27d8b3d09835a611edea424885af1ec650 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Mon, 10 Nov 2014 14:32:02 -0500 Subject: [dev.garbage] runtime: Coarsen the write barrier to always grey the destination. LGTM=rsc R=rsc CC=golang-codereviews https://codereview.appspot.com/174820043 --- src/runtime/mgc0.c | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index 5300f554b..3f6cce5c0 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -1056,13 +1056,41 @@ shade(byte *b) return; } -// This is the Dijkstra barrier coarsened to shade grey to white whereas -// the original Dijkstra barrier only shaded black to white. +// This is the Dijkstra barrier coarsened to always shade the ptr (dst) object. +// The original Dijkstra barrier only shaded ptrs being placed in black slots. // // Shade indicates that it has seen a white pointer by adding the referent -// to wbuf. +// to wbuf as well as marking it. +// // slot is the destination (dst) in go code // ptr is the value that goes into the slot (src) in the go code +// +// Dijkstra pointed out that maintaining the no black to white +// pointers means that white to white pointers not need +// to be noted by the write barrier. Furthermore if either +// white object dies before it is reached by the +// GC then the object can be collected during this GC cycle +// instead of waiting for the next cycle. Unfortunately the cost of +// ensure that the object holding the slot doesn't concurrently +// change to black without the mutator noticing seems prohibitive. +// +// Consider the following example where the mutator writes into +// a slot and then loads the slot's mark bit while the GC thread +// writes to the slot's mark bit and then as part of scanning reads +// the slot. +// +// Initially both [slot] and [slotmark] are 0 (nil) +// Mutator thread GC thread +// st [slot], ptr st [slotmark], 1 +// +// ld r1, [slotmark] ld r2, [slot] +// +// This is a classic example of independent reads of independent writes, +// aka IRIW. The question is if r1==r2==0 is allowed and for most HW the +// answer is yes without inserting a memory barriers between the st and the ld. +// These barriers are expensive so we have decided that we will +// always grey the ptr object regardless of the slot's color. +// void runtime·gcmarkwb_m() { @@ -1081,11 +1109,11 @@ runtime·gcmarkwb_m() case GCscan: break; case GCmark: - if(ptr != nil && inheap(ptr) && shaded((byte*)slot)) + if(ptr != nil && inheap(ptr)) shade(ptr); break; case GCmarktermination: - if(ptr != nil && inheap(ptr) && shaded((byte*)slot)) + if(ptr != nil && inheap(ptr)) shade(ptr); break; } -- cgit v1.2.1 From f4d66a3a39bd302d7fd96fc98c086813cfdaf755 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Mon, 10 Nov 2014 14:59:36 -0500 Subject: [dev.garbage] runtime: add write barrier to casp Also rewrite some casp that don't use real pointers to use casuintptr instead. LGTM=rlh R=rlh CC=golang-codereviews https://codereview.appspot.com/166440044 --- src/runtime/asm_386.s | 6 +++--- src/runtime/asm_amd64.s | 6 +++--- src/runtime/asm_amd64p32.s | 6 +++--- src/runtime/asm_power64x.s | 6 +++--- src/runtime/atomic.go | 38 ++++++++++++++++++++++++++++++++++---- src/runtime/mgc0.c | 1 - src/runtime/mgc0.go | 14 ++++++++++++-- src/runtime/proc.c | 14 +++++++------- src/runtime/runtime.h | 1 + src/runtime/string.c | 2 +- src/runtime/stubs.go | 3 --- 11 files changed, 67 insertions(+), 30 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s index 2d102b273..d456e6bca 100644 --- a/src/runtime/asm_386.s +++ b/src/runtime/asm_386.s @@ -502,7 +502,7 @@ fail: // return 1; // }else // return 0; -TEXT runtime·casp(SB), NOSPLIT, $0-13 +TEXT runtime·casp1(SB), NOSPLIT, $0-13 MOVL ptr+0(FP), BX MOVL old+4(FP), AX MOVL new+8(FP), CX @@ -537,7 +537,7 @@ TEXT runtime·xchg(SB), NOSPLIT, $0-12 MOVL AX, ret+8(FP) RET -TEXT runtime·xchgp(SB), NOSPLIT, $0-12 +TEXT runtime·xchgp1(SB), NOSPLIT, $0-12 MOVL ptr+0(FP), BX MOVL new+4(FP), AX XCHGL AX, 0(BX) @@ -555,7 +555,7 @@ again: JNZ again RET -TEXT runtime·atomicstorep(SB), NOSPLIT, $0-8 +TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8 MOVL ptr+0(FP), BX MOVL val+4(FP), AX XCHGL AX, 0(BX) diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index ac9c58cf3..5d176575c 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -489,7 +489,7 @@ TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16 // return 1; // } else // return 0; -TEXT runtime·casp(SB), NOSPLIT, $0-25 +TEXT runtime·casp1(SB), NOSPLIT, $0-25 MOVQ ptr+0(FP), BX MOVQ old+8(FP), AX MOVQ new+16(FP), CX @@ -541,7 +541,7 @@ TEXT runtime·xchg64(SB), NOSPLIT, $0-24 MOVQ AX, ret+16(FP) RET -TEXT runtime·xchgp(SB), NOSPLIT, $0-24 +TEXT runtime·xchgp1(SB), NOSPLIT, $0-24 MOVQ ptr+0(FP), BX MOVQ new+8(FP), AX XCHGQ AX, 0(BX) @@ -559,7 +559,7 @@ again: JNZ again RET -TEXT runtime·atomicstorep(SB), NOSPLIT, $0-16 +TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16 MOVQ ptr+0(FP), BX MOVQ val+8(FP), AX XCHGQ AX, 0(BX) diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s index de3ef3a23..2b2155753 100644 --- a/src/runtime/asm_amd64p32.s +++ b/src/runtime/asm_amd64p32.s @@ -460,7 +460,7 @@ fail: // return 1; // } else // return 0; -TEXT runtime·casp(SB), NOSPLIT, $0-17 +TEXT runtime·casp1(SB), NOSPLIT, $0-17 MOVL ptr+0(FP), BX MOVL old+4(FP), AX MOVL new+8(FP), CX @@ -512,7 +512,7 @@ TEXT runtime·xchg64(SB), NOSPLIT, $0-24 MOVQ AX, ret+16(FP) RET -TEXT runtime·xchgp(SB), NOSPLIT, $0-12 +TEXT runtime·xchgp1(SB), NOSPLIT, $0-12 MOVL ptr+0(FP), BX MOVL new+4(FP), AX XCHGL AX, 0(BX) @@ -530,7 +530,7 @@ again: JNZ again RET -TEXT runtime·atomicstorep(SB), NOSPLIT, $0-8 +TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8 MOVL ptr+0(FP), BX MOVL val+4(FP), AX XCHGL AX, 0(BX) diff --git a/src/runtime/asm_power64x.s b/src/runtime/asm_power64x.s index f77658032..fd0c6be16 100644 --- a/src/runtime/asm_power64x.s +++ b/src/runtime/asm_power64x.s @@ -472,7 +472,7 @@ TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16 // return 1; // } else // return 0; -TEXT runtime·casp(SB), NOSPLIT, $0-25 +TEXT runtime·casp1(SB), NOSPLIT, $0-25 BR runtime·cas64(SB) // uint32 xadd(uint32 volatile *val, int32 delta) @@ -529,7 +529,7 @@ TEXT runtime·xchg64(SB), NOSPLIT, $0-24 MOVD R3, ret+16(FP) RETURN -TEXT runtime·xchgp(SB), NOSPLIT, $0-24 +TEXT runtime·xchgp1(SB), NOSPLIT, $0-24 BR runtime·xchg64(SB) TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24 @@ -538,7 +538,7 @@ TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24 TEXT runtime·procyield(SB),NOSPLIT,$0-0 RETURN -TEXT runtime·atomicstorep(SB), NOSPLIT, $0-16 +TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16 BR runtime·atomicstore64(SB) TEXT runtime·atomicstore(SB), NOSPLIT, $0-12 diff --git a/src/runtime/atomic.go b/src/runtime/atomic.go index 7e9d9b3aa..a0e4d84e9 100644 --- a/src/runtime/atomic.go +++ b/src/runtime/atomic.go @@ -20,8 +20,16 @@ func xchg(ptr *uint32, new uint32) uint32 //go:noescape func xchg64(ptr *uint64, new uint64) uint64 -//go:noescape -func xchgp(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer +// Cannot use noescape here: ptr does not but new does escape. +// Instead use noescape(ptr) in wrapper below. +func xchgp1(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer + +//go:nosplit +func xchgp(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer { + old := xchgp1(noescape(ptr), new) + writebarrierptr_nostore((*uintptr)(ptr), uintptr(new)) + return old +} //go:noescape func xchguintptr(ptr *uintptr, new uintptr) uintptr @@ -47,5 +55,27 @@ func atomicstore(ptr *uint32, val uint32) //go:noescape func atomicstore64(ptr *uint64, val uint64) -//go:noescape -func atomicstorep(ptr unsafe.Pointer, val unsafe.Pointer) +// Cannot use noescape here: ptr does not but val does escape. +// Instead use noescape(ptr) in wrapper below. +func atomicstorep1(ptr unsafe.Pointer, val unsafe.Pointer) + +//go:nosplit +func atomicstorep(ptr unsafe.Pointer, val unsafe.Pointer) { + atomicstorep1(noescape(ptr), val) + // TODO(rsc): Why does the compiler think writebarrierptr_nostore's dst argument escapes? + writebarrierptr_nostore((*uintptr)(noescape(ptr)), uintptr(val)) +} + +// Cannot use noescape here: ptr does not but new does escape. +// Instead use noescape(ptr) in wrapper below. +func casp1(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool + +//go:nosplit +func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool { + ok := casp1((*unsafe.Pointer)(noescape(unsafe.Pointer(ptr))), old, new) + if !ok { + return false + } + writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new)) + return true +} diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index 3f6cce5c0..8d87107c7 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -1098,7 +1098,6 @@ runtime·gcmarkwb_m() slot = (byte**)g->m->scalararg[0]; ptr = (byte*)g->m->scalararg[1]; - *slot = ptr; switch(runtime·gcphase) { default: runtime·throw("gcphasework in bad gcphase"); diff --git a/src/runtime/mgc0.go b/src/runtime/mgc0.go index ce5c290ef..760d2a545 100644 --- a/src/runtime/mgc0.go +++ b/src/runtime/mgc0.go @@ -92,13 +92,24 @@ const ( // but if we do that, Go inserts a write barrier on *dst = src. //go:nosplit func writebarrierptr(dst *uintptr, src uintptr) { + *dst = src + writebarrierptr_nostore(dst, src) +} + +// Like writebarrierptr, but the store has already been applied. +// Do not reapply. +//go:nosplit +func writebarrierptr_nostore(dst *uintptr, src uintptr) { + if getg() == nil { // very low-level startup + return + } + if src != 0 && (src < _PageSize || src == _PoisonGC || src == _PoisonStack) { onM(func() { gothrow("bad pointer in write barrier") }) } mp := acquirem() if mp.inwb { - *dst = src releasem(mp) return } @@ -112,7 +123,6 @@ func writebarrierptr(dst *uintptr, src uintptr) { mp.scalararg[1] = oldscalar1 mp.inwb = false releasem(mp) - // *dst = src is done inside of the write barrier. } //go:nosplit diff --git a/src/runtime/proc.c b/src/runtime/proc.c index 9626bd101..e5e2df2e4 100644 --- a/src/runtime/proc.c +++ b/src/runtime/proc.c @@ -1060,7 +1060,7 @@ runtime·dropm(void) unlockextra(mp); } -#define MLOCKED ((M*)1) +#define MLOCKED 1 // lockextra locks the extra list and returns the list head. // The caller must unlock the list by storing a new list head @@ -1071,28 +1071,28 @@ runtime·dropm(void) static M* lockextra(bool nilokay) { - M *mp; + uintptr mpx; void (*yield)(void); for(;;) { - mp = runtime·atomicloadp(&runtime·extram); - if(mp == MLOCKED) { + mpx = runtime·atomicloaduintptr((uintptr*)&runtime·extram); + if(mpx == MLOCKED) { yield = runtime·osyield; yield(); continue; } - if(mp == nil && !nilokay) { + if(mpx == 0 && !nilokay) { runtime·usleep(1); continue; } - if(!runtime·casp(&runtime·extram, mp, MLOCKED)) { + if(!runtime·casuintptr((uintptr*)&runtime·extram, mpx, MLOCKED)) { yield = runtime·osyield; yield(); continue; } break; } - return mp; + return (M*)mpx; } #pragma textflag NOSPLIT diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index a0f1acc05..a4186f450 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -894,6 +894,7 @@ int32 runtime·round2(int32 x); // round x up to a power of 2. bool runtime·cas(uint32*, uint32, uint32); bool runtime·cas64(uint64*, uint64, uint64); bool runtime·casp(void**, void*, void*); +bool runtime·casuintptr(uintptr*, uintptr, uintptr); // Don't confuse with XADD x86 instruction, // this one is actually 'addx', that is, add-and-fetch. uint32 runtime·xadd(uint32 volatile*, int32); diff --git a/src/runtime/string.c b/src/runtime/string.c index ed5debc33..475ea2de6 100644 --- a/src/runtime/string.c +++ b/src/runtime/string.c @@ -48,7 +48,7 @@ runtime·gostringnocopy(byte *str) s.len = runtime·findnull(str); while(true) { ms = runtime·maxstring; - if(s.len <= ms || runtime·casp((void**)&runtime·maxstring, (void*)ms, (void*)s.len)) + if(s.len <= ms || runtime·casuintptr(&runtime·maxstring, ms, s.len)) return s; } } diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go index 852f4ddbb..421ab04e5 100644 --- a/src/runtime/stubs.go +++ b/src/runtime/stubs.go @@ -213,9 +213,6 @@ func write(fd uintptr, p unsafe.Pointer, n int32) int32 //go:noescape func cas(ptr *uint32, old, new uint32) bool -//go:noescape -func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool - //go:noescape func casuintptr(ptr *uintptr, old, new uintptr) bool -- cgit v1.2.1 From fc350a0d75c2df2eea560e66483dbb6f04d00e35 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Tue, 11 Nov 2014 16:54:50 -0500 Subject: [dev.garbage] runtime: concurrent mark fixes Add missing write barrier when initializing state for newly created goroutine. Add write barrier for same slot when preempting a goroutine. Disable write barrier during goroutine death, because dopanic does pointer writes. With concurrent mark enabled (not in this CL), all.bash passed once. The second time, TestGoexitCrash-2 failed. LGTM=rlh R=rlh CC=golang-codereviews https://codereview.appspot.com/167610043 --- src/runtime/mgc0.c | 3 +-- src/runtime/mgc0.go | 2 +- src/runtime/runtime.h | 2 ++ src/runtime/stack.c | 8 ++++++++ src/runtime/sys_x86.c | 1 + 5 files changed, 13 insertions(+), 3 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index 8d87107c7..3c4d1afa5 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -1094,8 +1094,7 @@ shade(byte *b) void runtime·gcmarkwb_m() { - byte **slot, *ptr; - slot = (byte**)g->m->scalararg[0]; + byte *ptr; ptr = (byte*)g->m->scalararg[1]; switch(runtime·gcphase) { diff --git a/src/runtime/mgc0.go b/src/runtime/mgc0.go index 760d2a545..dc4eec519 100644 --- a/src/runtime/mgc0.go +++ b/src/runtime/mgc0.go @@ -109,7 +109,7 @@ func writebarrierptr_nostore(dst *uintptr, src uintptr) { } mp := acquirem() - if mp.inwb { + if mp.inwb || mp.dying > 0 { releasem(mp) return } diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index a4186f450..fec224390 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -1121,6 +1121,8 @@ void runtime·osyield(void); void runtime·lockOSThread(void); void runtime·unlockOSThread(void); +void runtime·writebarrierptr_nostore(void*, void*); + bool runtime·showframe(Func*, G*); void runtime·printcreatedby(G*); diff --git a/src/runtime/stack.c b/src/runtime/stack.c index fb23cc1c3..a4947a53b 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -706,6 +706,14 @@ runtime·newstack(void) runtime·printf("runtime: split stack overflow: %p < %p\n", sp, gp->stack.lo); runtime·throw("runtime: split stack overflow"); } + + if(gp->sched.ctxt != nil) { + // morestack wrote sched.ctxt on its way in here, + // without a write barrier. Run the write barrier now. + // It is not possible to be preempted between then + // and now, so it's okay. + runtime·writebarrierptr_nostore(&gp->sched.ctxt, gp->sched.ctxt); + } if(gp->stackguard0 == (uintptr)StackPreempt) { if(gp == g->m->g0) diff --git a/src/runtime/sys_x86.c b/src/runtime/sys_x86.c index a450b3e58..edbe47ff4 100644 --- a/src/runtime/sys_x86.c +++ b/src/runtime/sys_x86.c @@ -20,6 +20,7 @@ runtime·gostartcall(Gobuf *gobuf, void (*fn)(void), void *ctxt) gobuf->sp = (uintptr)sp; gobuf->pc = (uintptr)fn; gobuf->ctxt = ctxt; + runtime·writebarrierptr_nostore(&gobuf->ctxt, ctxt); } // Called to rewind context saved during morestack back to beginning of function. -- cgit v1.2.1 From 4664f7441b495d8fa8aa5001755cb5f85e790b19 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Wed, 12 Nov 2014 14:20:53 -0500 Subject: [dev.garbage] runtime: Add write barriers to c code Also improve missing GC mark diagnostics. LGTM=rsc R=rsc CC=golang-codereviews https://codereview.appspot.com/169450043 --- src/runtime/mgc0.c | 21 +++++++++++++++++++-- src/runtime/os_darwin.c | 3 +++ src/runtime/os_dragonfly.c | 3 +++ src/runtime/os_freebsd.c | 3 +++ src/runtime/os_linux.c | 3 +++ src/runtime/os_nacl.c | 3 +++ src/runtime/os_netbsd.c | 3 +++ src/runtime/os_openbsd.c | 3 +++ src/runtime/os_plan9.c | 6 ++++++ src/runtime/os_solaris.c | 3 +++ src/runtime/proc.c | 2 ++ 11 files changed, 51 insertions(+), 2 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index 3c4d1afa5..214b9ebc2 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -29,8 +29,7 @@ // Preempted goroutines are scanned before P schedules next goroutine. // 3. Set phase = GCmark. // 4. Wait for all P's to acknowledge phase change. -// 5. Now write barrier marks and enqueues black or grey to white pointers. If a pointer is -// stored into a white slot, such pointer is not marked. +// 5. Now write barrier marks and enqueues black, grey, or white to white pointers. // Malloc still allocates white (non-marked) objects. // 6. Meanwhile GC transitively walks the heap marking reachable objects. // 7. When GC finishes marking heap, it preempts P's one-by-one and @@ -446,7 +445,25 @@ greyobject(byte *obj, Markbits *mbits, Workbuf *wbuf) if(checkmark) { if(!ismarked(mbits)) { + MSpan *s; + pageID k; + uintptr x, i; + runtime·printf("runtime:greyobject: checkmarks finds unexpected unmarked object obj=%p, mbits->bits=%x, *mbits->bitp=%x\n", obj, mbits->bits, *mbits->bitp); + + k = (uintptr)obj>>PageShift; + x = k; + x -= (uintptr)runtime·mheap.arena_start>>PageShift; + s = runtime·mheap.spans[x]; + runtime·printf("runtime:greyobject Span: obj=%p, k=%p", obj, k); + if (s == nil) { + runtime·printf(" s=nil\n"); + } else { + runtime·printf(" s->start=%p s->limit=%p, s->state=%d, s->sizeclass=%d, s->elemsize=%D \n", s->start*PageSize, s->limit, s->state, s->sizeclass, s->elemsize); + for(i=0; isizeclass; i++) { + runtime·printf(" ((uintptr*)obj)[%D]=%p\n", i, ((uintptr*)obj)[i]); + } + } runtime·throw("checkmark found unmarked object"); } if(ischeckmarked(mbits)) diff --git a/src/runtime/os_darwin.c b/src/runtime/os_darwin.c index bbd29282b..b866863d0 100644 --- a/src/runtime/os_darwin.c +++ b/src/runtime/os_darwin.c @@ -135,7 +135,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); // OS X wants >=8K, Linux >=2K + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_dragonfly.c b/src/runtime/os_dragonfly.c index e372205ec..051192ad3 100644 --- a/src/runtime/os_dragonfly.c +++ b/src/runtime/os_dragonfly.c @@ -195,7 +195,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_freebsd.c b/src/runtime/os_freebsd.c index a513cb604..1c126547a 100644 --- a/src/runtime/os_freebsd.c +++ b/src/runtime/os_freebsd.c @@ -203,7 +203,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_linux.c b/src/runtime/os_linux.c index 9bd123d59..cc23774e3 100644 --- a/src/runtime/os_linux.c +++ b/src/runtime/os_linux.c @@ -233,7 +233,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); // OS X wants >=8K, Linux >=2K + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_nacl.c b/src/runtime/os_nacl.c index 14b558303..ad72cc7c6 100644 --- a/src/runtime/os_nacl.c +++ b/src/runtime/os_nacl.c @@ -20,7 +20,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); // OS X wants >=8K, Linux >=2K + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_netbsd.c b/src/runtime/os_netbsd.c index 58e5bedf2..28929ea57 100644 --- a/src/runtime/os_netbsd.c +++ b/src/runtime/os_netbsd.c @@ -271,7 +271,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_openbsd.c b/src/runtime/os_openbsd.c index eebaa13ee..960aaffff 100644 --- a/src/runtime/os_openbsd.c +++ b/src/runtime/os_openbsd.c @@ -217,7 +217,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_plan9.c b/src/runtime/os_plan9.c index f8c543f6f..18460fc12 100644 --- a/src/runtime/os_plan9.c +++ b/src/runtime/os_plan9.c @@ -20,12 +20,18 @@ runtime·mpreinit(M *mp) { // Initialize stack and goroutine for note handling. mp->gsignal = runtime·malg(32*1024); + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); + mp->notesig = (int8*)runtime·mallocgc(ERRMAX*sizeof(int8), nil, FlagNoScan); + runtime·writebarrierptr_nostore(&mp->notesig, mp->notesig); // Initialize stack for handling strings from the // errstr system call, as used in package syscall. mp->errstr = (byte*)runtime·mallocgc(ERRMAX*sizeof(byte), nil, FlagNoScan); + runtime·writebarrierptr_nostore(&mp->errstr, mp->errstr); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_solaris.c b/src/runtime/os_solaris.c index e16b8e637..bee91d8e6 100644 --- a/src/runtime/os_solaris.c +++ b/src/runtime/os_solaris.c @@ -176,7 +176,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/proc.c b/src/runtime/proc.c index e5e2df2e4..c1df40d02 100644 --- a/src/runtime/proc.c +++ b/src/runtime/proc.c @@ -876,7 +876,9 @@ runtime·allocm(P *p) mp->g0 = runtime·malg(-1); else mp->g0 = runtime·malg(8192); + runtime·writebarrierptr_nostore(&mp->g0, mp->g0); mp->g0->m = mp; + runtime·writebarrierptr_nostore(&mp->g0->m, mp->g0->m); if(p == g->m->p) releasep(); -- cgit v1.2.1