diff options
author | Russ Cox <rsc@golang.org> | 2014-11-14 11:37:54 -0500 |
---|---|---|
committer | Russ Cox <rsc@golang.org> | 2014-11-14 11:37:54 -0500 |
commit | f4110c2e9cc8f316e14f3a4a35789bc821b326bf (patch) | |
tree | 3a2b461d76cc134f7b71e5e1ef63bc37e7d25059 /src/runtime | |
parent | 78d351d121615f1101f28f88920029f20884c689 (diff) | |
parent | 4664f7441b495d8fa8aa5001755cb5f85e790b19 (diff) | |
download | go-f4110c2e9cc8f316e14f3a4a35789bc821b326bf.tar.gz |
[dev.garbage] all: merge default (f38460037b72) into dev.garbage
This is the revision that dev.cc is branched from.
LGTM=austin
R=austin
CC=golang-codereviews
https://codereview.appspot.com/169590043
Diffstat (limited to 'src/runtime')
81 files changed, 4236 insertions, 790 deletions
diff --git a/src/runtime/arch_power64.go b/src/runtime/arch_power64.go new file mode 100644 index 000000000..270cd7b95 --- /dev/null +++ b/src/runtime/arch_power64.go @@ -0,0 +1,8 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime + +type uintreg uint64 +type intptr int64 // TODO(rsc): remove diff --git a/src/runtime/arch_power64.h b/src/runtime/arch_power64.h new file mode 100644 index 000000000..7cfb9da2f --- /dev/null +++ b/src/runtime/arch_power64.h @@ -0,0 +1,14 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +enum { + thechar = '9', + BigEndian = 1, + CacheLineSize = 64, + RuntimeGogoBytes = 64, + PhysPageSize = 65536, + PCQuantum = 4, + Int64Align = 8 +}; + diff --git a/src/runtime/arch_power64le.go b/src/runtime/arch_power64le.go new file mode 100644 index 000000000..270cd7b95 --- /dev/null +++ b/src/runtime/arch_power64le.go @@ -0,0 +1,8 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime + +type uintreg uint64 +type intptr int64 // TODO(rsc): remove diff --git a/src/runtime/arch_power64le.h b/src/runtime/arch_power64le.h new file mode 100644 index 000000000..684ac9953 --- /dev/null +++ b/src/runtime/arch_power64le.h @@ -0,0 +1,14 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +enum { + thechar = '9', + BigEndian = 0, + CacheLineSize = 64, + RuntimeGogoBytes = 64, + PhysPageSize = 65536, + PCQuantum = 4, + Int64Align = 8 +}; + diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s index b4b81d739..501e64b09 100644 --- a/src/runtime/asm_386.s +++ b/src/runtime/asm_386.s @@ -486,11 +486,11 @@ TEXT runtime·cas64(SB), NOSPLIT, $0-21 MOVL new_hi+16(FP), CX LOCK CMPXCHG8B 0(BP) - JNZ cas64_fail + JNZ fail MOVL $1, AX MOVB AX, ret+20(FP) RET -cas64_fail: +fail: MOVL $0, AX MOVB AX, ret+20(FP) RET @@ -502,7 +502,7 @@ cas64_fail: // return 1; // }else // return 0; -TEXT runtime·casp(SB), NOSPLIT, $0-13 +TEXT runtime·casp1(SB), NOSPLIT, $0-13 MOVL ptr+0(FP), BX MOVL old+4(FP), AX MOVL new+8(FP), CX @@ -537,7 +537,7 @@ TEXT runtime·xchg(SB), NOSPLIT, $0-12 MOVL AX, ret+8(FP) RET -TEXT runtime·xchgp(SB), NOSPLIT, $0-12 +TEXT runtime·xchgp1(SB), NOSPLIT, $0-12 MOVL ptr+0(FP), BX MOVL new+4(FP), AX XCHGL AX, 0(BX) @@ -555,7 +555,7 @@ again: JNZ again RET -TEXT runtime·atomicstorep(SB), NOSPLIT, $0-8 +TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8 MOVL ptr+0(FP), BX MOVL val+4(FP), AX XCHGL AX, 0(BX) @@ -1356,29 +1356,29 @@ TEXT strings·IndexByte(SB),NOSPLIT,$0 // AX = 1/0/-1 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 CMPL SI, DI - JEQ cmp_allsame + JEQ allsame CMPL BX, DX MOVL DX, BP CMOVLLT BX, BP // BP = min(alen, blen) CMPL BP, $4 - JB cmp_small + JB small TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 - JE cmp_mediumloop -cmp_largeloop: + JE mediumloop +largeloop: CMPL BP, $16 - JB cmp_mediumloop + JB mediumloop MOVOU (SI), X0 MOVOU (DI), X1 PCMPEQB X0, X1 PMOVMSKB X1, AX XORL $0xffff, AX // convert EQ to NE - JNE cmp_diff16 // branch if at least one byte is not equal + JNE diff16 // branch if at least one byte is not equal ADDL $16, SI ADDL $16, DI SUBL $16, BP - JMP cmp_largeloop + JMP largeloop -cmp_diff16: +diff16: BSFL AX, BX // index of first byte that differs XORL AX, AX MOVB (SI)(BX*1), CX @@ -1387,25 +1387,25 @@ cmp_diff16: LEAL -1(AX*2), AX // convert 1/0 to +1/-1 RET -cmp_mediumloop: +mediumloop: CMPL BP, $4 - JBE cmp_0through4 + JBE _0through4 MOVL (SI), AX MOVL (DI), CX CMPL AX, CX - JNE cmp_diff4 + JNE diff4 ADDL $4, SI ADDL $4, DI SUBL $4, BP - JMP cmp_mediumloop + JMP mediumloop -cmp_0through4: +_0through4: MOVL -4(SI)(BP*1), AX MOVL -4(DI)(BP*1), CX CMPL AX, CX - JEQ cmp_allsame + JEQ allsame -cmp_diff4: +diff4: BSWAPL AX // reverse order of bytes BSWAPL CX XORL AX, CX // find bit differences @@ -1416,37 +1416,37 @@ cmp_diff4: RET // 0-3 bytes in common -cmp_small: +small: LEAL (BP*8), CX NEGL CX - JEQ cmp_allsame + JEQ allsame // load si CMPB SI, $0xfc - JA cmp_si_high + JA si_high MOVL (SI), SI - JMP cmp_si_finish -cmp_si_high: + JMP si_finish +si_high: MOVL -4(SI)(BP*1), SI SHRL CX, SI -cmp_si_finish: +si_finish: SHLL CX, SI // same for di CMPB DI, $0xfc - JA cmp_di_high + JA di_high MOVL (DI), DI - JMP cmp_di_finish -cmp_di_high: + JMP di_finish +di_high: MOVL -4(DI)(BP*1), DI SHRL CX, DI -cmp_di_finish: +di_finish: SHLL CX, DI BSWAPL SI // reverse order of bytes BSWAPL DI XORL SI, DI // find bit differences - JEQ cmp_allsame + JEQ allsame BSRL DI, CX // index of highest bit difference SHRL CX, SI // move a's bit to bottom ANDL $1, SI // mask bit @@ -1455,7 +1455,7 @@ cmp_di_finish: // all the bytes in common are the same, so we just need // to compare the lengths. -cmp_allsame: +allsame: XORL AX, AX XORL CX, CX CMPL BX, DX diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 39d7c78f2..1aa2d71a8 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -461,11 +461,11 @@ TEXT runtime·cas64(SB), NOSPLIT, $0-25 MOVQ new+16(FP), CX LOCK CMPXCHGQ CX, 0(BX) - JNZ cas64_fail + JNZ fail MOVL $1, AX MOVB AX, ret+24(FP) RET -cas64_fail: +fail: MOVL $0, AX MOVB AX, ret+24(FP) RET @@ -489,7 +489,7 @@ TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16 // return 1; // } else // return 0; -TEXT runtime·casp(SB), NOSPLIT, $0-25 +TEXT runtime·casp1(SB), NOSPLIT, $0-25 MOVQ ptr+0(FP), BX MOVQ old+8(FP), AX MOVQ new+16(FP), CX @@ -541,7 +541,7 @@ TEXT runtime·xchg64(SB), NOSPLIT, $0-24 MOVQ AX, ret+16(FP) RET -TEXT runtime·xchgp(SB), NOSPLIT, $0-24 +TEXT runtime·xchgp1(SB), NOSPLIT, $0-24 MOVQ ptr+0(FP), BX MOVQ new+8(FP), AX XCHGQ AX, 0(BX) @@ -559,7 +559,7 @@ again: JNZ again RET -TEXT runtime·atomicstorep(SB), NOSPLIT, $0-16 +TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16 MOVQ ptr+0(FP), BX MOVQ val+8(FP), AX XCHGQ AX, 0(BX) @@ -890,24 +890,24 @@ TEXT runtime·aeshashbody(SB),NOSPLIT,$0-32 MOVO runtime·aeskeysched+0(SB), X2 MOVO runtime·aeskeysched+16(SB), X3 CMPQ CX, $16 - JB aessmall -aesloop: + JB small +loop: CMPQ CX, $16 - JBE aesloopend + JBE loopend MOVOU (AX), X1 AESENC X2, X0 AESENC X1, X0 SUBQ $16, CX ADDQ $16, AX - JMP aesloop + JMP loop // 1-16 bytes remaining -aesloopend: +loopend: // This load may overlap with the previous load above. // We'll hash some bytes twice, but that's ok. MOVOU -16(AX)(CX*1), X1 JMP partial // 0-15 bytes -aessmall: +small: TESTQ CX, CX JE finalize // 0 bytes @@ -1050,18 +1050,18 @@ TEXT runtime·eqstring(SB),NOSPLIT,$0-33 MOVQ s1len+8(FP), AX MOVQ s2len+24(FP), BX CMPQ AX, BX - JNE different + JNE noteq MOVQ s1str+0(FP), SI MOVQ s2str+16(FP), DI CMPQ SI, DI - JEQ same + JEQ eq CALL runtime·memeqbody(SB) MOVB AX, v+32(FP) RET -same: +eq: MOVB $1, v+32(FP) RET -different: +noteq: MOVB $0, v+32(FP) RET @@ -1184,29 +1184,29 @@ TEXT runtime·cmpbytes(SB),NOSPLIT,$0-56 // AX = 1/0/-1 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 CMPQ SI, DI - JEQ cmp_allsame + JEQ allsame CMPQ BX, DX MOVQ DX, BP CMOVQLT BX, BP // BP = min(alen, blen) = # of bytes to compare CMPQ BP, $8 - JB cmp_small + JB small -cmp_loop: +loop: CMPQ BP, $16 - JBE cmp_0through16 + JBE _0through16 MOVOU (SI), X0 MOVOU (DI), X1 PCMPEQB X0, X1 PMOVMSKB X1, AX XORQ $0xffff, AX // convert EQ to NE - JNE cmp_diff16 // branch if at least one byte is not equal + JNE diff16 // branch if at least one byte is not equal ADDQ $16, SI ADDQ $16, DI SUBQ $16, BP - JMP cmp_loop + JMP loop // AX = bit mask of differences -cmp_diff16: +diff16: BSFQ AX, BX // index of first byte that differs XORQ AX, AX MOVB (SI)(BX*1), CX @@ -1216,21 +1216,21 @@ cmp_diff16: RET // 0 through 16 bytes left, alen>=8, blen>=8 -cmp_0through16: +_0through16: CMPQ BP, $8 - JBE cmp_0through8 + JBE _0through8 MOVQ (SI), AX MOVQ (DI), CX CMPQ AX, CX - JNE cmp_diff8 -cmp_0through8: + JNE diff8 +_0through8: MOVQ -8(SI)(BP*1), AX MOVQ -8(DI)(BP*1), CX CMPQ AX, CX - JEQ cmp_allsame + JEQ allsame // AX and CX contain parts of a and b that differ. -cmp_diff8: +diff8: BSWAPQ AX // reverse order of bytes BSWAPQ CX XORQ AX, CX @@ -1241,44 +1241,44 @@ cmp_diff8: RET // 0-7 bytes in common -cmp_small: +small: LEAQ (BP*8), CX // bytes left -> bits left NEGQ CX // - bits lift (== 64 - bits left mod 64) - JEQ cmp_allsame + JEQ allsame // load bytes of a into high bytes of AX CMPB SI, $0xf8 - JA cmp_si_high + JA si_high MOVQ (SI), SI - JMP cmp_si_finish -cmp_si_high: + JMP si_finish +si_high: MOVQ -8(SI)(BP*1), SI SHRQ CX, SI -cmp_si_finish: +si_finish: SHLQ CX, SI // load bytes of b in to high bytes of BX CMPB DI, $0xf8 - JA cmp_di_high + JA di_high MOVQ (DI), DI - JMP cmp_di_finish -cmp_di_high: + JMP di_finish +di_high: MOVQ -8(DI)(BP*1), DI SHRQ CX, DI -cmp_di_finish: +di_finish: SHLQ CX, DI BSWAPQ SI // reverse order of bytes BSWAPQ DI XORQ SI, DI // find bit differences - JEQ cmp_allsame + JEQ allsame BSRQ DI, CX // index of highest bit difference SHRQ CX, SI // move a's bit to bottom ANDQ $1, SI // mask bit LEAQ -1(SI*2), AX // 1/0 => +1/-1 RET -cmp_allsame: +allsame: XORQ AX, AX XORQ CX, CX CMPQ BX, DX @@ -1313,7 +1313,7 @@ TEXT runtime·indexbytebody(SB),NOSPLIT,$0 MOVQ SI, DI CMPQ BX, $16 - JLT indexbyte_small + JLT small // round up to first 16-byte boundary TESTQ $15, SI @@ -1371,7 +1371,7 @@ failure: RET // handle for lengths < 16 -indexbyte_small: +small: MOVQ BX, CX REPN; SCASB JZ success diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s index a1116b5d4..153564b14 100644 --- a/src/runtime/asm_amd64p32.s +++ b/src/runtime/asm_amd64p32.s @@ -444,11 +444,11 @@ TEXT runtime·cas64(SB), NOSPLIT, $0-25 MOVQ new+16(FP), CX LOCK CMPXCHGQ CX, 0(BX) - JNZ cas64_fail + JNZ fail MOVL $1, AX MOVB AX, ret+24(FP) RET -cas64_fail: +fail: MOVL $0, AX MOVB AX, ret+24(FP) RET @@ -460,7 +460,7 @@ cas64_fail: // return 1; // } else // return 0; -TEXT runtime·casp(SB), NOSPLIT, $0-17 +TEXT runtime·casp1(SB), NOSPLIT, $0-17 MOVL ptr+0(FP), BX MOVL old+4(FP), AX MOVL new+8(FP), CX @@ -512,7 +512,7 @@ TEXT runtime·xchg64(SB), NOSPLIT, $0-24 MOVQ AX, ret+16(FP) RET -TEXT runtime·xchgp(SB), NOSPLIT, $0-12 +TEXT runtime·xchgp1(SB), NOSPLIT, $0-12 MOVL ptr+0(FP), BX MOVL new+4(FP), AX XCHGL AX, 0(BX) @@ -530,7 +530,7 @@ again: JNZ again RET -TEXT runtime·atomicstorep(SB), NOSPLIT, $0-8 +TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8 MOVL ptr+0(FP), BX MOVL val+4(FP), AX XCHGL AX, 0(BX) @@ -834,29 +834,29 @@ TEXT runtime·cmpbytes(SB),NOSPLIT,$0-28 // AX = 1/0/-1 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0 CMPQ SI, DI - JEQ cmp_allsame + JEQ allsame CMPQ BX, DX MOVQ DX, R8 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare CMPQ R8, $8 - JB cmp_small + JB small -cmp_loop: +loop: CMPQ R8, $16 - JBE cmp_0through16 + JBE _0through16 MOVOU (SI), X0 MOVOU (DI), X1 PCMPEQB X0, X1 PMOVMSKB X1, AX XORQ $0xffff, AX // convert EQ to NE - JNE cmp_diff16 // branch if at least one byte is not equal + JNE diff16 // branch if at least one byte is not equal ADDQ $16, SI ADDQ $16, DI SUBQ $16, R8 - JMP cmp_loop + JMP loop // AX = bit mask of differences -cmp_diff16: +diff16: BSFQ AX, BX // index of first byte that differs XORQ AX, AX ADDQ BX, SI @@ -868,23 +868,23 @@ cmp_diff16: RET // 0 through 16 bytes left, alen>=8, blen>=8 -cmp_0through16: +_0through16: CMPQ R8, $8 - JBE cmp_0through8 + JBE _0through8 MOVQ (SI), AX MOVQ (DI), CX CMPQ AX, CX - JNE cmp_diff8 -cmp_0through8: + JNE diff8 +_0through8: ADDQ R8, SI ADDQ R8, DI MOVQ -8(SI), AX MOVQ -8(DI), CX CMPQ AX, CX - JEQ cmp_allsame + JEQ allsame // AX and CX contain parts of a and b that differ. -cmp_diff8: +diff8: BSWAPQ AX // reverse order of bytes BSWAPQ CX XORQ AX, CX @@ -895,46 +895,46 @@ cmp_diff8: RET // 0-7 bytes in common -cmp_small: +small: LEAQ (R8*8), CX // bytes left -> bits left NEGQ CX // - bits lift (== 64 - bits left mod 64) - JEQ cmp_allsame + JEQ allsame // load bytes of a into high bytes of AX CMPB SI, $0xf8 - JA cmp_si_high + JA si_high MOVQ (SI), SI - JMP cmp_si_finish -cmp_si_high: + JMP si_finish +si_high: ADDQ R8, SI MOVQ -8(SI), SI SHRQ CX, SI -cmp_si_finish: +si_finish: SHLQ CX, SI // load bytes of b in to high bytes of BX CMPB DI, $0xf8 - JA cmp_di_high + JA di_high MOVQ (DI), DI - JMP cmp_di_finish -cmp_di_high: + JMP di_finish +di_high: ADDQ R8, DI MOVQ -8(DI), DI SHRQ CX, DI -cmp_di_finish: +di_finish: SHLQ CX, DI BSWAPQ SI // reverse order of bytes BSWAPQ DI XORQ SI, DI // find bit differences - JEQ cmp_allsame + JEQ allsame BSRQ DI, CX // index of highest bit difference SHRQ CX, SI // move a's bit to bottom ANDQ $1, SI // mask bit LEAQ -1(SI*2), AX // 1/0 => +1/-1 RET -cmp_allsame: +allsame: XORQ AX, AX XORQ CX, CX CMPQ BX, DX @@ -969,7 +969,7 @@ TEXT runtime·indexbytebody(SB),NOSPLIT,$0 MOVL SI, DI CMPL BX, $16 - JLT indexbyte_small + JLT small // round up to first 16-byte boundary TESTL $15, SI @@ -1027,7 +1027,7 @@ failure: RET // handle for lengths < 16 -indexbyte_small: +small: MOVL BX, CX REPN; SCASB JZ success diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s index 0f3b5eeb8..58aebf388 100644 --- a/src/runtime/asm_arm.s +++ b/src/runtime/asm_arm.s @@ -492,7 +492,7 @@ TEXT asmcgocall<>(SB),NOSPLIT,$0-0 MOVW g_m(g), R8 MOVW m_g0(R8), R3 CMP R3, g - BEQ asmcgocall_g0 + BEQ g0 BL gosave<>(SB) MOVW R0, R5 MOVW R3, R0 @@ -501,7 +501,7 @@ TEXT asmcgocall<>(SB),NOSPLIT,$0-0 MOVW (g_sched+gobuf_sp)(g), R13 // Now on a scheduling stack (a pthread-created stack). -asmcgocall_g0: +g0: SUB $24, R13 BIC $0x7, R13 // alignment for gcc ABI MOVW R4, 20(R13) // save old g @@ -751,13 +751,13 @@ TEXT runtime·memeq(SB),NOSPLIT,$-4-13 ADD R1, R3, R6 MOVW $1, R0 MOVB R0, ret+12(FP) -_next2: +loop: CMP R1, R6 RET.EQ MOVBU.P 1(R1), R4 MOVBU.P 1(R2), R5 CMP R4, R5 - BEQ _next2 + BEQ loop MOVW $0, R0 MOVB R0, ret+12(FP) @@ -780,13 +780,13 @@ TEXT runtime·eqstring(SB),NOSPLIT,$-4-17 CMP R2, R3 RET.EQ ADD R2, R0, R6 -_eqnext: +loop: CMP R2, R6 RET.EQ MOVBU.P 1(R2), R4 MOVBU.P 1(R3), R5 CMP R4, R5 - BEQ _eqnext + BEQ loop MOVB R7, v+16(FP) RET @@ -801,26 +801,26 @@ TEXT bytes·Equal(SB),NOSPLIT,$0 MOVW b_len+16(FP), R3 CMP R1, R3 // unequal lengths are not equal - B.NE _notequal + B.NE notequal MOVW a+0(FP), R0 MOVW b+12(FP), R2 ADD R0, R1 // end -_byteseq_next: +loop: CMP R0, R1 - B.EQ _equal // reached the end + B.EQ equal // reached the end MOVBU.P 1(R0), R4 MOVBU.P 1(R2), R5 CMP R4, R5 - B.EQ _byteseq_next + B.EQ loop -_notequal: +notequal: MOVW $0, R0 MOVBU R0, ret+24(FP) RET -_equal: +equal: MOVW $1, R0 MOVBU R0, ret+24(FP) RET diff --git a/src/runtime/asm_power64x.s b/src/runtime/asm_power64x.s new file mode 100644 index 000000000..fd0c6be16 --- /dev/null +++ b/src/runtime/asm_power64x.s @@ -0,0 +1,981 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build power64 power64le + +#include "zasm_GOOS_GOARCH.h" +#include "funcdata.h" +#include "textflag.h" + +TEXT runtime·rt0_go(SB),NOSPLIT,$0 + // initialize essential registers + BL runtime·reginit(SB) + + SUB $24, R1 + MOVW R3, 8(R1) // argc + MOVD R4, 16(R1) // argv + + // create istack out of the given (operating system) stack. + // _cgo_init may update stackguard. + MOVD $runtime·g0(SB), g + MOVD $(-64*1024), R31 + ADD R31, R1, R3 + MOVD R3, g_stackguard0(g) + MOVD R3, g_stackguard1(g) + MOVD R3, (g_stack+stack_lo)(g) + MOVD R1, (g_stack+stack_hi)(g) + + // TODO: if there is a _cgo_init, call it. + // TODO: add TLS + + // set the per-goroutine and per-mach "registers" + MOVD $runtime·m0(SB), R3 + + // save m->g0 = g0 + MOVD g, m_g0(R3) + // save m0 to g0->m + MOVD R3, g_m(g) + + BL runtime·check(SB) + + // args are already prepared + BL runtime·args(SB) + BL runtime·osinit(SB) + BL runtime·schedinit(SB) + + // create a new goroutine to start program + MOVD $runtime·main·f(SB), R3 // entry + MOVDU R3, -8(R1) + MOVDU R0, -8(R1) + MOVDU R0, -8(R1) + BL runtime·newproc(SB) + ADD $24, R1 + + // start this M + BL runtime·mstart(SB) + + MOVD R0, 1(R0) + RETURN + +DATA runtime·main·f+0(SB)/8,$runtime·main(SB) +GLOBL runtime·main·f(SB),RODATA,$8 + +TEXT runtime·breakpoint(SB),NOSPLIT,$-8-0 + MOVD R0, 2(R0) // TODO: TD + RETURN + +TEXT runtime·asminit(SB),NOSPLIT,$-8-0 + RETURN + +TEXT runtime·reginit(SB),NOSPLIT,$-8-0 + // set R0 to zero, it's expected by the toolchain + XOR R0, R0 + // initialize essential FP registers + FMOVD $4503601774854144.0, F27 + FMOVD $0.5, F29 + FSUB F29, F29, F28 + FADD F29, F29, F30 + FADD F30, F30, F31 + RETURN + +/* + * go-routine + */ + +// void gosave(Gobuf*) +// save state in Gobuf; setjmp +TEXT runtime·gosave(SB), NOSPLIT, $-8-8 + MOVD gobuf+0(FP), R3 + MOVD R1, gobuf_sp(R3) + MOVD LR, R31 + MOVD R31, gobuf_pc(R3) + MOVD g, gobuf_g(R3) + MOVD R0, gobuf_lr(R3) + MOVD R0, gobuf_ret(R3) + MOVD R0, gobuf_ctxt(R3) + RETURN + +// void gogo(Gobuf*) +// restore state from Gobuf; longjmp +TEXT runtime·gogo(SB), NOSPLIT, $-8-8 + MOVD gobuf+0(FP), R5 + MOVD gobuf_g(R5), g // make sure g is not nil + MOVD 0(g), R4 + MOVD gobuf_sp(R5), R1 + MOVD gobuf_lr(R5), R31 + MOVD R31, LR + MOVD gobuf_ret(R5), R3 + MOVD gobuf_ctxt(R5), R11 + MOVD R0, gobuf_sp(R5) + MOVD R0, gobuf_ret(R5) + MOVD R0, gobuf_lr(R5) + MOVD R0, gobuf_ctxt(R5) + CMP R0, R0 // set condition codes for == test, needed by stack split + MOVD gobuf_pc(R5), R31 + MOVD R31, CTR + BR (CTR) + +// void mcall(fn func(*g)) +// Switch to m->g0's stack, call fn(g). +// Fn must never return. It should gogo(&g->sched) +// to keep running g. +TEXT runtime·mcall(SB), NOSPLIT, $-8-8 + // Save caller state in g->sched + MOVD R1, (g_sched+gobuf_sp)(g) + MOVD LR, R31 + MOVD R31, (g_sched+gobuf_pc)(g) + MOVD R0, (g_sched+gobuf_lr)(g) + MOVD g, (g_sched+gobuf_g)(g) + + // Switch to m->g0 & its stack, call fn. + MOVD g, R3 + MOVD g_m(g), R8 + MOVD m_g0(R8), g + CMP g, R3 + BNE 2(PC) + BR runtime·badmcall(SB) + MOVD fn+0(FP), R11 // context + MOVD 0(R11), R4 // code pointer + MOVD R4, CTR + MOVD (g_sched+gobuf_sp)(g), R1 // sp = m->g0->sched.sp + MOVDU R3, -8(R1) + MOVDU R0, -8(R1) + BL (CTR) + BR runtime·badmcall2(SB) + +// switchtoM is a dummy routine that onM leaves at the bottom +// of the G stack. We need to distinguish the routine that +// lives at the bottom of the G stack from the one that lives +// at the top of the M stack because the one at the top of +// the M stack terminates the stack walk (see topofstack()). +TEXT runtime·switchtoM(SB), NOSPLIT, $0-0 + UNDEF + BL (LR) // make sure this function is not leaf + RETURN + +// func onM_signalok(fn func()) +TEXT runtime·onM_signalok(SB), NOSPLIT, $8-8 + MOVD g, R3 // R3 = g + MOVD g_m(R3), R4 // R4 = g->m + MOVD m_gsignal(R4), R4 // R4 = g->m->gsignal + MOVD fn+0(FP), R11 // context for call below + CMP R3, R4 + BEQ onsignal + MOVD R11, 8(R1) + BL runtime·onM(SB) + RETURN + +onsignal: + MOVD 0(R11), R3 // code pointer + MOVD R3, CTR + BL (CTR) + RETURN + +// void onM(fn func()) +TEXT runtime·onM(SB), NOSPLIT, $0-8 + MOVD fn+0(FP), R3 // R3 = fn + MOVD R3, R11 // context + MOVD g_m(g), R4 // R4 = m + + MOVD m_g0(R4), R5 // R5 = g0 + CMP g, R5 + BEQ onm + + MOVD m_curg(R4), R6 + CMP g, R6 + BEQ oncurg + + // Not g0, not curg. Must be gsignal, but that's not allowed. + // Hide call from linker nosplit analysis. + MOVD $runtime·badonm(SB), R3 + MOVD R3, CTR + BL (CTR) + +oncurg: + // save our state in g->sched. Pretend to + // be switchtoM if the G stack is scanned. + MOVD $runtime·switchtoM(SB), R6 + ADD $8, R6 // get past prologue + MOVD R6, (g_sched+gobuf_pc)(g) + MOVD R1, (g_sched+gobuf_sp)(g) + MOVD R0, (g_sched+gobuf_lr)(g) + MOVD g, (g_sched+gobuf_g)(g) + + // switch to g0 + MOVD R5, g + MOVD (g_sched+gobuf_sp)(g), R3 + // make it look like mstart called onM on g0, to stop traceback + SUB $8, R3 + MOVD $runtime·mstart(SB), R4 + MOVD R4, 0(R3) + MOVD R3, R1 + + // call target function + MOVD 0(R11), R3 // code pointer + MOVD R3, CTR + BL (CTR) + + // switch back to g + MOVD g_m(g), R3 + MOVD m_curg(R3), g + MOVD (g_sched+gobuf_sp)(g), R1 + MOVD R0, (g_sched+gobuf_sp)(g) + RETURN + +onm: + // already on m stack, just call directly + MOVD 0(R11), R3 // code pointer + MOVD R3, CTR + BL (CTR) + RETURN + +/* + * support for morestack + */ + +// Called during function prolog when more stack is needed. +// Caller has already loaded: +// R3: framesize, R4: argsize, R5: LR +// +// The traceback routines see morestack on a g0 as being +// the top of a stack (for example, morestack calling newstack +// calling the scheduler calling newm calling gc), so we must +// record an argument size. For that purpose, it has no arguments. +TEXT runtime·morestack(SB),NOSPLIT,$-8-0 + // Cannot grow scheduler stack (m->g0). + MOVD g_m(g), R7 + MOVD m_g0(R7), R8 + CMP g, R8 + BNE 2(PC) + BL runtime·abort(SB) + + // Cannot grow signal stack (m->gsignal). + MOVD m_gsignal(R7), R8 + CMP g, R8 + BNE 2(PC) + BL runtime·abort(SB) + + // Called from f. + // Set g->sched to context in f. + MOVD R11, (g_sched+gobuf_ctxt)(g) + MOVD R1, (g_sched+gobuf_sp)(g) + MOVD LR, R8 + MOVD R8, (g_sched+gobuf_pc)(g) + MOVD R5, (g_sched+gobuf_lr)(g) + + // Called from f. + // Set m->morebuf to f's caller. + MOVD R5, (m_morebuf+gobuf_pc)(R7) // f's caller's PC + MOVD R1, (m_morebuf+gobuf_sp)(R7) // f's caller's SP + MOVD g, (m_morebuf+gobuf_g)(R7) + + // Call newstack on m->g0's stack. + MOVD m_g0(R7), g + MOVD (g_sched+gobuf_sp)(g), R1 + BL runtime·newstack(SB) + + // Not reached, but make sure the return PC from the call to newstack + // is still in this function, and not the beginning of the next. + UNDEF + +TEXT runtime·morestack_noctxt(SB),NOSPLIT,$-8-0 + MOVD R0, R11 + BR runtime·morestack(SB) + +// reflectcall: call a function with the given argument list +// func call(f *FuncVal, arg *byte, argsize, retoffset uint32). +// we don't have variable-sized frames, so we use a small number +// of constant-sized-frame functions to encode a few bits of size in the pc. +// Caution: ugly multiline assembly macros in your future! + +#define DISPATCH(NAME,MAXSIZE) \ + MOVD $MAXSIZE, R31; \ + CMP R3, R31; \ + BGT 4(PC); \ + MOVD $NAME(SB), R31; \ + MOVD R31, CTR; \ + BR (CTR) +// Note: can't just "BR NAME(SB)" - bad inlining results. + +TEXT ·reflectcall(SB), NOSPLIT, $-8-24 + MOVW argsize+16(FP), R3 + DISPATCH(runtime·call16, 16) + DISPATCH(runtime·call32, 32) + DISPATCH(runtime·call64, 64) + DISPATCH(runtime·call128, 128) + DISPATCH(runtime·call256, 256) + DISPATCH(runtime·call512, 512) + DISPATCH(runtime·call1024, 1024) + DISPATCH(runtime·call2048, 2048) + DISPATCH(runtime·call4096, 4096) + DISPATCH(runtime·call8192, 8192) + DISPATCH(runtime·call16384, 16384) + DISPATCH(runtime·call32768, 32768) + DISPATCH(runtime·call65536, 65536) + DISPATCH(runtime·call131072, 131072) + DISPATCH(runtime·call262144, 262144) + DISPATCH(runtime·call524288, 524288) + DISPATCH(runtime·call1048576, 1048576) + DISPATCH(runtime·call2097152, 2097152) + DISPATCH(runtime·call4194304, 4194304) + DISPATCH(runtime·call8388608, 8388608) + DISPATCH(runtime·call16777216, 16777216) + DISPATCH(runtime·call33554432, 33554432) + DISPATCH(runtime·call67108864, 67108864) + DISPATCH(runtime·call134217728, 134217728) + DISPATCH(runtime·call268435456, 268435456) + DISPATCH(runtime·call536870912, 536870912) + DISPATCH(runtime·call1073741824, 1073741824) + MOVD $runtime·badreflectcall(SB), R31 + MOVD R31, CTR + BR (CTR) + +#define CALLFN(NAME,MAXSIZE) \ +TEXT NAME(SB), WRAPPER, $MAXSIZE-24; \ + NO_LOCAL_POINTERS; \ + /* copy arguments to stack */ \ + MOVD argptr+8(FP), R3; \ + MOVW argsize+16(FP), R4; \ + MOVD R1, R5; \ + ADD $(8-1), R5; \ + SUB $1, R3; \ + ADD R5, R4; \ + CMP R5, R4; \ + BEQ 4(PC); \ + MOVBZU 1(R3), R6; \ + MOVBZU R6, 1(R5); \ + BR -4(PC); \ + /* call function */ \ + MOVD f+0(FP), R11; \ + MOVD (R11), R31; \ + MOVD R31, CTR; \ + PCDATA $PCDATA_StackMapIndex, $0; \ + BL (CTR); \ + /* copy return values back */ \ + MOVD argptr+8(FP), R3; \ + MOVW argsize+16(FP), R4; \ + MOVW retoffset+20(FP), R6; \ + MOVD R1, R5; \ + ADD R6, R5; \ + ADD R6, R3; \ + SUB R6, R4; \ + ADD $(8-1), R5; \ + SUB $1, R3; \ + ADD R5, R4; \ + CMP R5, R4; \ + BEQ 4(PC); \ + MOVBZU 1(R5), R6; \ + MOVBZU R6, 1(R3); \ + BR -4(PC); \ + RETURN + +CALLFN(·call16, 16) +CALLFN(·call32, 32) +CALLFN(·call64, 64) +CALLFN(·call128, 128) +CALLFN(·call256, 256) +CALLFN(·call512, 512) +CALLFN(·call1024, 1024) +CALLFN(·call2048, 2048) +CALLFN(·call4096, 4096) +CALLFN(·call8192, 8192) +CALLFN(·call16384, 16384) +CALLFN(·call32768, 32768) +CALLFN(·call65536, 65536) +CALLFN(·call131072, 131072) +CALLFN(·call262144, 262144) +CALLFN(·call524288, 524288) +CALLFN(·call1048576, 1048576) +CALLFN(·call2097152, 2097152) +CALLFN(·call4194304, 4194304) +CALLFN(·call8388608, 8388608) +CALLFN(·call16777216, 16777216) +CALLFN(·call33554432, 33554432) +CALLFN(·call67108864, 67108864) +CALLFN(·call134217728, 134217728) +CALLFN(·call268435456, 268435456) +CALLFN(·call536870912, 536870912) +CALLFN(·call1073741824, 1073741824) + +// bool cas(int32 *val, int32 old, int32 new) +// Atomically: +// if(*val == old){ +// *val = new; +// return 1; +// } else +// return 0; +TEXT runtime·cas(SB), NOSPLIT, $0-17 + MOVD p+0(FP), R3 + MOVW old+8(FP), R4 + MOVW new+12(FP), R5 +cas_again: + SYNC + LWAR (R3), R6 + CMPW R6, R4 + BNE cas_fail + STWCCC R5, (R3) + BNE cas_again + MOVD $1, R3 + SYNC + ISYNC + MOVB R3, ret+16(FP) + RETURN +cas_fail: + MOVD $0, R3 + BR -5(PC) + +// bool runtime·cas64(uint64 *val, uint64 old, uint64 new) +// Atomically: +// if(*val == *old){ +// *val = new; +// return 1; +// } else { +// return 0; +// } +TEXT runtime·cas64(SB), NOSPLIT, $0-25 + MOVD p+0(FP), R3 + MOVD old+8(FP), R4 + MOVD new+16(FP), R5 +cas64_again: + SYNC + LDAR (R3), R6 + CMP R6, R4 + BNE cas64_fail + STDCCC R5, (R3) + BNE cas64_again + MOVD $1, R3 + SYNC + ISYNC + MOVB R3, ret+24(FP) + RETURN +cas64_fail: + MOVD $0, R3 + BR -5(PC) + +TEXT runtime·casuintptr(SB), NOSPLIT, $0-25 + BR runtime·cas64(SB) + +TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $-8-16 + BR runtime·atomicload64(SB) + +TEXT runtime·atomicloaduint(SB), NOSPLIT, $-8-16 + BR runtime·atomicload64(SB) + +TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16 + BR runtime·atomicstore64(SB) + +// bool casp(void **val, void *old, void *new) +// Atomically: +// if(*val == old){ +// *val = new; +// return 1; +// } else +// return 0; +TEXT runtime·casp1(SB), NOSPLIT, $0-25 + BR runtime·cas64(SB) + +// uint32 xadd(uint32 volatile *val, int32 delta) +// Atomically: +// *val += delta; +// return *val; +TEXT runtime·xadd(SB), NOSPLIT, $0-20 + MOVD p+0(FP), R4 + MOVW delta+8(FP), R5 + SYNC + LWAR (R4), R3 + ADD R5, R3 + STWCCC R3, (R4) + BNE -4(PC) + SYNC + ISYNC + MOVW R3, ret+16(FP) + RETURN + +TEXT runtime·xadd64(SB), NOSPLIT, $0-24 + MOVD p+0(FP), R4 + MOVD delta+8(FP), R5 + SYNC + LDAR (R4), R3 + ADD R5, R3 + STDCCC R3, (R4) + BNE -4(PC) + SYNC + ISYNC + MOVD R3, ret+16(FP) + RETURN + +TEXT runtime·xchg(SB), NOSPLIT, $0-20 + MOVD p+0(FP), R4 + MOVW new+8(FP), R5 + SYNC + LWAR (R4), R3 + STWCCC R5, (R4) + BNE -3(PC) + SYNC + ISYNC + MOVW R3, ret+16(FP) + RETURN + +TEXT runtime·xchg64(SB), NOSPLIT, $0-24 + MOVD p+0(FP), R4 + MOVD new+8(FP), R5 + SYNC + LDAR (R4), R3 + STDCCC R5, (R4) + BNE -3(PC) + SYNC + ISYNC + MOVD R3, ret+16(FP) + RETURN + +TEXT runtime·xchgp1(SB), NOSPLIT, $0-24 + BR runtime·xchg64(SB) + +TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24 + BR runtime·xchg64(SB) + +TEXT runtime·procyield(SB),NOSPLIT,$0-0 + RETURN + +TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16 + BR runtime·atomicstore64(SB) + +TEXT runtime·atomicstore(SB), NOSPLIT, $0-12 + MOVD ptr+0(FP), R3 + MOVW val+8(FP), R4 + SYNC + MOVW R4, 0(R3) + RETURN + +TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16 + MOVD ptr+0(FP), R3 + MOVD val+8(FP), R4 + SYNC + MOVD R4, 0(R3) + RETURN + +// void runtime·atomicor8(byte volatile*, byte); +TEXT runtime·atomicor8(SB), NOSPLIT, $0-9 + MOVD ptr+0(FP), R3 + MOVBZ val+8(FP), R4 + // Align ptr down to 4 bytes so we can use 32-bit load/store. + // R5 = (R3 << 0) & ~3 + RLDCR $0, R3, $~3, R5 + // Compute val shift. +#ifdef GOARCH_power64 + // Big endian. ptr = ptr ^ 3 + XOR $3, R3 +#endif + // R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8) + RLDC $3, R3, $(3*8), R6 + // Shift val for aligned ptr. R4 = val << R6 + SLD R6, R4, R4 + +atomicor8_again: + SYNC + LWAR (R5), R6 + OR R4, R6 + STWCCC R6, (R5) + BNE atomicor8_again + SYNC + ISYNC + RETURN + +// void jmpdefer(fv, sp); +// called from deferreturn. +// 1. grab stored LR for caller +// 2. sub 4 bytes to get back to BL deferreturn +// 3. BR to fn +TEXT runtime·jmpdefer(SB), NOSPLIT, $-8-16 + MOVD 0(R1), R31 + SUB $4, R31 + MOVD R31, LR + + MOVD fv+0(FP), R11 + MOVD argp+8(FP), R1 + SUB $8, R1 + MOVD 0(R11), R3 + MOVD R3, CTR + BR (CTR) + +// Save state of caller into g->sched. Smashes R31. +TEXT gosave<>(SB),NOSPLIT,$-8 + MOVD LR, R31 + MOVD R31, (g_sched+gobuf_pc)(g) + MOVD R1, (g_sched+gobuf_sp)(g) + MOVD R0, (g_sched+gobuf_lr)(g) + MOVD R0, (g_sched+gobuf_ret)(g) + MOVD R0, (g_sched+gobuf_ctxt)(g) + RETURN + +// asmcgocall(void(*fn)(void*), void *arg) +// Call fn(arg) on the scheduler stack, +// aligned appropriately for the gcc ABI. +// See cgocall.c for more details. +TEXT ·asmcgocall(SB),NOSPLIT,$0-16 + MOVD R0, 21(R0) + +// cgocallback(void (*fn)(void*), void *frame, uintptr framesize) +// Turn the fn into a Go func (by taking its address) and call +// cgocallback_gofunc. +TEXT runtime·cgocallback(SB),NOSPLIT,$24-24 + MOVD R0, 22(R0) + +// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) +// See cgocall.c for more details. +TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24 + MOVD R0, 23(R0) + +// void setg(G*); set g. for use by needm. +TEXT runtime·setg(SB), NOSPLIT, $0-8 + MOVD R0, 24(R0) + +// void setg_gcc(G*); set g called from gcc. +TEXT setg_gcc<>(SB),NOSPLIT,$0 + MOVD R0, 25(R0) + +TEXT runtime·getcallerpc(SB),NOSPLIT,$-8-16 + MOVD 0(R1), R3 + MOVD R3, ret+8(FP) + RETURN + +TEXT runtime·gogetcallerpc(SB),NOSPLIT,$-8-16 + MOVD 0(R1), R3 + MOVD R3,ret+8(FP) + RETURN + +TEXT runtime·setcallerpc(SB),NOSPLIT,$-8-16 + MOVD pc+8(FP), R3 + MOVD R3, 0(R1) // set calling pc + RETURN + +TEXT runtime·getcallersp(SB),NOSPLIT,$0-16 + MOVD sp+0(FP), R3 + SUB $8, R3 + MOVD R3, ret+8(FP) + RETURN + +// func gogetcallersp(p unsafe.Pointer) uintptr +TEXT runtime·gogetcallersp(SB),NOSPLIT,$0-16 + MOVD sp+0(FP), R3 + SUB $8, R3 + MOVD R3,ret+8(FP) + RETURN + +TEXT runtime·abort(SB),NOSPLIT,$-8-0 + MOVW (R0), R0 + UNDEF + +#define TBRL 268 +#define TBRU 269 /* Time base Upper/Lower */ + +// int64 runtime·cputicks(void) +TEXT runtime·cputicks(SB),NOSPLIT,$0-8 + MOVW SPR(TBRU), R4 + MOVW SPR(TBRL), R3 + MOVW SPR(TBRU), R5 + CMPW R4, R5 + BNE -4(PC) + SLD $32, R5 + OR R5, R3 + MOVD R3, ret+0(FP) + RETURN + +// AES hashing not implemented for Power +TEXT runtime·aeshash(SB),NOSPLIT,$-8-0 + MOVW (R0), R1 +TEXT runtime·aeshash32(SB),NOSPLIT,$-8-0 + MOVW (R0), R1 +TEXT runtime·aeshash64(SB),NOSPLIT,$-8-0 + MOVW (R0), R1 +TEXT runtime·aeshashstr(SB),NOSPLIT,$-8-0 + MOVW (R0), R1 + +TEXT runtime·memeq(SB),NOSPLIT,$-8-25 + MOVD a+0(FP), R3 + MOVD b+8(FP), R4 + MOVD count+16(FP), R5 + SUB $1, R3 + SUB $1, R4 + ADD R3, R5, R8 +loop: + CMP R3, R8 + BNE 4(PC) + MOVD $1, R3 + MOVB R3, ret+24(FP) + RETURN + MOVBZU 1(R3), R6 + MOVBZU 1(R4), R7 + CMP R6, R7 + BEQ loop + + MOVB R0, ret+24(FP) + RETURN + +// eqstring tests whether two strings are equal. +// See runtime_test.go:eqstring_generic for +// equivalent Go code. +TEXT runtime·eqstring(SB),NOSPLIT,$0-33 + MOVD s1len+8(FP), R4 + MOVD s2len+24(FP), R5 + CMP R4, R5 + BNE noteq + + MOVD s1str+0(FP), R3 + MOVD s2str+16(FP), R4 + SUB $1, R3 + SUB $1, R4 + ADD R3, R5, R8 +loop: + CMP R3, R8 + BNE 4(PC) + MOVD $1, R3 + MOVB R3, ret+32(FP) + RETURN + MOVBZU 1(R3), R6 + MOVBZU 1(R4), R7 + CMP R6, R7 + BEQ loop +noteq: + MOVB R0, ret+32(FP) + RETURN + +// TODO: share code with memeq? +TEXT bytes·Equal(SB),NOSPLIT,$0-49 + MOVD a_len+8(FP), R3 + MOVD b_len+32(FP), R4 + + CMP R3, R4 // unequal lengths are not equal + BNE noteq + + MOVD a+0(FP), R5 + MOVD b+24(FP), R6 + SUB $1, R5 + SUB $1, R6 + ADD R5, R3 // end-1 + +loop: + CMP R5, R3 + BEQ equal // reached the end + MOVBZU 1(R5), R4 + MOVBZU 1(R6), R7 + CMP R4, R7 + BEQ loop + +noteq: + MOVBZ R0, ret+48(FP) + RETURN + +equal: + MOVD $1, R3 + MOVBZ R3, ret+48(FP) + RETURN + +TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 + MOVD s+0(FP), R3 + MOVD s_len+8(FP), R4 + MOVBZ c+24(FP), R5 // byte to find + MOVD R3, R6 // store base for later + SUB $1, R3 + ADD R3, R4 // end-1 + +loop: + CMP R3, R4 + BEQ notfound + MOVBZU 1(R3), R7 + CMP R7, R5 + BNE loop + + SUB R6, R3 // remove base + MOVD R3, ret+32(FP) + RETURN + +notfound: + MOVD $-1, R3 + MOVD R3, ret+32(FP) + RETURN + +TEXT strings·IndexByte(SB),NOSPLIT,$0 + MOVD p+0(FP), R3 + MOVD b_len+8(FP), R4 + MOVBZ c+16(FP), R5 // byte to find + MOVD R3, R6 // store base for later + SUB $1, R3 + ADD R3, R4 // end-1 + +loop: + CMP R3, R4 + BEQ notfound + MOVBZU 1(R3), R7 + CMP R7, R5 + BNE loop + + SUB R6, R3 // remove base + MOVD R3, ret+24(FP) + RETURN + +notfound: + MOVD $-1, R3 + MOVD R3, ret+24(FP) + RETURN + + +// A Duff's device for zeroing memory. +// The compiler jumps to computed addresses within +// this routine to zero chunks of memory. Do not +// change this code without also changing the code +// in ../../cmd/9g/ggen.c:/^clearfat. +// R0: always zero +// R3 (aka REGRT1): ptr to memory to be zeroed - 8 +// R3 is updated as a side effect. +TEXT runtime·duffzero(SB), NOSPLIT, $-8-0 + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + MOVDU R0, 8(R3) + RETURN + +TEXT runtime·fastrand1(SB), NOSPLIT, $0-4 + MOVD g_m(g), R4 + MOVWZ m_fastrand(R4), R3 + ADD R3, R3 + CMP R3, $0 + BGE 2(PC) + XOR $0x88888eef, R3 + MOVW R3, m_fastrand(R4) + MOVW R3, ret+0(FP) + RETURN + +TEXT runtime·return0(SB), NOSPLIT, $0 + MOVW $0, R3 + RETURN + +// Called from cgo wrappers, this function returns g->m->curg.stack.hi. +// Must obey the gcc calling convention. +TEXT _cgo_topofstack(SB),NOSPLIT,$0 + MOVD R0, 26(R0) diff --git a/src/runtime/atomic.go b/src/runtime/atomic.go index 7e9d9b3aa..a0e4d84e9 100644 --- a/src/runtime/atomic.go +++ b/src/runtime/atomic.go @@ -20,8 +20,16 @@ func xchg(ptr *uint32, new uint32) uint32 //go:noescape func xchg64(ptr *uint64, new uint64) uint64 -//go:noescape -func xchgp(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer +// Cannot use noescape here: ptr does not but new does escape. +// Instead use noescape(ptr) in wrapper below. +func xchgp1(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer + +//go:nosplit +func xchgp(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer { + old := xchgp1(noescape(ptr), new) + writebarrierptr_nostore((*uintptr)(ptr), uintptr(new)) + return old +} //go:noescape func xchguintptr(ptr *uintptr, new uintptr) uintptr @@ -47,5 +55,27 @@ func atomicstore(ptr *uint32, val uint32) //go:noescape func atomicstore64(ptr *uint64, val uint64) -//go:noescape -func atomicstorep(ptr unsafe.Pointer, val unsafe.Pointer) +// Cannot use noescape here: ptr does not but val does escape. +// Instead use noescape(ptr) in wrapper below. +func atomicstorep1(ptr unsafe.Pointer, val unsafe.Pointer) + +//go:nosplit +func atomicstorep(ptr unsafe.Pointer, val unsafe.Pointer) { + atomicstorep1(noescape(ptr), val) + // TODO(rsc): Why does the compiler think writebarrierptr_nostore's dst argument escapes? + writebarrierptr_nostore((*uintptr)(noescape(ptr)), uintptr(val)) +} + +// Cannot use noescape here: ptr does not but new does escape. +// Instead use noescape(ptr) in wrapper below. +func casp1(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool + +//go:nosplit +func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool { + ok := casp1((*unsafe.Pointer)(noescape(unsafe.Pointer(ptr))), old, new) + if !ok { + return false + } + writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new)) + return true +} diff --git a/src/runtime/atomic_power64x.s b/src/runtime/atomic_power64x.s new file mode 100644 index 000000000..e72871761 --- /dev/null +++ b/src/runtime/atomic_power64x.s @@ -0,0 +1,40 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build power64 power64le + +#include "textflag.h" + +// uint32 runtime·atomicload(uint32 volatile* addr) +TEXT ·atomicload(SB),NOSPLIT,$-8-12 + MOVD 0(FP), R3 + SYNC + MOVWZ 0(R3), R3 + CMPW R3, R3, CR7 + BC 4, 30, 1(PC) // bne- cr7,0x4 + ISYNC + MOVW R3, ret+8(FP) + RETURN + +// uint64 runtime·atomicload64(uint64 volatile* addr) +TEXT ·atomicload64(SB),NOSPLIT,$-8-16 + MOVD 0(FP), R3 + SYNC + MOVD 0(R3), R3 + CMP R3, R3, CR7 + BC 4, 30, 1(PC) // bne- cr7,0x4 + ISYNC + MOVD R3, ret+8(FP) + RETURN + +// void *runtime·atomicloadp(void *volatile *addr) +TEXT ·atomicloadp(SB),NOSPLIT,$-8-16 + MOVD 0(FP), R3 + SYNC + MOVD 0(R3), R3 + CMP R3, R3, CR7 + BC 4, 30, 1(PC) // bne- cr7,0x4 + ISYNC + MOVD R3, ret+8(FP) + RETURN diff --git a/src/runtime/debug/stubs.s b/src/runtime/debug/stubs.s index d56274f2d..1e883b72c 100644 --- a/src/runtime/debug/stubs.s +++ b/src/runtime/debug/stubs.s @@ -7,6 +7,12 @@ #ifdef GOARCH_arm #define JMP B #endif +#ifdef GOARCH_power64 +#define JMP BR +#endif +#ifdef GOARCH_power64le +#define JMP BR +#endif TEXT ·setMaxStack(SB),NOSPLIT,$0-0 JMP runtime·setMaxStack(SB) diff --git a/src/runtime/defs1_linux.go b/src/runtime/defs1_linux.go index 392cc4ab5..87c6e02a4 100644 --- a/src/runtime/defs1_linux.go +++ b/src/runtime/defs1_linux.go @@ -15,12 +15,14 @@ package runtime /* #include <ucontext.h> #include <fcntl.h> +#include <asm/signal.h> */ import "C" const ( - O_RDONLY = C.O_RDONLY - O_CLOEXEC = C.O_CLOEXEC + O_RDONLY = C.O_RDONLY + O_CLOEXEC = C.O_CLOEXEC + SA_RESTORER = C.SA_RESTORER ) type Usigset C.__sigset_t diff --git a/src/runtime/defs3_linux.go b/src/runtime/defs3_linux.go new file mode 100644 index 000000000..3551a4fa9 --- /dev/null +++ b/src/runtime/defs3_linux.go @@ -0,0 +1,43 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +/* +Input to cgo -cdefs + +GOARCH=power64 cgo -cdefs defs_linux.go defs3_linux.go > defs_linux_power64.h +*/ + +package runtime + +/* +#define size_t __kernel_size_t +#define sigset_t __sigset_t // rename the sigset_t here otherwise cgo will complain about "inconsistent definitions for C.sigset_t" +#define _SYS_TYPES_H // avoid inclusion of sys/types.h +#include <asm/ucontext.h> +#include <asm-generic/fcntl.h> +*/ +import "C" + +const ( + O_RDONLY = C.O_RDONLY + O_CLOEXEC = C.O_CLOEXEC + SA_RESTORER = 0 // unused +) + +type Usigset C.__sigset_t + +// types used in sigcontext +type Ptregs C.struct_pt_regs +type Gregset C.elf_gregset_t +type FPregset C.elf_fpregset_t +type Vreg C.elf_vrreg_t + +type SigaltstackT C.struct_sigaltstack + +// PPC64 uses sigcontext in place of mcontext in ucontext. +// see http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/include/uapi/asm/ucontext.h +type Sigcontext C.struct_sigcontext +type Ucontext C.struct_ucontext diff --git a/src/runtime/defs_linux.go b/src/runtime/defs_linux.go index 8657dbb0e..553366a50 100644 --- a/src/runtime/defs_linux.go +++ b/src/runtime/defs_linux.go @@ -20,6 +20,7 @@ package runtime // headers for things like ucontext_t, so that happens in // a separate file, defs1.go. +#define _SYS_TYPES_H // avoid inclusion of sys/types.h #include <asm/posix_types.h> #define size_t __kernel_size_t #include <asm/signal.h> @@ -28,7 +29,7 @@ package runtime #include <asm-generic/errno.h> #include <asm-generic/poll.h> #include <linux/eventpoll.h> -#undef size_t +#include <linux/time.h> */ import "C" @@ -48,10 +49,9 @@ const ( MADV_DONTNEED = C.MADV_DONTNEED - SA_RESTART = C.SA_RESTART - SA_ONSTACK = C.SA_ONSTACK - SA_RESTORER = C.SA_RESTORER - SA_SIGINFO = C.SA_SIGINFO + SA_RESTART = C.SA_RESTART + SA_ONSTACK = C.SA_ONSTACK + SA_SIGINFO = C.SA_SIGINFO SIGHUP = C.SIGHUP SIGINT = C.SIGINT @@ -116,6 +116,7 @@ const ( EPOLL_CTL_MOD = C.EPOLL_CTL_MOD ) +type Sigset C.sigset_t type Timespec C.struct_timespec type Timeval C.struct_timeval type Sigaction C.struct_sigaction diff --git a/src/runtime/defs_linux_power64.h b/src/runtime/defs_linux_power64.h new file mode 100644 index 000000000..93742fa34 --- /dev/null +++ b/src/runtime/defs_linux_power64.h @@ -0,0 +1,204 @@ +// Created by cgo -cdefs - DO NOT EDIT +// cgo -cdefs defs_linux.go defs3_linux.go + + +enum { + EINTR = 0x4, + EAGAIN = 0xb, + ENOMEM = 0xc, + + PROT_NONE = 0x0, + PROT_READ = 0x1, + PROT_WRITE = 0x2, + PROT_EXEC = 0x4, + + MAP_ANON = 0x20, + MAP_PRIVATE = 0x2, + MAP_FIXED = 0x10, + + MADV_DONTNEED = 0x4, + + SA_RESTART = 0x10000000, + SA_ONSTACK = 0x8000000, + SA_SIGINFO = 0x4, + + SIGHUP = 0x1, + SIGINT = 0x2, + SIGQUIT = 0x3, + SIGILL = 0x4, + SIGTRAP = 0x5, + SIGABRT = 0x6, + SIGBUS = 0x7, + SIGFPE = 0x8, + SIGKILL = 0x9, + SIGUSR1 = 0xa, + SIGSEGV = 0xb, + SIGUSR2 = 0xc, + SIGPIPE = 0xd, + SIGALRM = 0xe, + SIGSTKFLT = 0x10, + SIGCHLD = 0x11, + SIGCONT = 0x12, + SIGSTOP = 0x13, + SIGTSTP = 0x14, + SIGTTIN = 0x15, + SIGTTOU = 0x16, + SIGURG = 0x17, + SIGXCPU = 0x18, + SIGXFSZ = 0x19, + SIGVTALRM = 0x1a, + SIGPROF = 0x1b, + SIGWINCH = 0x1c, + SIGIO = 0x1d, + SIGPWR = 0x1e, + SIGSYS = 0x1f, + + FPE_INTDIV = 0x1, + FPE_INTOVF = 0x2, + FPE_FLTDIV = 0x3, + FPE_FLTOVF = 0x4, + FPE_FLTUND = 0x5, + FPE_FLTRES = 0x6, + FPE_FLTINV = 0x7, + FPE_FLTSUB = 0x8, + + BUS_ADRALN = 0x1, + BUS_ADRERR = 0x2, + BUS_OBJERR = 0x3, + + SEGV_MAPERR = 0x1, + SEGV_ACCERR = 0x2, + + ITIMER_REAL = 0x0, + ITIMER_VIRTUAL = 0x1, + ITIMER_PROF = 0x2, + + EPOLLIN = 0x1, + EPOLLOUT = 0x4, + EPOLLERR = 0x8, + EPOLLHUP = 0x10, + EPOLLRDHUP = 0x2000, + EPOLLET = -0x80000000, + EPOLL_CLOEXEC = 0x80000, + EPOLL_CTL_ADD = 0x1, + EPOLL_CTL_DEL = 0x2, + EPOLL_CTL_MOD = 0x3, +}; + +typedef struct Sigset Sigset; +typedef struct Timespec Timespec; +typedef struct Timeval Timeval; +typedef struct SigactionT SigactionT; +typedef struct Siginfo Siginfo; +typedef struct Itimerval Itimerval; +typedef struct EpollEvent EpollEvent; + +#pragma pack on + +//struct Sigset { +// uint64 sig[1]; +//}; +//typedef uint64 Sigset; + +struct Timespec { + int64 tv_sec; + int64 tv_nsec; +}; +struct Timeval { + int64 tv_sec; + int64 tv_usec; +}; +struct SigactionT { + void *sa_handler; + uint64 sa_flags; + void *sa_restorer; + uint64 sa_mask; +}; +struct Siginfo { + int32 si_signo; + int32 si_errno; + int32 si_code; + byte Pad_cgo_0[4]; + byte _sifields[112]; +}; +struct Itimerval { + Timeval it_interval; + Timeval it_value; +}; +struct EpollEvent { + uint32 events; + byte Pad_cgo_0[4]; + byte data[8]; // unaligned uintptr +}; + + +#pragma pack off +// Created by cgo -cdefs - DO NOT EDIT +// cgo -cdefs defs_linux.go defs3_linux.go + + +enum { + O_RDONLY = 0x0, + O_CLOEXEC = 0x80000, + SA_RESTORER = 0, +}; + +typedef struct Ptregs Ptregs; +typedef struct Vreg Vreg; +typedef struct SigaltstackT SigaltstackT; +typedef struct Sigcontext Sigcontext; +typedef struct Ucontext Ucontext; + +#pragma pack on + +struct Ptregs { + uint64 gpr[32]; + uint64 nip; + uint64 msr; + uint64 orig_gpr3; + uint64 ctr; + uint64 link; + uint64 xer; + uint64 ccr; + uint64 softe; + uint64 trap; + uint64 dar; + uint64 dsisr; + uint64 result; +}; +typedef uint64 Gregset[48]; +typedef float64 FPregset[33]; +struct Vreg { + uint32 u[4]; +}; + +struct SigaltstackT { + byte *ss_sp; + int32 ss_flags; + byte Pad_cgo_0[4]; + uint64 ss_size; +}; + +struct Sigcontext { + uint64 _unused[4]; + int32 signal; + int32 _pad0; + uint64 handler; + uint64 oldmask; + Ptregs *regs; + uint64 gp_regs[48]; + float64 fp_regs[33]; + Vreg *v_regs; + int64 vmx_reserve[101]; +}; +struct Ucontext { + uint64 uc_flags; + Ucontext *uc_link; + SigaltstackT uc_stack; + uint64 uc_sigmask; + uint64 __unused[15]; + Sigcontext uc_mcontext; +}; + + +#pragma pack off diff --git a/src/runtime/defs_linux_power64le.h b/src/runtime/defs_linux_power64le.h new file mode 100644 index 000000000..93742fa34 --- /dev/null +++ b/src/runtime/defs_linux_power64le.h @@ -0,0 +1,204 @@ +// Created by cgo -cdefs - DO NOT EDIT +// cgo -cdefs defs_linux.go defs3_linux.go + + +enum { + EINTR = 0x4, + EAGAIN = 0xb, + ENOMEM = 0xc, + + PROT_NONE = 0x0, + PROT_READ = 0x1, + PROT_WRITE = 0x2, + PROT_EXEC = 0x4, + + MAP_ANON = 0x20, + MAP_PRIVATE = 0x2, + MAP_FIXED = 0x10, + + MADV_DONTNEED = 0x4, + + SA_RESTART = 0x10000000, + SA_ONSTACK = 0x8000000, + SA_SIGINFO = 0x4, + + SIGHUP = 0x1, + SIGINT = 0x2, + SIGQUIT = 0x3, + SIGILL = 0x4, + SIGTRAP = 0x5, + SIGABRT = 0x6, + SIGBUS = 0x7, + SIGFPE = 0x8, + SIGKILL = 0x9, + SIGUSR1 = 0xa, + SIGSEGV = 0xb, + SIGUSR2 = 0xc, + SIGPIPE = 0xd, + SIGALRM = 0xe, + SIGSTKFLT = 0x10, + SIGCHLD = 0x11, + SIGCONT = 0x12, + SIGSTOP = 0x13, + SIGTSTP = 0x14, + SIGTTIN = 0x15, + SIGTTOU = 0x16, + SIGURG = 0x17, + SIGXCPU = 0x18, + SIGXFSZ = 0x19, + SIGVTALRM = 0x1a, + SIGPROF = 0x1b, + SIGWINCH = 0x1c, + SIGIO = 0x1d, + SIGPWR = 0x1e, + SIGSYS = 0x1f, + + FPE_INTDIV = 0x1, + FPE_INTOVF = 0x2, + FPE_FLTDIV = 0x3, + FPE_FLTOVF = 0x4, + FPE_FLTUND = 0x5, + FPE_FLTRES = 0x6, + FPE_FLTINV = 0x7, + FPE_FLTSUB = 0x8, + + BUS_ADRALN = 0x1, + BUS_ADRERR = 0x2, + BUS_OBJERR = 0x3, + + SEGV_MAPERR = 0x1, + SEGV_ACCERR = 0x2, + + ITIMER_REAL = 0x0, + ITIMER_VIRTUAL = 0x1, + ITIMER_PROF = 0x2, + + EPOLLIN = 0x1, + EPOLLOUT = 0x4, + EPOLLERR = 0x8, + EPOLLHUP = 0x10, + EPOLLRDHUP = 0x2000, + EPOLLET = -0x80000000, + EPOLL_CLOEXEC = 0x80000, + EPOLL_CTL_ADD = 0x1, + EPOLL_CTL_DEL = 0x2, + EPOLL_CTL_MOD = 0x3, +}; + +typedef struct Sigset Sigset; +typedef struct Timespec Timespec; +typedef struct Timeval Timeval; +typedef struct SigactionT SigactionT; +typedef struct Siginfo Siginfo; +typedef struct Itimerval Itimerval; +typedef struct EpollEvent EpollEvent; + +#pragma pack on + +//struct Sigset { +// uint64 sig[1]; +//}; +//typedef uint64 Sigset; + +struct Timespec { + int64 tv_sec; + int64 tv_nsec; +}; +struct Timeval { + int64 tv_sec; + int64 tv_usec; +}; +struct SigactionT { + void *sa_handler; + uint64 sa_flags; + void *sa_restorer; + uint64 sa_mask; +}; +struct Siginfo { + int32 si_signo; + int32 si_errno; + int32 si_code; + byte Pad_cgo_0[4]; + byte _sifields[112]; +}; +struct Itimerval { + Timeval it_interval; + Timeval it_value; +}; +struct EpollEvent { + uint32 events; + byte Pad_cgo_0[4]; + byte data[8]; // unaligned uintptr +}; + + +#pragma pack off +// Created by cgo -cdefs - DO NOT EDIT +// cgo -cdefs defs_linux.go defs3_linux.go + + +enum { + O_RDONLY = 0x0, + O_CLOEXEC = 0x80000, + SA_RESTORER = 0, +}; + +typedef struct Ptregs Ptregs; +typedef struct Vreg Vreg; +typedef struct SigaltstackT SigaltstackT; +typedef struct Sigcontext Sigcontext; +typedef struct Ucontext Ucontext; + +#pragma pack on + +struct Ptregs { + uint64 gpr[32]; + uint64 nip; + uint64 msr; + uint64 orig_gpr3; + uint64 ctr; + uint64 link; + uint64 xer; + uint64 ccr; + uint64 softe; + uint64 trap; + uint64 dar; + uint64 dsisr; + uint64 result; +}; +typedef uint64 Gregset[48]; +typedef float64 FPregset[33]; +struct Vreg { + uint32 u[4]; +}; + +struct SigaltstackT { + byte *ss_sp; + int32 ss_flags; + byte Pad_cgo_0[4]; + uint64 ss_size; +}; + +struct Sigcontext { + uint64 _unused[4]; + int32 signal; + int32 _pad0; + uint64 handler; + uint64 oldmask; + Ptregs *regs; + uint64 gp_regs[48]; + float64 fp_regs[33]; + Vreg *v_regs; + int64 vmx_reserve[101]; +}; +struct Ucontext { + uint64 uc_flags; + Ucontext *uc_link; + SigaltstackT uc_stack; + uint64 uc_sigmask; + uint64 __unused[15]; + Sigcontext uc_mcontext; +}; + + +#pragma pack off diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go index be352557f..65e918e84 100644 --- a/src/runtime/export_test.go +++ b/src/runtime/export_test.go @@ -26,7 +26,7 @@ var Exitsyscall = exitsyscall var LockedOSThread = lockedOSThread type LFNode struct { - Next *LFNode + Next uint64 Pushcnt uintptr } diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go index 88f6703f9..1a33f3b3b 100644 --- a/src/runtime/gcinfo_test.go +++ b/src/runtime/gcinfo_test.go @@ -153,6 +153,12 @@ func infoBigStruct() []byte { BitsScalar, BitsScalar, BitsDead, BitsScalar, BitsScalar, // t int; y uint16; u uint64 BitsPointer, BitsDead, // i string } + case "power64", "power64le": + return []byte{ + BitsPointer, BitsScalar, BitsScalar, BitsScalar, + BitsMultiWord, BitsSlice, BitsScalar, BitsScalar, + BitsScalar, BitsScalar, BitsMultiWord, BitsString, + } default: panic("unknown arch") } @@ -188,6 +194,6 @@ var ( infoString = []byte{BitsPointer, BitsDead} infoSlice = []byte{BitsPointer, BitsDead, BitsDead} - infoEface = []byte{BitsMultiWord, BitsEface} - infoIface = []byte{BitsMultiWord, BitsIface} + infoEface = []byte{BitsPointer, BitsPointer} + infoIface = []byte{BitsPointer, BitsPointer} ) diff --git a/src/runtime/heapdump.c b/src/runtime/heapdump.c index eddbc1d1c..da14f2d24 100644 --- a/src/runtime/heapdump.c +++ b/src/runtime/heapdump.c @@ -261,20 +261,7 @@ dumpbv(BitVector *bv, uintptr offset) dumpint(offset + i / BitsPerPointer * PtrSize); break; case BitsMultiWord: - switch(bv->bytedata[(i+BitsPerPointer)/8] >> (i+BitsPerPointer)%8 & 3) { - default: - runtime·throw("unexpected garbage collection bits"); - case BitsIface: - dumpint(FieldKindIface); - dumpint(offset + i / BitsPerPointer * PtrSize); - i += BitsPerPointer; - break; - case BitsEface: - dumpint(FieldKindEface); - dumpint(offset + i / BitsPerPointer * PtrSize); - i += BitsPerPointer; - break; - } + runtime·throw("bumpbv unexpected garbage collection bits"); } } } diff --git a/src/runtime/lfstack.c b/src/runtime/lfstack.c index 57e0af282..0ced839c2 100644 --- a/src/runtime/lfstack.c +++ b/src/runtime/lfstack.c @@ -46,7 +46,7 @@ runtime·lfstackpush(uint64 *head, LFNode *node) new = (uint64)(uintptr)node|(((uint64)node->pushcnt&CNT_MASK)<<PTR_BITS); for(;;) { old = runtime·atomicload64(head); - node->next = (LFNode*)(uintptr)(old&PTR_MASK); + node->next = old; if(runtime·cas64(head, old, new)) break; } @@ -55,19 +55,17 @@ runtime·lfstackpush(uint64 *head, LFNode *node) LFNode* runtime·lfstackpop(uint64 *head) { - LFNode *node, *node2; - uint64 old, new; + LFNode *node; + uint64 old, next; for(;;) { old = runtime·atomicload64(head); if(old == 0) return nil; node = (LFNode*)(uintptr)(old&PTR_MASK); - node2 = runtime·atomicloadp(&node->next); - new = 0; - if(node2 != nil) - new = (uint64)(uintptr)node2|(((uint64)node2->pushcnt&CNT_MASK)<<PTR_BITS); - if(runtime·cas64(head, old, new)) + next = runtime·atomicload64(&node->next); + + if(runtime·cas64(head, old, next)) return node; } } diff --git a/src/runtime/lfstack_test.go b/src/runtime/lfstack_test.go index e51877704..68f221d6e 100644 --- a/src/runtime/lfstack_test.go +++ b/src/runtime/lfstack_test.go @@ -121,7 +121,7 @@ func TestLFStackStress(t *testing.T) { } cnt++ sum2 += node.data - node.Next = nil + node.Next = 0 } } if cnt != K { diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 8cf1c3d34..fab8cf269 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -4,9 +4,7 @@ package runtime -import ( - "unsafe" -) +import "unsafe" const ( debugMalloc = false @@ -247,6 +245,8 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { masksize = masksize * pointersPerByte / 8 // 4 bits per word masksize++ // unroll flag in the beginning if masksize > maxGCMask && typ.gc[1] != 0 { + // write barriers have not been updated to deal with this case yet. + gothrow("maxGCMask too small for now") // If the mask is too large, unroll the program directly // into the GC bitmap. It's 7 times slower than copying // from the pre-unrolled mask, but saves 1/16 of type size @@ -261,8 +261,10 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { goto marked } ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0]))) - // Check whether the program is already unrolled. - if uintptr(atomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 { + // Check whether the program is already unrolled + // by checking if the unroll flag byte is set + maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask))) + if *(*uint8)(unsafe.Pointer(&maskword)) == 0 { mp := acquirem() mp.ptrarg[0] = unsafe.Pointer(typ) onM(unrollgcprog_m) @@ -304,6 +306,18 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { } } marked: + + // GCmarkterminate allocates black + // All slots hold nil so no scanning is needed. + // This may be racing with GC so do it atomically if there can be + // a race marking the bit. + if gcphase == _GCmarktermination { + mp := acquirem() + mp.ptrarg[0] = x + onM(gcmarknewobject_m) + releasem(mp) + } + if raceenabled { racemalloc(x, size) } @@ -344,6 +358,37 @@ marked: return x } +func loadPtrMask(typ *_type) []uint8 { + var ptrmask *uint8 + nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize + if typ.kind&kindGCProg != 0 { + masksize := nptr + if masksize%2 != 0 { + masksize *= 2 // repeated + } + masksize = masksize * pointersPerByte / 8 // 4 bits per word + masksize++ // unroll flag in the beginning + if masksize > maxGCMask && typ.gc[1] != 0 { + // write barriers have not been updated to deal with this case yet. + gothrow("maxGCMask too small for now") + } + ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0]))) + // Check whether the program is already unrolled + // by checking if the unroll flag byte is set + maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask))) + if *(*uint8)(unsafe.Pointer(&maskword)) == 0 { + mp := acquirem() + mp.ptrarg[0] = unsafe.Pointer(typ) + onM(unrollgcprog_m) + releasem(mp) + } + ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte + } else { + ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask + } + return (*[1 << 30]byte)(unsafe.Pointer(ptrmask))[:(nptr+1)/2] +} + // implementation of new builtin func newobject(typ *_type) unsafe.Pointer { flags := uint32(0) @@ -438,7 +483,20 @@ func gogc(force int32) { mp = acquirem() mp.gcing = 1 releasem(mp) + onM(stoptheworld) + onM(finishsweep_m) // finish sweep before we start concurrent scan. + if false { // To turn on concurrent scan and mark set to true... + onM(starttheworld) + // Do a concurrent heap scan before we stop the world. + onM(gcscan_m) + onM(stoptheworld) + onM(gcinstallmarkwb_m) + onM(starttheworld) + onM(gcmark_m) + onM(stoptheworld) + onM(gcinstalloffwb_m) + } if mp != acquirem() { gothrow("gogc: rescheduled") } @@ -469,6 +527,8 @@ func gogc(force int32) { onM(gc_m) } + onM(gccheckmark_m) + // all done mp.gcing = 0 semrelease(&worldsema) @@ -483,6 +543,14 @@ func gogc(force int32) { } } +func GCcheckmarkenable() { + onM(gccheckmarkenable_m) +} + +func GCcheckmarkdisable() { + onM(gccheckmarkdisable_m) +} + // GC runs a garbage collection. func GC() { gogc(2) diff --git a/src/runtime/malloc.h b/src/runtime/malloc.h index adb8d3d67..522b11bba 100644 --- a/src/runtime/malloc.h +++ b/src/runtime/malloc.h @@ -86,6 +86,7 @@ typedef struct MSpan MSpan; typedef struct MStats MStats; typedef struct MLink MLink; typedef struct GCStats GCStats; +typedef struct Workbuf Workbuf; enum { @@ -344,8 +345,6 @@ struct MCache SudoG* sudogcache; - void* gcworkbuf; - // Local allocator stats, flushed during GC. uintptr local_nlookup; // number of pointer lookups uintptr local_largefree; // bytes freed for large objects (>MaxSmallSize) @@ -356,7 +355,7 @@ struct MCache MSpan* runtime·MCache_Refill(MCache *c, int32 sizeclass); void runtime·MCache_ReleaseAll(MCache *c); void runtime·stackcache_clear(MCache *c); -void runtime·gcworkbuffree(void *b); +void runtime·gcworkbuffree(Workbuf *b); enum { diff --git a/src/runtime/mcache.c b/src/runtime/mcache.c index 5fdbe3266..95ddced3e 100644 --- a/src/runtime/mcache.c +++ b/src/runtime/mcache.c @@ -39,12 +39,12 @@ runtime·allocmcache(void) return c; } +// mheap.lock needs to be held to release the gcworkbuf. static void freemcache(MCache *c) { runtime·MCache_ReleaseAll(c); runtime·stackcache_clear(c); - runtime·gcworkbuffree(c->gcworkbuf); runtime·lock(&runtime·mheap.lock); runtime·purgecachedstats(c); runtime·FixAlloc_Free(&runtime·mheap.cachealloc, c); diff --git a/src/runtime/mem_linux.c b/src/runtime/mem_linux.c index bfb405607..52e02b34e 100644 --- a/src/runtime/mem_linux.c +++ b/src/runtime/mem_linux.c @@ -11,7 +11,7 @@ enum { - _PAGE_SIZE = 4096, + _PAGE_SIZE = PhysPageSize, EACCES = 13, }; @@ -36,8 +36,9 @@ addrspace_free(void *v, uintptr n) errval = runtime·mincore((int8*)v + off, chunk, vec); // ENOMEM means unmapped, which is what we want. // Anything else we assume means the pages are mapped. - if (errval != -ENOMEM) + if (errval != -ENOMEM && errval != ENOMEM) { return 0; + } } return 1; } @@ -48,12 +49,15 @@ mmap_fixed(byte *v, uintptr n, int32 prot, int32 flags, int32 fd, uint32 offset) void *p; p = runtime·mmap(v, n, prot, flags, fd, offset); - if(p != v && addrspace_free(v, n)) { + if(p != v) { + if(p > (void*)4096) { + runtime·munmap(p, n); + p = nil; + } // On some systems, mmap ignores v without // MAP_FIXED, so retry if the address space is free. - if(p > (void*)4096) - runtime·munmap(p, n); - p = runtime·mmap(v, n, prot, flags|MAP_FIXED, fd, offset); + if(addrspace_free(v, n)) + p = runtime·mmap(v, n, prot, flags|MAP_FIXED, fd, offset); } return p; } diff --git a/src/runtime/memclr_386.s b/src/runtime/memclr_386.s index 1520aea2e..3f20b69c8 100644 --- a/src/runtime/memclr_386.s +++ b/src/runtime/memclr_386.s @@ -15,31 +15,31 @@ TEXT runtime·memclr(SB), NOSPLIT, $0-8 XORL AX, AX // MOVOU seems always faster than REP STOSL. -clr_tail: +tail: TESTL BX, BX - JEQ clr_0 + JEQ _0 CMPL BX, $2 - JBE clr_1or2 + JBE _1or2 CMPL BX, $4 - JBE clr_3or4 + JBE _3or4 CMPL BX, $8 - JBE clr_5through8 + JBE _5through8 CMPL BX, $16 - JBE clr_9through16 + JBE _9through16 TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2 JEQ nosse2 PXOR X0, X0 CMPL BX, $32 - JBE clr_17through32 + JBE _17through32 CMPL BX, $64 - JBE clr_33through64 + JBE _33through64 CMPL BX, $128 - JBE clr_65through128 + JBE _65through128 CMPL BX, $256 - JBE clr_129through256 + JBE _129through256 // TODO: use branch table and BSR to make this just a single dispatch -clr_loop: +loop: MOVOU X0, 0(DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) @@ -59,40 +59,40 @@ clr_loop: SUBL $256, BX ADDL $256, DI CMPL BX, $256 - JAE clr_loop - JMP clr_tail + JAE loop + JMP tail -clr_1or2: +_1or2: MOVB AX, (DI) MOVB AX, -1(DI)(BX*1) RET -clr_0: +_0: RET -clr_3or4: +_3or4: MOVW AX, (DI) MOVW AX, -2(DI)(BX*1) RET -clr_5through8: +_5through8: MOVL AX, (DI) MOVL AX, -4(DI)(BX*1) RET -clr_9through16: +_9through16: MOVL AX, (DI) MOVL AX, 4(DI) MOVL AX, -8(DI)(BX*1) MOVL AX, -4(DI)(BX*1) RET -clr_17through32: +_17through32: MOVOU X0, (DI) MOVOU X0, -16(DI)(BX*1) RET -clr_33through64: +_33through64: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, -32(DI)(BX*1) MOVOU X0, -16(DI)(BX*1) RET -clr_65through128: +_65through128: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) @@ -102,7 +102,7 @@ clr_65through128: MOVOU X0, -32(DI)(BX*1) MOVOU X0, -16(DI)(BX*1) RET -clr_129through256: +_129through256: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) @@ -126,5 +126,5 @@ nosse2: REP STOSL ANDL $3, BX - JNE clr_tail + JNE tail RET diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s index 94a2c7f23..ec24f1db2 100644 --- a/src/runtime/memclr_amd64.s +++ b/src/runtime/memclr_amd64.s @@ -15,30 +15,30 @@ TEXT runtime·memclr(SB), NOSPLIT, $0-16 XORQ AX, AX // MOVOU seems always faster than REP STOSQ. -clr_tail: +tail: TESTQ BX, BX - JEQ clr_0 + JEQ _0 CMPQ BX, $2 - JBE clr_1or2 + JBE _1or2 CMPQ BX, $4 - JBE clr_3or4 + JBE _3or4 CMPQ BX, $8 - JBE clr_5through8 + JBE _5through8 CMPQ BX, $16 - JBE clr_9through16 + JBE _9through16 PXOR X0, X0 CMPQ BX, $32 - JBE clr_17through32 + JBE _17through32 CMPQ BX, $64 - JBE clr_33through64 + JBE _33through64 CMPQ BX, $128 - JBE clr_65through128 + JBE _65through128 CMPQ BX, $256 - JBE clr_129through256 + JBE _129through256 // TODO: use branch table and BSR to make this just a single dispatch // TODO: for really big clears, use MOVNTDQ. -clr_loop: +loop: MOVOU X0, 0(DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) @@ -58,38 +58,38 @@ clr_loop: SUBQ $256, BX ADDQ $256, DI CMPQ BX, $256 - JAE clr_loop - JMP clr_tail + JAE loop + JMP tail -clr_1or2: +_1or2: MOVB AX, (DI) MOVB AX, -1(DI)(BX*1) RET -clr_0: +_0: RET -clr_3or4: +_3or4: MOVW AX, (DI) MOVW AX, -2(DI)(BX*1) RET -clr_5through8: +_5through8: MOVL AX, (DI) MOVL AX, -4(DI)(BX*1) RET -clr_9through16: +_9through16: MOVQ AX, (DI) MOVQ AX, -8(DI)(BX*1) RET -clr_17through32: +_17through32: MOVOU X0, (DI) MOVOU X0, -16(DI)(BX*1) RET -clr_33through64: +_33through64: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, -32(DI)(BX*1) MOVOU X0, -16(DI)(BX*1) RET -clr_65through128: +_65through128: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) @@ -99,7 +99,7 @@ clr_65through128: MOVOU X0, -32(DI)(BX*1) MOVOU X0, -16(DI)(BX*1) RET -clr_129through256: +_129through256: MOVOU X0, (DI) MOVOU X0, 16(DI) MOVOU X0, 32(DI) diff --git a/src/runtime/memclr_plan9_386.s b/src/runtime/memclr_plan9_386.s index b4b671f77..50f327b4e 100644 --- a/src/runtime/memclr_plan9_386.s +++ b/src/runtime/memclr_plan9_386.s @@ -10,40 +10,40 @@ TEXT runtime·memclr(SB), NOSPLIT, $0-8 MOVL n+4(FP), BX XORL AX, AX -clr_tail: +tail: TESTL BX, BX - JEQ clr_0 + JEQ _0 CMPL BX, $2 - JBE clr_1or2 + JBE _1or2 CMPL BX, $4 - JBE clr_3or4 + JBE _3or4 CMPL BX, $8 - JBE clr_5through8 + JBE _5through8 CMPL BX, $16 - JBE clr_9through16 + JBE _9through16 MOVL BX, CX SHRL $2, CX REP STOSL ANDL $3, BX - JNE clr_tail + JNE tail RET -clr_1or2: +_1or2: MOVB AX, (DI) MOVB AX, -1(DI)(BX*1) RET -clr_0: +_0: RET -clr_3or4: +_3or4: MOVW AX, (DI) MOVW AX, -2(DI)(BX*1) RET -clr_5through8: +_5through8: MOVL AX, (DI) MOVL AX, -4(DI)(BX*1) RET -clr_9through16: +_9through16: MOVL AX, (DI) MOVL AX, 4(DI) MOVL AX, -8(DI)(BX*1) diff --git a/src/runtime/memclr_power64x.s b/src/runtime/memclr_power64x.s new file mode 100644 index 000000000..dfad64b6f --- /dev/null +++ b/src/runtime/memclr_power64x.s @@ -0,0 +1,20 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build power64 power64le + +#include "textflag.h" + +// void runtime·memclr(void*, uintptr) +TEXT runtime·memclr(SB),NOSPLIT,$0-16 + MOVD ptr+0(FP), R3 + MOVD n+8(FP), R4 + CMP R4, $0 + BEQ done + SUB $1, R3 + MOVD R4, CTR + MOVBU R0, 1(R3) + BC 25, 0, -1(PC) // bdnz+ $-4 +done: + RETURN diff --git a/src/runtime/memmove_power64x.s b/src/runtime/memmove_power64x.s new file mode 100644 index 000000000..2b04d8319 --- /dev/null +++ b/src/runtime/memmove_power64x.s @@ -0,0 +1,40 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build power64 power64le + +#include "textflag.h" + +// void runtime·memmove(void*, void*, uintptr) +TEXT runtime·memmove(SB), NOSPLIT, $-8-24 + MOVD to+0(FP), R3 + MOVD from+8(FP), R4 + MOVD n+16(FP), R5 + CMP R5, $0 + BNE check + RETURN + +check: + CMP R3, R4 + BGT backward + + SUB $1, R3 + ADD R3, R5 + SUB $1, R4 +loop: + MOVBU 1(R4), R6 + MOVBU R6, 1(R3) + CMP R3, R5 + BNE loop + RETURN + +backward: + ADD R5, R4 + ADD R3, R5 +loop1: + MOVBU -1(R4), R6 + MOVBU R6, -1(R5) + CMP R3, R5 + BNE loop1 + RETURN diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c index 7754bad89..3248b0f49 100644 --- a/src/runtime/mgc0.c +++ b/src/runtime/mgc0.c @@ -4,22 +4,72 @@ // Garbage collector (GC). // -// GC is: -// - mark&sweep -// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc) -// - parallel (up to MaxGcproc threads) -// - partially concurrent (mark is stop-the-world, while sweep is concurrent) -// - non-moving/non-compacting -// - full (non-partial) +// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple GC +// thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is +// non-generational and non-compacting. Allocation is done using size segregated per P allocation +// areas to minimize fragmentation while eliminating locks in the common case. // -// GC rate. -// Next GC is after we've allocated an extra amount of memory proportional to -// the amount already in use. The proportion is controlled by GOGC environment variable -// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M -// (this mark is tracked in next_gc variable). This keeps the GC cost in linear -// proportion to the allocation cost. Adjusting GOGC just changes the linear constant -// (and also the amount of extra memory used). +// The algorithm decomposes into several steps. +// This is a high level description of the algorithm being used. For an overview of GC a good +// place to start is Richard Jones' gchandbook.org. +// +// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see +// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978. +// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978), 966-975. +// For journal quality proofs that these steps are complete, correct, and terminate see +// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world. +// Concurrency and Computation: Practice and Experience 15(3-5), 2003. // +// 0. Set phase = GCscan from GCoff. +// 1. Wait for all P's to acknowledge phase change. +// At this point all goroutines have passed through a GC safepoint and +// know we are in the GCscan phase. +// 2. GC scans all goroutine stacks, mark and enqueues all encountered pointers +// (marking avoids most duplicate enqueuing but races may produce duplication which is benign). +// Preempted goroutines are scanned before P schedules next goroutine. +// 3. Set phase = GCmark. +// 4. Wait for all P's to acknowledge phase change. +// 5. Now write barrier marks and enqueues black, grey, or white to white pointers. +// Malloc still allocates white (non-marked) objects. +// 6. Meanwhile GC transitively walks the heap marking reachable objects. +// 7. When GC finishes marking heap, it preempts P's one-by-one and +// retakes partial wbufs (filled by write barrier or during a stack scan of the goroutine +// currently scheduled on the P). +// 8. Once the GC has exhausted all available marking work it sets phase = marktermination. +// 9. Wait for all P's to acknowledge phase change. +// 10. Malloc now allocates black objects, so number of unmarked reachable objects +// monotonically decreases. +// 11. GC preempts P's one-by-one taking partial wbufs and marks all unmarked yet reachable objects. +// 12. When GC completes a full cycle over P's and discovers no new grey +// objects, (which means all reachable objects are marked) set phase = GCsweep. +// 13. Wait for all P's to acknowledge phase change. +// 14. Now malloc allocates white (but sweeps spans before use). +// Write barrier becomes nop. +// 15. GC does background sweeping, see description below. +// 16. When sweeping is complete set phase to GCoff. +// 17. When sufficient allocation has taken place replay the sequence starting at 0 above, +// see discussion of GC rate below. + +// Changing phases. +// Phases are changed by setting the gcphase to the next phase and possibly calling ackgcphase. +// All phase action must be benign in the presence of a change. +// Starting with GCoff +// GCoff to GCscan +// GSscan scans stacks and globals greying them and never marks an object black. +// Once all the P's are aware of the new phase they will scan gs on preemption. +// This means that the scanning of preempted gs can't start until all the Ps +// have acknowledged. +// GCscan to GCmark +// GCMark turns on the write barrier which also only greys objects. No scanning +// of objects (making them black) can happen until all the Ps have acknowledged +// the phase change. +// GCmark to GCmarktermination +// The only change here is that we start allocating black so the Ps must acknowledge +// the change before we begin the termination algorithm +// GCmarktermination to GSsweep +// Object currently on the freelist must be marked black for this to work. +// Are things on the free lists black or white? How does the sweep phase work? + // Concurrent sweep. // The sweep phase proceeds concurrently with normal program execution. // The heap is swept span-by-span both lazily (when a goroutine needs another span) @@ -50,6 +100,14 @@ // The finalizer goroutine is kicked off only when all spans are swept. // When the next GC starts, it sweeps all not-yet-swept spans (if any). +// GC rate. +// Next GC is after we've allocated an extra amount of memory proportional to +// the amount already in use. The proportion is controlled by GOGC environment variable +// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M +// (this mark is tracked in next_gc variable). This keeps the GC cost in linear +// proportion to the allocation cost. Adjusting GOGC just changes the linear constant +// (and also the amount of extra memory used). + #include "runtime.h" #include "arch_GOARCH.h" #include "malloc.h" @@ -64,10 +122,8 @@ enum { Debug = 0, - DebugPtrs = 0, // if 1, print trace of every pointer load during GC ConcurrentSweep = 1, - WorkbufSize = 4*1024, FinBlockSize = 4*1024, RootData = 0, RootBss = 1, @@ -80,7 +136,7 @@ enum { // ptrmask for an allocation containing a single pointer. static byte oneptr[] = {BitsPointer}; -// Initialized from $GOGC. GOGC=off means no gc. +// Initialized from $GOGC. GOGC=off means no GC. extern int32 runtime·gcpercent; // Holding worldsema grants an M the right to try to stop the world. @@ -98,12 +154,16 @@ extern int32 runtime·gcpercent; // uint32 runtime·worldsema = 1; -typedef struct Workbuf Workbuf; -struct Workbuf -{ - LFNode node; // must be first - uintptr nobj; - byte* obj[(WorkbufSize-sizeof(LFNode)-sizeof(uintptr))/PtrSize]; +// It is a bug if bits does not have bitBoundary set but +// there are still some cases where this happens related +// to stack spans. +typedef struct Markbits Markbits; +struct Markbits { + byte *bitp; // pointer to the byte holding xbits + byte shift; // bits xbits needs to be shifted to get bits + byte xbits; // byte holding all the bits from *bitp + byte bits; // mark and boundary bits relevant to corresponding slot. + byte tbits; // pointer||scalar bits relevant to corresponding slot. }; extern byte runtime·data[]; @@ -128,26 +188,40 @@ BitVector runtime·gcbssmask; Mutex runtime·gclock; -static uintptr badblock[1024]; -static int32 nbadblock; - +static Workbuf* getpartialorempty(void); +static void putpartial(Workbuf*); static Workbuf* getempty(Workbuf*); static Workbuf* getfull(Workbuf*); static void putempty(Workbuf*); +static void putfull(Workbuf*); static Workbuf* handoff(Workbuf*); static void gchelperstart(void); static void flushallmcaches(void); -static bool scanframe(Stkframe *frame, void *unused); -static void scanstack(G *gp); -static BitVector unrollglobgcprog(byte *prog, uintptr size); +static bool scanframe(Stkframe*, void*); +static void scanstack(G*); +static BitVector unrollglobgcprog(byte*, uintptr); +static void scanblock(byte*, uintptr, byte*); +static byte* objectstart(byte*, Markbits*); +static Workbuf* greyobject(byte*, Markbits*, Workbuf*); +static bool inheap(byte*); +static bool shaded(byte*); +static void shade(byte*); +static void slottombits(byte*, Markbits*); +static void atomicxor8(byte*, byte); +static bool ischeckmarked(Markbits*); +static bool ismarked(Markbits*); +static void clearcheckmarkbits(void); +static void clearcheckmarkbitsspan(MSpan*); void runtime·bgsweep(void); +void runtime·finishsweep_m(void); static FuncVal bgsweepv = {runtime·bgsweep}; typedef struct WorkData WorkData; struct WorkData { - uint64 full; // lock-free list of full blocks - uint64 empty; // lock-free list of empty blocks + uint64 full; // lock-free list of full blocks + uint64 empty; // lock-free list of empty blocks + uint64 partial; // lock-free list of partially filled blocks byte pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait uint32 nproc; int64 tstart; @@ -162,315 +236,422 @@ struct WorkData { }; WorkData runtime·work; -// Is _cgo_allocate linked into the binary? +// To help debug the concurrent GC we remark with the world +// stopped ensuring that any object encountered has their normal +// mark bit set. To do this we use an orthogonal bit +// pattern to indicate the object is marked. The following pattern +// uses the upper two bits in the object's bounday nibble. +// 01: scalar not marked +// 10: pointer not marked +// 11: pointer marked +// 00: scalar marked +// Xoring with 01 will flip the pattern from marked to unmarked and vica versa. +// The higher bit is 1 for pointers and 0 for scalars, whether the object +// is marked or not. +// The first nibble no longer holds the bitsDead pattern indicating that the +// there are no more pointers in the object. This information is held +// in the second nibble. + +// When marking an object if the bool checkmark is true one uses the above +// encoding, otherwise one uses the bitMarked bit in the lower two bits +// of the nibble. +static bool checkmark = false; +static bool gccheckmarkenable = true; + +// Is address b in the known heap. If it doesn't have a valid gcmap +// returns false. For example pointers into stacks will return false. static bool -have_cgo_allocate(void) +inheap(byte *b) { - extern byte go·weak·runtime·_cgo_allocate_internal[1]; - return go·weak·runtime·_cgo_allocate_internal != nil; + MSpan *s; + pageID k; + uintptr x; + + if(b == nil || b < runtime·mheap.arena_start || b >= runtime·mheap.arena_used) + return false; + // Not a beginning of a block, consult span table to find the block beginning. + k = (uintptr)b>>PageShift; + x = k; + x -= (uintptr)runtime·mheap.arena_start>>PageShift; + s = runtime·mheap.spans[x]; + if(s == nil || k < s->start || b >= s->limit || s->state != MSpanInUse) + return false; + return true; } -// scanblock scans a block of n bytes starting at pointer b for references -// to other objects, scanning any it finds recursively until there are no -// unscanned objects left. Instead of using an explicit recursion, it keeps -// a work list in the Workbuf* structures and loops in the main function -// body. Keeping an explicit work list is easier on the stack allocator and -// more efficient. +// Given an address in the heap return the relevant byte from the gcmap. This routine +// can be used on addresses to the start of an object or to the interior of the an object. static void -scanblock(byte *b, uintptr n, byte *ptrmask) +slottombits(byte *obj, Markbits *mbits) { - byte *obj, *obj0, *p, *arena_start, *arena_used, **wp, *scanbuf[8], *ptrbitp, *bitp; - uintptr i, j, nobj, size, idx, x, off, scanbufpos, bits, xbits, shift; - Workbuf *wbuf; - Iface *iface; - Eface *eface; - Type *typ; + uintptr off; + + off = (uintptr*)((uintptr)obj&~(PtrSize-1)) - (uintptr*)runtime·mheap.arena_start; + mbits->bitp = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1; + mbits->shift = (off % wordsPerBitmapByte) * gcBits; + mbits->xbits = *mbits->bitp; + mbits->bits = (mbits->xbits >> mbits->shift) & bitMask; + mbits->tbits = ((mbits->xbits >> mbits->shift) & bitPtrMask) >> 2; +} + +// b is a pointer into the heap. +// Find the start of the object refered to by b. +// Set mbits to the associated bits from the bit map. +// If b is not a valid heap object return nil and +// undefined values in mbits. +static byte* +objectstart(byte *b, Markbits *mbits) +{ + byte *obj, *p; MSpan *s; pageID k; - bool keepworking; + uintptr x, size, idx; - // Cache memory arena parameters in local vars. - arena_start = runtime·mheap.arena_start; - arena_used = runtime·mheap.arena_used; + obj = (byte*)((uintptr)b&~(PtrSize-1)); + for(;;) { + slottombits(obj, mbits); + if((mbits->bits&bitBoundary) == bitBoundary) + break; - wbuf = getempty(nil); - nobj = wbuf->nobj; - wp = &wbuf->obj[nobj]; - keepworking = b == nil; - scanbufpos = 0; - for(i = 0; i < nelem(scanbuf); i++) - scanbuf[i] = nil; + // Not a beginning of a block, consult span table to find the block beginning. + k = (uintptr)obj>>PageShift; + x = k; + x -= (uintptr)runtime·mheap.arena_start>>PageShift; + s = runtime·mheap.spans[x]; + if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse){ + if(s != nil && s->state == MSpanStack) { + return nil; // This is legit. + } + // The following ensures that we are rigorous about what data + // structures hold valid pointers + if(0) { + // Still happens sometimes. We don't know why. + runtime·printf("runtime:objectstart Span weird: obj=%p, k=%p", obj, k); + if (s == nil) + runtime·printf(" s=nil\n"); + else + runtime·printf(" s->start=%p s->limit=%p, s->state=%d\n", s->start*PageSize, s->limit, s->state); + runtime·throw("objectstart: bad pointer in unexpected span"); + } + return nil; + } + p = (byte*)((uintptr)s->start<<PageShift); + if(s->sizeclass != 0) { + size = s->elemsize; + idx = ((byte*)obj - p)/size; + p = p+idx*size; + } + if(p == obj) { + runtime·printf("runtime: failed to find block beginning for %p s=%p s->limit=%p\n", + p, s->start*PageSize, s->limit); + runtime·throw("failed to find block beginning"); + } + obj = p; + } + // if size(obj.firstfield) < PtrSize, the &obj.secondfield could map to the boundary bit + // Clear any low bits to get to the start of the object. + // greyobject depends on this. + return obj; +} + +// Slow for now as we serialize this, since this is on a debug path +// speed is not critical at this point. +static Mutex andlock; +static void +atomicand8(byte *src, byte val) +{ + runtime·lock(&andlock); + *src = *src&val; + runtime·unlock(&andlock); +} + +// Mark using the checkmark scheme. +void +docheckmark(Markbits *mbits) +{ + // xor 01 moves 01(scalar unmarked) to 00(scalar marked) + // and 10(pointer unmarked) to 11(pointer marked) + if(mbits->tbits == BitsScalar) + atomicand8(mbits->bitp, ~(byte)(BitsCheckMarkXor<<mbits->shift<<2)); + else if(mbits->tbits == BitsPointer) + runtime·atomicor8(mbits->bitp, BitsCheckMarkXor<<mbits->shift<<2); + + // reload bits for ischeckmarked + mbits->xbits = *mbits->bitp; + mbits->bits = (mbits->xbits >> mbits->shift) & bitMask; + mbits->tbits = ((mbits->xbits >> mbits->shift) & bitPtrMask) >> 2; + + return; +} + +// In the default scheme does mbits refer to a marked object. +static bool +ismarked(Markbits *mbits) +{ + if((mbits->bits&bitBoundary) != bitBoundary) + runtime·throw("ismarked: bits should have boundary bit set"); + return (mbits->bits&bitMarked) == bitMarked; +} + +// In the checkmark scheme does mbits refer to a marked object. +static bool +ischeckmarked(Markbits *mbits) +{ + if((mbits->bits&bitBoundary) != bitBoundary) + runtime·printf("runtime:ischeckmarked: bits should have boundary bit set\n"); + return mbits->tbits==BitsScalarMarked || mbits->tbits==BitsPointerMarked; +} + +// When in GCmarkterminate phase we allocate black. +void +runtime·gcmarknewobject_m(void) +{ + Markbits mbits; + byte *obj; + + if(runtime·gcphase != GCmarktermination) + runtime·throw("marking new object while not in mark termination phase"); + if(checkmark) // The world should be stopped so this should not happen. + runtime·throw("gcmarknewobject called while doing checkmark"); + + obj = g->m->ptrarg[0]; + slottombits((byte*)((uintptr)obj & (PtrSize-1)), &mbits); + + if((mbits.bits&bitMarked) != 0) + return; + + // Each byte of GC bitmap holds info for two words. + // If the current object is larger than two words, or if the object is one word + // but the object it shares the byte with is already marked, + // then all the possible concurrent updates are trying to set the same bit, + // so we can use a non-atomic update. + if((mbits.xbits&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) || runtime·work.nproc == 1) + *mbits.bitp = mbits.xbits | (bitMarked<<mbits.shift); + else + runtime·atomicor8(mbits.bitp, bitMarked<<mbits.shift); + return; +} + +// obj is the start of an object with mark mbits. +// If it isn't already marked, mark it and enqueue into workbuf. +// Return possibly new workbuf to use. +static Workbuf* +greyobject(byte *obj, Markbits *mbits, Workbuf *wbuf) +{ + // obj should be start of allocation, and so must be at least pointer-aligned. + if(((uintptr)obj & (PtrSize-1)) != 0) + runtime·throw("greyobject: obj not pointer-aligned"); + + if(checkmark) { + if(!ismarked(mbits)) { + MSpan *s; + pageID k; + uintptr x, i; + + runtime·printf("runtime:greyobject: checkmarks finds unexpected unmarked object obj=%p, mbits->bits=%x, *mbits->bitp=%x\n", obj, mbits->bits, *mbits->bitp); + + k = (uintptr)obj>>PageShift; + x = k; + x -= (uintptr)runtime·mheap.arena_start>>PageShift; + s = runtime·mheap.spans[x]; + runtime·printf("runtime:greyobject Span: obj=%p, k=%p", obj, k); + if (s == nil) { + runtime·printf(" s=nil\n"); + } else { + runtime·printf(" s->start=%p s->limit=%p, s->state=%d, s->sizeclass=%d, s->elemsize=%D \n", s->start*PageSize, s->limit, s->state, s->sizeclass, s->elemsize); + for(i=0; i<s->sizeclass; i++) { + runtime·printf(" ((uintptr*)obj)[%D]=%p\n", i, ((uintptr*)obj)[i]); + } + } + runtime·throw("checkmark found unmarked object"); + } + if(ischeckmarked(mbits)) + return wbuf; + docheckmark(mbits); + if(!ischeckmarked(mbits)) { + runtime·printf("mbits xbits=%x bits=%x tbits=%x shift=%d\n", mbits->xbits, mbits->bits, mbits->tbits, mbits->shift); + runtime·throw("docheckmark and ischeckmarked disagree"); + } + } else { + // If marked we have nothing to do. + if((mbits->bits&bitMarked) != 0) + return wbuf; + + // Each byte of GC bitmap holds info for two words. + // If the current object is larger than two words, or if the object is one word + // but the object it shares the byte with is already marked, + // then all the possible concurrent updates are trying to set the same bit, + // so we can use a non-atomic update. + if((mbits->xbits&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) || runtime·work.nproc == 1) + *mbits->bitp = mbits->xbits | (bitMarked<<mbits->shift); + else + runtime·atomicor8(mbits->bitp, bitMarked<<mbits->shift); + } + + if (!checkmark && (((mbits->xbits>>(mbits->shift+2))&BitsMask) == BitsDead)) + return wbuf; // noscan object + + // Queue the obj for scanning. The PREFETCH(obj) logic has been removed but + // seems like a nice optimization that can be added back in. + // There needs to be time between the PREFETCH and the use. + // Previously we put the obj in an 8 element buffer that is drained at a rate + // to give the PREFETCH time to do its work. + // Use of PREFETCHNTA might be more appropriate than PREFETCH + + // If workbuf is full, obtain an empty one. + if(wbuf->nobj >= nelem(wbuf->obj)) { + wbuf = getempty(wbuf); + } + + wbuf->obj[wbuf->nobj] = obj; + wbuf->nobj++; + return wbuf; +} + +// Scan the object b of size n, adding pointers to wbuf. +// Return possibly new wbuf to use. +// If ptrmask != nil, it specifies where pointers are in b. +// If ptrmask == nil, the GC bitmap should be consulted. +// In this case, n may be an overestimate of the size; the GC bitmap +// must also be used to make sure the scan stops at the end of b. +static Workbuf* +scanobject(byte *b, uintptr n, byte *ptrmask, Workbuf *wbuf) +{ + byte *obj, *arena_start, *arena_used, *ptrbitp; + uintptr i, j; + int32 bits; + Markbits mbits; + + arena_start = (byte*)runtime·mheap.arena_start; + arena_used = runtime·mheap.arena_used; ptrbitp = nil; + // Find bits of the beginning of the object. + if(ptrmask == nil) { + b = objectstart(b, &mbits); + if(b == nil) + return wbuf; + ptrbitp = mbits.bitp; //arena_start - off/wordsPerBitmapByte - 1; + } + for(i = 0; i < n; i += PtrSize) { + // Find bits for this word. + if(ptrmask != nil) { + // dense mask (stack or data) + bits = (ptrmask[(i/PtrSize)/4]>>(((i/PtrSize)%4)*BitsPerPointer))&BitsMask; + } else { + // Check if we have reached end of span. + // n is an overestimate of the size of the object. + if((((uintptr)b+i)%PageSize) == 0 && + runtime·mheap.spans[(b-arena_start)>>PageShift] != runtime·mheap.spans[(b+i-arena_start)>>PageShift]) + break; + // Consult GC bitmap. + bits = *ptrbitp; + if(wordsPerBitmapByte != 2) + runtime·throw("alg doesn't work for wordsPerBitmapByte != 2"); + j = ((uintptr)b+i)/PtrSize & 1; // j indicates upper nibble or lower nibble + bits >>= gcBits*j; + if(i == 0) + bits &= ~bitBoundary; + ptrbitp -= j; + + if((bits&bitBoundary) != 0 && i != 0) + break; // reached beginning of the next object + bits = (bits&bitPtrMask)>>2; // bits refer to the type bits. + + if(i != 0 && bits == BitsDead) // BitsDead in first nibble not valid during checkmark + break; // reached no-scan part of the object + } + + if(bits <= BitsScalar) // Bits Scalar || + // BitsDead || // default encoding + // BitsScalarMarked // checkmark encoding + continue; + + if((bits&BitsPointer) != BitsPointer) { + runtime·printf("gc checkmark=%d, b=%p ptrmask=%p, mbits.bitp=%p, mbits.xbits=%x, bits=%x\n", checkmark, b, ptrmask, mbits.bitp, mbits.xbits, bits); + runtime·throw("unexpected garbage collection bits"); + } + + obj = *(byte**)(b+i); + // At this point we have extracted the next potential pointer. + // Check if it points into heap. + if(obj == nil || obj < arena_start || obj >= arena_used) + continue; + // Mark the object. return some important bits. + // We we combine the following two rotines we don't have to pass mbits or obj around. + obj = objectstart(obj, &mbits); + // In the case of the span being MSpan_Stack mbits is useless and will not have + // the boundary bit set. It does not need to be greyed since it will be + // scanned using the scan stack mechanism. + if(obj == nil) + continue; + wbuf = greyobject(obj, &mbits, wbuf); + } + return wbuf; +} + +// scanblock starts by scanning b as scanobject would. +// If the gcphase is GCscan, that's all scanblock does. +// Otherwise it traverses some fraction of the pointers it found in b, recursively. +// As a special case, scanblock(nil, 0, nil) means to scan previously queued work, +// stopping only when no work is left in the system. +static void +scanblock(byte *b, uintptr n, byte *ptrmask) +{ + Workbuf *wbuf; + bool keepworking; + + wbuf = getpartialorempty(); + if(b != nil) { + wbuf = scanobject(b, n, ptrmask, wbuf); + if(runtime·gcphase == GCscan) { + if(inheap(b) && !ptrmask) + // b is in heap, we are in GCscan so there should be a ptrmask. + runtime·throw("scanblock: In GCscan phase and inheap is true."); + // GCscan only goes one level deep since mark wb not turned on. + putpartial(wbuf); + return; + } + } + if(runtime·gcphase == GCscan) { + runtime·throw("scanblock: In GCscan phase but no b passed in."); + } + + keepworking = b == nil; + // ptrmask can have 2 possible values: // 1. nil - obtain pointer mask from GC bitmap. // 2. pointer to a compact mask (for stacks and data). - if(b != nil) - goto scanobj; for(;;) { - if(nobj == 0) { - // Out of work in workbuf. - // First, see is there is any work in scanbuf. - for(i = 0; i < nelem(scanbuf); i++) { - b = scanbuf[scanbufpos]; - scanbuf[scanbufpos++] = nil; - scanbufpos %= nelem(scanbuf); - if(b != nil) { - n = arena_used - b; // scan until bitBoundary or BitsDead - ptrmask = nil; // use GC bitmap for pointer info - goto scanobj; - } - } + if(wbuf->nobj == 0) { if(!keepworking) { putempty(wbuf); return; } // Refill workbuf from global queue. wbuf = getfull(wbuf); - if(wbuf == nil) + if(wbuf == nil) // nil means out of work barrier reached return; - nobj = wbuf->nobj; - wp = &wbuf->obj[nobj]; + + if(wbuf->nobj<=0) { + runtime·throw("runtime:scanblock getfull returns empty buffer"); + } + } // If another proc wants a pointer, give it some. - if(runtime·work.nwait > 0 && nobj > 4 && runtime·work.full == 0) { - wbuf->nobj = nobj; + if(runtime·work.nwait > 0 && wbuf->nobj > 4 && runtime·work.full == 0) { wbuf = handoff(wbuf); - nobj = wbuf->nobj; - wp = &wbuf->obj[nobj]; } - wp--; - nobj--; - b = *wp; - n = arena_used - b; // scan until next bitBoundary or BitsDead - ptrmask = nil; // use GC bitmap for pointer info - - scanobj: - if(DebugPtrs) - runtime·printf("scanblock %p +%p %p\n", b, n, ptrmask); - // Find bits of the beginning of the object. - if(ptrmask == nil) { - off = (uintptr*)b - (uintptr*)arena_start; - ptrbitp = arena_start - off/wordsPerBitmapByte - 1; - } - for(i = 0; i < n; i += PtrSize) { - obj = nil; - // Find bits for this word. - if(ptrmask == nil) { - // Check is we have reached end of span. - if((((uintptr)b+i)%PageSize) == 0 && - runtime·mheap.spans[(b-arena_start)>>PageShift] != runtime·mheap.spans[(b+i-arena_start)>>PageShift]) - break; - // Consult GC bitmap. - bits = *ptrbitp; - - if(wordsPerBitmapByte != 2) - runtime·throw("alg doesn't work for wordsPerBitmapByte != 2"); - j = ((uintptr)b+i)/PtrSize & 1; - ptrbitp -= j; - bits >>= gcBits*j; - - if((bits&bitBoundary) != 0 && i != 0) - break; // reached beginning of the next object - bits = (bits>>2)&BitsMask; - if(bits == BitsDead) - break; // reached no-scan part of the object - } else // dense mask (stack or data) - bits = (ptrmask[(i/PtrSize)/4]>>(((i/PtrSize)%4)*BitsPerPointer))&BitsMask; - - if(bits <= BitsScalar) // BitsScalar || BitsDead - continue; - if(bits == BitsPointer) { - obj = *(byte**)(b+i); - obj0 = obj; - goto markobj; - } - - // With those three out of the way, must be multi-word. - if(Debug && bits != BitsMultiWord) - runtime·throw("unexpected garbage collection bits"); - // Find the next pair of bits. - if(ptrmask == nil) { - bits = *ptrbitp; - j = ((uintptr)b+i+PtrSize)/PtrSize & 1; - ptrbitp -= j; - bits >>= gcBits*j; - bits = (bits>>2)&BitsMask; - } else - bits = (ptrmask[((i+PtrSize)/PtrSize)/4]>>((((i+PtrSize)/PtrSize)%4)*BitsPerPointer))&BitsMask; - - if(Debug && bits != BitsIface && bits != BitsEface) - runtime·throw("unexpected garbage collection bits"); - - if(bits == BitsIface) { - iface = (Iface*)(b+i); - if(iface->tab != nil) { - typ = iface->tab->type; - if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers)) - obj = iface->data; - } - } else { - eface = (Eface*)(b+i); - typ = eface->type; - if(typ != nil) { - if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers)) - obj = eface->data; - } - } - - i += PtrSize; - - obj0 = obj; - markobj: - // At this point we have extracted the next potential pointer. - // Check if it points into heap. - if(obj == nil) - continue; - if(obj < arena_start || obj >= arena_used) { - if((uintptr)obj < PhysPageSize && runtime·invalidptr) { - s = nil; - goto badobj; - } - continue; - } - // Mark the object. - obj = (byte*)((uintptr)obj & ~(PtrSize-1)); - off = (uintptr*)obj - (uintptr*)arena_start; - bitp = arena_start - off/wordsPerBitmapByte - 1; - shift = (off % wordsPerBitmapByte) * gcBits; - xbits = *bitp; - bits = (xbits >> shift) & bitMask; - if((bits&bitBoundary) == 0) { - // Not a beginning of a block, consult span table to find the block beginning. - k = (uintptr)obj>>PageShift; - x = k; - x -= (uintptr)arena_start>>PageShift; - s = runtime·mheap.spans[x]; - if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse) { - // Stack pointers lie within the arena bounds but are not part of the GC heap. - // Ignore them. - if(s != nil && s->state == MSpanStack) - continue; - - badobj: - // If cgo_allocate is linked into the binary, it can allocate - // memory as []unsafe.Pointer that may not contain actual - // pointers and must be scanned conservatively. - // In this case alone, allow the bad pointer. - if(have_cgo_allocate() && ptrmask == nil) - continue; - - // Anything else indicates a bug somewhere. - // If we're in the middle of chasing down a different bad pointer, - // don't confuse the trace by printing about this one. - if(nbadblock > 0) - continue; - - runtime·printf("runtime: garbage collector found invalid heap pointer *(%p+%p)=%p", b, i, obj); - if(s == nil) - runtime·printf(" s=nil\n"); - else - runtime·printf(" span=%p-%p-%p state=%d\n", (uintptr)s->start<<PageShift, s->limit, (uintptr)(s->start+s->npages)<<PageShift, s->state); - if(ptrmask != nil) - runtime·throw("invalid heap pointer"); - // Add to badblock list, which will cause the garbage collection - // to keep repeating until it has traced the chain of pointers - // leading to obj all the way back to a root. - if(nbadblock == 0) - badblock[nbadblock++] = (uintptr)b; - continue; - } - p = (byte*)((uintptr)s->start<<PageShift); - if(s->sizeclass != 0) { - size = s->elemsize; - idx = ((byte*)obj - p)/size; - p = p+idx*size; - } - if(p == obj) { - runtime·printf("runtime: failed to find block beginning for %p s=%p s->limit=%p\n", - p, s->start*PageSize, s->limit); - runtime·throw("failed to find block beginning"); - } - obj = p; - goto markobj; - } - if(DebugPtrs) - runtime·printf("scan *%p = %p => base %p\n", b+i, obj0, obj); - - if(nbadblock > 0 && (uintptr)obj == badblock[nbadblock-1]) { - // Running garbage collection again because - // we want to find the path from a root to a bad pointer. - // Found possible next step; extend or finish path. - for(j=0; j<nbadblock; j++) - if(badblock[j] == (uintptr)b) - goto AlreadyBad; - runtime·printf("runtime: found *(%p+%p) = %p+%p\n", b, i, obj0, (uintptr)(obj-obj0)); - if(ptrmask != nil) - runtime·throw("bad pointer"); - if(nbadblock >= nelem(badblock)) - runtime·throw("badblock trace too long"); - badblock[nbadblock++] = (uintptr)b; - AlreadyBad:; - } - - // Now we have bits, bitp, and shift correct for - // obj pointing at the base of the object. - // Only care about not marked objects. - if((bits&bitMarked) != 0) - continue; - // If obj size is greater than 8, then each byte of GC bitmap - // contains info for at most one object. In such case we use - // non-atomic byte store to mark the object. This can lead - // to double enqueue of the object for scanning, but scanning - // is an idempotent operation, so it is OK. This cannot lead - // to bitmap corruption because the single marked bit is the - // only thing that can change in the byte. - // For 8-byte objects we use non-atomic store, if the other - // quadruple is already marked. Otherwise we resort to CAS - // loop for marking. - if((xbits&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) || - runtime·work.nproc == 1) - *bitp = xbits | (bitMarked<<shift); - else - runtime·atomicor8(bitp, bitMarked<<shift); - - if(((xbits>>(shift+2))&BitsMask) == BitsDead) - continue; // noscan object - - // Queue the obj for scanning. - PREFETCH(obj); - p = scanbuf[scanbufpos]; - scanbuf[scanbufpos++] = obj; - scanbufpos %= nelem(scanbuf); - if(p == nil) - continue; - - // If workbuf is full, obtain an empty one. - if(nobj >= nelem(wbuf->obj)) { - wbuf->nobj = nobj; - wbuf = getempty(wbuf); - nobj = wbuf->nobj; - wp = &wbuf->obj[nobj]; - } - *wp = p; - wp++; - nobj++; - } - if(DebugPtrs) - runtime·printf("end scanblock %p +%p %p\n", b, n, ptrmask); - - if(Debug && ptrmask == nil) { - // For heap objects ensure that we did not overscan. - n = 0; - p = nil; - if(!runtime·mlookup(b, &p, &n, nil) || b != p || i > n) { - runtime·printf("runtime: scanned (%p,%p), heap object (%p,%p)\n", b, i, p, n); - runtime·throw("scanblock: scanned invalid object"); - } - } + // This might be a good place to add prefetch code... + // if(wbuf->nobj > 4) { + // PREFETCH(wbuf->obj[wbuf->nobj - 3]; + // } + --wbuf->nobj; + b = wbuf->obj[wbuf->nobj]; + wbuf = scanobject(b, runtime·mheap.arena_used - b, nil, wbuf); } } @@ -484,7 +665,7 @@ markroot(ParFor *desc, uint32 i) void *p; uint32 status; bool restart; - + USED(&desc); // Note: if you add a case here, please also update heapdump.c:dumproots. switch(i) { @@ -511,7 +692,8 @@ markroot(ParFor *desc, uint32 i) s = runtime·work.spans[spanidx]; if(s->state != MSpanInUse) continue; - if(s->sweepgen != sg) { + if(!checkmark && s->sweepgen != sg) { + // sweepgen was updated (+2) during non-checkmark GC pass runtime·printf("sweep %d %d\n", s->sweepgen, sg); runtime·throw("gc: unswept span"); } @@ -523,14 +705,16 @@ markroot(ParFor *desc, uint32 i) spf = (SpecialFinalizer*)sp; // A finalizer can be set for an inner byte of an object, find object beginning. p = (void*)((s->start << PageShift) + spf->special.offset/s->elemsize*s->elemsize); - scanblock(p, s->elemsize, nil); + if(runtime·gcphase != GCscan) + scanblock(p, s->elemsize, nil); // Scanned during mark phase scanblock((void*)&spf->fn, PtrSize, oneptr); } } break; case RootFlushCaches: - flushallmcaches(); + if (runtime·gcphase != GCscan) // Do not flush mcaches during GCscan phase. + flushallmcaches(); break; default: @@ -540,17 +724,37 @@ markroot(ParFor *desc, uint32 i) gp = runtime·allg[i - RootCount]; // remember when we've first observed the G blocked // needed only to output in traceback - status = runtime·readgstatus(gp); + status = runtime·readgstatus(gp); // We are not in a scan state if((status == Gwaiting || status == Gsyscall) && gp->waitsince == 0) gp->waitsince = runtime·work.tstart; - // Shrink a stack if not much of it is being used. - runtime·shrinkstack(gp); - if(runtime·readgstatus(gp) == Gdead) + // Shrink a stack if not much of it is being used but not in the scan phase. + if (runtime·gcphase != GCscan) // Do not shrink during GCscan phase. + runtime·shrinkstack(gp); + if(runtime·readgstatus(gp) == Gdead) gp->gcworkdone = true; else gp->gcworkdone = false; restart = runtime·stopg(gp); - scanstack(gp); + + // goroutine will scan its own stack when it stops running. + // Wait until it has. + while(runtime·readgstatus(gp) == Grunning && !gp->gcworkdone) { + } + + // scanstack(gp) is done as part of gcphasework + // But to make sure we finished we need to make sure that + // the stack traps have all responded so drop into + // this while loop until they respond. + while(!gp->gcworkdone){ + status = runtime·readgstatus(gp); + if(status == Gdead) { + gp->gcworkdone = true; // scan is a noop + break; + //do nothing, scan not needed. + } + if(status == Gwaiting || status == Grunnable) + restart = runtime·stopg(gp); + } if(restart) runtime·restartg(gp); break; @@ -562,53 +766,95 @@ markroot(ParFor *desc, uint32 i) static Workbuf* getempty(Workbuf *b) { - MCache *c; - - if(b != nil) - runtime·lfstackpush(&runtime·work.full, &b->node); - b = nil; - c = g->m->mcache; - if(c->gcworkbuf != nil) { - b = c->gcworkbuf; - c->gcworkbuf = nil; + if(b != nil) { + putfull(b); + b = nil; } - if(b == nil) + if(runtime·work.empty) b = (Workbuf*)runtime·lfstackpop(&runtime·work.empty); - if(b == nil) + + if(b && b->nobj != 0) { + runtime·printf("m%d: getempty: popped b=%p with non-zero b->nobj=%d\n", g->m->id, b, (uint32)b->nobj); + runtime·throw("getempty: workbuffer not empty, b->nobj not 0"); + } + if(b == nil) { b = runtime·persistentalloc(sizeof(*b), CacheLineSize, &mstats.gc_sys); - b->nobj = 0; + b->nobj = 0; + } return b; } static void putempty(Workbuf *b) { - MCache *c; - - c = g->m->mcache; - if(c->gcworkbuf == nil) { - c->gcworkbuf = b; - return; + if(b->nobj != 0) { + runtime·throw("putempty: b->nobj not 0\n"); } runtime·lfstackpush(&runtime·work.empty, &b->node); } -void -runtime·gcworkbuffree(void *b) +// Put a full or partially full workbuf on the full list. +static void +putfull(Workbuf *b) { - if(b != nil) - putempty(b); + if(b->nobj <= 0) { + runtime·throw("putfull: b->nobj <= 0\n"); + } + runtime·lfstackpush(&runtime·work.full, &b->node); } -// Get a full work buffer off the work.full list, or return nil. +// Get an partially empty work buffer +// if none are available get an empty one. +static Workbuf* +getpartialorempty(void) +{ + Workbuf *b; + + b = (Workbuf*)runtime·lfstackpop(&runtime·work.partial); + if(b == nil) + b = getempty(nil); + return b; +} + +static void +putpartial(Workbuf *b) +{ + + if(b->nobj == 0) + runtime·lfstackpush(&runtime·work.empty, &b->node); + else if (b->nobj < nelem(b->obj)) + runtime·lfstackpush(&runtime·work.partial, &b->node); + else if (b->nobj == nelem(b->obj)) + runtime·lfstackpush(&runtime·work.full, &b->node); + else { + runtime·printf("b=%p, b->nobj=%d, nelem(b->obj)=%d\n", b, (uint32)b->nobj, (uint32)nelem(b->obj)); + runtime·throw("putpartial: bad Workbuf b->nobj"); + } +} + +// Get a full work buffer off the work.full or a partially +// filled one off the work.partial list. If nothing is available +// wait until all the other gc helpers have finished and then +// return nil. +// getfull acts as a barrier for work.nproc helpers. As long as one +// gchelper is actively marking objects it +// may create a workbuffer that the other helpers can work on. +// The for loop either exits when a work buffer is found +// or when _all_ of the work.nproc GC helpers are in the loop +// looking for work and thus not capable of creating new work. +// This is in fact the termination condition for the STW mark +// phase. static Workbuf* getfull(Workbuf *b) { int32 i; if(b != nil) - runtime·lfstackpush(&runtime·work.empty, &b->node); + putempty(b); + b = (Workbuf*)runtime·lfstackpop(&runtime·work.full); + if(b==nil) + b = (Workbuf*)runtime·lfstackpop(&runtime·work.partial); if(b != nil || runtime·work.nproc == 1) return b; @@ -617,7 +863,9 @@ getfull(Workbuf *b) if(runtime·work.full != 0) { runtime·xadd(&runtime·work.nwait, -1); b = (Workbuf*)runtime·lfstackpop(&runtime·work.full); - if(b != nil) + if(b==nil) + b = (Workbuf*)runtime·lfstackpop(&runtime·work.partial); + if(b != nil) return b; runtime·xadd(&runtime·work.nwait, +1); } @@ -737,7 +985,7 @@ scanframe(Stkframe *frame, void *unused) } bv = runtime·stackmapdata(stackmap, pcdata); } - scanblock((byte*)frame->argp, bv.n/BitsPerPointer*PtrSize, bv.bytedata); + scanblock((byte*)frame->argp, bv.n/BitsPerPointer*PtrSize, bv.bytedata); } return true; } @@ -760,8 +1008,7 @@ scanstack(G *gp) case Gdead: return; case Grunning: - runtime·printf("runtime: gp=%p, goid=%D, gp->atomicstatus=%d\n", gp, gp->goid, runtime·readgstatus(gp)); - runtime·throw("mark - world not stopped"); + runtime·throw("scanstack: - goroutine not stopped"); case Grunnable: case Gsyscall: case Gwaiting: @@ -778,8 +1025,117 @@ scanstack(G *gp) runtime·tracebackdefers(gp, &fn, nil); } -// The gp has been moved to a gc safepoint. If there is gcphase specific -// work it is done here. +// If the slot is grey or black return true, if white return false. +// If the slot is not in the known heap and thus does not have a valid GC bitmap then +// it is considered grey. Globals and stacks can hold such slots. +// The slot is grey if its mark bit is set and it is enqueued to be scanned. +// The slot is black if it has already been scanned. +// It is white if it has a valid mark bit and the bit is not set. +static bool +shaded(byte *slot) +{ + Markbits mbits; + byte *valid; + + if(!inheap(slot)) // non-heap slots considered grey + return true; + + valid = objectstart(slot, &mbits); + if(valid == nil) + return true; + + if(checkmark) + return ischeckmarked(&mbits); + + return (mbits.bits&bitMarked) != 0; +} + +// Shade the object if it isn't already. +// The object is not nil and known to be in the heap. +static void +shade(byte *b) +{ + byte *obj; + Workbuf *wbuf; + Markbits mbits; + + if(!inheap(b)) + runtime·throw("shade: passed an address not in the heap"); + + wbuf = getpartialorempty(); + // Mark the object, return some important bits. + // If we combine the following two rotines we don't have to pass mbits or obj around. + obj = objectstart(b, &mbits); + if(obj != nil) + wbuf = greyobject(obj, &mbits, wbuf); // augments the wbuf + + putpartial(wbuf); + return; +} + +// This is the Dijkstra barrier coarsened to always shade the ptr (dst) object. +// The original Dijkstra barrier only shaded ptrs being placed in black slots. +// +// Shade indicates that it has seen a white pointer by adding the referent +// to wbuf as well as marking it. +// +// slot is the destination (dst) in go code +// ptr is the value that goes into the slot (src) in the go code +// +// Dijkstra pointed out that maintaining the no black to white +// pointers means that white to white pointers not need +// to be noted by the write barrier. Furthermore if either +// white object dies before it is reached by the +// GC then the object can be collected during this GC cycle +// instead of waiting for the next cycle. Unfortunately the cost of +// ensure that the object holding the slot doesn't concurrently +// change to black without the mutator noticing seems prohibitive. +// +// Consider the following example where the mutator writes into +// a slot and then loads the slot's mark bit while the GC thread +// writes to the slot's mark bit and then as part of scanning reads +// the slot. +// +// Initially both [slot] and [slotmark] are 0 (nil) +// Mutator thread GC thread +// st [slot], ptr st [slotmark], 1 +// +// ld r1, [slotmark] ld r2, [slot] +// +// This is a classic example of independent reads of independent writes, +// aka IRIW. The question is if r1==r2==0 is allowed and for most HW the +// answer is yes without inserting a memory barriers between the st and the ld. +// These barriers are expensive so we have decided that we will +// always grey the ptr object regardless of the slot's color. +// +void +runtime·gcmarkwb_m() +{ + byte *ptr; + ptr = (byte*)g->m->scalararg[1]; + + switch(runtime·gcphase) { + default: + runtime·throw("gcphasework in bad gcphase"); + case GCoff: + case GCquiesce: + case GCstw: + case GCsweep: + case GCscan: + break; + case GCmark: + if(ptr != nil && inheap(ptr)) + shade(ptr); + break; + case GCmarktermination: + if(ptr != nil && inheap(ptr)) + shade(ptr); + break; + } +} + +// The gp has been moved to a GC safepoint. GC phase specific +// work is done here. void runtime·gcphasework(G *gp) { @@ -790,12 +1146,18 @@ runtime·gcphasework(G *gp) case GCquiesce: case GCstw: case GCsweep: - // No work for now. + // No work. + break; + case GCscan: + // scan the stack, mark the objects, put pointers in work buffers + // hanging off the P where this is being run. + scanstack(gp); break; case GCmark: - // Disabled until concurrent GC is implemented - // but indicate the scan has been done. - // scanstack(gp); + break; + case GCmarktermination: + scanstack(gp); + // All available mark work will be emptied before returning. break; } gp->gcworkdone = true; @@ -885,6 +1247,7 @@ runtime·iterate_finq(void (*callback)(FuncVal*, byte*, uintptr, Type*, PtrType* } } +// Returns only when span s has been swept. void runtime·MSpan_EnsureSwept(MSpan *s) { @@ -899,6 +1262,7 @@ runtime·MSpan_EnsureSwept(MSpan *s) sg = runtime·mheap.sweepgen; if(runtime·atomicload(&s->sweepgen) == sg) return; + // The caller must be sure that the span is a MSpanInUse span. if(runtime·cas(&s->sweepgen, sg-2, sg-1)) { runtime·MSpan_Sweep(s, false); return; @@ -926,6 +1290,9 @@ runtime·MSpan_Sweep(MSpan *s, bool preserve) Special *special, **specialp, *y; bool res, sweepgenset; + if(checkmark) + runtime·throw("MSpan_Sweep: checkmark only runs in STW and after the sweep."); + // It's critical that we enter this function with preemption disabled, // GC must not start while we are in the middle of this function. if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0) @@ -1173,6 +1540,7 @@ runtime·gosweepdone(void) return runtime·mheap.sweepdone; } + void runtime·gchelper(void) { @@ -1181,13 +1549,11 @@ runtime·gchelper(void) g->m->traceback = 2; gchelperstart(); - // parallel mark for over gc roots + // parallel mark for over GC roots runtime·parfordo(runtime·work.markfor); - - // help other threads scan secondary blocks - scanblock(nil, 0, nil); - - nproc = runtime·work.nproc; // runtime·work.nproc can change right after we increment runtime·work.ndone + if(runtime·gcphase != GCscan) + scanblock(nil, 0, nil); // blocks in getfull + nproc = runtime·work.nproc; // work.nproc can change right after we increment work.ndone if(runtime·xadd(&runtime·work.ndone, +1) == nproc-1) runtime·notewakeup(&runtime·work.alldone); g->m->traceback = 0; @@ -1353,6 +1719,7 @@ runtime·gcinit(void) runtime·gcbssmask = unrollglobgcprog(runtime·gcbss, runtime·ebss - runtime·bss); } +// Called from malloc.go using onM, stopping and starting the world handled in caller. void runtime·gc_m(void) { @@ -1366,17 +1733,296 @@ runtime·gc_m(void) a.start_time = (uint64)(g->m->scalararg[0]) | ((uint64)(g->m->scalararg[1]) << 32); a.eagersweep = g->m->scalararg[2]; gc(&a); + runtime·casgstatus(gp, Gwaiting, Grunning); +} + +// Similar to clearcheckmarkbits but works on a single span. +// It preforms two tasks. +// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01) +// for nibbles with the BoundaryBit set. +// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and +// BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding. +// For the second case it is possible to restore the BitsDead pattern but since +// clearmark is a debug tool performance has a lower priority than simplicity. +// The span is MSpanInUse and the world is stopped. +static void +clearcheckmarkbitsspan(MSpan *s) +{ + int32 cl, n, npages, i; + uintptr size, off, step; + byte *p, *bitp, *arena_start, b; + + if(s->state != MSpanInUse) { + runtime·printf("runtime:clearcheckmarkbitsspan: state=%d\n", + s->state); + runtime·throw("clearcheckmarkbitsspan: bad span state"); + } + arena_start = runtime·mheap.arena_start; + cl = s->sizeclass; + size = s->elemsize; + if(cl == 0) { + n = 1; + } else { + // Chunk full of small blocks. + npages = runtime·class_to_allocnpages[cl]; + n = (npages << PageShift) / size; + } + + // MSpan_Sweep has similar code but instead of overloading and + // complicating that routine we do a simpler walk here. + // Sweep through n objects of given size starting at p. + // This thread owns the span now, so it can manipulate + // the block bitmap without atomic operations. + p = (byte*)(s->start << PageShift); + // Find bits for the beginning of the span. + off = (uintptr*)p - (uintptr*)arena_start; + bitp = arena_start - off/wordsPerBitmapByte - 1; + step = size/(PtrSize*wordsPerBitmapByte); + + // The type bit values are: + // 00 - BitsDead, for us BitsScalarMarked + // 01 - BitsScalar + // 10 - BitsPointer + // 11 - unused, for us BitsPointerMarked + // + // When called to prepare for the checkmark phase (checkmark==1), + // we change BitsDead to BitsScalar, so that there are no BitsScalarMarked + // type bits anywhere. + // + // The checkmark phase marks by changing BitsScalar to BitsScalarMarked + // and BitsPointer to BitsPointerMarked. + // + // When called to clean up after the checkmark phase (checkmark==0), + // we unmark by changing BitsScalarMarked back to BitsScalar and + // BitsPointerMarked back to BitsPointer. + // + // There are two problems with the scheme as just described. + // First, the setup rewrites BitsDead to BitsScalar, but the type bits + // following a BitsDead are uninitialized and must not be used. + // Second, objects that are free are expected to have their type + // bits zeroed (BitsDead), so in the cleanup we need to restore + // any BitsDeads that were there originally. + // + // In a one-word object (8-byte allocation on 64-bit system), + // there is no difference between BitsScalar and BitsDead, because + // neither is a pointer and there are no more words in the object, + // so using BitsScalar during the checkmark is safe and mapping + // both back to BitsDead during cleanup is also safe. + // + // In a larger object, we need to be more careful. During setup, + // if the type of the first word is BitsDead, we change it to BitsScalar + // (as we must) but also initialize the type of the second + // word to BitsDead, so that a scan during the checkmark phase + // will still stop before seeing the uninitialized type bits in the + // rest of the object. The sequence 'BitsScalar BitsDead' never + // happens in real type bitmaps - BitsDead is always as early + // as possible, so immediately after the last BitsPointer. + // During cleanup, if we see a BitsScalar, we can check to see if it + // is followed by BitsDead. If so, it was originally BitsDead and + // we can change it back. - if(nbadblock > 0) { - // Work out path from root to bad block. - for(;;) { - gc(&a); - if(nbadblock >= nelem(badblock)) - runtime·throw("cannot find path to bad pointer"); + if(step == 0) { + // updating top and bottom nibbles, all boundaries + for(i=0; i<n/2; i++, bitp--) { + if((*bitp & bitBoundary) != bitBoundary) + runtime·throw("missing bitBoundary"); + b = (*bitp & bitPtrMask)>>2; + if(!checkmark && (b == BitsScalar || b == BitsScalarMarked)) + *bitp &= ~0x0c; // convert to BitsDead + else if(b == BitsScalarMarked || b == BitsPointerMarked) + *bitp ^= BitsCheckMarkXor<<2; + + if(((*bitp>>gcBits) & bitBoundary) != bitBoundary) + runtime·throw("missing bitBoundary"); + b = ((*bitp>>gcBits) & bitPtrMask)>>2; + if(!checkmark && (b == BitsScalar || b == BitsScalarMarked)) + *bitp &= ~0xc0; // convert to BitsDead + else if(b == BitsScalarMarked || b == BitsPointerMarked) + *bitp ^= BitsCheckMarkXor<<(2+gcBits); + } + } else { + // updating bottom nibble for first word of each object + for(i=0; i<n; i++, bitp -= step) { + if((*bitp & bitBoundary) != bitBoundary) + runtime·throw("missing bitBoundary"); + b = (*bitp & bitPtrMask)>>2; + + if(checkmark && b == BitsDead) { + // move BitsDead into second word. + // set bits to BitsScalar in preparation for checkmark phase. + *bitp &= ~0xc0; + *bitp |= BitsScalar<<2; + } else if(!checkmark && (b == BitsScalar || b == BitsScalarMarked) && (*bitp & 0xc0) == 0) { + // Cleaning up after checkmark phase. + // First word is scalar or dead (we forgot) + // and second word is dead. + // First word might as well be dead too. + *bitp &= ~0x0c; + } else if(b == BitsScalarMarked || b == BitsPointerMarked) + *bitp ^= BitsCheckMarkXor<<2; } } +} - runtime·casgstatus(gp, Gwaiting, Grunning); +// clearcheckmarkbits preforms two tasks. +// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01) +// for nibbles with the BoundaryBit set. +// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and +// BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding. +// This is a bit expensive but preserves the BitsDead encoding during the normal marking. +// BitsDead remains valid for every nibble except the ones with BitsBoundary set. +static void +clearcheckmarkbits(void) +{ + uint32 idx; + MSpan *s; + for(idx=0; idx<runtime·work.nspan; idx++) { + s = runtime·work.spans[idx]; + if(s->state == MSpanInUse) { + clearcheckmarkbitsspan(s); + } + } +} + +// Called from malloc.go using onM. +// The world is stopped. Rerun the scan and mark phases +// using the bitMarkedCheck bit instead of the +// bitMarked bit. If the marking encounters an +// bitMarked bit that is not set then we throw. +void +runtime·gccheckmark_m(void) +{ + if(!gccheckmarkenable) + return; + + if(checkmark) + runtime·throw("gccheckmark_m, entered with checkmark already true."); + + checkmark = true; + clearcheckmarkbits(); // Converts BitsDead to BitsScalar. + runtime·gc_m(); // turns off checkmark + // Work done, fixed up the GC bitmap to remove the checkmark bits. + clearcheckmarkbits(); +} + +// checkmarkenable is initially false +void +runtime·gccheckmarkenable_m(void) +{ + gccheckmarkenable = true; +} + +void +runtime·gccheckmarkdisable_m(void) +{ + gccheckmarkenable = false; +} + +void +runtime·finishsweep_m(void) +{ + uint32 i, sg; + MSpan *s; + + // The world is stopped so we should be able to complete the sweeps + // quickly. + while(runtime·sweepone() != -1) + runtime·sweep.npausesweep++; + + // There may be some other spans being swept concurrently that + // we need to wait for. If finishsweep_m is done with the world stopped + // this code is not required. + sg = runtime·mheap.sweepgen; + for(i=0; i<runtime·work.nspan; i++) { + s = runtime·work.spans[i]; + if(s->sweepgen == sg) { + continue; + } + if(s->state != MSpanInUse) // Span is not part of the GCed heap so no need to ensure it is swept. + continue; + runtime·MSpan_EnsureSwept(s); + } +} + +// Scan all of the stacks, greying (or graying if in America) the referents +// but not blackening them since the mark write barrier isn't installed. +void +runtime·gcscan_m(void) +{ + uint32 i, allglen, oldphase; + G *gp, *mastergp, **allg; + + // Grab the g that called us and potentially allow rescheduling. + // This allows it to be scanned like other goroutines. + mastergp = g->m->curg; + + runtime·casgstatus(mastergp, Grunning, Gwaiting); + mastergp->waitreason = runtime·gostringnocopy((byte*)"garbage collection scan"); + + // Span sweeping has been done by finishsweep_m. + // Long term we will want to make this goroutine runnable + // by placing it onto a scanenqueue state and then calling + // runtime·restartg(mastergp) to make it Grunnable. + // At the bottom we will want to return this p back to the scheduler. + + oldphase = runtime·gcphase; + + runtime·lock(&runtime·allglock); + allglen = runtime·allglen; + allg = runtime·allg; + // Prepare flag indicating that the scan has not been completed. + for(i = 0; i < allglen; i++) { + gp = allg[i]; + gp->gcworkdone = false; // set to true in gcphasework + } + runtime·unlock(&runtime·allglock); + + runtime·work.nwait = 0; + runtime·work.ndone = 0; + runtime·work.nproc = 1; // For now do not do this in parallel. + runtime·gcphase = GCscan; + // ackgcphase is not needed since we are not scanning running goroutines. + runtime·parforsetup(runtime·work.markfor, runtime·work.nproc, RootCount + allglen, nil, false, markroot); + runtime·parfordo(runtime·work.markfor); + + runtime·lock(&runtime·allglock); + + allg = runtime·allg; + // Check that gc work is done. + for(i = 0; i < allglen; i++) { + gp = allg[i]; + if(!gp->gcworkdone) { + runtime·throw("scan missed a g"); + } + } + runtime·unlock(&runtime·allglock); + + runtime·gcphase = oldphase; + runtime·casgstatus(mastergp, Gwaiting, Grunning); + // Let the g that called us continue to run. +} + +// Mark all objects that are known about. +void +runtime·gcmark_m(void) +{ + scanblock(nil, 0, nil); +} + +// For now this must be bracketed with a stoptheworld and a starttheworld to ensure +// all go routines see the new barrier. +void +runtime·gcinstallmarkwb_m(void) +{ + runtime·gcphase = GCmark; +} + +// For now this must be bracketed with a stoptheworld and a starttheworld to ensure +// all go routines see the new barrier. +void +runtime·gcinstalloffwb_m(void) +{ + runtime·gcphase = GCoff; } static void @@ -1385,9 +2031,9 @@ gc(struct gc_args *args) int64 t0, t1, t2, t3, t4; uint64 heap0, heap1, obj; GCStats stats; - - if(DebugPtrs) - runtime·printf("GC start\n"); + uint32 oldphase; + uint32 i; + G *gp; if(runtime·debug.allocfreetrace) runtime·tracegc(); @@ -1400,11 +2046,10 @@ gc(struct gc_args *args) if(runtime·debug.gctrace) t1 = runtime·nanotime(); - // Sweep what is not sweeped by bgsweep. - while(runtime·sweepone() != -1) - runtime·sweep.npausesweep++; + if(!checkmark) + runtime·finishsweep_m(); // skip during checkmark debug phase. - // Cache runtime.mheap.allspans in work.spans to avoid conflicts with + // Cache runtime·mheap.allspans in work.spans to avoid conflicts with // resizing/freeing allspans. // New spans can be created while GC progresses, but they are not garbage for // this round: @@ -1421,10 +2066,19 @@ gc(struct gc_args *args) runtime·work.spans = runtime·mheap.allspans; runtime·work.nspan = runtime·mheap.nspan; runtime·unlock(&runtime·mheap.lock); + oldphase = runtime·gcphase; runtime·work.nwait = 0; runtime·work.ndone = 0; - runtime·work.nproc = runtime·gcprocs(); + runtime·work.nproc = runtime·gcprocs(); + runtime·gcphase = GCmarktermination; + + // World is stopped so allglen will not change. + for(i = 0; i < runtime·allglen; i++) { + gp = runtime·allg[i]; + gp->gcworkdone = false; // set to true in gcphasework + } + runtime·parforsetup(runtime·work.markfor, runtime·work.nproc, RootCount + runtime·allglen, nil, false, markroot); if(runtime·work.nproc > 1) { runtime·noteclear(&runtime·work.alldone); @@ -1437,8 +2091,15 @@ gc(struct gc_args *args) gchelperstart(); runtime·parfordo(runtime·work.markfor); + scanblock(nil, 0, nil); + if(runtime·work.full) + runtime·throw("runtime·work.full != nil"); + if(runtime·work.partial) + runtime·throw("runtime·work.partial != nil"); + + runtime·gcphase = oldphase; t3 = 0; if(runtime·debug.gctrace) t3 = runtime·nanotime(); @@ -1499,6 +2160,16 @@ gc(struct gc_args *args) // Free the old cached mark array if necessary. if(runtime·work.spans != nil && runtime·work.spans != runtime·mheap.allspans) runtime·SysFree(runtime·work.spans, runtime·work.nspan*sizeof(runtime·work.spans[0]), &mstats.other_sys); + + if(gccheckmarkenable) { + if(!checkmark) { + // first half of two-pass; don't set up sweep + runtime·unlock(&runtime·mheap.lock); + return; + } + checkmark = false; // done checking marks + } + // Cache the current array for sweeping. runtime·mheap.gcspans = runtime·mheap.allspans; runtime·mheap.sweepgen += 2; @@ -1508,6 +2179,7 @@ gc(struct gc_args *args) runtime·sweep.spanidx = 0; runtime·unlock(&runtime·mheap.lock); + if(ConcurrentSweep && !args->eagersweep) { runtime·lock(&runtime·gclock); if(runtime·sweep.g == nil) @@ -1527,9 +2199,6 @@ gc(struct gc_args *args) runtime·mProf_GC(); g->m->traceback = 0; - - if(DebugPtrs) - runtime·printf("GC end\n"); } extern uintptr runtime·sizeof_C_MStats; @@ -1784,7 +2453,7 @@ runtime·unrollgcprog_m(void) Type *typ; byte *mask, *prog; uintptr pos; - uint32 x; + uintptr x; typ = g->m->ptrarg[0]; g->m->ptrarg[0] = nil; @@ -1802,9 +2471,11 @@ runtime·unrollgcprog_m(void) prog = (byte*)typ->gc[1]; unrollgcprog1(mask, prog, &pos, false, true); } + // atomic way to say mask[0] = 1 - x = ((uint32*)mask)[0]; - runtime·atomicstore((uint32*)mask, x|1); + x = *(uintptr*)mask; + ((byte*)&x)[0] = 1; + runtime·atomicstorep((void**)mask, (void*)x); } runtime·unlock(&lock); } diff --git a/src/runtime/mgc0.go b/src/runtime/mgc0.go index 3a7204b54..dc4eec519 100644 --- a/src/runtime/mgc0.go +++ b/src/runtime/mgc0.go @@ -83,54 +83,139 @@ func bgsweep() { } } +const ( + _PoisonGC = 0xf969696969696969 & (1<<(8*ptrSize) - 1) + _PoisonStack = 0x6868686868686868 & (1<<(8*ptrSize) - 1) +) + // NOTE: Really dst *unsafe.Pointer, src unsafe.Pointer, // but if we do that, Go inserts a write barrier on *dst = src. //go:nosplit func writebarrierptr(dst *uintptr, src uintptr) { *dst = src + writebarrierptr_nostore(dst, src) +} + +// Like writebarrierptr, but the store has already been applied. +// Do not reapply. +//go:nosplit +func writebarrierptr_nostore(dst *uintptr, src uintptr) { + if getg() == nil { // very low-level startup + return + } + + if src != 0 && (src < _PageSize || src == _PoisonGC || src == _PoisonStack) { + onM(func() { gothrow("bad pointer in write barrier") }) + } + + mp := acquirem() + if mp.inwb || mp.dying > 0 { + releasem(mp) + return + } + mp.inwb = true + oldscalar0 := mp.scalararg[0] + oldscalar1 := mp.scalararg[1] + mp.scalararg[0] = uintptr(unsafe.Pointer(dst)) + mp.scalararg[1] = src + onM_signalok(gcmarkwb_m) + mp.scalararg[0] = oldscalar0 + mp.scalararg[1] = oldscalar1 + mp.inwb = false + releasem(mp) } //go:nosplit func writebarrierstring(dst *[2]uintptr, src [2]uintptr) { - dst[0] = src[0] + writebarrierptr(&dst[0], src[0]) dst[1] = src[1] } //go:nosplit func writebarrierslice(dst *[3]uintptr, src [3]uintptr) { - dst[0] = src[0] + writebarrierptr(&dst[0], src[0]) dst[1] = src[1] dst[2] = src[2] } //go:nosplit func writebarrieriface(dst *[2]uintptr, src [2]uintptr) { - dst[0] = src[0] - dst[1] = src[1] -} - -//go:nosplit -func writebarrierfat2(dst *[2]uintptr, _ *byte, src [2]uintptr) { - dst[0] = src[0] - dst[1] = src[1] + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) } -//go:nosplit -func writebarrierfat3(dst *[3]uintptr, _ *byte, src [3]uintptr) { - dst[0] = src[0] - dst[1] = src[1] - dst[2] = src[2] -} +//go:generate go run wbfat_gen.go -- wbfat.go +// +// The above line generates multiword write barriers for +// all the combinations of ptr+scalar up to four words. +// The implementations are written to wbfat.go. //go:nosplit -func writebarrierfat4(dst *[4]uintptr, _ *byte, src [4]uintptr) { - dst[0] = src[0] - dst[1] = src[1] - dst[2] = src[2] - dst[3] = src[3] +func writebarrierfat(typ *_type, dst, src unsafe.Pointer) { + mask := loadPtrMask(typ) + nptr := typ.size / ptrSize + for i := uintptr(0); i < nptr; i += 2 { + bits := mask[i/2] + if (bits>>2)&_BitsMask == _BitsPointer { + writebarrierptr((*uintptr)(dst), *(*uintptr)(src)) + } else { + *(*uintptr)(dst) = *(*uintptr)(src) + } + dst = add(dst, ptrSize) + src = add(src, ptrSize) + if i+1 == nptr { + break + } + bits >>= 4 + if (bits>>2)&_BitsMask == _BitsPointer { + writebarrierptr((*uintptr)(dst), *(*uintptr)(src)) + } else { + *(*uintptr)(dst) = *(*uintptr)(src) + } + dst = add(dst, ptrSize) + src = add(src, ptrSize) + } } //go:nosplit -func writebarrierfat(typ *_type, dst, src unsafe.Pointer) { - memmove(dst, src, typ.size) +func writebarriercopy(typ *_type, dst, src slice) int { + n := dst.len + if n > src.len { + n = src.len + } + if n == 0 { + return 0 + } + dstp := unsafe.Pointer(dst.array) + srcp := unsafe.Pointer(src.array) + + if uintptr(srcp) < uintptr(dstp) && uintptr(srcp)+uintptr(n)*typ.size > uintptr(dstp) { + // Overlap with src before dst. + // Copy backward, being careful not to move dstp/srcp + // out of the array they point into. + dstp = add(dstp, uintptr(n-1)*typ.size) + srcp = add(srcp, uintptr(n-1)*typ.size) + i := uint(0) + for { + writebarrierfat(typ, dstp, srcp) + if i++; i >= n { + break + } + dstp = add(dstp, -typ.size) + srcp = add(srcp, -typ.size) + } + } else { + // Copy forward, being careful not to move dstp/srcp + // out of the array they point into. + i := uint(0) + for { + writebarrierfat(typ, dstp, srcp) + if i++; i >= n { + break + } + dstp = add(dstp, typ.size) + srcp = add(srcp, typ.size) + } + } + return int(n) } diff --git a/src/runtime/mgc0.h b/src/runtime/mgc0.h index 64f818914..519d7206e 100644 --- a/src/runtime/mgc0.h +++ b/src/runtime/mgc0.h @@ -45,8 +45,12 @@ enum { // If you change these, also change scanblock. // scanblock does "if(bits == BitsScalar || bits == BitsDead)" as "if(bits <= BitsScalar)". BitsDead = 0, - BitsScalar = 1, - BitsPointer = 2, + BitsScalar = 1, // 01 + BitsPointer = 2, // 10 + BitsCheckMarkXor = 1, // 10 + BitsScalarMarked = BitsScalar ^ BitsCheckMarkXor, // 00 + BitsPointerMarked = BitsPointer ^ BitsCheckMarkXor, // 11 + BitsMultiWord = 3, // BitsMultiWord will be set for the first word of a multi-word item. // When it is set, one of the following will be set for the second word. @@ -56,7 +60,7 @@ enum { BitsEface = 3, // 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively. - MaxGCMask = 64, + MaxGCMask = 65536, // TODO(rsc): change back to 64 }; // Bits in per-word bitmap. diff --git a/src/runtime/noasm_arm.go b/src/runtime/noasm.go index dd3ef8267..43c16860b 100644 --- a/src/runtime/noasm_arm.go +++ b/src/runtime/noasm.go @@ -5,6 +5,8 @@ // Routines that are implemented in assembly in asm_{amd64,386}.s // but are implemented in Go for arm. +// +build arm power64 power64le + package runtime func cmpstring(s1, s2 string) int { diff --git a/src/runtime/os_darwin.c b/src/runtime/os_darwin.c index bbd29282b..b866863d0 100644 --- a/src/runtime/os_darwin.c +++ b/src/runtime/os_darwin.c @@ -135,7 +135,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); // OS X wants >=8K, Linux >=2K + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_dragonfly.c b/src/runtime/os_dragonfly.c index e372205ec..051192ad3 100644 --- a/src/runtime/os_dragonfly.c +++ b/src/runtime/os_dragonfly.c @@ -195,7 +195,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_freebsd.c b/src/runtime/os_freebsd.c index a513cb604..1c126547a 100644 --- a/src/runtime/os_freebsd.c +++ b/src/runtime/os_freebsd.c @@ -203,7 +203,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_linux.c b/src/runtime/os_linux.c index 0d8ffc995..cc23774e3 100644 --- a/src/runtime/os_linux.c +++ b/src/runtime/os_linux.c @@ -49,9 +49,22 @@ runtime·futexsleep(uint32 *addr, uint32 val, int64 ns) runtime·futex(addr, FUTEX_WAIT, val, nil, nil, 0); return; } - // NOTE: tv_nsec is int64 on amd64, so this assumes a little-endian system. + + // It's difficult to live within the no-split stack limits here. + // On ARM and 386, a 64-bit divide invokes a general software routine + // that needs more stack than we can afford. So we use timediv instead. + // But on real 64-bit systems, where words are larger but the stack limit + // is not, even timediv is too heavy, and we really need to use just an + // ordinary machine instruction. + // Sorry for the #ifdef. + // For what it's worth, the #ifdef eliminated an implicit little-endian assumption. +#ifdef _64BIT + ts.tv_sec = ns / 1000000000LL; + ts.tv_nsec = ns % 1000000000LL; +#else ts.tv_nsec = 0; ts.tv_sec = runtime·timediv(ns, 1000000000LL, (int32*)&ts.tv_nsec); +#endif runtime·futex(addr, FUTEX_WAIT, val, &ts, nil, 0); } @@ -98,19 +111,22 @@ static int32 getproccount(void) { uintptr buf[16], t; - int32 r, cnt, i; + int32 r, n, i; - cnt = 0; r = runtime·sched_getaffinity(0, sizeof(buf), buf); - if(r > 0) + if(r <= 0) + return 1; + n = 0; for(i = 0; i < r/sizeof(buf[0]); i++) { t = buf[i]; - t = t - ((t >> 1) & 0x5555555555555555ULL); - t = (t & 0x3333333333333333ULL) + ((t >> 2) & 0x3333333333333333ULL); - cnt += (int32)((((t + (t >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); + while(t != 0) { + n += t&1; + t >>= 1; + } } - - return cnt ? cnt : 1; + if(n < 1) + n = 1; + return n; } // Clone, the Linux rfork. @@ -217,7 +233,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); // OS X wants >=8K, Linux >=2K + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). @@ -298,7 +317,8 @@ runtime·setsig(int32 i, GoSighandler *fn, bool restart) if(fn == runtime·sighandler) fn = (void*)runtime·sigtramp; sa.sa_handler = fn; - if(runtime·rt_sigaction(i, &sa, nil, sizeof(sa.sa_mask)) != 0) + // Qemu rejects rt_sigaction of SIGRTMAX (64). + if(runtime·rt_sigaction(i, &sa, nil, sizeof(sa.sa_mask)) != 0 && i != 64) runtime·throw("rt_sigaction failure"); } diff --git a/src/runtime/os_nacl.c b/src/runtime/os_nacl.c index 14b558303..ad72cc7c6 100644 --- a/src/runtime/os_nacl.c +++ b/src/runtime/os_nacl.c @@ -20,7 +20,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); // OS X wants >=8K, Linux >=2K + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_netbsd.c b/src/runtime/os_netbsd.c index 58e5bedf2..28929ea57 100644 --- a/src/runtime/os_netbsd.c +++ b/src/runtime/os_netbsd.c @@ -271,7 +271,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_openbsd.c b/src/runtime/os_openbsd.c index eebaa13ee..960aaffff 100644 --- a/src/runtime/os_openbsd.c +++ b/src/runtime/os_openbsd.c @@ -217,7 +217,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_plan9.c b/src/runtime/os_plan9.c index f8c543f6f..18460fc12 100644 --- a/src/runtime/os_plan9.c +++ b/src/runtime/os_plan9.c @@ -20,12 +20,18 @@ runtime·mpreinit(M *mp) { // Initialize stack and goroutine for note handling. mp->gsignal = runtime·malg(32*1024); + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); + mp->notesig = (int8*)runtime·mallocgc(ERRMAX*sizeof(int8), nil, FlagNoScan); + runtime·writebarrierptr_nostore(&mp->notesig, mp->notesig); // Initialize stack for handling strings from the // errstr system call, as used in package syscall. mp->errstr = (byte*)runtime·mallocgc(ERRMAX*sizeof(byte), nil, FlagNoScan); + runtime·writebarrierptr_nostore(&mp->errstr, mp->errstr); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/os_solaris.c b/src/runtime/os_solaris.c index e16b8e637..bee91d8e6 100644 --- a/src/runtime/os_solaris.c +++ b/src/runtime/os_solaris.c @@ -176,7 +176,10 @@ void runtime·mpreinit(M *mp) { mp->gsignal = runtime·malg(32*1024); + runtime·writebarrierptr_nostore(&mp->gsignal, mp->gsignal); + mp->gsignal->m = mp; + runtime·writebarrierptr_nostore(&mp->gsignal->m, mp->gsignal->m); } // Called to initialize a new m (including the bootstrap m). diff --git a/src/runtime/panic.c b/src/runtime/panic.c index 24eb6dbfe..46683b2b0 100644 --- a/src/runtime/panic.c +++ b/src/runtime/panic.c @@ -70,7 +70,7 @@ runtime·recovery_m(G *gp) // (The pc we're returning to does pop pop // before it tests the return value.) // On the arm there are 2 saved LRs mixed in too. - if(thechar == '5') + if(thechar == '5' || thechar == '9') gp->sched.sp = (uintptr)argp - 4*sizeof(uintptr); else gp->sched.sp = (uintptr)argp - 2*sizeof(uintptr); diff --git a/src/runtime/panic.go b/src/runtime/panic.go index 685ff5ca0..91b5da294 100644 --- a/src/runtime/panic.go +++ b/src/runtime/panic.go @@ -61,7 +61,7 @@ func deferproc(siz int32, fn *funcval) { // arguments of fn follow fn // we can only call nosplit routines. argp := uintptr(unsafe.Pointer(&fn)) argp += unsafe.Sizeof(fn) - if GOARCH == "arm" { + if GOARCH == "arm" || GOARCH == "power64" || GOARCH == "power64le" { argp += ptrSize // skip caller's saved link register } mp := acquirem() @@ -494,12 +494,12 @@ func throw(s *byte) { //go:nosplit func gothrow(s string) { + print("fatal error: ", s, "\n") gp := getg() if gp.m.throwing == 0 { gp.m.throwing = 1 } startpanic() - print("fatal error: ", s, "\n") dopanic(0) *(*int)(nil) = 0 // not reached } diff --git a/src/runtime/print1.go b/src/runtime/print1.go index 8f8268873..3d812bd04 100644 --- a/src/runtime/print1.go +++ b/src/runtime/print1.go @@ -41,7 +41,31 @@ func snprintf(dst *byte, n int32, s *byte) { gp.writebuf = nil } -//var debuglock mutex +var debuglock mutex + +// The compiler emits calls to printlock and printunlock around +// the multiple calls that implement a single Go print or println +// statement. Some of the print helpers (printsp, for example) +// call print recursively. There is also the problem of a crash +// happening during the print routines and needing to acquire +// the print lock to print information about the crash. +// For both these reasons, let a thread acquire the printlock 'recursively'. + +func printlock() { + mp := getg().m + mp.printlock++ + if mp.printlock == 1 { + lock(&debuglock) + } +} + +func printunlock() { + mp := getg().m + mp.printlock-- + if mp.printlock == 0 { + unlock(&debuglock) + } +} // write to goroutine-local buffer if diverting output, // or else standard error. @@ -80,7 +104,7 @@ func printnl() { // Very simple printf. Only for debugging prints. // Do not add to this without checking with Rob. func vprintf(str string, arg unsafe.Pointer) { - //lock(&debuglock); + printlock() s := bytes(str) start := 0 @@ -160,7 +184,7 @@ func vprintf(str string, arg unsafe.Pointer) { gwrite(s[start:i]) } - //unlock(&debuglock); + printunlock() } func printpc(p unsafe.Pointer) { diff --git a/src/runtime/proc.c b/src/runtime/proc.c index 91e3fe16d..ce39db4ab 100644 --- a/src/runtime/proc.c +++ b/src/runtime/proc.c @@ -423,13 +423,7 @@ runtime·casgstatus(G *gp, uint32 oldval, uint32 newval) // loop if gp->atomicstatus is in a scan state giving // GC time to finish and change the state to oldval. while(!runtime·cas(&gp->atomicstatus, oldval, newval)) { - // Help GC if needed. - if(gp->preemptscan && !gp->gcworkdone && (oldval == Grunning || oldval == Gsyscall)) { - gp->preemptscan = false; - g->m->ptrarg[0] = gp; - fn = helpcasgstatus; - runtime·onM(&fn); - } + } } @@ -504,6 +498,13 @@ runtime·stopg(G *gp) return false; case Grunning: + if(runtime·gcphase == GCscan) { + gp->gcworkdone = true; + return false; + // Running routines not scanned during + // GCscan phase, we only scan non-running routines. + } + // Claim goroutine, so we aren't racing with a status // transition away from Grunning. if(!runtime·castogscanstatus(gp, Grunning, Gscanrunning)) @@ -581,9 +582,10 @@ mquiesce(G *gpmaster) uint32 status; uint32 activeglen; - activeglen = runtime·allglen; // enqueue the calling goroutine. runtime·restartg(gpmaster); + + activeglen = runtime·allglen; for(i = 0; i < activeglen; i++) { gp = runtime·allg[i]; if(runtime·readgstatus(gp) == Gdead) @@ -874,7 +876,9 @@ runtime·allocm(P *p) mp->g0 = runtime·malg(-1); else mp->g0 = runtime·malg(8192); + runtime·writebarrierptr_nostore(&mp->g0, mp->g0); mp->g0->m = mp; + runtime·writebarrierptr_nostore(&mp->g0->m, mp->g0->m); if(p == g->m->p) releasep(); @@ -1058,7 +1062,7 @@ runtime·dropm(void) unlockextra(mp); } -#define MLOCKED ((M*)1) +#define MLOCKED 1 // lockextra locks the extra list and returns the list head. // The caller must unlock the list by storing a new list head @@ -1069,28 +1073,28 @@ runtime·dropm(void) static M* lockextra(bool nilokay) { - M *mp; + uintptr mpx; void (*yield)(void); for(;;) { - mp = runtime·atomicloadp(&runtime·extram); - if(mp == MLOCKED) { + mpx = runtime·atomicloaduintptr((uintptr*)&runtime·extram); + if(mpx == MLOCKED) { yield = runtime·osyield; yield(); continue; } - if(mp == nil && !nilokay) { + if(mpx == 0 && !nilokay) { runtime·usleep(1); continue; } - if(!runtime·casp(&runtime·extram, mp, MLOCKED)) { + if(!runtime·casuintptr((uintptr*)&runtime·extram, mpx, MLOCKED)) { yield = runtime·osyield; yield(); continue; } break; } - return mp; + return (M*)mpx; } #pragma textflag NOSPLIT @@ -1915,6 +1919,7 @@ exitsyscallfast(void) // Freezetheworld sets stopwait but does not retake P's. if(runtime·sched.stopwait) { + g->m->mcache = nil; g->m->p = nil; return false; } @@ -1927,6 +1932,7 @@ exitsyscallfast(void) return true; } // Try to get any other idle P. + g->m->mcache = nil; g->m->p = nil; if(runtime·sched.pidle) { fn = exitsyscallfast_pidle; @@ -2122,7 +2128,7 @@ runtime·newproc(int32 siz, FuncVal* fn, ...) byte *argp; void (*mfn)(void); - if(thechar == '5') + if(thechar == '5' || thechar == '9') argp = (byte*)(&fn+2); // skip caller's saved LR else argp = (byte*)(&fn+1); @@ -2182,7 +2188,7 @@ runtime·newproc1(FuncVal *fn, byte *argp, int32 narg, int32 nret, void *callerp sp -= 4*sizeof(uintreg); // extra space in case of reads slightly beyond frame sp -= siz; runtime·memmove(sp, argp, narg); - if(thechar == '5') { + if(thechar == '5' || thechar == '9') { // caller's LR sp -= sizeof(void*); *(void**)sp = nil; @@ -2615,6 +2621,8 @@ runtime·setcpuprofilerate_m(void) P *runtime·newP(void); // Change number of processors. The world is stopped, sched is locked. +// gcworkbufs are not being modified by either the GC or +// the write barrier code. static void procresize(int32 new) { diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 5b8c7d8ae..f41ffbff3 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -165,6 +165,9 @@ func acquireSudog() *sudog { // which keeps the garbage collector from being invoked. mp := acquirem() p := new(sudog) + if p.elem != nil { + gothrow("acquireSudog: found p.elem != nil after new") + } releasem(mp) return p } diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s index bdea28c7c..15b18ff8f 100644 --- a/src/runtime/race_amd64.s +++ b/src/runtime/race_amd64.s @@ -140,20 +140,20 @@ TEXT racecalladdr<>(SB), NOSPLIT, $0-0 MOVQ g_racectx(R14), RARG0 // goroutine context // Check that addr is within [arenastart, arenaend) or within [noptrdata, enoptrbss). CMPQ RARG1, runtime·racearenastart(SB) - JB racecalladdr_data + JB data CMPQ RARG1, runtime·racearenaend(SB) - JB racecalladdr_call -racecalladdr_data: + JB call +data: MOVQ $runtime·noptrdata(SB), R13 CMPQ RARG1, R13 - JB racecalladdr_ret + JB ret MOVQ $runtime·enoptrbss(SB), R13 CMPQ RARG1, R13 - JAE racecalladdr_ret -racecalladdr_call: + JAE ret +call: MOVQ AX, AX // w/o this 6a miscompiles this function JMP racecall<>(SB) -racecalladdr_ret: +ret: RET // func runtime·racefuncenter(pc uintptr) @@ -335,9 +335,9 @@ TEXT racecall<>(SB), NOSPLIT, $0-0 MOVQ SP, R12 // callee-saved, preserved across the CALL MOVQ m_g0(R13), R10 CMPQ R10, R14 - JE racecall_cont // already on g0 + JE call // already on g0 MOVQ (g_sched+gobuf_sp)(R10), SP -racecall_cont: +call: ANDQ $~15, SP // alignment for gcc ABI CALL AX MOVQ R12, SP diff --git a/src/runtime/rt0_linux_power64.s b/src/runtime/rt0_linux_power64.s new file mode 100644 index 000000000..970b6a673 --- /dev/null +++ b/src/runtime/rt0_linux_power64.s @@ -0,0 +1,17 @@ +#include "textflag.h" + +// actually a function descriptor for _main<>(SB) +TEXT _rt0_power64_linux(SB),NOSPLIT,$0 + DWORD $_main<>(SB) + DWORD $0 + DWORD $0 + +TEXT _main<>(SB),NOSPLIT,$-8 + MOVD 0(R1), R3 // argc + ADD $8, R1, R4 // argv + BR main(SB) + +TEXT main(SB),NOSPLIT,$-8 + MOVD $runtime·rt0_go(SB), R31 + MOVD R31, CTR + BR (CTR) diff --git a/src/runtime/rt0_linux_power64le.s b/src/runtime/rt0_linux_power64le.s new file mode 100644 index 000000000..85ce84733 --- /dev/null +++ b/src/runtime/rt0_linux_power64le.s @@ -0,0 +1,14 @@ +#include "textflag.h" + +TEXT _rt0_power64le_linux(SB),NOSPLIT,$0 + BR _main<>(SB) + +TEXT _main<>(SB),NOSPLIT,$-8 + MOVD 0(R1), R3 // argc + ADD $8, R1, R4 // argv + BR main(SB) + +TEXT main(SB),NOSPLIT,$-8 + MOVD $runtime·rt0_go(SB), R31 + MOVD R31, CTR + BR (CTR) diff --git a/src/runtime/runtime.c b/src/runtime/runtime.c index c823691ec..f19f8e4be 100644 --- a/src/runtime/runtime.c +++ b/src/runtime/runtime.c @@ -185,6 +185,7 @@ runtime·check(void) float64 j, j1; byte *k, *k1; uint16* l; + byte m[4]; struct x1 { byte x; }; @@ -236,6 +237,11 @@ runtime·check(void) if(k != k1) runtime·throw("casp3"); + m[0] = m[1] = m[2] = m[3] = 0x1; + runtime·atomicor8(&m[1], 0xf0); + if (m[0] != 0x1 || m[1] != 0xf1 || m[2] != 0x1 || m[3] != 0x1) + runtime·throw("atomicor8"); + *(uint64*)&j = ~0ULL; if(j == j) runtime·throw("float64nan"); diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index 977c4547d..330ed429b 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -94,6 +94,7 @@ typedef struct PollDesc PollDesc; typedef struct DebugVars DebugVars; typedef struct ForceGCState ForceGCState; typedef struct Stack Stack; +typedef struct Workbuf Workbuf; /* * Per-CPU declaration. @@ -304,7 +305,7 @@ struct G bool paniconfault; // panic (instead of crash) on unexpected fault address bool preemptscan; // preempted g does scan for GC bool gcworkdone; // debug: cleared at begining of gc work phase cycle, set by gcphasework, tested at end of cycle - bool throwsplit; // must not split stack + bool throwsplit; // must not split stack int8 raceignore; // ignore race detection events M* m; // for debuggers, but offset not hard-coded M* lockedm; @@ -344,6 +345,8 @@ struct M int32 helpgc; bool spinning; // M is out of work and is actively looking for work bool blocked; // M is blocked on a Note + bool inwb; // M is executing a write barrier + int8 printlock; uint32 fastrand; uint64 ncgocall; // number of cgo calls in total int32 ncgo; // number of cgo calls currently in progress @@ -570,9 +573,10 @@ enum { #endif // Lock-free stack node. +// Also known to export_test.go. struct LFNode { - LFNode *next; + uint64 next; uintptr pushcnt; }; @@ -598,6 +602,16 @@ struct ParFor uint64 nsleep; }; +enum { + WorkbufSize = 4*1024, +}; +struct Workbuf +{ + LFNode node; // must be first + uintptr nobj; + byte* obj[(WorkbufSize-sizeof(LFNode)-sizeof(uintptr))/PtrSize]; +}; + // Track memory allocated by code not written in Go during a cgo call, // so that the garbage collector can see them. struct CgoMal @@ -620,12 +634,14 @@ struct DebugVars // Indicates to write barrier and sychronization task to preform. enum -{ // Synchronization Write barrier - GCoff, // stop and start nop - GCquiesce, // stop and start nop - GCstw, // stop the ps nop - GCmark, // scan the stacks and start no white to black - GCsweep, // stop and start nop +{ // Action WB installation + GCoff = 0, // stop and start no wb + GCquiesce, // stop and start no wb + GCstw, // stop the ps nop + GCscan, // scan the stacks prior to marking + GCmark, // mark use wbufs from GCscan and globals, scan the stacks, then go to GCtermination + GCmarktermination, // mark termination detection. Allocate black, Ps help out GC + GCsweep, // stop and start nop }; struct ForceGCState @@ -636,6 +652,7 @@ struct ForceGCState }; extern uint32 runtime·gcphase; +extern Mutex runtime·allglock; /* * defined macros @@ -666,6 +683,7 @@ enum { uint32 runtime·readgstatus(G*); void runtime·casgstatus(G*, uint32, uint32); +bool runtime·castogscanstatus(G*, uint32, uint32); void runtime·quiesce(G*); bool runtime·stopg(G*); void runtime·restartg(G*); @@ -882,6 +900,7 @@ int32 runtime·round2(int32 x); // round x up to a power of 2. bool runtime·cas(uint32*, uint32, uint32); bool runtime·cas64(uint64*, uint64, uint64); bool runtime·casp(void**, void*, void*); +bool runtime·casuintptr(uintptr*, uintptr, uintptr); // Don't confuse with XADD x86 instruction, // this one is actually 'addx', that is, add-and-fetch. uint32 runtime·xadd(uint32 volatile*, int32); @@ -1108,6 +1127,8 @@ void runtime·osyield(void); void runtime·lockOSThread(void); void runtime·unlockOSThread(void); +void runtime·writebarrierptr_nostore(void*, void*); + bool runtime·showframe(Func*, G*); void runtime·printcreatedby(G*); diff --git a/src/runtime/select.go b/src/runtime/select.go index efe68c1f5..d703e1d79 100644 --- a/src/runtime/select.go +++ b/src/runtime/select.go @@ -377,12 +377,7 @@ loop: // iterating through the linked list they are in reverse order. cas = nil sglist = gp.waiting - // Clear all selectdone and elem before unlinking from gp.waiting. - // They must be cleared before being put back into the sudog cache. - // Clear before unlinking, because if a stack copy happens after the unlink, - // they will not be updated, they will be left pointing to the old stack, - // which creates dangling pointers, which may be detected by the - // garbage collector. + // Clear all elem before unlinking from gp.waiting. for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink { sg1.selectdone = nil sg1.elem = nil diff --git a/src/runtime/signal_linux_power64.h b/src/runtime/signal_linux_power64.h new file mode 100644 index 000000000..840648920 --- /dev/null +++ b/src/runtime/signal_linux_power64.h @@ -0,0 +1,49 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#define SIG_REGS(ctxt) (*((Sigcontext*)&((Ucontext*)(ctxt))->uc_mcontext)->regs) + +#define SIG_R0(info, ctxt) (SIG_REGS(ctxt).gpr[0]) +#define SIG_R1(info, ctxt) (SIG_REGS(ctxt).gpr[1]) +#define SIG_R2(info, ctxt) (SIG_REGS(ctxt).gpr[2]) +#define SIG_R3(info, ctxt) (SIG_REGS(ctxt).gpr[3]) +#define SIG_R4(info, ctxt) (SIG_REGS(ctxt).gpr[4]) +#define SIG_R5(info, ctxt) (SIG_REGS(ctxt).gpr[5]) +#define SIG_R6(info, ctxt) (SIG_REGS(ctxt).gpr[6]) +#define SIG_R7(info, ctxt) (SIG_REGS(ctxt).gpr[7]) +#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).gpr[8]) +#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).gpr[9]) +#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).gpr[10]) +#define SIG_R11(info, ctxt) (SIG_REGS(ctxt).gpr[11]) +#define SIG_R12(info, ctxt) (SIG_REGS(ctxt).gpr[12]) +#define SIG_R13(info, ctxt) (SIG_REGS(ctxt).gpr[13]) +#define SIG_R14(info, ctxt) (SIG_REGS(ctxt).gpr[14]) +#define SIG_R15(info, ctxt) (SIG_REGS(ctxt).gpr[15]) +#define SIG_R16(info, ctxt) (SIG_REGS(ctxt).gpr[16]) +#define SIG_R17(info, ctxt) (SIG_REGS(ctxt).gpr[17]) +#define SIG_R18(info, ctxt) (SIG_REGS(ctxt).gpr[18]) +#define SIG_R19(info, ctxt) (SIG_REGS(ctxt).gpr[19]) +#define SIG_R20(info, ctxt) (SIG_REGS(ctxt).gpr[20]) +#define SIG_R21(info, ctxt) (SIG_REGS(ctxt).gpr[21]) +#define SIG_R22(info, ctxt) (SIG_REGS(ctxt).gpr[22]) +#define SIG_R23(info, ctxt) (SIG_REGS(ctxt).gpr[23]) +#define SIG_R24(info, ctxt) (SIG_REGS(ctxt).gpr[24]) +#define SIG_R25(info, ctxt) (SIG_REGS(ctxt).gpr[25]) +#define SIG_R26(info, ctxt) (SIG_REGS(ctxt).gpr[26]) +#define SIG_R27(info, ctxt) (SIG_REGS(ctxt).gpr[27]) +#define SIG_R28(info, ctxt) (SIG_REGS(ctxt).gpr[28]) +#define SIG_R29(info, ctxt) (SIG_REGS(ctxt).gpr[29]) +#define SIG_R30(info, ctxt) (SIG_REGS(ctxt).gpr[30]) +#define SIG_R31(info, ctxt) (SIG_REGS(ctxt).gpr[31]) + +#define SIG_SP(info, ctxt) (SIG_REGS(ctxt).gpr[1]) +#define SIG_PC(info, ctxt) (SIG_REGS(ctxt).nip) +#define SIG_TRAP(info, ctxt) (SIG_REGS(ctxt).trap) +#define SIG_CTR(info, ctxt) (SIG_REGS(ctxt).ctr) +#define SIG_LINK(info, ctxt) (SIG_REGS(ctxt).link) +#define SIG_XER(info, ctxt) (SIG_REGS(ctxt).xer) +#define SIG_CCR(info, ctxt) (SIG_REGS(ctxt).ccr) + +#define SIG_CODE0(info, ctxt) ((uintptr)(info)->si_code) +#define SIG_FAULT(info, ctxt) (SIG_REGS(ctxt).dar) diff --git a/src/runtime/signal_linux_power64le.h b/src/runtime/signal_linux_power64le.h new file mode 100644 index 000000000..840648920 --- /dev/null +++ b/src/runtime/signal_linux_power64le.h @@ -0,0 +1,49 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#define SIG_REGS(ctxt) (*((Sigcontext*)&((Ucontext*)(ctxt))->uc_mcontext)->regs) + +#define SIG_R0(info, ctxt) (SIG_REGS(ctxt).gpr[0]) +#define SIG_R1(info, ctxt) (SIG_REGS(ctxt).gpr[1]) +#define SIG_R2(info, ctxt) (SIG_REGS(ctxt).gpr[2]) +#define SIG_R3(info, ctxt) (SIG_REGS(ctxt).gpr[3]) +#define SIG_R4(info, ctxt) (SIG_REGS(ctxt).gpr[4]) +#define SIG_R5(info, ctxt) (SIG_REGS(ctxt).gpr[5]) +#define SIG_R6(info, ctxt) (SIG_REGS(ctxt).gpr[6]) +#define SIG_R7(info, ctxt) (SIG_REGS(ctxt).gpr[7]) +#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).gpr[8]) +#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).gpr[9]) +#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).gpr[10]) +#define SIG_R11(info, ctxt) (SIG_REGS(ctxt).gpr[11]) +#define SIG_R12(info, ctxt) (SIG_REGS(ctxt).gpr[12]) +#define SIG_R13(info, ctxt) (SIG_REGS(ctxt).gpr[13]) +#define SIG_R14(info, ctxt) (SIG_REGS(ctxt).gpr[14]) +#define SIG_R15(info, ctxt) (SIG_REGS(ctxt).gpr[15]) +#define SIG_R16(info, ctxt) (SIG_REGS(ctxt).gpr[16]) +#define SIG_R17(info, ctxt) (SIG_REGS(ctxt).gpr[17]) +#define SIG_R18(info, ctxt) (SIG_REGS(ctxt).gpr[18]) +#define SIG_R19(info, ctxt) (SIG_REGS(ctxt).gpr[19]) +#define SIG_R20(info, ctxt) (SIG_REGS(ctxt).gpr[20]) +#define SIG_R21(info, ctxt) (SIG_REGS(ctxt).gpr[21]) +#define SIG_R22(info, ctxt) (SIG_REGS(ctxt).gpr[22]) +#define SIG_R23(info, ctxt) (SIG_REGS(ctxt).gpr[23]) +#define SIG_R24(info, ctxt) (SIG_REGS(ctxt).gpr[24]) +#define SIG_R25(info, ctxt) (SIG_REGS(ctxt).gpr[25]) +#define SIG_R26(info, ctxt) (SIG_REGS(ctxt).gpr[26]) +#define SIG_R27(info, ctxt) (SIG_REGS(ctxt).gpr[27]) +#define SIG_R28(info, ctxt) (SIG_REGS(ctxt).gpr[28]) +#define SIG_R29(info, ctxt) (SIG_REGS(ctxt).gpr[29]) +#define SIG_R30(info, ctxt) (SIG_REGS(ctxt).gpr[30]) +#define SIG_R31(info, ctxt) (SIG_REGS(ctxt).gpr[31]) + +#define SIG_SP(info, ctxt) (SIG_REGS(ctxt).gpr[1]) +#define SIG_PC(info, ctxt) (SIG_REGS(ctxt).nip) +#define SIG_TRAP(info, ctxt) (SIG_REGS(ctxt).trap) +#define SIG_CTR(info, ctxt) (SIG_REGS(ctxt).ctr) +#define SIG_LINK(info, ctxt) (SIG_REGS(ctxt).link) +#define SIG_XER(info, ctxt) (SIG_REGS(ctxt).xer) +#define SIG_CCR(info, ctxt) (SIG_REGS(ctxt).ccr) + +#define SIG_CODE0(info, ctxt) ((uintptr)(info)->si_code) +#define SIG_FAULT(info, ctxt) (SIG_REGS(ctxt).dar) diff --git a/src/runtime/signal_power64x.c b/src/runtime/signal_power64x.c new file mode 100644 index 000000000..89c5c7848 --- /dev/null +++ b/src/runtime/signal_power64x.c @@ -0,0 +1,137 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build linux +// +build power64 power64le + +#include "runtime.h" +#include "defs_GOOS_GOARCH.h" +#include "os_GOOS.h" +#include "signal_GOOS_GOARCH.h" +#include "signals_GOOS.h" + +void +runtime·dumpregs(Siginfo *info, void *ctxt) +{ + USED(info); USED(ctxt); + runtime·printf("r0 %X\t", SIG_R0(info, ctxt)); + runtime·printf("r1 %X\n", SIG_R1(info, ctxt)); + runtime·printf("r2 %X\t", SIG_R2(info, ctxt)); + runtime·printf("r3 %X\n", SIG_R3(info, ctxt)); + runtime·printf("r4 %X\t", SIG_R4(info, ctxt)); + runtime·printf("r5 %X\n", SIG_R5(info, ctxt)); + runtime·printf("r6 %X\t", SIG_R6(info, ctxt)); + runtime·printf("r7 %X\n", SIG_R7(info, ctxt)); + runtime·printf("r8 %X\t", SIG_R8(info, ctxt)); + runtime·printf("r9 %X\n", SIG_R9(info, ctxt)); + runtime·printf("r10 %X\t", SIG_R10(info, ctxt)); + runtime·printf("r11 %X\n", SIG_R11(info, ctxt)); + runtime·printf("r12 %X\t", SIG_R12(info, ctxt)); + runtime·printf("r13 %X\n", SIG_R13(info, ctxt)); + runtime·printf("r14 %X\t", SIG_R14(info, ctxt)); + runtime·printf("r15 %X\n", SIG_R15(info, ctxt)); + runtime·printf("r16 %X\t", SIG_R16(info, ctxt)); + runtime·printf("r17 %X\n", SIG_R17(info, ctxt)); + runtime·printf("r18 %X\t", SIG_R18(info, ctxt)); + runtime·printf("r19 %X\n", SIG_R19(info, ctxt)); + runtime·printf("r20 %X\t", SIG_R20(info, ctxt)); + runtime·printf("r21 %X\n", SIG_R21(info, ctxt)); + runtime·printf("r22 %X\t", SIG_R22(info, ctxt)); + runtime·printf("r23 %X\n", SIG_R23(info, ctxt)); + runtime·printf("r24 %X\t", SIG_R24(info, ctxt)); + runtime·printf("r25 %X\n", SIG_R25(info, ctxt)); + runtime·printf("r26 %X\t", SIG_R26(info, ctxt)); + runtime·printf("r27 %X\n", SIG_R27(info, ctxt)); + runtime·printf("r28 %X\t", SIG_R28(info, ctxt)); + runtime·printf("r29 %X\n", SIG_R29(info, ctxt)); + runtime·printf("r30 %X\t", SIG_R30(info, ctxt)); + runtime·printf("r31 %X\n", SIG_R31(info, ctxt)); + runtime·printf("pc %X\t", SIG_PC(info, ctxt)); + runtime·printf("ctr %X\n", SIG_CTR(info, ctxt)); + runtime·printf("link %X\t", SIG_LINK(info, ctxt)); + runtime·printf("xer %X\n", SIG_XER(info, ctxt)); + runtime·printf("ccr %X\t", SIG_CCR(info, ctxt)); + runtime·printf("trap %X\n", SIG_TRAP(info, ctxt)); +} + +void +runtime·sighandler(int32 sig, Siginfo *info, void *ctxt, G *gp) +{ + SigTab *t; + bool crash; + + if(sig == SIGPROF) { + runtime·sigprof((uint8*)SIG_PC(info, ctxt), (uint8*)SIG_SP(info, ctxt), (uint8*)SIG_LINK(info, ctxt), gp, g->m); + return; + } + t = &runtime·sigtab[sig]; + if(SIG_CODE0(info, ctxt) != SI_USER && (t->flags & SigPanic)) { + // Make it look like a call to the signal func. + // Have to pass arguments out of band since + // augmenting the stack frame would break + // the unwinding code. + gp->sig = sig; + gp->sigcode0 = SIG_CODE0(info, ctxt); + gp->sigcode1 = SIG_FAULT(info, ctxt); + gp->sigpc = SIG_PC(info, ctxt); + + // We arrange link, and pc to pretend the panicking + // function calls sigpanic directly. + // Always save LINK to stack so that panics in leaf + // functions are correctly handled. This smashes + // the stack frame but we're not going back there + // anyway. + SIG_SP(info, ctxt) -= sizeof(uintptr); + *(uintptr*)SIG_SP(info, ctxt) = SIG_LINK(info, ctxt); + // Don't bother saving PC if it's zero, which is + // probably a call to a nil func: the old link register + // is more useful in the stack trace. + if(gp->sigpc != 0) + SIG_LINK(info, ctxt) = gp->sigpc; + // In case we are panicking from external C code + SIG_R0(info, ctxt) = 0; + SIG_R30(info, ctxt) = (uintptr)gp; + SIG_PC(info, ctxt) = (uintptr)runtime·sigpanic; + return; + } + + if(SIG_CODE0(info, ctxt) == SI_USER || (t->flags & SigNotify)) + if(runtime·sigsend(sig)) + return; + if(t->flags & SigKill) + runtime·exit(2); + if(!(t->flags & SigThrow)) + return; + + g->m->throwing = 1; + g->m->caughtsig = gp; + if(runtime·panicking) // traceback already printed + runtime·exit(2); + runtime·panicking = 1; + + if(sig < 0 || sig >= NSIG) + runtime·printf("Signal %d\n", sig); + else + runtime·printf("%s\n", runtime·sigtab[sig].name); + + runtime·printf("PC=%x\n", SIG_PC(info, ctxt)); + if(g->m->lockedg != nil && g->m->ncgo > 0 && gp == g->m->g0) { + runtime·printf("signal arrived during cgo execution\n"); + gp = g->m->lockedg; + } + runtime·printf("\n"); + + if(runtime·gotraceback(&crash)){ + runtime·goroutineheader(gp); + runtime·traceback(SIG_PC(info, ctxt), SIG_SP(info, ctxt), SIG_LINK(info, ctxt), gp); + runtime·tracebackothers(gp); + runtime·printf("\n"); + runtime·dumpregs(info, ctxt); + } + + if(crash) + runtime·crash(); + + runtime·exit(2); +} diff --git a/src/runtime/stack.c b/src/runtime/stack.c index 072bc242b..ffae73a2a 100644 --- a/src/runtime/stack.c +++ b/src/runtime/stack.c @@ -382,8 +382,6 @@ adjustpointers(byte **scanp, BitVector *bv, AdjustInfo *adjinfo, Func *f) uintptr delta; int32 num, i; byte *p, *minp, *maxp; - Type *t; - Itab *tab; minp = (byte*)adjinfo->old.lo; maxp = (byte*)adjinfo->old.hi; @@ -415,43 +413,7 @@ adjustpointers(byte **scanp, BitVector *bv, AdjustInfo *adjinfo, Func *f) } break; case BitsMultiWord: - switch(bv->bytedata[(i+1) / (8 / BitsPerPointer)] >> ((i+1) * BitsPerPointer & 7) & 3) { - default: - runtime·throw("unexpected garbage collection bits"); - case BitsEface: - t = (Type*)scanp[i]; - if(t != nil && ((t->kind & KindDirectIface) == 0 || (t->kind & KindNoPointers) == 0)) { - p = scanp[i+1]; - if(minp <= p && p < maxp) { - if(StackDebug >= 3) - runtime·printf("adjust eface %p\n", p); - if(t->size > PtrSize) // currently we always allocate such objects on the heap - runtime·throw("large interface value found on stack"); - scanp[i+1] = p + delta; - } - } - i++; - break; - case BitsIface: - tab = (Itab*)scanp[i]; - if(tab != nil) { - t = tab->type; - //runtime·printf(" type=%p\n", t); - if((t->kind & KindDirectIface) == 0 || (t->kind & KindNoPointers) == 0) { - p = scanp[i+1]; - if(minp <= p && p < maxp) { - if(StackDebug >= 3) - runtime·printf("adjust iface %p\n", p); - if(t->size > PtrSize) // currently we always allocate such objects on the heap - runtime·throw("large interface value found on stack"); - scanp[i+1] = p + delta; - } - } - } - i++; - break; - } - break; + runtime·throw("adjustpointers: unexpected garbage collection bits"); } } } @@ -587,13 +549,13 @@ adjustsudogs(G *gp, AdjustInfo *adjinfo) } // Copies gp's stack to a new stack of a different size. +// Caller must have changed gp status to Gcopystack. static void copystack(G *gp, uintptr newsize) { Stack old, new; uintptr used; AdjustInfo adjinfo; - uint32 oldstatus; bool (*cb)(Stkframe*, void*); byte *p, *ep; @@ -637,20 +599,11 @@ copystack(G *gp, uintptr newsize) } runtime·memmove((byte*)new.hi - used, (byte*)old.hi - used, used); - oldstatus = runtime·readgstatus(gp); - oldstatus &= ~Gscan; - if(oldstatus == Gwaiting || oldstatus == Grunnable) - runtime·casgstatus(gp, oldstatus, Gcopystack); // oldstatus is Gwaiting or Grunnable - else - runtime·throw("copystack: bad status, not Gwaiting or Grunnable"); - // Swap out old stack for new one gp->stack = new; gp->stackguard0 = new.lo + StackGuard; // NOTE: might clobber a preempt request gp->sched.sp = new.hi - used; - runtime·casgstatus(gp, Gcopystack, oldstatus); // oldstatus is Gwaiting or Grunnable - // free old stack if(StackPoisonCopy) { p = (byte*)old.lo; @@ -700,6 +653,7 @@ void runtime·newstack(void) { int32 oldsize, newsize; + uint32 oldstatus; uintptr sp; G *gp; Gobuf morebuf; @@ -752,6 +706,14 @@ runtime·newstack(void) runtime·printf("runtime: split stack overflow: %p < %p\n", sp, gp->stack.lo); runtime·throw("runtime: split stack overflow"); } + + if(gp->sched.ctxt != nil) { + // morestack wrote sched.ctxt on its way in here, + // without a write barrier. Run the write barrier now. + // It is not possible to be preempted between then + // and now, so it's okay. + runtime·writebarrierptr_nostore(&gp->sched.ctxt, gp->sched.ctxt); + } if(gp->stackguard0 == (uintptr)StackPreempt) { if(gp == g->m->g0) @@ -789,12 +751,15 @@ runtime·newstack(void) runtime·throw("stack overflow"); } - // Note that the concurrent GC might be scanning the stack as we try to replace it. - // copystack takes care of the appropriate coordination with the stack scanner. + oldstatus = runtime·readgstatus(gp); + oldstatus &= ~Gscan; + runtime·casgstatus(gp, oldstatus, Gcopystack); // oldstatus is Gwaiting or Grunnable + // The concurrent GC will not scan the stack while we are doing the copy since + // the gp is in a Gcopystack status. copystack(gp, newsize); if(StackDebug >= 1) runtime·printf("stack grow done\n"); - runtime·casgstatus(gp, Gwaiting, Grunning); + runtime·casgstatus(gp, Gcopystack, Grunning); runtime·gogo(&gp->sched); } @@ -825,6 +790,7 @@ void runtime·shrinkstack(G *gp) { uintptr used, oldsize, newsize; + uint32 oldstatus; if(runtime·readgstatus(gp) == Gdead) { if(gp->stack.lo != 0) { @@ -858,8 +824,19 @@ runtime·shrinkstack(G *gp) #endif if(StackDebug > 0) runtime·printf("shrinking stack %D->%D\n", (uint64)oldsize, (uint64)newsize); + // This is being done in a Gscan state and was initiated by the GC so no need to move to + // the Gcopystate. + // The world is stopped, so the goroutine must be Gwaiting or Grunnable, + // and what it is is not changing underfoot. + + oldstatus = runtime·readgstatus(gp); + oldstatus &= ~Gscan; + if(oldstatus != Gwaiting && oldstatus != Grunnable) + runtime·throw("status is not Gwaiting or Grunnable"); + runtime·casgstatus(gp, oldstatus, Gcopystack); copystack(gp, newsize); -} + runtime·casgstatus(gp, Gcopystack, oldstatus); + } // Do any delayed stack freeing that was queued up during GC. void diff --git a/src/runtime/string.c b/src/runtime/string.c index ed5debc33..475ea2de6 100644 --- a/src/runtime/string.c +++ b/src/runtime/string.c @@ -48,7 +48,7 @@ runtime·gostringnocopy(byte *str) s.len = runtime·findnull(str); while(true) { ms = runtime·maxstring; - if(s.len <= ms || runtime·casp((void**)&runtime·maxstring, (void*)ms, (void*)s.len)) + if(s.len <= ms || runtime·casuintptr(&runtime·maxstring, ms, s.len)) return s; } } diff --git a/src/runtime/string.go b/src/runtime/string.go index 0809f89bc..882281605 100644 --- a/src/runtime/string.go +++ b/src/runtime/string.go @@ -39,22 +39,18 @@ func concatstrings(a []string) string { return s } -//go:nosplit func concatstring2(a [2]string) string { return concatstrings(a[:]) } -//go:nosplit func concatstring3(a [3]string) string { return concatstrings(a[:]) } -//go:nosplit func concatstring4(a [4]string) string { return concatstrings(a[:]) } -//go:nosplit func concatstring5(a [5]string) string { return concatstrings(a[:]) } diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go index fe8f9c922..9889567d6 100644 --- a/src/runtime/stubs.go +++ b/src/runtime/stubs.go @@ -106,6 +106,16 @@ func recovery_m(*g) func mcacheRefill_m() func largeAlloc_m() func gc_m() +func gcscan_m() +func gcmark_m() +func gccheckmark_m() +func gccheckmarkenable_m() +func gccheckmarkdisable_m() +func gcinstallmarkwb_m() +func gcinstalloffwb_m() +func gcmarknewobject_m() +func gcmarkwb_m() +func finishsweep_m() func scavenge_m() func setFinalizer_m() func removeFinalizer_m() @@ -204,9 +214,6 @@ func write(fd uintptr, p unsafe.Pointer, n int32) int32 func cas(ptr *uint32, old, new uint32) bool //go:noescape -func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool - -//go:noescape func casuintptr(ptr *uintptr, old, new uintptr) bool //go:noescape diff --git a/src/runtime/sys_darwin_386.s b/src/runtime/sys_darwin_386.s index a961c71a8..3bf8b1d41 100644 --- a/src/runtime/sys_darwin_386.s +++ b/src/runtime/sys_darwin_386.s @@ -248,7 +248,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$40 MOVL BX, 0(SP) MOVL $runtime·badsignal(SB), AX CALL AX - JMP sigtramp_ret + JMP ret // save g MOVL DI, 20(SP) @@ -275,7 +275,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$40 MOVL 20(SP), DI MOVL DI, g(CX) -sigtramp_ret: +ret: // call sigreturn MOVL context+16(FP), CX MOVL style+4(FP), BX diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s index bd397d72a..8a8928e06 100644 --- a/src/runtime/sys_darwin_amd64.s +++ b/src/runtime/sys_darwin_amd64.s @@ -211,7 +211,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$64 MOVL DX, 0(SP) MOVQ $runtime·badsignal(SB), AX CALL AX - JMP sigtramp_ret + JMP ret // save g MOVQ R10, 48(SP) @@ -233,7 +233,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$64 MOVQ 48(SP), R10 MOVQ R10, g(BX) -sigtramp_ret: +ret: // call sigreturn MOVL $(0x2000000+184), AX // sigreturn(ucontext, infostyle) MOVQ 32(SP), DI // saved ucontext diff --git a/src/runtime/sys_dragonfly_386.s b/src/runtime/sys_dragonfly_386.s index 161eaec19..71ece9ecb 100644 --- a/src/runtime/sys_dragonfly_386.s +++ b/src/runtime/sys_dragonfly_386.s @@ -217,7 +217,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$44 MOVL BX, 0(SP) MOVL $runtime·badsignal(SB), AX CALL AX - JMP sigtramp_ret + JMP ret // save g MOVL DI, 20(SP) @@ -243,7 +243,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$44 MOVL 20(SP), BX MOVL BX, g(CX) -sigtramp_ret: +ret: // call sigreturn MOVL context+8(FP), AX MOVL $0, 0(SP) // syscall gap diff --git a/src/runtime/sys_freebsd_386.s b/src/runtime/sys_freebsd_386.s index 2c40fc433..66d03c27d 100644 --- a/src/runtime/sys_freebsd_386.s +++ b/src/runtime/sys_freebsd_386.s @@ -197,7 +197,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$44 MOVL BX, 0(SP) MOVL $runtime·badsignal(SB), AX CALL AX - JMP sigtramp_ret + JMP ret // save g MOVL DI, 20(SP) @@ -223,7 +223,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$44 MOVL 20(SP), BX MOVL BX, g(CX) -sigtramp_ret: +ret: // call sigreturn MOVL context+8(FP), AX MOVL $0, 0(SP) // syscall gap diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s index 33b91e872..d8d86ffad 100644 --- a/src/runtime/sys_linux_amd64.s +++ b/src/runtime/sys_linux_amd64.s @@ -115,7 +115,7 @@ TEXT time·now(SB),NOSPLIT,$16 // That leaves 104 for the gettime code to use. Hope that's enough! MOVQ runtime·__vdso_clock_gettime_sym(SB), AX CMPQ AX, $0 - JEQ fallback_gtod + JEQ fallback MOVL $0, DI // CLOCK_REALTIME LEAQ 0(SP), SI CALL AX @@ -124,7 +124,7 @@ TEXT time·now(SB),NOSPLIT,$16 MOVQ AX, sec+0(FP) MOVL DX, nsec+8(FP) RET -fallback_gtod: +fallback: LEAQ 0(SP), DI MOVQ $0, SI MOVQ runtime·__vdso_gettimeofday_sym(SB), AX @@ -141,7 +141,7 @@ TEXT runtime·nanotime(SB),NOSPLIT,$16 // See comment above in time.now. MOVQ runtime·__vdso_clock_gettime_sym(SB), AX CMPQ AX, $0 - JEQ fallback_gtod_nt + JEQ fallback MOVL $1, DI // CLOCK_MONOTONIC LEAQ 0(SP), SI CALL AX @@ -153,7 +153,7 @@ TEXT runtime·nanotime(SB),NOSPLIT,$16 ADDQ DX, AX MOVQ AX, ret+0(FP) RET -fallback_gtod_nt: +fallback: LEAQ 0(SP), DI MOVQ $0, SI MOVQ runtime·__vdso_gettimeofday_sym(SB), AX diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s index bd285f399..033a03642 100644 --- a/src/runtime/sys_linux_arm.s +++ b/src/runtime/sys_linux_arm.s @@ -373,20 +373,20 @@ TEXT cas<>(SB),NOSPLIT,$0 TEXT runtime·cas(SB),NOSPLIT,$0 MOVW ptr+0(FP), R2 MOVW old+4(FP), R0 -casagain: +loop: MOVW new+8(FP), R1 BL cas<>(SB) - BCC cascheck + BCC check MOVW $1, R0 MOVB R0, ret+12(FP) RET -cascheck: +check: // Kernel lies; double-check. MOVW ptr+0(FP), R2 MOVW old+4(FP), R0 MOVW 0(R2), R3 CMP R0, R3 - BEQ casagain + BEQ loop MOVW $0, R0 MOVB R0, ret+12(FP) RET diff --git a/src/runtime/sys_linux_power64x.s b/src/runtime/sys_linux_power64x.s new file mode 100644 index 000000000..fb24d3e79 --- /dev/null +++ b/src/runtime/sys_linux_power64x.s @@ -0,0 +1,383 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build linux +// +build power64 power64le + +// +// System calls and other sys.stuff for Power64, Linux +// + +#include "zasm_GOOS_GOARCH.h" +#include "textflag.h" + +#define SYS_exit 1 +#define SYS_read 3 +#define SYS_write 4 +#define SYS_open 5 +#define SYS_close 6 +#define SYS_fcntl 55 +#define SYS_gettimeofday 78 +#define SYS_select 82 // always return -ENOSYS +#define SYS_mmap 90 +#define SYS_munmap 91 +#define SYS_setitimer 104 +#define SYS_clone 120 +#define SYS_newselect 142 +#define SYS_sched_yield 158 +#define SYS_rt_sigreturn 172 +#define SYS_rt_sigaction 173 +#define SYS_rt_sigprocmask 174 +#define SYS_sigaltstack 185 +#define SYS_ugetrlimit 190 +#define SYS_madvise 205 +#define SYS_mincore 206 +#define SYS_gettid 207 +#define SYS_tkill 208 +#define SYS_futex 221 +#define SYS_sched_getaffinity 223 +#define SYS_exit_group 234 +#define SYS_epoll_create 236 +#define SYS_epoll_ctl 237 +#define SYS_epoll_wait 238 +#define SYS_clock_gettime 246 +#define SYS_epoll_create1 315 + +TEXT runtime·exit(SB),NOSPLIT,$-8-4 + MOVW code+0(FP), R3 + SYSCALL $SYS_exit_group + RETURN + +TEXT runtime·exit1(SB),NOSPLIT,$-8-4 + MOVW code+0(FP), R3 + SYSCALL $SYS_exit + RETURN + +TEXT runtime·open(SB),NOSPLIT,$-8-20 + MOVD name+0(FP), R3 + MOVW mode+8(FP), R4 + MOVW perm+12(FP), R5 + SYSCALL $SYS_open + MOVW R3, ret+16(FP) + RETURN + +TEXT runtime·close(SB),NOSPLIT,$-8-12 + MOVW fd+0(FP), R3 + SYSCALL $SYS_close + MOVW R3, ret+8(FP) + RETURN + +TEXT runtime·write(SB),NOSPLIT,$-8-28 + MOVD fd+0(FP), R3 + MOVD p+8(FP), R4 + MOVW n+16(FP), R5 + SYSCALL $SYS_write + MOVW R3, ret+24(FP) + RETURN + +TEXT runtime·read(SB),NOSPLIT,$-8-28 + MOVW fd+0(FP), R3 + MOVD p+8(FP), R4 + MOVW n+16(FP), R5 + SYSCALL $SYS_read + MOVW R3, ret+24(FP) + RETURN + +TEXT runtime·getrlimit(SB),NOSPLIT,$-8-20 + MOVW kind+0(FP), R3 + MOVD limit+8(FP), R4 + SYSCALL $SYS_ugetrlimit + MOVW R3, ret+16(FP) + RETURN + +TEXT runtime·usleep(SB),NOSPLIT,$16-4 + MOVW usec+0(FP), R3 + MOVD R3, R5 + MOVW $1000000, R4 + DIVD R4, R3 + MOVD R3, 8(R1) + MULLD R3, R4 + SUB R4, R5 + MOVD R5, 16(R1) + + // select(0, 0, 0, 0, &tv) + MOVW $0, R3 + MOVW $0, R4 + MOVW $0, R5 + MOVW $0, R6 + ADD $8, R1, R7 + SYSCALL $SYS_newselect + RETURN + +TEXT runtime·raise(SB),NOSPLIT,$-8 + SYSCALL $SYS_gettid + MOVW R3, R3 // arg 1 tid + MOVW sig+0(FP), R4 // arg 2 + SYSCALL $SYS_tkill + RETURN + +TEXT runtime·setitimer(SB),NOSPLIT,$-8-24 + MOVW mode+0(FP), R3 + MOVD new+8(FP), R4 + MOVD old+16(FP), R5 + SYSCALL $SYS_setitimer + RETURN + +TEXT runtime·mincore(SB),NOSPLIT,$-8-28 + MOVD addr+0(FP), R3 + MOVD n+8(FP), R4 + MOVD dst+16(FP), R5 + SYSCALL $SYS_mincore + MOVW R3, ret+24(FP) + RETURN + +// func now() (sec int64, nsec int32) +TEXT time·now(SB),NOSPLIT,$16 + MOVD $0(R1), R3 + MOVD $0, R4 + SYSCALL $SYS_gettimeofday + MOVD 0(R1), R3 // sec + MOVD 8(R1), R5 // usec + MOVD $1000, R4 + MULLD R4, R5 + MOVD R3, sec+0(FP) + MOVW R5, nsec+8(FP) + RETURN + +TEXT runtime·nanotime(SB),NOSPLIT,$16 + MOVW $1, R3 // CLOCK_MONOTONIC + MOVD $0(R1), R4 + SYSCALL $SYS_clock_gettime + MOVD 0(R1), R3 // sec + MOVD 8(R1), R5 // nsec + // sec is in R3, nsec in R5 + // return nsec in R3 + MOVD $1000000000, R4 + MULLD R4, R3 + ADD R5, R3 + MOVD R3, ret+0(FP) + RETURN + +TEXT runtime·rtsigprocmask(SB),NOSPLIT,$-8-28 + MOVW sig+0(FP), R3 + MOVD new+8(FP), R4 + MOVD old+16(FP), R5 + MOVW size+24(FP), R6 + SYSCALL $SYS_rt_sigprocmask + BVC 2(PC) + MOVD R0, 0xf1(R0) // crash + RETURN + +TEXT runtime·rt_sigaction(SB),NOSPLIT,$-8-36 + MOVD sig+0(FP), R3 + MOVD new+8(FP), R4 + MOVD old+16(FP), R5 + MOVD size+24(FP), R6 + SYSCALL $SYS_rt_sigaction + MOVW R3, ret+32(FP) + RETURN + +#ifdef GOARCH_power64le +// power64le doesn't need function descriptors +TEXT runtime·sigtramp(SB),NOSPLIT,$64 +#else +// function descriptor for the real sigtramp +TEXT runtime·sigtramp(SB),NOSPLIT,$-8 + DWORD $runtime·_sigtramp(SB) + DWORD $0 + DWORD $0 +TEXT runtime·_sigtramp(SB),NOSPLIT,$64 +#endif + // initialize essential registers (just in case) + BL runtime·reginit(SB) + + // check that g exists + CMP g, $0 + BNE 6(PC) + MOVD R3, 8(R1) + MOVD $runtime·badsignal(SB), R31 + MOVD R31, CTR + BL (CTR) + RETURN + + // save g + MOVD g, 40(R1) + MOVD g, R6 + + // g = m->gsignal + MOVD g_m(g), R7 + MOVD m_gsignal(R7), g + + MOVW R3, 8(R1) + MOVD R4, 16(R1) + MOVD R5, 24(R1) + MOVD R6, 32(R1) + + BL runtime·sighandler(SB) + + // restore g + MOVD 40(R1), g + + RETURN + +TEXT runtime·mmap(SB),NOSPLIT,$-8 + MOVD addr+0(FP), R3 + MOVD n+8(FP), R4 + MOVW prot+16(FP), R5 + MOVW flags+20(FP), R6 + MOVW fd+24(FP), R7 + MOVW off+28(FP), R8 + + SYSCALL $SYS_mmap + MOVD R3, ret+32(FP) + RETURN + +TEXT runtime·munmap(SB),NOSPLIT,$-8 + MOVD addr+0(FP), R3 + MOVD n+8(FP), R4 + SYSCALL $SYS_munmap + BVC 2(PC) + MOVD R0, 0xf3(R0) + RETURN + +TEXT runtime·madvise(SB),NOSPLIT,$-8 + MOVD addr+0(FP), R3 + MOVD n+8(FP), R4 + MOVW flags+16(FP), R5 + SYSCALL $SYS_madvise + // ignore failure - maybe pages are locked + RETURN + +// int64 futex(int32 *uaddr, int32 op, int32 val, +// struct timespec *timeout, int32 *uaddr2, int32 val2); +TEXT runtime·futex(SB),NOSPLIT,$-8 + MOVD addr+0(FP), R3 + MOVW op+8(FP), R4 + MOVW val+12(FP), R5 + MOVD ts+16(FP), R6 + MOVD addr2+24(FP), R7 + MOVW val3+32(FP), R8 + SYSCALL $SYS_futex + MOVW R3, ret+40(FP) + RETURN + +// int64 clone(int32 flags, void *stk, M *mp, G *gp, void (*fn)(void)); +TEXT runtime·clone(SB),NOSPLIT,$-8 + MOVW flags+0(FP), R3 + MOVD stk+8(FP), R4 + + // Copy mp, gp, fn off parent stack for use by child. + // Careful: Linux system call clobbers ???. + MOVD mm+16(FP), R7 + MOVD gg+24(FP), R8 + MOVD fn+32(FP), R12 + + MOVD R7, -8(R4) + MOVD R8, -16(R4) + MOVD R12, -24(R4) + MOVD $1234, R7 + MOVD R7, -32(R4) + + SYSCALL $SYS_clone + + // In parent, return. + CMP R3, $0 + BEQ 3(PC) + MOVW R3, ret+40(FP) + RETURN + + // In child, on new stack. + // initialize essential registers + BL runtime·reginit(SB) + MOVD -32(R1), R7 + CMP R7, $1234 + BEQ 2(PC) + MOVD R0, 0(R0) + + // Initialize m->procid to Linux tid + SYSCALL $SYS_gettid + + MOVD -24(R1), R12 + MOVD -16(R1), R8 + MOVD -8(R1), R7 + + MOVD R3, m_procid(R7) + + // TODO: setup TLS. + + // In child, set up new stack + MOVD R7, g_m(R8) + MOVD R8, g + //CALL runtime·stackcheck(SB) + + // Call fn + MOVD R12, CTR + BL (CTR) + + // It shouldn't return. If it does, exit + MOVW $111, R3 + SYSCALL $SYS_exit_group + BR -2(PC) // keep exiting + +TEXT runtime·sigaltstack(SB),NOSPLIT,$-8 + MOVD new+0(FP), R3 + MOVD old+8(FP), R4 + SYSCALL $SYS_sigaltstack + BVC 2(PC) + MOVD R0, 0xf1(R0) // crash + RETURN + +TEXT runtime·osyield(SB),NOSPLIT,$-8 + SYSCALL $SYS_sched_yield + RETURN + +TEXT runtime·sched_getaffinity(SB),NOSPLIT,$-8 + MOVD pid+0(FP), R3 + MOVD len+8(FP), R4 + MOVD buf+16(FP), R5 + SYSCALL $SYS_sched_getaffinity + MOVW R3, ret+24(FP) + RETURN + +// int32 runtime·epollcreate(int32 size); +TEXT runtime·epollcreate(SB),NOSPLIT,$-8 + MOVW size+0(FP), R3 + SYSCALL $SYS_epoll_create + MOVW R3, ret+8(FP) + RETURN + +// int32 runtime·epollcreate1(int32 flags); +TEXT runtime·epollcreate1(SB),NOSPLIT,$-8 + MOVW flags+0(FP), R3 + SYSCALL $SYS_epoll_create1 + MOVW R3, ret+8(FP) + RETURN + +// func epollctl(epfd, op, fd int32, ev *epollEvent) int +TEXT runtime·epollctl(SB),NOSPLIT,$-8 + MOVW epfd+0(FP), R3 + MOVW op+4(FP), R4 + MOVW fd+8(FP), R5 + MOVD ev+16(FP), R6 + SYSCALL $SYS_epoll_ctl + MOVW R3, ret+24(FP) + RETURN + +// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout); +TEXT runtime·epollwait(SB),NOSPLIT,$-8 + MOVW epfd+0(FP), R3 + MOVD ev+8(FP), R4 + MOVW nev+16(FP), R5 + MOVW timeout+20(FP), R6 + SYSCALL $SYS_epoll_wait + MOVW R3, ret+24(FP) + RETURN + +// void runtime·closeonexec(int32 fd); +TEXT runtime·closeonexec(SB),NOSPLIT,$-8 + MOVW fd+0(FP), R3 // fd + MOVD $2, R4 // F_SETFD + MOVD $1, R5 // FD_CLOEXEC + SYSCALL $SYS_fcntl + RETURN diff --git a/src/runtime/sys_nacl_386.s b/src/runtime/sys_nacl_386.s index 47985f31f..16cd721d9 100644 --- a/src/runtime/sys_nacl_386.s +++ b/src/runtime/sys_nacl_386.s @@ -293,7 +293,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0 MOVL $0, 0(SP) MOVL $runtime·badsignal(SB), AX CALL AX - JMP sigtramp_ret + JMP ret // save g MOVL DI, 20(SP) @@ -317,7 +317,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0 MOVL 20(SP), BX MOVL BX, g(CX) -sigtramp_ret: +ret: // Enable exceptions again. NACL_SYSCALL(SYS_exception_clear_flag) diff --git a/src/runtime/sys_nacl_amd64p32.s b/src/runtime/sys_nacl_amd64p32.s index 4eb4aacdd..9cfbef6ef 100644 --- a/src/runtime/sys_nacl_amd64p32.s +++ b/src/runtime/sys_nacl_amd64p32.s @@ -338,7 +338,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$80 MOVL 20(SP), BX MOVL BX, g(CX) -sigtramp_ret: // Enable exceptions again. NACL_SYSCALL(SYS_exception_clear_flag) diff --git a/src/runtime/sys_nacl_arm.s b/src/runtime/sys_nacl_arm.s index d354ab483..432deadf4 100644 --- a/src/runtime/sys_nacl_arm.s +++ b/src/runtime/sys_nacl_arm.s @@ -269,7 +269,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$80 // restore g MOVW 20(R13), g -sigtramp_ret: // Enable exceptions again. NACL_SYSCALL(SYS_exception_clear_flag) diff --git a/src/runtime/sys_openbsd_386.s b/src/runtime/sys_openbsd_386.s index 5cda7768a..b1ae5ecee 100644 --- a/src/runtime/sys_openbsd_386.s +++ b/src/runtime/sys_openbsd_386.s @@ -186,7 +186,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$44 MOVL BX, 0(SP) MOVL $runtime·badsignal(SB), AX CALL AX - JMP sigtramp_ret + JMP ret // save g MOVL DI, 20(SP) @@ -212,7 +212,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$44 MOVL 20(SP), BX MOVL BX, g(CX) -sigtramp_ret: +ret: // call sigreturn MOVL context+8(FP), AX MOVL $0, 0(SP) // syscall gap diff --git a/src/runtime/sys_power64x.c b/src/runtime/sys_power64x.c new file mode 100644 index 000000000..79d976255 --- /dev/null +++ b/src/runtime/sys_power64x.c @@ -0,0 +1,38 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build power64 power64le + +#include "runtime.h" + +// adjust Gobuf as if it executed a call to fn with context ctxt +// and then did an immediate Gosave. +void +runtime·gostartcall(Gobuf *gobuf, void (*fn)(void), void *ctxt) +{ + if(gobuf->lr != 0) + runtime·throw("invalid use of gostartcall"); + gobuf->lr = gobuf->pc; + gobuf->pc = (uintptr)fn; + gobuf->ctxt = ctxt; +} + +// Called to rewind context saved during morestack back to beginning of function. +// To help us, the linker emits a jmp back to the beginning right after the +// call to morestack. We just have to decode and apply that jump. +void +runtime·rewindmorestack(Gobuf *gobuf) +{ + uint32 inst; + + inst = *(uint32*)gobuf->pc; + if((gobuf->pc&3) == 0 && (inst>>24) == 0x4b && (inst&3) == 0) { + //runtime·printf("runtime: rewind pc=%p to pc=%p\n", gobuf->pc, gobuf->pc + ((int32)(inst<<8)>>8)); + gobuf->pc += (int32)(inst<<8)>>8; + return; + } + runtime·printf("runtime: pc=%p %x\n", gobuf->pc, inst); + runtime·throw("runtime: misuse of rewindmorestack"); +} + diff --git a/src/runtime/sys_solaris_amd64.s b/src/runtime/sys_solaris_amd64.s index 0ebdab6ee..3981893b0 100644 --- a/src/runtime/sys_solaris_amd64.s +++ b/src/runtime/sys_solaris_amd64.s @@ -287,24 +287,24 @@ TEXT runtime·usleep1(SB),NOSPLIT,$0 // Execute call on m->g0. get_tls(R15) CMPQ R15, $0 - JE usleep1_noswitch + JE noswitch MOVQ g(R15), R13 CMPQ R13, $0 - JE usleep1_noswitch + JE noswitch MOVQ g_m(R13), R13 CMPQ R13, $0 - JE usleep1_noswitch + JE noswitch // TODO(aram): do something about the cpu profiler here. MOVQ m_g0(R13), R14 CMPQ g(R15), R14 - JNE usleep1_switch + JNE switch // executing on m->g0 already CALL AX RET -usleep1_switch: +switch: // Switch to m->g0 stack and back. MOVQ (g_sched+gobuf_sp)(R14), R14 MOVQ SP, -8(R14) @@ -313,7 +313,7 @@ usleep1_switch: MOVQ 0(SP), SP RET -usleep1_noswitch: +noswitch: // Not a Go-managed thread. Do not switch stack. CALL AX RET diff --git a/src/runtime/sys_windows_386.s b/src/runtime/sys_windows_386.s index 932fe9dd2..13fb5bdc9 100644 --- a/src/runtime/sys_windows_386.s +++ b/src/runtime/sys_windows_386.s @@ -106,7 +106,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0-0 MOVL g_m(DX), BX MOVL m_g0(BX), BX CMPL DX, BX - JEQ sigtramp_g0 + JEQ g0 // switch to the g0 stack get_tls(BP) @@ -123,7 +123,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0-0 MOVL SP, 36(DI) MOVL DI, SP -sigtramp_g0: +g0: MOVL 0(CX), BX // ExceptionRecord* MOVL 4(CX), CX // Context* MOVL BX, 0(SP) @@ -383,12 +383,12 @@ TEXT runtime·usleep1(SB),NOSPLIT,$0 MOVL m_g0(BP), SI CMPL g(CX), SI - JNE usleep1_switch + JNE switch // executing on m->g0 already CALL AX - JMP usleep1_ret + JMP ret -usleep1_switch: +switch: // Switch to m->g0 stack and back. MOVL (g_sched+gobuf_sp)(SI), SI MOVL SP, -4(SI) @@ -396,7 +396,7 @@ usleep1_switch: CALL AX MOVL 0(SP), SP -usleep1_ret: +ret: get_tls(CX) MOVL g(CX), BP MOVL g_m(BP), BP diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s index e6190ce68..8b95f6d6c 100644 --- a/src/runtime/sys_windows_amd64.s +++ b/src/runtime/sys_windows_amd64.s @@ -138,7 +138,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0-0 MOVQ g_m(DX), BX MOVQ m_g0(BX), BX CMPQ DX, BX - JEQ sigtramp_g0 + JEQ g0 // switch to g0 stack get_tls(BP) @@ -157,7 +157,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$0-0 MOVQ SP, 104(DI) MOVQ DI, SP -sigtramp_g0: +g0: MOVQ 0(CX), BX // ExceptionRecord* MOVQ 8(CX), CX // Context* MOVQ BX, 0(SP) @@ -407,12 +407,12 @@ TEXT runtime·usleep1(SB),NOSPLIT,$0 MOVQ m_g0(R13), R14 CMPQ g(R15), R14 - JNE usleep1_switch + JNE switch // executing on m->g0 already CALL AX - JMP usleep1_ret + JMP ret -usleep1_switch: +switch: // Switch to m->g0 stack and back. MOVQ (g_sched+gobuf_sp)(R14), R14 MOVQ SP, -8(R14) @@ -420,7 +420,7 @@ usleep1_switch: CALL AX MOVQ 0(SP), SP -usleep1_ret: +ret: MOVQ $0, m_libcallsp(R13) RET diff --git a/src/runtime/sys_x86.c b/src/runtime/sys_x86.c index a450b3e58..edbe47ff4 100644 --- a/src/runtime/sys_x86.c +++ b/src/runtime/sys_x86.c @@ -20,6 +20,7 @@ runtime·gostartcall(Gobuf *gobuf, void (*fn)(void), void *ctxt) gobuf->sp = (uintptr)sp; gobuf->pc = (uintptr)fn; gobuf->ctxt = ctxt; + runtime·writebarrierptr_nostore(&gobuf->ctxt, ctxt); } // Called to rewind context saved during morestack back to beginning of function. diff --git a/src/runtime/thunk.s b/src/runtime/thunk.s index 0a0f147c4..1a5b65502 100644 --- a/src/runtime/thunk.s +++ b/src/runtime/thunk.s @@ -10,6 +10,12 @@ #ifdef GOARCH_arm #define JMP B #endif +#ifdef GOARCH_power64 +#define JMP BR +#endif +#ifdef GOARCH_power64le +#define JMP BR +#endif TEXT net·runtimeNano(SB),NOSPLIT,$0-0 JMP runtime·nanotime(SB) diff --git a/src/runtime/wbfat.go b/src/runtime/wbfat.go new file mode 100644 index 000000000..75c58b26b --- /dev/null +++ b/src/runtime/wbfat.go @@ -0,0 +1,190 @@ +// generated by wbfat_gen.go; use go generate + +package runtime + +//go:nosplit +func writebarrierfat01(dst *[2]uintptr, _ *byte, src [2]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) +} + +//go:nosplit +func writebarrierfat10(dst *[2]uintptr, _ *byte, src [2]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] +} + +//go:nosplit +func writebarrierfat11(dst *[2]uintptr, _ *byte, src [2]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) +} + +//go:nosplit +func writebarrierfat001(dst *[3]uintptr, _ *byte, src [3]uintptr) { + dst[0] = src[0] + dst[1] = src[1] + writebarrierptr(&dst[2], src[2]) +} + +//go:nosplit +func writebarrierfat010(dst *[3]uintptr, _ *byte, src [3]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) + dst[2] = src[2] +} + +//go:nosplit +func writebarrierfat011(dst *[3]uintptr, _ *byte, src [3]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) + writebarrierptr(&dst[2], src[2]) +} + +//go:nosplit +func writebarrierfat100(dst *[3]uintptr, _ *byte, src [3]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] + dst[2] = src[2] +} + +//go:nosplit +func writebarrierfat101(dst *[3]uintptr, _ *byte, src [3]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] + writebarrierptr(&dst[2], src[2]) +} + +//go:nosplit +func writebarrierfat110(dst *[3]uintptr, _ *byte, src [3]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) + dst[2] = src[2] +} + +//go:nosplit +func writebarrierfat111(dst *[3]uintptr, _ *byte, src [3]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) + writebarrierptr(&dst[2], src[2]) +} + +//go:nosplit +func writebarrierfat0001(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + dst[1] = src[1] + dst[2] = src[2] + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat0010(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + dst[1] = src[1] + writebarrierptr(&dst[2], src[2]) + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat0011(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + dst[1] = src[1] + writebarrierptr(&dst[2], src[2]) + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat0100(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) + dst[2] = src[2] + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat0101(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) + dst[2] = src[2] + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat0110(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) + writebarrierptr(&dst[2], src[2]) + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat0111(dst *[4]uintptr, _ *byte, src [4]uintptr) { + dst[0] = src[0] + writebarrierptr(&dst[1], src[1]) + writebarrierptr(&dst[2], src[2]) + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat1000(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] + dst[2] = src[2] + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat1001(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] + dst[2] = src[2] + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat1010(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] + writebarrierptr(&dst[2], src[2]) + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat1011(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + dst[1] = src[1] + writebarrierptr(&dst[2], src[2]) + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat1100(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) + dst[2] = src[2] + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat1101(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) + dst[2] = src[2] + writebarrierptr(&dst[3], src[3]) +} + +//go:nosplit +func writebarrierfat1110(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) + writebarrierptr(&dst[2], src[2]) + dst[3] = src[3] +} + +//go:nosplit +func writebarrierfat1111(dst *[4]uintptr, _ *byte, src [4]uintptr) { + writebarrierptr(&dst[0], src[0]) + writebarrierptr(&dst[1], src[1]) + writebarrierptr(&dst[2], src[2]) + writebarrierptr(&dst[3], src[3]) +} diff --git a/src/runtime/wbfat_gen.go b/src/runtime/wbfat_gen.go new file mode 100644 index 000000000..78d5b6271 --- /dev/null +++ b/src/runtime/wbfat_gen.go @@ -0,0 +1,41 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +package main + +import ( + "flag" + "fmt" + "log" + "os" +) + +func main() { + flag.Parse() + if flag.NArg() > 0 { + f, err := os.Create(flag.Arg(0)) + if err != nil { + log.Fatal(err) + } + os.Stdout = f + } + fmt.Printf("// generated by wbfat_gen.go; use go generate\n\n") + fmt.Printf("package runtime\n") + for i := uint(2); i <= 4; i++ { + for j := 1; j < 1<<i; j++ { + fmt.Printf("\n//go:nosplit\n") + fmt.Printf("func writebarrierfat%0*b(dst *[%d]uintptr, _ *byte, src [%d]uintptr) {\n", int(i), j, i, i) + for k := uint(0); k < i; k++ { + if j&(1<<(i-1-k)) != 0 { + fmt.Printf("\twritebarrierptr(&dst[%d], src[%d])\n", k, k) + } else { + fmt.Printf("\tdst[%d] = src[%d]\n", k, k) + } + } + fmt.Printf("}\n") + } + } +} |