diff options
author | Keith Randall <khr@golang.org> | 2014-04-01 12:51:02 -0700 |
---|---|---|
committer | Keith Randall <khr@golang.org> | 2014-04-01 12:51:02 -0700 |
commit | 8595a9617da50fc99ca953101e332952378a3b6e (patch) | |
tree | 203c4f89df716609c8743e8bfb93249bdd353282 /src/liblink/asm6.c | |
parent | cf0e6253c7847a5d5bda488fb5dbadd58f17c86b (diff) | |
download | go-8595a9617da50fc99ca953101e332952378a3b6e.tar.gz |
runtime: get rid of most uses of REP for copying/zeroing.
REP MOVSQ and REP STOSQ have a really high startup overhead.
Use a Duff's device to do the repetition instead.
benchmark old ns/op new ns/op delta
BenchmarkClearFat32 7.20 1.60 -77.78%
BenchmarkCopyFat32 6.88 2.38 -65.41%
BenchmarkClearFat64 7.15 3.20 -55.24%
BenchmarkCopyFat64 6.88 3.44 -50.00%
BenchmarkClearFat128 9.53 5.34 -43.97%
BenchmarkCopyFat128 9.27 5.56 -40.02%
BenchmarkClearFat256 13.8 9.53 -30.94%
BenchmarkCopyFat256 13.5 10.3 -23.70%
BenchmarkClearFat512 22.3 18.0 -19.28%
BenchmarkCopyFat512 22.0 19.7 -10.45%
BenchmarkCopyFat1024 36.5 38.4 +5.21%
BenchmarkClearFat1024 35.1 35.0 -0.28%
TODO: use for stack frame zeroing
TODO: REP prefixes are still used for "reverse" copying when src/dst
regions overlap. Might be worth fixing.
LGTM=rsc
R=golang-codereviews, rsc
CC=golang-codereviews, r
https://codereview.appspot.com/81370046
Diffstat (limited to 'src/liblink/asm6.c')
-rw-r--r-- | src/liblink/asm6.c | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/src/liblink/asm6.c b/src/liblink/asm6.c index b2690bf0e..040366521 100644 --- a/src/liblink/asm6.c +++ b/src/liblink/asm6.c @@ -507,6 +507,11 @@ static uchar ycall[] = Ynone, Ybr, Zcall, 1, 0 }; +static uchar yduff[] = +{ + Ynone, Yi32, Zcall, 1, + 0 +}; static uchar yjmp[] = { Ynone, Yml, Zo_m64, 2, @@ -1519,6 +1524,9 @@ Optab optab[] = { APCDATA, ypcdata, Px, 0,0 }, { ACHECKNIL }, { AVARDEF }, + { AVARKILL }, + { ADUFFCOPY, yduff, Px, 0xe8 }, + { ADUFFZERO, yduff, Px, 0xe8 }, { AEND }, 0 @@ -3030,6 +3038,7 @@ found: r = addrel(ctxt->cursym); r->off = p->pc + ctxt->andptr - ctxt->and; r->sym = p->to.sym; + r->add = p->to.offset; r->type = D_PCREL; r->siz = 4; put4(ctxt, 0); |