summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorRuss Cox <rsc@golang.org>2014-10-15 19:33:15 -0400
committerRuss Cox <rsc@golang.org>2014-10-15 19:33:15 -0400
commitfb2b1f6303e348ba79527474231e03968674dc77 (patch)
tree7ab5c2618d83d5cb012722214414a083f3513aaf /src
parent0f01edffa62961e2f23eb08f6dfff545437c67b9 (diff)
downloadgo-fb2b1f6303e348ba79527474231e03968674dc77.tar.gz
cmd/gc: simplify compiled code for explicit zeroing
Among other things, *x = T{} does not need a write barrier. The changes here avoid an unnecessary copy even when no pointers are involved, so it may have larger effects. In 6g and 8g, avoid manually repeated STOSQ in favor of writing explicit MOVs, under the theory that the MOVs should have fewer dependencies and pipeline better. Benchmarks compare best of 5 on a 2012 MacBook Pro Core i5 with TurboBoost disabled. Most improvements can be explained by the changes in this CL. The effect in Revcomp is real but harder to explain: none of the instructions in the inner loop changed. I suspect loop alignment but really have no idea. benchmark old new delta BenchmarkBinaryTree17 3809027371 3819907076 +0.29% BenchmarkFannkuch11 3607547556 3686983012 +2.20% BenchmarkFmtFprintfEmpty 118 103 -12.71% BenchmarkFmtFprintfString 289 277 -4.15% BenchmarkFmtFprintfInt 304 290 -4.61% BenchmarkFmtFprintfIntInt 507 458 -9.66% BenchmarkFmtFprintfPrefixedInt 425 408 -4.00% BenchmarkFmtFprintfFloat 555 555 +0.00% BenchmarkFmtManyArgs 1835 1733 -5.56% BenchmarkGobDecode 14738209 14639331 -0.67% BenchmarkGobEncode 14239039 13703571 -3.76% BenchmarkGzip 538211054 538701315 +0.09% BenchmarkGunzip 135430877 134818459 -0.45% BenchmarkHTTPClientServer 116488 116618 +0.11% BenchmarkJSONEncode 28923406 29294334 +1.28% BenchmarkJSONDecode 105779820 104289543 -1.41% BenchmarkMandelbrot200 5791758 5771964 -0.34% BenchmarkGoParse 5376642 5310943 -1.22% BenchmarkRegexpMatchEasy0_32 195 190 -2.56% BenchmarkRegexpMatchEasy0_1K 477 455 -4.61% BenchmarkRegexpMatchEasy1_32 170 165 -2.94% BenchmarkRegexpMatchEasy1_1K 1410 1394 -1.13% BenchmarkRegexpMatchMedium_32 336 329 -2.08% BenchmarkRegexpMatchMedium_1K 108979 106328 -2.43% BenchmarkRegexpMatchHard_32 5854 5821 -0.56% BenchmarkRegexpMatchHard_1K 185089 182838 -1.22% BenchmarkRevcomp 834920364 780202624 -6.55% BenchmarkTemplate 137046937 129728756 -5.34% BenchmarkTimeParse 600 594 -1.00% BenchmarkTimeFormat 559 539 -3.58% LGTM=r R=r CC=golang-codereviews, iant, khr, rlh https://codereview.appspot.com/157910047
Diffstat (limited to 'src')
-rw-r--r--src/cmd/6g/ggen.c40
-rw-r--r--src/cmd/8g/ggen.c28
-rw-r--r--src/cmd/gc/gen.c2
-rw-r--r--src/cmd/gc/go.h1
-rw-r--r--src/cmd/gc/mparith2.c4
-rw-r--r--src/cmd/gc/sinit.c4
-rw-r--r--src/cmd/gc/walk.c11
7 files changed, 74 insertions, 16 deletions
diff --git a/src/cmd/6g/ggen.c b/src/cmd/6g/ggen.c
index 987473cca..363620769 100644
--- a/src/cmd/6g/ggen.c
+++ b/src/cmd/6g/ggen.c
@@ -1102,26 +1102,54 @@ clearfat(Node *nl)
c = w % 8; // bytes
q = w / 8; // quads
+ if(q < 4) {
+ // Write sequence of MOV 0, off(base) instead of using STOSQ.
+ // The hope is that although the code will be slightly longer,
+ // the MOVs will have no dependencies and pipeline better
+ // than the unrolled STOSQ loop.
+ // NOTE: Must use agen, not igen, so that optimizer sees address
+ // being taken. We are not writing on field boundaries.
+ agenr(nl, &n1, N);
+ n1.op = OINDREG;
+ nodconst(&z, types[TUINT64], 0);
+ while(q-- > 0) {
+ n1.type = z.type;
+ gins(AMOVQ, &z, &n1);
+ n1.xoffset += 8;
+ }
+ if(c >= 4) {
+ nodconst(&z, types[TUINT32], 0);
+ n1.type = z.type;
+ gins(AMOVL, &z, &n1);
+ n1.xoffset += 4;
+ c -= 4;
+ }
+ nodconst(&z, types[TUINT8], 0);
+ while(c-- > 0) {
+ n1.type = z.type;
+ gins(AMOVB, &z, &n1);
+ n1.xoffset++;
+ }
+ regfree(&n1);
+ return;
+ }
+
savex(D_DI, &n1, &oldn1, N, types[tptr]);
agen(nl, &n1);
savex(D_AX, &ax, &oldax, N, types[tptr]);
gconreg(AMOVL, 0, D_AX);
- if(q > 128 || (q >= 4 && nacl)) {
+ if(q > 128 || nacl) {
gconreg(movptr, q, D_CX);
gins(AREP, N, N); // repeat
gins(ASTOSQ, N, N); // STOQ AL,*(DI)+
- } else if(q >= 4) {
+ } else {
p = gins(ADUFFZERO, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffzero", runtimepkg));
// 2 and 128 = magic constants: see ../../runtime/asm_amd64.s
p->to.offset = 2*(128-q);
- } else
- while(q > 0) {
- gins(ASTOSQ, N, N); // STOQ AL,*(DI)+
- q--;
}
z = ax;
diff --git a/src/cmd/8g/ggen.c b/src/cmd/8g/ggen.c
index 7c986cc64..6333a60bb 100644
--- a/src/cmd/8g/ggen.c
+++ b/src/cmd/8g/ggen.c
@@ -157,7 +157,7 @@ void
clearfat(Node *nl)
{
uint32 w, c, q;
- Node n1;
+ Node n1, z;
Prog *p;
/* clear a fat object */
@@ -172,6 +172,32 @@ clearfat(Node *nl)
c = w % 4; // bytes
q = w / 4; // quads
+ if(q < 4) {
+ // Write sequence of MOV 0, off(base) instead of using STOSL.
+ // The hope is that although the code will be slightly longer,
+ // the MOVs will have no dependencies and pipeline better
+ // than the unrolled STOSL loop.
+ // NOTE: Must use agen, not igen, so that optimizer sees address
+ // being taken. We are not writing on field boundaries.
+ regalloc(&n1, types[tptr], N);
+ agen(nl, &n1);
+ n1.op = OINDREG;
+ nodconst(&z, types[TUINT64], 0);
+ while(q-- > 0) {
+ n1.type = z.type;
+ gins(AMOVL, &z, &n1);
+ n1.xoffset += 4;
+ }
+ nodconst(&z, types[TUINT8], 0);
+ while(c-- > 0) {
+ n1.type = z.type;
+ gins(AMOVB, &z, &n1);
+ n1.xoffset++;
+ }
+ regfree(&n1);
+ return;
+ }
+
nodreg(&n1, types[tptr], D_DI);
agen(nl, &n1);
gconreg(AMOVL, 0, D_AX);
diff --git a/src/cmd/gc/gen.c b/src/cmd/gc/gen.c
index eb9eacca8..a7db833a1 100644
--- a/src/cmd/gc/gen.c
+++ b/src/cmd/gc/gen.c
@@ -731,7 +731,7 @@ cgen_as(Node *nl, Node *nr)
return;
}
- if(nr == N || isnil(nr)) {
+ if(nr == N || iszero(nr)) {
// externals and heaps should already be clear
if(nr == N) {
if(nl->class == PEXTERN)
diff --git a/src/cmd/gc/go.h b/src/cmd/gc/go.h
index 8178f7272..475754145 100644
--- a/src/cmd/gc/go.h
+++ b/src/cmd/gc/go.h
@@ -1374,6 +1374,7 @@ int isnilinter(Type *t);
int isptrto(Type *t, int et);
int isslice(Type *t);
int istype(Type *t, int et);
+int iszero(Node *n);
void linehist(char *file, int32 off, int relative);
NodeList* list(NodeList *l, Node *n);
NodeList* list1(Node *n);
diff --git a/src/cmd/gc/mparith2.c b/src/cmd/gc/mparith2.c
index 5cf98c62c..fd9f591ce 100644
--- a/src/cmd/gc/mparith2.c
+++ b/src/cmd/gc/mparith2.c
@@ -656,7 +656,7 @@ mpdivmodfixfix(Mpint *q, Mpint *r, Mpint *n, Mpint *d)
}
static int
-iszero(Mpint *a)
+mpiszero(Mpint *a)
{
long *a1;
int i;
@@ -687,7 +687,7 @@ mpdivfract(Mpint *a, Mpint *b)
for(j=0; j<Mpscale; j++) {
x <<= 1;
if(mpcmp(&d, &n) <= 0) {
- if(!iszero(&d))
+ if(!mpiszero(&d))
x |= 1;
mpsubfixfix(&n, &d);
}
diff --git a/src/cmd/gc/sinit.c b/src/cmd/gc/sinit.c
index f050026d9..2a811513c 100644
--- a/src/cmd/gc/sinit.c
+++ b/src/cmd/gc/sinit.c
@@ -17,7 +17,6 @@ enum
InitPending = 2,
};
-static int iszero(Node*);
static void initplan(Node*);
static NodeList *initlist;
static void init2(Node*, NodeList**);
@@ -1356,7 +1355,6 @@ no:
return 0;
}
-static int iszero(Node*);
static int isvaluelit(Node*);
static InitEntry* entry(InitPlan*);
static void addvalue(InitPlan*, vlong, Node*, Node*);
@@ -1440,7 +1438,7 @@ addvalue(InitPlan *p, vlong xoffset, Node *key, Node *n)
e->expr = n;
}
-static int
+int
iszero(Node *n)
{
NodeList *l;
diff --git a/src/cmd/gc/walk.c b/src/cmd/gc/walk.c
index 241d7d74a..7f2748c66 100644
--- a/src/cmd/gc/walk.c
+++ b/src/cmd/gc/walk.c
@@ -1390,7 +1390,12 @@ walkexpr(Node **np, NodeList **init)
case OMAPLIT:
case OSTRUCTLIT:
case OPTRLIT:
- // XXX TODO do we need to clear var?
+ // NOTE(rsc): Race detector cannot handle seeing
+ // a STRUCTLIT or ARRAYLIT representing a zero value,
+ // so make a temporary for those always in race mode.
+ // Otherwise, leave zero values in place.
+ if(iszero(n) && !flag_race)
+ goto ret;
var = temp(n->type);
anylit(0, n, var, init);
n = var;
@@ -2009,8 +2014,8 @@ needwritebarrier(Node *l, Node *r)
if(isstack(l))
return 0;
- // No write barrier for zeroing.
- if(r == N)
+ // No write barrier for implicit or explicit zeroing.
+ if(r == N || iszero(r))
return 0;
// No write barrier for initialization to constant.