summaryrefslogtreecommitdiff
path: root/compiler/x86
diff options
context:
space:
mode:
authorflorian <florian@3ad0048d-3df7-0310-abae-a5850022a9f2>2011-08-20 12:34:37 +0000
committerflorian <florian@3ad0048d-3df7-0310-abae-a5850022a9f2>2011-08-20 12:34:37 +0000
commit94a8e89cc18939917dfce9743d2d3c851544835b (patch)
treec84dba0538ad585af0b2bf3c2e7afd65b27a73a9 /compiler/x86
parentf43fb9eea65b9e38a60d847161438fda0d0481ab (diff)
downloadfpc-94a8e89cc18939917dfce9743d2d3c851544835b.tar.gz
* make use of mulps/mulpd and haddps/haddpd/hsubpd/hsubps to optimze x*x+y*y and x*x-y*y where x and y might be single or double
git-svn-id: http://svn.freepascal.org/svn/fpc/trunk@18790 3ad0048d-3df7-0310-abae-a5850022a9f2
Diffstat (limited to 'compiler/x86')
-rw-r--r--compiler/x86/nx86add.pas68
1 files changed, 66 insertions, 2 deletions
diff --git a/compiler/x86/nx86add.pas b/compiler/x86/nx86add.pas
index 5f7ee2c751..f7241fe29d 100644
--- a/compiler/x86/nx86add.pas
+++ b/compiler/x86/nx86add.pas
@@ -66,7 +66,7 @@ unit nx86add;
symconst,symdef,
cgobj,cgx86,cga,cgutils,
paramgr,tgobj,ncgutil,
- ncon,nset,
+ ncon,nset,ninl,
defutil;
@@ -660,7 +660,28 @@ unit nx86add;
procedure tx86addnode.second_addfloatsse;
var
op : topcg;
+ sqr_sum : boolean;
+ tmp : tnode;
begin
+ sqr_sum:=false;
+ if (current_settings.fputype>=fpu_sse3) and
+ use_vectorfpu(resultdef) and
+ (nodetype in [addn,subn]) and
+ (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
+ (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
+ begin
+ sqr_sum:=true;
+ tmp:=tinlinenode(left).left;
+ tinlinenode(left).left:=nil;
+ left.free;
+ left:=tmp;
+
+ tmp:=tinlinenode(right).left;
+ tinlinenode(right).left:=nil;
+ right.free;
+ right:=tmp;
+ end;
+
pass_left_right;
check_left_and_right_fpureg(false);
@@ -687,8 +708,51 @@ unit nx86add;
end;
location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+
+ if sqr_sum then
+ begin
+ if nf_swapped in flags then
+ swapleftright;
+
+ location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
+ location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,true);
+ location:=left.location;
+ if is_double(resultdef) then
+ begin
+ current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
+ case nodetype of
+ addn:
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
+ subn:
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
+ else
+ internalerror(201108162);
+ end;
+ end
+ else
+ begin
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
+ { ensure that bits 64..127 contain valid values }
+ current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
+ { the data is now in bits 0..32 and 64..95 }
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
+ case nodetype of
+ addn:
+ begin
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
+ end;
+ subn:
+ begin
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
+ end;
+ else
+ internalerror(201108163);
+ end;
+ end
+ end
{ we can use only right as left operand if the operation is commutative }
- if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
+ else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
begin
location.register:=right.location.register;
{ force floating point reg. location to be written to memory,