summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorflorian <florian@3ad0048d-3df7-0310-abae-a5850022a9f2>2021-04-29 19:48:54 +0000
committerflorian <florian@3ad0048d-3df7-0310-abae-a5850022a9f2>2021-04-29 19:48:54 +0000
commit3953e9448c2fda000fddd0e624d9fa7313d3f0f9 (patch)
tree765282702a1aef5fb9beb8436d622ee518044ecd
parent8afe96f6b109298f49cf74bbeda8faa9820fe098 (diff)
downloadfpc-3953e9448c2fda000fddd0e624d9fa7313d3f0f9.tar.gz
* patch by J. Gareth Moreton: AArch64 "magic division"
(replace division by constant with multiplication), part of #38806 git-svn-id: https://svn.freepascal.org/svn/fpc/trunk@49290 3ad0048d-3df7-0310-abae-a5850022a9f2
-rw-r--r--compiler/aarch64/ncpumat.pas403
1 files changed, 319 insertions, 84 deletions
diff --git a/compiler/aarch64/ncpumat.pas b/compiler/aarch64/ncpumat.pas
index 1c8d45524f..6395a7f009 100644
--- a/compiler/aarch64/ncpumat.pas
+++ b/compiler/aarch64/ncpumat.pas
@@ -71,20 +71,35 @@ implementation
var
op : tasmop;
tmpreg,
+ zeroreg,
numerator,
divider,
+ largernumreg,
+ largerresreg,
resultreg : tregister;
- hl : tasmlabel;
+ hl : tasmlabel;
overflowloc: tlocation;
- power: longint;
+ power : longint;
+ opsize : tcgsize;
+
+ dividend : Int64;
+ high_bit,
+ reciprocal : QWord;
+ { Just to save on stack space and the like }
+ reciprocal_signed : Int64 absolute reciprocal;
+
+ expandword,
+ magic_add : Boolean;
+ shift : byte;
+
+ shifterop : tshifterop;
+ hp : taicpu;
procedure genOrdConstNodeDiv;
var
helper1, helper2: TRegister;
so: tshifterop;
- opsize: TCgSize;
begin
- opsize:=def_cgsize(resultdef);
if tordconstnode(right).value=0 then
internalerror(2020021601)
else if tordconstnode(right).value=1 then
@@ -98,7 +113,7 @@ implementation
current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_NEG,
resultreg,numerator),toppostfix(ord(cs_check_overflow in current_settings.localswitches)*ord(PF_S))));
end
- else if ispowerof2(tordconstnode(right).value,power) then
+ else if isabspowerof2(tordconstnode(right).value,power) then
begin
if (is_signed(right.resultdef)) then
begin
@@ -115,98 +130,318 @@ implementation
so.shiftimm:=resultdef.size*8-power;
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,helper2,numerator,helper1,so));
cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,def_cgsize(resultdef),power,helper2,resultreg);
+
+ if (tordconstnode(right).value < 0) then
+ { Invert the result }
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_NEG,resultreg,resultreg));
end
- else
- cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,opsize,power,numerator,resultreg)
+ else
+ cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,opsize,power,numerator,resultreg)
end
else
- { Everything else is handled in the generic code }
- cg.g_div_const_reg_reg(current_asmdata.CurrAsmList,opsize,
- tordconstnode(right).value.svalue,numerator,resultreg);
+ { Generic division }
+ begin
+ if is_signed(left.resultdef) then
+ op:=A_SDIV
+ else
+ op:=A_UDIV;
+
+ { If we didn't acquire the original divisor earlier, grab it now }
+ if divider = NR_NO then
+ begin
+ divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
+ cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
+ end;
+
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,resultreg,numerator,divider));
+ end;
end;
- begin
- secondpass(left);
- secondpass(right);
- { avoid warning }
- divider:=NR_NO;
-
- { set result location }
- location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
- location.register:=cg.getintregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
- resultreg:=location.register;
-
- { put numerator in register }
- hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
- numerator:=left.location.register;
-
- if (right.nodetype=ordconstn) and
- ((tordconstnode(right).value=1) or
- (tordconstnode(right).value=int64(-1)) or
- (tordconstnode(right).value=0) or
- ispowerof2(tordconstnode(right).value,power)) then
+ procedure genOverflowCheck;
begin
- genOrdConstNodeDiv;
- if nodetype=modn then
+ { in case of overflow checking, also check for low(int64) div (-1)
+ (no hardware support for this either) }
+ if (cs_check_overflow in current_settings.localswitches) and
+ is_signed(left.resultdef) and
+ ((right.nodetype<>ordconstn) or
+ (tordconstnode(right).value=-1)) then
begin
- divider:=cg.getintregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
- cg.a_load_const_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),int64(tordconstnode(right).value),divider);
+ { num=ffff... and div=8000... <=>
+ num xor not(div xor 8000...) = 0
+ (and we have the "eon" operation, which performs "xor not(...)" }
+ tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,left.resultdef);
+ hlcg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.resultdef,low(int64),numerator,tmpreg);
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_EON,
+ tmpreg,numerator,tmpreg));
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,tmpreg,0));
+ { now the zero/equal flag is set in case we divided low(int64) by
+ (-1) }
+ location_reset(overflowloc,LOC_FLAGS,OS_NO);
+ overflowloc.resflags:=F_EQ;
+ cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,location,resultdef,overflowloc);
end;
- end
- else
- begin
- { load divider in a register }
- hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
- divider:=right.location.register;
-
- { start division }
- if is_signed(left.resultdef) then
- op:=A_SDIV
- else
- op:=A_UDIV;
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,numerator,divider));
end;
- { no divide-by-zero detection available in hardware, emulate (if it's a
- constant, this will have been detected earlier already) }
- if (right.nodetype<>ordconstn) then
- begin
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,
- right.location.register,0));
+ begin
+ secondpass(left);
+ secondpass(right);
+ { avoid warning }
+ divider := NR_NO;
+ largernumreg := NR_NO;
+ expandword := False;
- current_asmdata.getjumplabel(hl);
- current_asmdata.CurrAsmList.concat(taicpu.op_cond_sym(A_B,C_NE,hl));
- cg.a_call_name(current_asmdata.CurrAsmList,'FPC_DIVBYZERO',false);
- cg.a_label(current_asmdata.CurrAsmList,hl);
- end;
+ opsize := def_cgsize(resultdef);
- { in case of overflow checking, also check for low(int64) div (-1)
- (no hardware support for this either) }
- if (cs_check_overflow in current_settings.localswitches) and
- is_signed(left.resultdef) and
- ((right.nodetype<>ordconstn) or
- (tordconstnode(right).value=-1)) then
- begin
- { num=ffff... and div=8000... <=>
- num xor not(div xor 8000...) = 0
- (and we have the "eon" operation, which performs "xor not(...)" }
- tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,left.resultdef);
- hlcg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.resultdef,low(int64),left.location.register,tmpreg);
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_EON,
- tmpreg,left.location.register,tmpreg));
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,tmpreg,0));
- { now the zero/equal flag is set in case we divided low(int64) by
- (-1) }
- location_reset(overflowloc,LOC_FLAGS,OS_NO);
- overflowloc.resflags:=F_EQ;
- cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,location,resultdef,overflowloc);
- end;
+ { set result location }
+ location_reset(location,LOC_REGISTER,opsize);
+ location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
+ resultreg:=location.register;
+
+ { put numerator in register }
+ hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
+ numerator:=left.location.register;
+
+ if (right.nodetype=ordconstn) then
+ begin
+ { If optimising for size, just use regular division operations }
+ if (cs_opt_size in current_settings.optimizerswitches) or
+ ((tordconstnode(right).value=1) or
+ (tordconstnode(right).value=int64(-1)) or
+ isabspowerof2(tordconstnode(right).value,power)) then
+ begin
+
+ { Store divisor for later (and executed at the same time as the multiplication) }
+ if (nodetype=modn) then
+ begin
+ if (tordconstnode(right).value = 1) or (tordconstnode(right).value = int64(-1)) then
+ begin
+ { Just evaluates to zero }
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_MOVZ,resultreg, 0));
+ Exit;
+ end
+ { "not cs_opt_size" saves from checking the value of the divisor again
+ (if cs_opt_size is not set, then the divisor is a power of 2) }
+ else if not (cs_opt_size in current_settings.optimizerswitches) then
+ begin
+ divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
+ cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
+ end
+ end;
+
+ genOrdConstNodeDiv;
+ genOverflowCheck;
+
+ { in case of modulo, multiply result again by the divider and subtract
+ from the numerator }
+ if (nodetype=modn) then
+ begin
+ if ispowerof2(tordconstnode(right).value,power) then
+ begin
+ shifterop.shiftmode := SM_LSL;
+ shifterop.shiftimm := power;
+
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_SUB,resultreg,numerator,resultreg,shifterop));
+ end
+ else
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg,
+ resultreg,divider,numerator));
+ end;
+
+ Exit;
+ end
+ else
+ begin
+ if is_signed(left.resultdef) then
+ begin
+ if (nodetype=modn) then { Signed mod doesn't work properly }
+ begin
+ divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
+ cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
+ genOrdConstNodeDiv;
+ end
+ else
+ begin
+ { Read signed value to avoid Internal Error 200706094 }
+ dividend := tordconstnode(right).value.svalue;
+
+ calc_divconst_magic_signed(resultdef.size * 8, dividend, reciprocal_signed, shift);
+ cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, reciprocal_signed, resultreg);
+
+ { SMULH is only available for the full 64-bit registers }
+ if opsize in [OS_64, OS_S64] then
+ begin
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_SMULH,resultreg,resultreg,numerator));
+ largerresreg := resultreg;
+ end
+ else
+ begin
+ largerresreg := newreg(getregtype(resultreg), getsupreg(resultreg), R_SUBWHOLE);
+ largernumreg := newreg(getregtype(numerator), getsupreg(numerator), R_SUBWHOLE);
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_MUL,largerresreg,largerresreg,largernumreg));
+ expandword := True; { Merge the shift operation with something below }
+ end;
+
+ { Store divisor for later (and executed at the same time as the multiplication) }
+ if nodetype=modn then
+ begin
+ divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
+ cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,dividend,divider);
+ end;
+
+ { add or subtract dividend }
+ if (dividend > 0) and (reciprocal_signed < 0) then
+ begin
+ if expandword then
+ begin
+ shifterop.shiftmode := SM_ASR;
+ shifterop.shiftimm := 32;
+ expandword := False;
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,largerresreg,largernumreg,largerresreg,shifterop));
+ end
+ else
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_ADD,resultreg,resultreg,numerator));
+ end
+ else if (dividend < 0) and (reciprocal_signed > 0) then
+ begin
+ if expandword then
+ begin
+ { We can't append LSR to the SUB below because it's on the wrong operand }
+ expandword := False;
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,largerresreg,largerresreg,32));
+ end;
+
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_SUB,resultreg,resultreg,numerator));
+ end
+ else if expandword then
+ Inc(shift,32);
+
+ { shift if necessary }
+ if (shift <> 0) then
+ begin
+ if expandword then
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,largerresreg,largerresreg,shift))
+ else
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,resultreg,resultreg,shift));
+ end;
+
+ { extract and add the sign bit }
+ shifterop.shiftmode := SM_LSR;
+ shifterop.shiftimm := left.resultdef.size*8 - 1;
+
+ if (dividend < 0) then
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,resultreg,resultreg,resultreg,shifterop))
+ else
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,resultreg,resultreg,numerator,shifterop));
+ end;
+ end
+ else
+ begin
+ calc_divconst_magic_unsigned(resultdef.size * 8, tordconstnode(right).value, reciprocal, magic_add, shift);
+ cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, reciprocal, resultreg);
+
+ { UMULH is only available for the full 64-bit registers }
+ if opsize in [OS_64, OS_S64] then
+ begin
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_UMULH,resultreg,resultreg,numerator));
+ largerresreg := resultreg;
+ end
+ else
+ begin
+ largerresreg := newreg(getregtype(resultreg), getsupreg(resultreg), R_SUBWHOLE);
+ largernumreg := newreg(getregtype(numerator), getsupreg(numerator), R_SUBWHOLE);
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_MUL,largerresreg,largerresreg,largernumreg));
+ expandword := True; { Try to merge the shift operation with something below }
+ end;
+
+ { Store divisor for later (and executed at the same time as the multiplication) }
+ if (nodetype=modn) then
+ begin
+ divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
+ cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
+ end;
+
+ if magic_add then
+ begin
+ { We can't append LSR to the ADD below because it would require extending the registers
+ and interfere with the carry bit }
+ if expandword then
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,largerresreg,largerresreg,32));
+
+ { Add the reciprocal to the high-order word, tracking the carry bit, shift, then
+ insert the carry bit via CSEL and ORR }
- { in case of modulo, multiply result again by the divider and subtract
- from the numerator }
- if nodetype=modn then
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg,
- resultreg,divider,numerator));
+ if opsize in [OS_64,OS_S64] then
+ zeroreg := NR_XZR
+ else
+ zeroreg := NR_WZR;
+
+ high_bit := QWord(1) shl ((resultdef.size * 8) - shift);
+
+ tmpreg := cg.getintregister(current_asmdata.CurrAsmList, opsize);
+ cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, high_bit, tmpreg);
+
+ { Generate ADDS instruction }
+ hp := taicpu.op_reg_reg_reg(A_ADD,resultreg,resultreg,numerator);
+ hp.oppostfix := PF_S;
+ current_asmdata.CurrAsmList.concat(hp);
+
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_cond(A_CSEL,tmpreg,tmpreg,zeroreg, C_CS));
+
+ shifterop.shiftmode := SM_LSR;
+ shifterop.shiftimm := shift;
+
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,resultreg,tmpreg,resultreg,shifterop));
+ end
+ else if expandword then
+ { Include the right-shift by 32 to get the high-order DWord }
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,largerresreg,largerresreg,shift + 32))
+ else
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,resultreg,resultreg,shift));
+ end;
+
+ end;
+
+ end
+ { no divide-by-zero detection available in hardware, emulate (if it's a
+ constant, this will have been detected earlier already) }
+ else
+ begin
+ { load divider in a register }
+ hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
+ divider:=right.location.register;
+
+ { ARM-64 developer guides recommend checking for division by zero conditions
+ AFTER the division, since the check and the division can be done in tandem }
+ if is_signed(left.resultdef) then
+ op:=A_SDIV
+ else
+ op:=A_UDIV;
+
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,resultreg,numerator,divider));
+
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,divider,0));
+ current_asmdata.getjumplabel(hl);
+ current_asmdata.CurrAsmList.concat(taicpu.op_cond_sym(A_B,C_NE,hl));
+ cg.a_call_name(current_asmdata.CurrAsmList,'FPC_DIVBYZERO',false);
+ cg.a_label(current_asmdata.CurrAsmList,hl);
+ end;
+
+ genOverflowCheck;
+
+ { in case of modulo, multiply result again by the divider and subtract
+ from the numerator }
+ if (nodetype=modn) then
+ begin
+ { If we didn't acquire the original divisor earlier, grab it now }
+ if divider = NR_NO then
+ begin
+ divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
+ cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
+ end;
+
+ current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg,
+ resultreg,divider,numerator));
+ end;
end;