From 3953e9448c2fda000fddd0e624d9fa7313d3f0f9 Mon Sep 17 00:00:00 2001 From: florian Date: Thu, 29 Apr 2021 19:48:54 +0000 Subject: * patch by J. Gareth Moreton: AArch64 "magic division" (replace division by constant with multiplication), part of #38806 git-svn-id: https://svn.freepascal.org/svn/fpc/trunk@49290 3ad0048d-3df7-0310-abae-a5850022a9f2 --- compiler/aarch64/ncpumat.pas | 403 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 319 insertions(+), 84 deletions(-) diff --git a/compiler/aarch64/ncpumat.pas b/compiler/aarch64/ncpumat.pas index 1c8d45524f..6395a7f009 100644 --- a/compiler/aarch64/ncpumat.pas +++ b/compiler/aarch64/ncpumat.pas @@ -71,20 +71,35 @@ implementation var op : tasmop; tmpreg, + zeroreg, numerator, divider, + largernumreg, + largerresreg, resultreg : tregister; - hl : tasmlabel; + hl : tasmlabel; overflowloc: tlocation; - power: longint; + power : longint; + opsize : tcgsize; + + dividend : Int64; + high_bit, + reciprocal : QWord; + { Just to save on stack space and the like } + reciprocal_signed : Int64 absolute reciprocal; + + expandword, + magic_add : Boolean; + shift : byte; + + shifterop : tshifterop; + hp : taicpu; procedure genOrdConstNodeDiv; var helper1, helper2: TRegister; so: tshifterop; - opsize: TCgSize; begin - opsize:=def_cgsize(resultdef); if tordconstnode(right).value=0 then internalerror(2020021601) else if tordconstnode(right).value=1 then @@ -98,7 +113,7 @@ implementation current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_NEG, resultreg,numerator),toppostfix(ord(cs_check_overflow in current_settings.localswitches)*ord(PF_S)))); end - else if ispowerof2(tordconstnode(right).value,power) then + else if isabspowerof2(tordconstnode(right).value,power) then begin if (is_signed(right.resultdef)) then begin @@ -115,98 +130,318 @@ implementation so.shiftimm:=resultdef.size*8-power; current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,helper2,numerator,helper1,so)); cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,def_cgsize(resultdef),power,helper2,resultreg); + + if (tordconstnode(right).value < 0) then + { Invert the result } + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_NEG,resultreg,resultreg)); end - else - cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,opsize,power,numerator,resultreg) + else + cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,opsize,power,numerator,resultreg) end else - { Everything else is handled in the generic code } - cg.g_div_const_reg_reg(current_asmdata.CurrAsmList,opsize, - tordconstnode(right).value.svalue,numerator,resultreg); + { Generic division } + begin + if is_signed(left.resultdef) then + op:=A_SDIV + else + op:=A_UDIV; + + { If we didn't acquire the original divisor earlier, grab it now } + if divider = NR_NO then + begin + divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize); + cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider); + end; + + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,resultreg,numerator,divider)); + end; end; - begin - secondpass(left); - secondpass(right); - { avoid warning } - divider:=NR_NO; - - { set result location } - location_reset(location,LOC_REGISTER,def_cgsize(resultdef)); - location.register:=cg.getintregister(current_asmdata.CurrAsmList,def_cgsize(resultdef)); - resultreg:=location.register; - - { put numerator in register } - hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true); - numerator:=left.location.register; - - if (right.nodetype=ordconstn) and - ((tordconstnode(right).value=1) or - (tordconstnode(right).value=int64(-1)) or - (tordconstnode(right).value=0) or - ispowerof2(tordconstnode(right).value,power)) then + procedure genOverflowCheck; begin - genOrdConstNodeDiv; - if nodetype=modn then + { in case of overflow checking, also check for low(int64) div (-1) + (no hardware support for this either) } + if (cs_check_overflow in current_settings.localswitches) and + is_signed(left.resultdef) and + ((right.nodetype<>ordconstn) or + (tordconstnode(right).value=-1)) then begin - divider:=cg.getintregister(current_asmdata.CurrAsmList,def_cgsize(resultdef)); - cg.a_load_const_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),int64(tordconstnode(right).value),divider); + { num=ffff... and div=8000... <=> + num xor not(div xor 8000...) = 0 + (and we have the "eon" operation, which performs "xor not(...)" } + tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,left.resultdef); + hlcg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.resultdef,low(int64),numerator,tmpreg); + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_EON, + tmpreg,numerator,tmpreg)); + current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,tmpreg,0)); + { now the zero/equal flag is set in case we divided low(int64) by + (-1) } + location_reset(overflowloc,LOC_FLAGS,OS_NO); + overflowloc.resflags:=F_EQ; + cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,location,resultdef,overflowloc); end; - end - else - begin - { load divider in a register } - hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true); - divider:=right.location.register; - - { start division } - if is_signed(left.resultdef) then - op:=A_SDIV - else - op:=A_UDIV; - current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,numerator,divider)); end; - { no divide-by-zero detection available in hardware, emulate (if it's a - constant, this will have been detected earlier already) } - if (right.nodetype<>ordconstn) then - begin - current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP, - right.location.register,0)); + begin + secondpass(left); + secondpass(right); + { avoid warning } + divider := NR_NO; + largernumreg := NR_NO; + expandword := False; - current_asmdata.getjumplabel(hl); - current_asmdata.CurrAsmList.concat(taicpu.op_cond_sym(A_B,C_NE,hl)); - cg.a_call_name(current_asmdata.CurrAsmList,'FPC_DIVBYZERO',false); - cg.a_label(current_asmdata.CurrAsmList,hl); - end; + opsize := def_cgsize(resultdef); - { in case of overflow checking, also check for low(int64) div (-1) - (no hardware support for this either) } - if (cs_check_overflow in current_settings.localswitches) and - is_signed(left.resultdef) and - ((right.nodetype<>ordconstn) or - (tordconstnode(right).value=-1)) then - begin - { num=ffff... and div=8000... <=> - num xor not(div xor 8000...) = 0 - (and we have the "eon" operation, which performs "xor not(...)" } - tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,left.resultdef); - hlcg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.resultdef,low(int64),left.location.register,tmpreg); - current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_EON, - tmpreg,left.location.register,tmpreg)); - current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,tmpreg,0)); - { now the zero/equal flag is set in case we divided low(int64) by - (-1) } - location_reset(overflowloc,LOC_FLAGS,OS_NO); - overflowloc.resflags:=F_EQ; - cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,location,resultdef,overflowloc); - end; + { set result location } + location_reset(location,LOC_REGISTER,opsize); + location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize); + resultreg:=location.register; + + { put numerator in register } + hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true); + numerator:=left.location.register; + + if (right.nodetype=ordconstn) then + begin + { If optimising for size, just use regular division operations } + if (cs_opt_size in current_settings.optimizerswitches) or + ((tordconstnode(right).value=1) or + (tordconstnode(right).value=int64(-1)) or + isabspowerof2(tordconstnode(right).value,power)) then + begin + + { Store divisor for later (and executed at the same time as the multiplication) } + if (nodetype=modn) then + begin + if (tordconstnode(right).value = 1) or (tordconstnode(right).value = int64(-1)) then + begin + { Just evaluates to zero } + current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_MOVZ,resultreg, 0)); + Exit; + end + { "not cs_opt_size" saves from checking the value of the divisor again + (if cs_opt_size is not set, then the divisor is a power of 2) } + else if not (cs_opt_size in current_settings.optimizerswitches) then + begin + divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize); + cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider); + end + end; + + genOrdConstNodeDiv; + genOverflowCheck; + + { in case of modulo, multiply result again by the divider and subtract + from the numerator } + if (nodetype=modn) then + begin + if ispowerof2(tordconstnode(right).value,power) then + begin + shifterop.shiftmode := SM_LSL; + shifterop.shiftimm := power; + + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_SUB,resultreg,numerator,resultreg,shifterop)); + end + else + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg, + resultreg,divider,numerator)); + end; + + Exit; + end + else + begin + if is_signed(left.resultdef) then + begin + if (nodetype=modn) then { Signed mod doesn't work properly } + begin + divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize); + cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider); + genOrdConstNodeDiv; + end + else + begin + { Read signed value to avoid Internal Error 200706094 } + dividend := tordconstnode(right).value.svalue; + + calc_divconst_magic_signed(resultdef.size * 8, dividend, reciprocal_signed, shift); + cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, reciprocal_signed, resultreg); + + { SMULH is only available for the full 64-bit registers } + if opsize in [OS_64, OS_S64] then + begin + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_SMULH,resultreg,resultreg,numerator)); + largerresreg := resultreg; + end + else + begin + largerresreg := newreg(getregtype(resultreg), getsupreg(resultreg), R_SUBWHOLE); + largernumreg := newreg(getregtype(numerator), getsupreg(numerator), R_SUBWHOLE); + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_MUL,largerresreg,largerresreg,largernumreg)); + expandword := True; { Merge the shift operation with something below } + end; + + { Store divisor for later (and executed at the same time as the multiplication) } + if nodetype=modn then + begin + divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize); + cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,dividend,divider); + end; + + { add or subtract dividend } + if (dividend > 0) and (reciprocal_signed < 0) then + begin + if expandword then + begin + shifterop.shiftmode := SM_ASR; + shifterop.shiftimm := 32; + expandword := False; + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,largerresreg,largernumreg,largerresreg,shifterop)); + end + else + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_ADD,resultreg,resultreg,numerator)); + end + else if (dividend < 0) and (reciprocal_signed > 0) then + begin + if expandword then + begin + { We can't append LSR to the SUB below because it's on the wrong operand } + expandword := False; + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,largerresreg,largerresreg,32)); + end; + + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_SUB,resultreg,resultreg,numerator)); + end + else if expandword then + Inc(shift,32); + + { shift if necessary } + if (shift <> 0) then + begin + if expandword then + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,largerresreg,largerresreg,shift)) + else + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,resultreg,resultreg,shift)); + end; + + { extract and add the sign bit } + shifterop.shiftmode := SM_LSR; + shifterop.shiftimm := left.resultdef.size*8 - 1; + + if (dividend < 0) then + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,resultreg,resultreg,resultreg,shifterop)) + else + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,resultreg,resultreg,numerator,shifterop)); + end; + end + else + begin + calc_divconst_magic_unsigned(resultdef.size * 8, tordconstnode(right).value, reciprocal, magic_add, shift); + cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, reciprocal, resultreg); + + { UMULH is only available for the full 64-bit registers } + if opsize in [OS_64, OS_S64] then + begin + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_UMULH,resultreg,resultreg,numerator)); + largerresreg := resultreg; + end + else + begin + largerresreg := newreg(getregtype(resultreg), getsupreg(resultreg), R_SUBWHOLE); + largernumreg := newreg(getregtype(numerator), getsupreg(numerator), R_SUBWHOLE); + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_MUL,largerresreg,largerresreg,largernumreg)); + expandword := True; { Try to merge the shift operation with something below } + end; + + { Store divisor for later (and executed at the same time as the multiplication) } + if (nodetype=modn) then + begin + divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize); + cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider); + end; + + if magic_add then + begin + { We can't append LSR to the ADD below because it would require extending the registers + and interfere with the carry bit } + if expandword then + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,largerresreg,largerresreg,32)); + + { Add the reciprocal to the high-order word, tracking the carry bit, shift, then + insert the carry bit via CSEL and ORR } - { in case of modulo, multiply result again by the divider and subtract - from the numerator } - if nodetype=modn then - current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg, - resultreg,divider,numerator)); + if opsize in [OS_64,OS_S64] then + zeroreg := NR_XZR + else + zeroreg := NR_WZR; + + high_bit := QWord(1) shl ((resultdef.size * 8) - shift); + + tmpreg := cg.getintregister(current_asmdata.CurrAsmList, opsize); + cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, high_bit, tmpreg); + + { Generate ADDS instruction } + hp := taicpu.op_reg_reg_reg(A_ADD,resultreg,resultreg,numerator); + hp.oppostfix := PF_S; + current_asmdata.CurrAsmList.concat(hp); + + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_cond(A_CSEL,tmpreg,tmpreg,zeroreg, C_CS)); + + shifterop.shiftmode := SM_LSR; + shifterop.shiftimm := shift; + + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,resultreg,tmpreg,resultreg,shifterop)); + end + else if expandword then + { Include the right-shift by 32 to get the high-order DWord } + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,largerresreg,largerresreg,shift + 32)) + else + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,resultreg,resultreg,shift)); + end; + + end; + + end + { no divide-by-zero detection available in hardware, emulate (if it's a + constant, this will have been detected earlier already) } + else + begin + { load divider in a register } + hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true); + divider:=right.location.register; + + { ARM-64 developer guides recommend checking for division by zero conditions + AFTER the division, since the check and the division can be done in tandem } + if is_signed(left.resultdef) then + op:=A_SDIV + else + op:=A_UDIV; + + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,resultreg,numerator,divider)); + + current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,divider,0)); + current_asmdata.getjumplabel(hl); + current_asmdata.CurrAsmList.concat(taicpu.op_cond_sym(A_B,C_NE,hl)); + cg.a_call_name(current_asmdata.CurrAsmList,'FPC_DIVBYZERO',false); + cg.a_label(current_asmdata.CurrAsmList,hl); + end; + + genOverflowCheck; + + { in case of modulo, multiply result again by the divider and subtract + from the numerator } + if (nodetype=modn) then + begin + { If we didn't acquire the original divisor earlier, grab it now } + if divider = NR_NO then + begin + divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize); + cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider); + end; + + current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg, + resultreg,divider,numerator)); + end; end; -- cgit v1.2.1