diff options
Diffstat (limited to 'compiler/nadd.pas')
-rw-r--r-- | compiler/nadd.pas | 238 |
1 files changed, 217 insertions, 21 deletions
diff --git a/compiler/nadd.pas b/compiler/nadd.pas index bf7b7f2a4e..ab89377400 100644 --- a/compiler/nadd.pas +++ b/compiler/nadd.pas @@ -73,6 +73,10 @@ interface { full 64 bit multiplies. } function use_generic_mul64bit: boolean; virtual; + { shall be overriden if the target cpu supports + an fma instruction + } + function use_fma : boolean; virtual; { This routine calls internal runtime library helpers for all floating point arithmetic in the case where the emulation switches is on. Otherwise @@ -80,18 +84,22 @@ interface the code generation phase. } function first_addfloat : tnode; virtual; - private - { checks whether a muln can be calculated as a 32bit } - { * 32bit -> 64 bit } - function try_make_mul32to64: boolean; - { Match against the ranges, i.e.: - var a:1..10; - begin - if a>0 then - ... - always evaluates to true. (DM) - } - function cmp_of_disjunct_ranges(var res : boolean) : boolean; + private + { checks whether a muln can be calculated as a 32bit } + { * 32bit -> 64 bit } + function try_make_mul32to64: boolean; + + { Match against the ranges, i.e.: + var a:1..10; + begin + if a>0 then + ... + always evaluates to true. (DM) + } + function cmp_of_disjunct_ranges(var res : boolean) : boolean; + + { tries to replace the current node by a fma node } + function try_fma(ld,rd : tdef) : tnode; end; taddnodeclass = class of taddnode; @@ -401,8 +409,7 @@ implementation end; { both are int constants } - if ( - ( + if ( is_constintnode(left) and is_constintnode(right) ) or @@ -414,7 +421,7 @@ implementation ( is_constenumnode(left) and is_constenumnode(right) and - allowenumop(nodetype)) + (allowenumop(nodetype) or (nf_internal in flags)) ) or ( (lt = pointerconstn) and @@ -676,6 +683,50 @@ implementation result:=t; exit; end; +{$if FPC_FULLVERSION>20700} + { bestrealrec is 2.7.1+ only } + + { replace .../const by a multiplication, but only if fastmath is enabled or + the division is done by a power of 2, do not mess with special floating point values like Inf etc. + + do this after constant folding to avoid unnecessary precision loss if + an slash expresion would be first converted into a multiplication and later + folded } + if (nodetype=slashn) and + { do not mess with currency types } + (not(is_currency(right.resultdef))) and + (((cs_opt_fastmath in current_settings.optimizerswitches) and (rt=ordconstn)) or + ((cs_opt_fastmath in current_settings.optimizerswitches) and (rt=realconstn) and + (bestrealrec(trealconstnode(right).value_real).SpecialType in [fsPositive,fsNegative]) + ) or + ((rt=realconstn) and + (bestrealrec(trealconstnode(right).value_real).SpecialType in [fsPositive,fsNegative]) and + { mantissa returns the mantissa/fraction without the hidden 1, so power of two means only the hidden + bit is set => mantissa must be 0 } + (bestrealrec(trealconstnode(right).value_real).Mantissa=0) + ) + ) then + case rt of + ordconstn: + begin + { the normal code handles div/0 } + if (tordconstnode(right).value<>0) then + begin + nodetype:=muln; + t:=crealconstnode.create(1/tordconstnode(right).value,resultdef); + right.free; + right:=t; + exit; + end; + end; + realconstn: + begin + nodetype:=muln; + trealconstnode(right).value_real:=1.0/trealconstnode(right).value_real; + exit; + end; + end; +{$endif FPC_FULLVERSION>20700} { first, we handle widestrings, so we can check later for } { stringconstn only } @@ -1013,6 +1064,14 @@ implementation change : boolean; {$endif} + function maybe_cast_ordconst(var n: tnode; adef: tdef): boolean; + begin + result:=(tordconstnode(n).value>=torddef(adef).low) and + (tordconstnode(n).value<=torddef(adef).high); + if result then + inserttypeconv(n,adef); + end; + begin result:=nil; rlow:=0; @@ -1420,6 +1479,18 @@ implementation inserttypeconv(right,nd); end; end + { don't extend (sign-mismatched) comparisons if either side is a constant + whose value is within range of opposite side } + else if is_integer(ld) and is_integer(rd) and + (nodetype in [equaln,unequaln,gtn,gten,ltn,lten]) and + (is_signed(ld)<>is_signed(rd)) and + ( + ((lt=ordconstn) and maybe_cast_ordconst(left,rd)) or + ((rt=ordconstn) and maybe_cast_ordconst(right,ld)) + ) then + begin + { done here } + end { is there a signed 64 bit type ? } else if ((torddef(rd).ordtype=s64bit) or (torddef(ld).ordtype=s64bit)) then begin @@ -1967,7 +2038,7 @@ implementation end else resultdef:=right.resultdef; - inserttypeconv(left,get_int_type_for_pointer_arithmetic(rd)); + inserttypeconv(left,tpointerdef(right.resultdef).pointer_arithmetic_int_type); if nodetype=addn then begin if (rt=niln) then @@ -1981,7 +2052,7 @@ implementation (tpointerdef(rd).pointeddef.size>1) then begin left:=caddnode.create(muln,left, - cordconstnode.create(tpointerdef(rd).pointeddef.size,get_int_type_for_pointer_arithmetic(rd),true)); + cordconstnode.create(tpointerdef(rd).pointeddef.size,tpointerdef(right.resultdef).pointer_arithmetic_int_type,true)); typecheckpass(left); end; end @@ -2000,7 +2071,7 @@ implementation else resultdef:=left.resultdef; - inserttypeconv(right,get_int_type_for_pointer_arithmetic(ld)); + inserttypeconv(right,tpointerdef(left.resultdef).pointer_arithmetic_int_type); if nodetype in [addn,subn] then begin if (lt=niln) then @@ -2017,7 +2088,7 @@ implementation if (tpointerdef(ld).pointeddef.size>1) then begin right:=caddnode.create(muln,right, - cordconstnode.create(tpointerdef(ld).pointeddef.size,get_int_type_for_pointer_arithmetic(ld),true)); + cordconstnode.create(tpointerdef(ld).pointeddef.size,tpointerdef(left.resultdef).pointer_arithmetic_int_type,true)); typecheckpass(right); end end else @@ -2025,7 +2096,7 @@ implementation (tarraydef(ld).elementdef.size>1) then begin right:=caddnode.create(muln,right, - cordconstnode.create(tarraydef(ld).elementdef.size,get_int_type_for_pointer_arithmetic(ld),true)); + cordconstnode.create(tarraydef(ld).elementdef.size,tpointerdef(left.resultdef).pointer_arithmetic_int_type,true)); typecheckpass(right); end; end @@ -2068,7 +2139,7 @@ implementation { enums } else if (ld.typ=enumdef) and (rd.typ=enumdef) then begin - if allowenumop(nodetype) then + if allowenumop(nodetype) or (nf_internal in flags) then inserttypeconv(right,left.resultdef) else CGMessage3(type_e_operator_not_supported_for_types,node2opstr(nodetype),ld.typename,rd.typename); @@ -2592,6 +2663,127 @@ implementation end; + function taddnode.use_fma : boolean; + begin + result:=false; + end; + + + function taddnode.try_fma(ld,rd : tdef) : tnode; + var + inlinennr : Integer; + begin + result:=nil; + if (cs_opt_fastmath in current_settings.optimizerswitches) and + use_fma and + (nodetype in [addn,subn]) and + (rd.typ=floatdef) and (ld.typ=floatdef) and + (is_single(rd) or is_double(rd)) and + equal_defs(rd,ld) and + { transforming a*b+c into fma(a,b,c) makes only sense if c can be + calculated easily. Consider a*b+c*d which results in + + fmul + fmul + fadd + + and in + + fmul + fma + + when using the fma optimization. On a super scalar architecture, the first instruction + sequence requires clock_cycles(fmul)+clock_cycles(fadd) clock cycles because the fmuls can be executed in parallel. + The second sequence requires clock_cycles(fmul)+clock_cycles(fma) because the fma has to wait for the + result of the fmul. Since typically clock_cycles(fma)>clock_cycles(fadd) applies, the first sequence is better. + } + (((left.nodetype=muln) and (node_complexity(right)<3)) or + ((right.nodetype=muln) and (node_complexity(left)<3)) or + ((left.nodetype=inlinen) and + (tinlinenode(left).inlinenumber=in_sqr_real) and + (node_complexity(right)<3)) or + ((right.nodetype=inlinen) and + (tinlinenode(right).inlinenumber=in_sqr_real) and + (node_complexity(left)<3)) + ) then + begin + case tfloatdef(ld).floattype of + s32real: + inlinennr:=in_fma_single; + s64real: + inlinennr:=in_fma_double; + s80real: + inlinennr:=in_fma_extended; + s128real: + inlinennr:=in_fma_float128; + else + internalerror(2014042601); + end; + if left.nodetype=muln then + begin + if nodetype=subn then + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(cunaryminusnode.create(right), + ccallparanode.create(taddnode(left).right, + ccallparanode.create(taddnode(left).left,nil + )))) + else + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(right, + ccallparanode.create(taddnode(left).right, + ccallparanode.create(taddnode(left).left,nil + )))); + right:=nil; + taddnode(left).right:=nil; + taddnode(left).left:=nil; + end + else if right.nodetype=muln then + begin + if nodetype=subn then + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left, + ccallparanode.create(cunaryminusnode.create(taddnode(right).right), + ccallparanode.create(taddnode(right).left,nil + )))) + else + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left, + ccallparanode.create(taddnode(right).right, + ccallparanode.create(taddnode(right).left,nil + )))); + left:=nil; + taddnode(right).right:=nil; + taddnode(right).left:=nil; + end + else if (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) then + begin + if nodetype=subn then + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(cunaryminusnode.create(right), + ccallparanode.create(tinlinenode(left).left.getcopy, + ccallparanode.create(tinlinenode(left).left.getcopy,nil + )))) + else + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(right, + ccallparanode.create(tinlinenode(left).left.getcopy, + ccallparanode.create(tinlinenode(left).left.getcopy,nil + )))); + right:=nil; + end + { we get here only if right is a sqr node } + else if (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then + begin + if nodetype=subn then + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left, + ccallparanode.create(cunaryminusnode.create(tinlinenode(right).left.getcopy), + ccallparanode.create(tinlinenode(right).left.getcopy,nil + )))) + else + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left, + ccallparanode.create(tinlinenode(right).left.getcopy, + ccallparanode.create(tinlinenode(right).left.getcopy,nil + )))); + left:=nil; + end; + end; + end; + + function taddnode.first_add64bitint: tnode; var procname: string[31]; @@ -3089,6 +3281,10 @@ implementation expectloc:=LOC_FPUREGISTER else expectloc:=LOC_FLAGS; + + result:=try_fma(ld,rd); + if assigned(result) then + exit; end { pointer comperation and subtraction } |