{
    Copyright (c) 1998-2002, 2014 by Florian Klaempfl and Jonas Maebe

    Generate AArch64 assembler for math nodes

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

 ****************************************************************************
}
unit ncpumat;

{$i fpcdefs.inc}

interface

    uses
      node,nmat,ncgmat;

    type
      taarch64moddivnode = class(tmoddivnode)
         function pass_1: tnode; override;
         procedure pass_generate_code;override;
      end;

      taarch64notnode = class(tcgnotnode)
         procedure second_boolean;override;
      end;

      taarch64unaryminusnode = class(tcgunaryminusnode)
         procedure second_float; override;
      end;

implementation

    uses
      globtype,systems,constexp,
      cutils,verbose,globals,
      symconst,symdef,
      aasmbase,aasmcpu,aasmtai,aasmdata,
      defutil,
      cgbase,cgobj,hlcgobj,pass_2,procinfo,
      ncon,
      cpubase,
      ncgutil,cgcpu,cgutils;

{*****************************************************************************
                             taarch64moddivnode
*****************************************************************************}

    function taarch64moddivnode.pass_1: tnode;
      begin
        result:=inherited pass_1;
        if not assigned(result) then
          include(current_procinfo.flags,pi_do_call);
      end;


    procedure taarch64moddivnode.pass_generate_code;
      var
         op         : tasmop;
         tmpreg,
         zeroreg,
         numerator,
         divider,
         largernumreg,
         largerresreg,
         resultreg  : tregister;
         hl         : tasmlabel;
         overflowloc: tlocation;
         power      : longint;
         opsize     : tcgsize;

         dividend   : Int64;
         high_bit,
         reciprocal : QWord;
         { Just to save on stack space and the like }
         reciprocal_signed : Int64 absolute reciprocal;

         expandword,
         magic_add  : Boolean;
         shift      : byte;

         shifterop  : tshifterop;
         hp         : taicpu;

       procedure genOrdConstNodeDiv;
         var
           helper1, helper2: TRegister;
           so: tshifterop;
         begin
           if tordconstnode(right).value=0 then
             internalerror(2020021601)
           else if tordconstnode(right).value=1 then
             cg.a_load_reg_reg(current_asmdata.CurrAsmList, opsize, opsize, numerator, resultreg)
           else if (tordconstnode(right).value = int64(-1)) then
             begin
               // note: only in the signed case possible..., may overflow
               if cs_check_overflow in current_settings.localswitches then
                 cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);

               current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_NEG,
                 resultreg,numerator),toppostfix(ord(cs_check_overflow in current_settings.localswitches)*ord(PF_S))));
             end
           else if isabspowerof2(tordconstnode(right).value,power) then
             begin
               if (is_signed(right.resultdef)) then
                 begin
                    helper2:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
                    if power = 1 then
                      helper1:=numerator
                    else
                      begin
                        helper1:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
                        cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,opsize,resultdef.size*8-1,numerator,helper1);
                      end;
                    shifterop_reset(so);
                    so.shiftmode:=SM_LSR;
                    so.shiftimm:=resultdef.size*8-power;
                    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,helper2,numerator,helper1,so));
                    cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,def_cgsize(resultdef),power,helper2,resultreg);

                    if (tordconstnode(right).value < 0) then
                      { Invert the result }
                      current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_NEG,resultreg,resultreg));
                  end
                else
                  cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,opsize,power,numerator,resultreg)
             end
           else
             { Generic division }
             begin
               if is_signed(left.resultdef) then
                 op:=A_SDIV
               else
                 op:=A_UDIV;

               { If we didn't acquire the original divisor earlier, grab it now }
               if divider = NR_NO then
                 begin
                   divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
                   cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
                 end;

               current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,resultreg,numerator,divider));
             end;
         end;

       procedure genOverflowCheck;
         begin
           { in case of overflow checking, also check for low(int64) div (-1)
             (no hardware support for this either) }
           if (cs_check_overflow in current_settings.localswitches) and
              is_signed(left.resultdef) and
              ((right.nodetype<>ordconstn) or
               (tordconstnode(right).value=-1)) then
             begin
               { num=ffff... and div=8000... <=>
                 num xor not(div xor 8000...) = 0
                 (and we have the "eon" operation, which performs "xor not(...)" }
               tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,left.resultdef);
               hlcg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.resultdef,low(int64),numerator,tmpreg);
               current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_EON,
                 tmpreg,numerator,tmpreg));
               current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,tmpreg,0));
               { now the zero/equal flag is set in case we divided low(int64) by
                 (-1) }
               location_reset(overflowloc,LOC_FLAGS,OS_NO);
               overflowloc.resflags:=F_EQ;
               cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,location,resultdef,overflowloc);
             end;
         end;

      begin
        secondpass(left);
        secondpass(right);
        { avoid warning }
        divider := NR_NO;
        largernumreg := NR_NO;
        expandword := False;

        opsize := def_cgsize(resultdef);

        { set result location }
        location_reset(location,LOC_REGISTER,opsize);
        location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
        resultreg:=location.register;

        { put numerator in register }
        hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
        numerator:=left.location.register;

        if (right.nodetype=ordconstn) then
          begin
            { If optimising for size, just use regular division operations }
            if (cs_opt_size in current_settings.optimizerswitches) or
              ((tordconstnode(right).value=1) or
              (tordconstnode(right).value=int64(-1)) or
              isabspowerof2(tordconstnode(right).value,power)) then
              begin

                { Store divisor for later (and executed at the same time as the multiplication) }
                if (nodetype=modn) then
                  begin
                    if (tordconstnode(right).value = 1) or (tordconstnode(right).value = int64(-1)) then
                      begin
                        { Just evaluates to zero }
                        current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_MOVZ,resultreg, 0));
                        Exit;
                      end
                    { "not cs_opt_size" saves from checking the value of the divisor again
                      (if cs_opt_size is not set, then the divisor is a power of 2) }
                    else if not (cs_opt_size in current_settings.optimizerswitches) then
                      begin
                        divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
                        cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
                      end
                  end;

                genOrdConstNodeDiv;
                genOverflowCheck;

                { in case of modulo, multiply result again by the divider and subtract
                  from the numerator }
                if (nodetype=modn) then
                  begin
                    if ispowerof2(tordconstnode(right).value,power) then
                      begin
                        shifterop.shiftmode := SM_LSL;
                        shifterop.shiftimm := power;

                        current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_SUB,resultreg,numerator,resultreg,shifterop));
                      end
                    else
                      current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg,
                        resultreg,divider,numerator));
                  end;

                Exit;
              end
            else
              begin
                if is_signed(left.resultdef) then
                  begin
                    if (nodetype=modn) then { Signed mod doesn't work properly }
                      begin
                        divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
                        cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
                        genOrdConstNodeDiv;
                      end
                    else
                      begin
                        { Read signed value to avoid Internal Error 200706094 }
                        dividend := tordconstnode(right).value.svalue;

                        calc_divconst_magic_signed(resultdef.size * 8, dividend, reciprocal_signed, shift);
                        cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, reciprocal_signed, resultreg);

                        { SMULH is only available for the full 64-bit registers }
                        if opsize in [OS_64, OS_S64] then
                          begin
                            current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_SMULH,resultreg,resultreg,numerator));
                            largerresreg := resultreg;
                          end
                        else
                          begin
                            largerresreg := newreg(getregtype(resultreg), getsupreg(resultreg), R_SUBWHOLE);
                            largernumreg := newreg(getregtype(numerator), getsupreg(numerator), R_SUBWHOLE);
                            current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_MUL,largerresreg,largerresreg,largernumreg));
                            expandword := True; { Merge the shift operation with something below }
                          end;

                        { Store divisor for later (and executed at the same time as the multiplication) }
                        if nodetype=modn then
                          begin
                            divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
                            cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,dividend,divider);
                          end;

                        { add or subtract dividend }
                        if (dividend > 0) and (reciprocal_signed < 0) then
                          begin
                            if expandword then
                              begin
                                shifterop.shiftmode := SM_ASR;
                                shifterop.shiftimm := 32;
                                expandword := False;
                                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,largerresreg,largernumreg,largerresreg,shifterop));
                              end
                            else
                              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_ADD,resultreg,resultreg,numerator));
                          end
                        else if (dividend < 0) and (reciprocal_signed > 0) then
                          begin
                            if expandword then
                              begin
                                { We can't append LSR to the SUB below because it's on the wrong operand }
                                expandword := False;
                                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,largerresreg,largerresreg,32));
                              end;

                            current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_SUB,resultreg,resultreg,numerator));
                          end
                        else if expandword then
                          Inc(shift,32);

                        { shift if necessary }
                        if (shift <> 0) then
                          begin
                            if expandword then
                              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,largerresreg,largerresreg,shift))
                            else
                              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,resultreg,resultreg,shift));
                          end;

                        { extract and add the sign bit }
                        shifterop.shiftmode := SM_LSR;
                        shifterop.shiftimm := left.resultdef.size*8 - 1;

                        if (dividend < 0) then
                          current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,resultreg,resultreg,resultreg,shifterop))
                        else
                          current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,resultreg,resultreg,numerator,shifterop));
                      end;
                  end
                else
                  begin
                    calc_divconst_magic_unsigned(resultdef.size * 8, tordconstnode(right).value, reciprocal, magic_add, shift);
                    cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, reciprocal, resultreg);

                    { UMULH is only available for the full 64-bit registers }
                    if opsize in [OS_64, OS_S64] then
                      begin
                        current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_UMULH,resultreg,resultreg,numerator));
                        largerresreg := resultreg;
                      end
                    else
                      begin
                        largerresreg := newreg(getregtype(resultreg), getsupreg(resultreg), R_SUBWHOLE);
                        largernumreg := newreg(getregtype(numerator), getsupreg(numerator), R_SUBWHOLE);
                        current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_MUL,largerresreg,largerresreg,largernumreg));
                        expandword := True; { Try to merge the shift operation with something below }
                      end;

                    { Store divisor for later (and executed at the same time as the multiplication) }
                    if (nodetype=modn) then
                      begin
                        divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
                        cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
                      end;

                    if magic_add then
                      begin
                        { We can't append LSR to the ADD below because it would require extending the registers
                          and interfere with the carry bit }
                        if expandword then
                          current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,largerresreg,largerresreg,32));

                        { Add the reciprocal to the high-order word, tracking the carry bit, shift, then
                          insert the carry bit via CSEL and ORR }

                        if opsize in [OS_64,OS_S64] then
                          zeroreg := NR_XZR
                        else
                          zeroreg := NR_WZR;

                        high_bit := QWord(1) shl ((resultdef.size * 8) - shift);

                        tmpreg := cg.getintregister(current_asmdata.CurrAsmList, opsize);
                        cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, high_bit, tmpreg);

                        { Generate ADDS instruction }
                        hp := taicpu.op_reg_reg_reg(A_ADD,resultreg,resultreg,numerator);
                        hp.oppostfix := PF_S;
                        current_asmdata.CurrAsmList.concat(hp);

                        current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_cond(A_CSEL,tmpreg,tmpreg,zeroreg, C_CS));

                        shifterop.shiftmode := SM_LSR;
                        shifterop.shiftimm := shift;

                        current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,resultreg,tmpreg,resultreg,shifterop));
                      end
                    else if expandword then
                      { Include the right-shift by 32 to get the high-order DWord }
                      current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,largerresreg,largerresreg,shift + 32))
                    else
                      current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,resultreg,resultreg,shift));
                  end;

              end;

          end
        { no divide-by-zero detection available in hardware, emulate (if it's a
          constant, this will have been detected earlier already) }
        else
          begin
            { load divider in a register }
            hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
            divider:=right.location.register;

            { ARM-64 developer guides recommend checking for division by zero conditions
              AFTER the division, since the check and the division can be done in tandem }
            if is_signed(left.resultdef) then
              op:=A_SDIV
            else
              op:=A_UDIV;

            current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,resultreg,numerator,divider));

            current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,divider,0));
            current_asmdata.getjumplabel(hl);
            current_asmdata.CurrAsmList.concat(taicpu.op_cond_sym(A_B,C_NE,hl));
            cg.a_call_name(current_asmdata.CurrAsmList,'FPC_DIVBYZERO',false);
            cg.a_label(current_asmdata.CurrAsmList,hl);
          end;

        genOverflowCheck;

        { in case of modulo, multiply result again by the divider and subtract
          from the numerator }
        if (nodetype=modn) then
          begin
            { If we didn't acquire the original divisor earlier, grab it now }
            if divider = NR_NO then
              begin
                divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
                cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
              end;

            current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg,
              resultreg,divider,numerator));
          end;
    end;


{*****************************************************************************
                               taarch64notnode
*****************************************************************************}

    procedure taarch64notnode.second_boolean;
      begin
        secondpass(left);
        if not handle_locjump then
          begin
            case left.location.loc of
              LOC_FLAGS :
                begin
                  location_copy(location,left.location);
                  inverse_flags(location.resflags);
                end;
              LOC_REGISTER, LOC_CREGISTER,
              LOC_REFERENCE, LOC_CREFERENCE,
              LOC_SUBSETREG, LOC_CSUBSETREG,
              LOC_SUBSETREF, LOC_CSUBSETREF:
                begin
                  hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
                  current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,
                    left.location.register,0));
                  location_reset(location,LOC_FLAGS,OS_NO);
                  location.resflags:=F_EQ;
               end;
              else
                internalerror(2003042401);
            end;
          end;
      end;


{*****************************************************************************
                                   taarch64unaryminusnode
*****************************************************************************}

    procedure taarch64unaryminusnode.second_float;
      begin
        secondpass(left);
        hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
        location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
        location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
        current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FNEG,location.register,left.location.register));
        cg.maybe_check_for_fpu_exception(current_asmdata.CurrAsmList);
      end;

begin
   cmoddivnode:=taarch64moddivnode;
   cnotnode:=taarch64notnode;
   cunaryminusnode:=taarch64unaryminusnode;
end.