4 files changed, 115 insertions, 22 deletions
diff --git a/compiler/aarch64/agcpugas.pas b/compiler/aarch64/agcpugas.pas
index 2487fa1cb1..8026765fc3 100644
--- a/compiler/aarch64/agcpugas.pas
+++ b/compiler/aarch64/agcpugas.pas
@@ -41,10 +41,12 @@ unit agcpugas;
 
       TAArch64Assembler=class(TGNUassembler)
         constructor CreateWithWriter(info: pasminfo; wr: TExternalAssemblerOutputFile; freewriter, smart: boolean); override;
+        function MakeCmdLine: TCmdStr; override;
       end;
 
       TAArch64AppleAssembler=class(TAppleGNUassembler)
         constructor CreateWithWriter(info: pasminfo; wr: TExternalAssemblerOutputFile; freewriter, smart: boolean); override;
+        function MakeCmdLine: TCmdStr; override;
       end;
 
       TAArch64ClangGASAssembler=class(TAArch64Assembler)
@@ -53,6 +55,7 @@ unit agcpugas;
       protected
         function sectionflags(secflags:TSectionFlags):string;override;
       public
+        function MakeCmdLine: TCmdStr; override;
         procedure WriteAsmList; override;
       end;
 
@@ -65,6 +68,18 @@ unit agcpugas;
     const
       cputype_to_gas_march : array[tcputype] of string = (
         '', // cpu_none
+        '', // armv8 is not accepted by GNU assembler
+        'armv8-a',
+        'armv8.1-a',
+        'armv8.2-a',
+        'armv8.3-a',
+        'armv8.4-a',
+        'armv8.5-a',
+        'armv8.6-a'
+      );
+
+      cputype_to_clang_march : array[tcputype] of string = (
+        '', // cpu_none
         'armv8',
         'armv8-a',
         'armv8.1-a',
@@ -94,6 +109,15 @@ unit agcpugas;
         InstrWriter := TAArch64InstrWriter.create(self);
       end;
 
+    function TAArch64Assembler.MakeCmdLine: TCmdStr;
+      begin
+        result:=inherited MakeCmdLine;
+        if cputype_to_gas_march[current_settings.cputype] <> '' then
+	  Replace(result,'$MARCHOPT','-march='+cputype_to_gas_march[current_settings.cputype])
+        else
+          Replace(result,'$MARCHOPT','');
+      end;
+
 {****************************************************************************}
 {                      Apple AArch64 Assembler writer                        }
 {****************************************************************************}
@@ -105,10 +129,24 @@ unit agcpugas;
       end;
 
 
+    function TAArch64AppleAssembler.MakeCmdLine: TCmdStr;
+      begin
+        result:=inherited MakeCmdLine;
+        if cputype_to_gas_march[current_settings.cputype] <> '' then
+	  Replace(result,'$MARCHOPT','-march='+cputype_to_gas_march[current_settings.cputype])
+        else
+          Replace(result,'$MARCHOPT','');
+      end;
+
 {****************************************************************************}
 {                      CLang AArch64 Assembler writer                        }
 {****************************************************************************}
 
+    function TAArch64ClangGASAssembler.MakeCmdLine: TCmdStr;
+      begin
+        result:=inherited MakeCmdLine;
+      end;
+
     procedure TAArch64ClangGASAssembler.TransformSEHDirectives(list:TAsmList);
 
       function convert_unwinddata(list:tasmlist):tdynamicarray;
@@ -773,7 +811,7 @@ unit agcpugas;
             id     : as_gas;
             idtxt  : 'AS';
             asmbin : 'as';
-            asmcmd : '-o $OBJ $EXTRAOPT $ASM';
+            asmcmd : '-o $OBJ $MARCHOPT $EXTRAOPT $ASM';
             supported_targets : [system_aarch64_freebsd,system_aarch64_linux,system_aarch64_android];
             flags : [af_needar,af_smartlink_sections];
             labelprefix : '.L';
@@ -787,7 +825,7 @@ unit agcpugas;
             id     : as_clang_asdarwin;
             idtxt  : 'CLANG';
             asmbin : 'clang';
-            asmcmd : '-x assembler -c -target $TRIPLET -o $OBJ $EXTRAOPT -x assembler $ASM';
+            asmcmd : '-x assembler -c -target $TRIPLET -o $OBJ $MARCHOPT $EXTRAOPT -x assembler $ASM';
             supported_targets : [system_aarch64_ios,system_aarch64_darwin];
             flags : [af_needar,af_smartlink_sections,af_supports_dwarf,af_llvm,af_supports_hlcfi];
             labelprefix : 'L';
@@ -801,7 +839,7 @@ unit agcpugas;
             id     : as_clang_gas;
             idtxt  : 'CLANG';
             asmbin : 'clang';
-            asmcmd : '-x assembler -c -target $TRIPLET -o $OBJ $EXTRAOPT -x assembler $ASM';
+            asmcmd : '-x assembler -c -target $TRIPLET -o $OBJ $MARCHOPT $EXTRAOPT -x assembler $ASM';
             supported_targets : [system_aarch64_win64];
             flags : [af_needar,af_smartlink_sections,af_supports_dwarf,af_llvm,af_supports_hlcfi];
             labelprefix : '.L';
diff --git a/compiler/aarch64/aoptcpu.pas b/compiler/aarch64/aoptcpu.pas
index 4ef898284e..dc5e327cf2 100644
--- a/compiler/aarch64/aoptcpu.pas
+++ b/compiler/aarch64/aoptcpu.pas
@@ -379,15 +379,23 @@ Implementation
                          taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
                          shifterop);
 
+                { Make sure the register used in the shifting is tracked all
+                  the way through, otherwise it may become deallocated while
+                  it's still live and cause incorrect optimisations later }
+                if (taicpu(hp1).oper[0]^.reg <> taicpu(p).oper[1]^.reg) then
+                  begin
+                    TransferUsedRegs(TmpUsedRegs);
+                    UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
+                    ALlocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, TmpUsedRegs);
+                  end;
+
                 taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
                 asml.insertbefore(hp2, hp1);
-                GetNextInstruction(p, hp2);
-                asml.remove(p);
-                asml.remove(hp1);
-                p.free;
-                hp1.free;
-                p:=hp2;
-                DebugMsg('Peephole FoldShiftProcess done', p);
+
+                RemoveInstruction(hp1);
+                RemoveCurrentp(p);
+
+                DebugMsg('Peephole FoldShiftProcess done', hp2);
                 Result:=true;
                 break;
               end;
diff --git a/compiler/aarch64/hlcgcpu.pas b/compiler/aarch64/hlcgcpu.pas
index 593c202ef8..9de4e55ad7 100644
--- a/compiler/aarch64/hlcgcpu.pas
+++ b/compiler/aarch64/hlcgcpu.pas
@@ -210,7 +210,8 @@ implementation
       if slopt in [SL_SETZERO,SL_SETMAX] then
         inherited
       else if not(sreg.bitlen in [32,64]) or
-              (sreg.startbit<>0) then
+              (sreg.startbit<>0) or
+              (getsubreg(fromreg)<getsubreg(sreg.subsetreg)) then
         begin
           makeregssamesize(list,def_cgsize(fromsize),sreg.subsetregsize,fromreg,sreg.subsetreg,fromreg,toreg);
           list.concat(taicpu.op_reg_reg_const_const(A_BFI,toreg,fromreg,sreg.startbit,sreg.bitlen))
diff --git a/compiler/aarch64/ncpuinl.pas b/compiler/aarch64/ncpuinl.pas
index a2e5f1352f..1ea4537b3b 100644
--- a/compiler/aarch64/ncpuinl.pas
+++ b/compiler/aarch64/ncpuinl.pas
@@ -1,7 +1,7 @@
 {
     Copyright (c) 1998-2002 by Florian Klaempfl
 
-    Generates ARM inline nodes
+    Generates AAarch64 inline nodes
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -35,6 +35,8 @@ interface
         function first_sqrt_real: tnode; override;
         function first_round_real: tnode; override;
         function first_trunc_real: tnode; override;
+        function first_int_real: tnode; override;
+        function first_frac_real: tnode; override;
         function first_fma : tnode; override;
         procedure second_abs_real; override;
         procedure second_sqr_real; override;
@@ -42,6 +44,8 @@ interface
         procedure second_abs_long; override;
         procedure second_round_real; override;
         procedure second_trunc_real; override;
+        procedure second_int_real; override;
+        procedure second_frac_real; override;
         procedure second_get_frame; override;
         procedure second_fma; override;
         procedure second_prefetch; override;
@@ -108,16 +112,31 @@ implementation
       end;
 
 
-     function taarch64inlinenode.first_fma : tnode;
-       begin
-         if ((is_double(resultdef)) or (is_single(resultdef))) then
-           begin
-             expectloc:=LOC_MMREGISTER;
-             Result:=nil;
-           end
-         else
-           Result:=inherited first_fma;
-       end;
+    function taarch64inlinenode.first_int_real : tnode;
+      begin
+        expectloc:=LOC_MMREGISTER;
+        result:=nil;
+      end;
+
+
+    function taarch64inlinenode.first_frac_real : tnode;
+      begin
+        expectloc:=LOC_MMREGISTER;
+        result:=nil;
+      end;
+
+
+    function taarch64inlinenode.first_fma : tnode;
+      begin
+        if ((is_double(resultdef)) or (is_single(resultdef))) then
+          begin
+            expectloc:=LOC_MMREGISTER;
+            Result:=nil;
+          end
+        else
+          Result:=inherited first_fma;
+     end;
+
 
     procedure taarch64inlinenode.second_abs_real;
       begin
@@ -187,6 +206,33 @@ implementation
       end;
 
 
+    procedure taarch64inlinenode.second_int_real;
+      var
+        hreg: tregister;
+      begin
+        secondpass(left);
+        hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+        location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+        location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+        current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FRINTZ,location.register,left.location.register));
+        cg.maybe_check_for_fpu_exception(current_asmdata.CurrAsmList);
+      end;
+
+
+    procedure taarch64inlinenode.second_frac_real;
+      var
+        hreg: tregister;
+      begin
+        secondpass(left);
+        hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+        location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+        location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+        current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FRINTZ,location.register,left.location.register));
+        current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_FSUB,location.register,left.location.register,location.register));
+        cg.maybe_check_for_fpu_exception(current_asmdata.CurrAsmList);
+      end;
+
+
     procedure taarch64inlinenode.second_get_frame;
       begin
         location_reset(location,LOC_CREGISTER,OS_ADDR);