summaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
authorbstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4>2011-11-28 14:03:29 +0000
committerbstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4>2011-11-28 14:03:29 +0000
commitf50671fe14ad06fc6793753f05c03600c2fb9903 (patch)
tree48f977d57ea2f4163dd60d917bfd1230174a7219 /gcc/config
parent506cd07ff0a6af7ed5de17de608e7d4ca48480c8 (diff)
downloadgcc-f50671fe14ad06fc6793753f05c03600c2fb9903.tar.gz
2011-11-28 Basile Starynkevitch <basile@starynkevitch.net>
MELT branch merged with trunk rev 181775 using svnmerge git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/melt-branch@181777 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/arm/arm.c4
-rw-r--r--gcc/config/avr/avr-c.c2
-rw-r--r--gcc/config/avr/avr-log.c86
-rw-r--r--gcc/config/avr/avr-protos.h2
-rw-r--r--gcc/config/avr/avr.c473
-rw-r--r--gcc/config/avr/avr.md42
-rw-r--r--gcc/config/i386/i386-opts.h3
-rw-r--r--gcc/config/i386/i386.c1171
-rw-r--r--gcc/config/i386/i386.h8
-rw-r--r--gcc/config/i386/i386.opt3
-rw-r--r--gcc/config/i386/sse.md20
-rw-r--r--gcc/config/i386/sync.md4
-rw-r--r--gcc/config/m68k/linux.h4
-rw-r--r--gcc/config/m68k/m68k.c8
-rw-r--r--gcc/config/m68k/m68k.md9
-rw-r--r--gcc/config/m68k/sync.md80
-rw-r--r--gcc/config/mips/mips-protos.h3
-rw-r--r--gcc/config/mips/mips.c70
-rw-r--r--gcc/config/mips/mips.md332
-rw-r--r--gcc/config/mips/predicates.md9
-rw-r--r--gcc/config/pa/pa-linux.h1
21 files changed, 1217 insertions, 1117 deletions
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index e3b0b883ec8..ee26c51b0a0 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1096,6 +1096,10 @@ arm_set_fixed_conv_libfunc (convert_optab optable, enum machine_mode to,
static void
arm_init_libfuncs (void)
{
+ /* For Linux, we have access to kernel support for atomic operations. */
+ if (arm_abi == ARM_ABI_AAPCS_LINUX)
+ init_sync_libfuncs (2 * UNITS_PER_WORD);
+
/* There are no special library functions unless we are using the
ARM BPABI. */
if (!TARGET_BPABI)
diff --git a/gcc/config/avr/avr-c.c b/gcc/config/avr/avr-c.c
index f0b3a628c2c..fd03b361b5e 100644
--- a/gcc/config/avr/avr-c.c
+++ b/gcc/config/avr/avr-c.c
@@ -136,6 +136,8 @@ avr_cpu_cpp_builtins (struct cpp_reader *pfile)
cpp_define (pfile, "__BUILTIN_AVR_WDR");
cpp_define (pfile, "__BUILTIN_AVR_SLEEP");
cpp_define (pfile, "__BUILTIN_AVR_SWAP");
+ cpp_define (pfile, "__BUILTIN_AVR_MAP8");
+ cpp_define (pfile, "__BUILTIN_AVR_MAP16");
cpp_define (pfile, "__BUILTIN_AVR_DELAY_CYCLES");
cpp_define (pfile, "__BUILTIN_AVR_FMUL");
diff --git a/gcc/config/avr/avr-log.c b/gcc/config/avr/avr-log.c
index 2f6b0aa6519..3c4bccfa282 100644
--- a/gcc/config/avr/avr-log.c
+++ b/gcc/config/avr/avr-log.c
@@ -49,6 +49,8 @@
C: enum rtx_code
m: enum machine_mode
R: enum reg_class
+ D: double_int (signed decimal)
+ X: double_int (unsigned hex)
L: insn list
H: location_t
@@ -82,9 +84,9 @@ static void avr_log_vadump (FILE*, const char*, va_list);
/* As we have no variadic macros, avr_edump maps to a call to
avr_log_set_caller_e which saves __FUNCTION__ to avr_log_caller and
- returns a function pointer to avr_log_fdump_e. avr_fdump_e
+ returns a function pointer to avr_log_fdump_e. avr_log_fdump_e
gets the printf-like arguments and calls avr_log_vadump, the
- worker function. avr_fdump works the same way. */
+ worker function. avr_fdump works the same way. */
/* Provide avr_log_fdump_e/f so that avr_log_set_caller_e/_f can return
their address. */
@@ -135,6 +137,49 @@ avr_log_set_caller_f (const char *caller)
return avr_log_fdump_f;
}
+
+/* Copy-paste from double-int.c:double_int_split_digit (it's static there).
+ Splits last digit of *CST (taken as unsigned) in BASE and returns it. */
+
+static unsigned
+avr_double_int_pop_digit (double_int *cst, unsigned base)
+{
+ unsigned HOST_WIDE_INT resl, reml;
+ HOST_WIDE_INT resh, remh;
+
+ div_and_round_double (FLOOR_DIV_EXPR, true, cst->low, cst->high, base, 0,
+ &resl, &resh, &reml, &remh);
+ cst->high = resh;
+ cst->low = resl;
+
+ return reml;
+}
+
+
+/* Dump VAL as hex value to FILE. */
+
+static void
+avr_dump_double_int_hex (FILE *file, double_int val)
+{
+ unsigned digit[4];
+
+ digit[0] = avr_double_int_pop_digit (&val, 1 << 16);
+ digit[1] = avr_double_int_pop_digit (&val, 1 << 16);
+ digit[2] = avr_double_int_pop_digit (&val, 1 << 16);
+ digit[3] = avr_double_int_pop_digit (&val, 1 << 16);
+
+ fprintf (file, "0x");
+
+ if (digit[3] | digit[2])
+ fprintf (file, "%04x%04x", digit[3], digit[2]);
+
+ if (digit[3] | digit[2] | digit[1] | digit[0])
+ fprintf (file, "%04x%04x", digit[1], digit[0]);
+ else
+ fprintf (file, "0");
+}
+
+
/* Worker function implementing the %-codes and forwarding to
respective print/dump function. */
@@ -189,6 +234,14 @@ avr_log_vadump (FILE *file, const char *fmt, va_list ap)
fprintf (file, "%d", va_arg (ap, int));
break;
+ case 'D':
+ dump_double_int (file, va_arg (ap, double_int), false);
+ break;
+
+ case 'X':
+ avr_dump_double_int_hex (file, va_arg (ap, double_int));
+ break;
+
case 'x':
fprintf (file, "%x", va_arg (ap, int));
break;
@@ -251,7 +304,7 @@ avr_log_vadump (FILE *file, const char *fmt, va_list ap)
location_t loc = va_arg (ap, location_t);
if (BUILTINS_LOCATION == loc)
- fprintf (file, "<BUILTIN-LOCATION");
+ fprintf (file, "<BUILTIN-LOCATION>");
else if (UNKNOWN_LOCATION == loc)
fprintf (file, "<UNKNOWN-LOCATION>");
else
@@ -306,21 +359,33 @@ avr_log_vadump (FILE *file, const char *fmt, va_list ap)
void
avr_log_set_avr_log (void)
{
- if (avr_log_details)
+ bool all = TARGET_ALL_DEBUG != 0;
+
+ if (all || avr_log_details)
{
/* Adding , at beginning and end of string makes searching easier. */
char *str = (char*) alloca (3 + strlen (avr_log_details));
+ bool info;
str[0] = ',';
strcat (stpcpy (str+1, avr_log_details), ",");
-
-#define SET_DUMP_DETAIL(S) \
- avr_log.S = (TARGET_ALL_DEBUG \
- || NULL != strstr (str, "," #S ",") \
- || NULL != strstr (str, ",all,"))
+
+ all |= NULL != strstr (str, ",all,");
+ info = NULL != strstr (str, ",?,");
+
+ if (info)
+ fprintf (stderr, "\n-mlog=");
+
+#define SET_DUMP_DETAIL(S) \
+ do { \
+ avr_log.S = (all || NULL != strstr (str, "," #S ",")); \
+ if (info) \
+ fprintf (stderr, #S ","); \
+ } while (0)
SET_DUMP_DETAIL (address_cost);
+ SET_DUMP_DETAIL (builtin);
SET_DUMP_DETAIL (constraints);
SET_DUMP_DETAIL (legitimate_address_p);
SET_DUMP_DETAIL (legitimize_address);
@@ -329,5 +394,8 @@ avr_log_set_avr_log (void)
SET_DUMP_DETAIL (rtx_costs);
#undef SET_DUMP_DETAIL
+
+ if (info)
+ fprintf (stderr, "?\n\n");
}
}
diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index 22b1548ed66..bafd794a302 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -92,6 +92,7 @@ extern const char* avr_out_plus_noclobber (rtx*, int*, int*);
extern const char* avr_out_addto_sp (rtx*, int*);
extern const char* avr_out_xload (rtx, rtx*, int*);
extern const char* avr_out_movmem (rtx, rtx*, int*);
+extern const char* avr_out_map_bits (rtx, rtx*, int*);
extern bool avr_popcount_each_byte (rtx, int, int);
extern int extra_constraint_Q (rtx x);
@@ -142,6 +143,7 @@ extern void avr_log_set_avr_log (void);
typedef struct
{
unsigned address_cost :1;
+ unsigned builtin :1;
unsigned constraints :1;
unsigned legitimate_address_p :1;
unsigned legitimize_address :1;
diff --git a/gcc/config/avr/avr.c b/gcc/config/avr/avr.c
index 543bb9c56c1..bb4a08dad71 100644
--- a/gcc/config/avr/avr.c
+++ b/gcc/config/avr/avr.c
@@ -1795,8 +1795,9 @@ print_operand_address (FILE *file, rtx addr)
}
-/* Output X as assembler operand to file FILE. */
-
+/* Output X as assembler operand to file FILE.
+ For a description of supported %-codes, see top of avr.md. */
+
void
print_operand (FILE *file, rtx x, int code)
{
@@ -1815,6 +1816,31 @@ print_operand (FILE *file, rtx x, int code)
if (AVR_HAVE_EIJMP_EICALL)
fputc ('e', file);
}
+ else if (code == 't'
+ || code == 'T')
+ {
+ static int t_regno = -1;
+ static int t_nbits = -1;
+
+ if (REG_P (x) && t_regno < 0 && code == 'T')
+ {
+ t_regno = REGNO (x);
+ t_nbits = GET_MODE_BITSIZE (GET_MODE (x));
+ }
+ else if (CONST_INT_P (x) && t_regno >= 0
+ && IN_RANGE (INTVAL (x), 0, t_nbits - 1))
+ {
+ int bpos = INTVAL (x);
+
+ fprintf (file, "%s", reg_names[t_regno + bpos / 8]);
+ if (code == 'T')
+ fprintf (file, ",%d", bpos % 8);
+
+ t_regno = -1;
+ }
+ else
+ fatal_insn ("operands to %T/%t must be reg + const_int:", x);
+ }
else if (REG_P (x))
{
if (x == zero_reg_rtx)
@@ -1822,9 +1848,32 @@ print_operand (FILE *file, rtx x, int code)
else
fprintf (file, reg_names[true_regnum (x) + abcd]);
}
- else if (GET_CODE (x) == CONST_INT)
- fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x) + abcd);
- else if (GET_CODE (x) == MEM)
+ else if (CONST_INT_P (x))
+ {
+ HOST_WIDE_INT ival = INTVAL (x);
+
+ if ('i' != code)
+ fprintf (file, HOST_WIDE_INT_PRINT_DEC, ival + abcd);
+ else if (low_io_address_operand (x, VOIDmode)
+ || high_io_address_operand (x, VOIDmode))
+ {
+ switch (ival)
+ {
+ case RAMPZ_ADDR: fprintf (file, "__RAMPZ__"); break;
+ case SREG_ADDR: fprintf (file, "__SREG__"); break;
+ case SP_ADDR: fprintf (file, "__SP_L__"); break;
+ case SP_ADDR+1: fprintf (file, "__SP_H__"); break;
+
+ default:
+ fprintf (file, HOST_WIDE_INT_PRINT_HEX,
+ ival - avr_current_arch->sfr_offset);
+ break;
+ }
+ }
+ else
+ fatal_insn ("bad address, not an I/O address:", x);
+ }
+ else if (MEM_P (x))
{
rtx addr = XEXP (x, 0);
@@ -1844,21 +1893,7 @@ print_operand (FILE *file, rtx x, int code)
}
else if (code == 'i')
{
- if (!io_address_operand (addr, GET_MODE (x)))
- fatal_insn ("bad address, not an I/O address:", addr);
-
- switch (INTVAL (addr))
- {
- case RAMPZ_ADDR: fprintf (file, "__RAMPZ__"); break;
- case SREG_ADDR: fprintf (file, "__SREG__"); break;
- case SP_ADDR: fprintf (file, "__SP_L__"); break;
- case SP_ADDR+1: fprintf (file, "__SP_H__"); break;
-
- default:
- fprintf (file, HOST_WIDE_INT_PRINT_HEX,
- UINTVAL (addr) - avr_current_arch->sfr_offset);
- break;
- }
+ print_operand (file, addr, 'i');
}
else if (code == 'o')
{
@@ -1889,6 +1924,10 @@ print_operand (FILE *file, rtx x, int code)
else
print_operand_address (file, addr);
}
+ else if (code == 'i')
+ {
+ fatal_insn ("bad address, not an I/O address:", x);
+ }
else if (code == 'x')
{
/* Constant progmem address - like used in jmp or call */
@@ -6390,6 +6429,8 @@ adjust_insn_length (rtx insn, int len)
case ADJUST_LEN_CALL: len = AVR_HAVE_JMP_CALL ? 2 : 1; break;
+ case ADJUST_LEN_MAP_BITS: avr_out_map_bits (insn, op, &len); break;
+
default:
gcc_unreachable();
}
@@ -9844,6 +9885,345 @@ avr_expand_delay_cycles (rtx operands0)
}
}
+
+/* Return VAL * BASE + DIGIT. BASE = 0 is shortcut for BASE = 2^{32} */
+
+static double_int
+avr_double_int_push_digit (double_int val, int base,
+ unsigned HOST_WIDE_INT digit)
+{
+ val = 0 == base
+ ? double_int_lshift (val, 32, 64, false)
+ : double_int_mul (val, uhwi_to_double_int (base));
+
+ return double_int_add (val, uhwi_to_double_int (digit));
+}
+
+
+/* Compute the image of x under f, i.e. perform x --> f(x) */
+
+static int
+avr_map (double_int f, int x)
+{
+ return 0xf & double_int_to_uhwi (double_int_rshift (f, 4*x, 64, false));
+}
+
+
+/* Return the map R that reverses the bits of byte B.
+
+ R(0) = (0 7) o (1 6) o (2 5) o (3 4)
+ R(1) = (8 15) o (9 14) o (10 13) o (11 12)
+
+ Notice that R o R = id. */
+
+static double_int
+avr_revert_map (int b)
+{
+ int i;
+ double_int r = double_int_zero;
+
+ for (i = 16-1; i >= 0; i--)
+ r = avr_double_int_push_digit (r, 16, i >> 3 == b ? i ^ 7 : i);
+
+ return r;
+}
+
+
+/* Return the map R that swaps bit-chunks of size SIZE in byte B.
+
+ R(1,0) = (0 1) o (2 3) o (4 5) o (6 7)
+ R(1,1) = (8 9) o (10 11) o (12 13) o (14 15)
+
+ R(4,0) = (0 4) o (1 5) o (2 6) o (3 7)
+ R(4,1) = (8 12) o (9 13) o (10 14) o (11 15)
+
+ Notice that R o R = id. */
+
+static double_int
+avr_swap_map (int size, int b)
+{
+ int i;
+ double_int r = double_int_zero;
+
+ for (i = 16-1; i >= 0; i--)
+ r = avr_double_int_push_digit (r, 16, i ^ (i >> 3 == b ? size : 0));
+
+ return r;
+}
+
+
+/* Return Identity. */
+
+static double_int
+avr_id_map (void)
+{
+ int i;
+ double_int r = double_int_zero;
+
+ for (i = 16-1; i >= 0; i--)
+ r = avr_double_int_push_digit (r, 16, i);
+
+ return r;
+}
+
+
+enum
+ {
+ SIG_ID = 0,
+ /* for QI and HI */
+ SIG_ROL = 0xf,
+ SIG_REVERT_0 = 1 << 4,
+ SIG_SWAP1_0 = 1 << 5,
+ /* HI only */
+ SIG_REVERT_1 = 1 << 6,
+ SIG_SWAP1_1 = 1 << 7,
+ SIG_SWAP4_0 = 1 << 8,
+ SIG_SWAP4_1 = 1 << 9
+ };
+
+
+/* Return basic map with signature SIG. */
+
+static double_int
+avr_sig_map (int n ATTRIBUTE_UNUSED, int sig)
+{
+ if (sig == SIG_ID) return avr_id_map ();
+ else if (sig == SIG_REVERT_0) return avr_revert_map (0);
+ else if (sig == SIG_REVERT_1) return avr_revert_map (1);
+ else if (sig == SIG_SWAP1_0) return avr_swap_map (1, 0);
+ else if (sig == SIG_SWAP1_1) return avr_swap_map (1, 1);
+ else if (sig == SIG_SWAP4_0) return avr_swap_map (4, 0);
+ else if (sig == SIG_SWAP4_1) return avr_swap_map (4, 1);
+ else
+ gcc_unreachable();
+}
+
+
+/* Return the Hamming distance between the B-th byte of A and C. */
+
+static bool
+avr_map_hamming_byte (int n, int b, double_int a, double_int c, bool strict)
+{
+ int i, hamming = 0;
+
+ for (i = 8*b; i < n && i < 8*b + 8; i++)
+ {
+ int ai = avr_map (a, i);
+ int ci = avr_map (c, i);
+
+ hamming += ai != ci && (strict || (ai < n && ci < n));
+ }
+
+ return hamming;
+}
+
+
+/* Return the non-strict Hamming distance between A and B. */
+
+#define avr_map_hamming_nonstrict(N,A,B) \
+ (+ avr_map_hamming_byte (N, 0, A, B, false) \
+ + avr_map_hamming_byte (N, 1, A, B, false))
+
+
+/* Return TRUE iff A and B represent the same mapping. */
+
+#define avr_map_equal_p(N,A,B) (0 == avr_map_hamming_nonstrict (N, A, B))
+
+
+/* Return TRUE iff A is a map of signature S. Notice that there is no
+ 1:1 correspondance between maps and signatures and thus this is
+ only supported for basic signatures recognized by avr_sig_map(). */
+
+#define avr_map_sig_p(N,A,S) avr_map_equal_p (N, A, avr_sig_map (N, S))
+
+
+/* Swap odd/even bits of ld-reg %0: %0 = bit-swap (%0) */
+
+static const char*
+avr_out_swap_bits (rtx *xop, int *plen)
+{
+ xop[1] = tmp_reg_rtx;
+
+ return avr_asm_len ("mov %1,%0" CR_TAB
+ "andi %0,0xaa" CR_TAB
+ "eor %1,%0" CR_TAB
+ "lsr %0" CR_TAB
+ "lsl %1" CR_TAB
+ "or %0,%1", xop, plen, 6);
+}
+
+/* Revert bit order: %0 = Revert (%1) with %0 != %1 and clobber %1 */
+
+static const char*
+avr_out_revert_bits (rtx *xop, int *plen)
+{
+ return avr_asm_len ("inc __zero_reg__" "\n"
+ "0:\tror %1" CR_TAB
+ "rol %0" CR_TAB
+ "lsl __zero_reg__" CR_TAB
+ "brne 0b", xop, plen, 5);
+}
+
+
+/* If OUT_P = true: Output BST/BLD instruction according to MAP.
+ If OUT_P = false: Just dry-run and fix XOP[1] to resolve
+ early-clobber conflicts if XOP[0] = XOP[1]. */
+
+static void
+avr_move_bits (rtx *xop, double_int map, int n_bits, bool out_p, int *plen)
+{
+ int bit_dest, b, clobber = 0;
+
+ /* T-flag contains this bit of the source, i.e. of XOP[1] */
+ int t_bit_src = -1;
+
+ if (!optimize && !out_p)
+ {
+ avr_asm_len ("mov __tmp_reg__,%1", xop, plen, 1);
+ xop[1] = tmp_reg_rtx;
+ return;
+ }
+
+ /* We order the operations according to the requested source bit b. */
+
+ for (b = 0; b < n_bits; b++)
+ for (bit_dest = 0; bit_dest < n_bits; bit_dest++)
+ {
+ int bit_src = avr_map (map, bit_dest);
+
+ if (b != bit_src
+ /* Same position: No need to copy as the caller did MOV. */
+ || bit_dest == bit_src
+ /* Accessing bits 8..f for 8-bit version is void. */
+ || bit_src >= n_bits)
+ continue;
+
+ if (t_bit_src != bit_src)
+ {
+ /* Source bit is not yet in T: Store it to T. */
+
+ t_bit_src = bit_src;
+
+ if (out_p)
+ {
+ xop[2] = GEN_INT (bit_src);
+ avr_asm_len ("bst %T1%T2", xop, plen, 1);
+ }
+ else if (clobber & (1 << bit_src))
+ {
+ /* Bit to be read was written already: Backup input
+ to resolve early-clobber conflict. */
+
+ avr_asm_len ("mov __tmp_reg__,%1", xop, plen, 1);
+ xop[1] = tmp_reg_rtx;
+ return;
+ }
+ }
+
+ /* Load destination bit with T. */
+
+ if (out_p)
+ {
+ xop[2] = GEN_INT (bit_dest);
+ avr_asm_len ("bld %T0%T2", xop, plen, 1);
+ }
+
+ clobber |= 1 << bit_dest;
+ }
+}
+
+
+/* Print assembler code for `map_bitsqi' and `map_bitshi'. */
+
+const char*
+avr_out_map_bits (rtx insn, rtx *operands, int *plen)
+{
+ bool copy_0, copy_1;
+ int n_bits = GET_MODE_BITSIZE (GET_MODE (operands[0]));
+ double_int map = rtx_to_double_int (operands[1]);
+ rtx xop[3];
+
+ xop[0] = operands[0];
+ xop[1] = operands[2];
+
+ if (plen)
+ *plen = 0;
+ else if (flag_print_asm_name)
+ avr_fdump (asm_out_file, ASM_COMMENT_START "%X\n", map);
+
+ switch (n_bits)
+ {
+ default:
+ gcc_unreachable();
+
+ case 8:
+ if (avr_map_sig_p (n_bits, map, SIG_SWAP1_0))
+ {
+ return avr_out_swap_bits (xop, plen);
+ }
+ else if (avr_map_sig_p (n_bits, map, SIG_REVERT_0))
+ {
+ if (REGNO (xop[0]) == REGNO (xop[1])
+ || !reg_unused_after (insn, xop[1]))
+ {
+ avr_asm_len ("mov __tmp_reg__,%1", xop, plen, 1);
+ xop[1] = tmp_reg_rtx;
+ }
+
+ return avr_out_revert_bits (xop, plen);
+ }
+
+ break; /* 8 */
+
+ case 16:
+
+ break; /* 16 */
+ }
+
+ /* Copy whole byte is cheaper than moving bits that stay at the same
+ position. Some bits in a byte stay at the same position iff the
+ strict Hamming distance to Identity is not 8. */
+
+ copy_0 = 8 != avr_map_hamming_byte (n_bits, 0, map, avr_id_map(), true);
+ copy_1 = 8 != avr_map_hamming_byte (n_bits, 1, map, avr_id_map(), true);
+
+ /* Perform the move(s) just worked out. */
+
+ if (n_bits == 8)
+ {
+ if (REGNO (xop[0]) == REGNO (xop[1]))
+ {
+ /* Fix early-clobber clashes.
+ Notice XOP[0] hat no eary-clobber in its constraint. */
+
+ avr_move_bits (xop, map, n_bits, false, plen);
+ }
+ else if (copy_0)
+ {
+ avr_asm_len ("mov %0,%1", xop, plen, 1);
+ }
+ }
+ else if (AVR_HAVE_MOVW && copy_0 && copy_1)
+ {
+ avr_asm_len ("movw %A0,%A1", xop, plen, 1);
+ }
+ else
+ {
+ if (copy_0)
+ avr_asm_len ("mov %A0,%A1", xop, plen, 1);
+
+ if (copy_1)
+ avr_asm_len ("mov %B0,%B1", xop, plen, 1);
+ }
+
+ /* Move individual bits. */
+
+ avr_move_bits (xop, map, n_bits, true, plen);
+
+ return "";
+}
+
+
/* IDs for all the AVR builtins. */
enum avr_builtin_id
@@ -9854,6 +10234,8 @@ enum avr_builtin_id
AVR_BUILTIN_WDR,
AVR_BUILTIN_SLEEP,
AVR_BUILTIN_SWAP,
+ AVR_BUILTIN_MAP8,
+ AVR_BUILTIN_MAP16,
AVR_BUILTIN_FMUL,
AVR_BUILTIN_FMULS,
AVR_BUILTIN_FMULSU,
@@ -9910,6 +10292,18 @@ avr_init_builtins (void)
long_unsigned_type_node,
NULL_TREE);
+ tree uchar_ftype_ulong_uchar
+ = build_function_type_list (unsigned_char_type_node,
+ long_unsigned_type_node,
+ unsigned_char_type_node,
+ NULL_TREE);
+
+ tree uint_ftype_ullong_uint
+ = build_function_type_list (unsigned_type_node,
+ long_long_unsigned_type_node,
+ unsigned_type_node,
+ NULL_TREE);
+
DEF_BUILTIN ("__builtin_avr_nop", void_ftype_void, AVR_BUILTIN_NOP);
DEF_BUILTIN ("__builtin_avr_sei", void_ftype_void, AVR_BUILTIN_SEI);
DEF_BUILTIN ("__builtin_avr_cli", void_ftype_void, AVR_BUILTIN_CLI);
@@ -9926,6 +10320,11 @@ avr_init_builtins (void)
DEF_BUILTIN ("__builtin_avr_fmulsu", int_ftype_char_uchar,
AVR_BUILTIN_FMULSU);
+ DEF_BUILTIN ("__builtin_avr_map8", uchar_ftype_ulong_uchar,
+ AVR_BUILTIN_MAP8);
+ DEF_BUILTIN ("__builtin_avr_map16", uint_ftype_ullong_uint,
+ AVR_BUILTIN_MAP16);
+
avr_init_builtin_int24 ();
}
@@ -9949,7 +10348,9 @@ bdesc_2arg[] =
{
{ CODE_FOR_fmul, "__builtin_avr_fmul", AVR_BUILTIN_FMUL },
{ CODE_FOR_fmuls, "__builtin_avr_fmuls", AVR_BUILTIN_FMULS },
- { CODE_FOR_fmulsu, "__builtin_avr_fmulsu", AVR_BUILTIN_FMULSU }
+ { CODE_FOR_fmulsu, "__builtin_avr_fmulsu", AVR_BUILTIN_FMULSU },
+ { CODE_FOR_map_bitsqi, "__builtin_avr_map8", AVR_BUILTIN_MAP8 },
+ { CODE_FOR_map_bitshi, "__builtin_avr_map16", AVR_BUILTIN_MAP16 }
};
/* Subroutine of avr_expand_builtin to take care of unop insns. */
@@ -10065,6 +10466,7 @@ avr_expand_builtin (tree exp, rtx target,
size_t i;
const struct avr_builtin_description *d;
tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+ const char* bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
unsigned int id = DECL_FUNCTION_CODE (fndecl);
tree arg0;
rtx op0;
@@ -10097,12 +10499,37 @@ avr_expand_builtin (tree exp, rtx target,
op0 = expand_expr (arg0, NULL_RTX, VOIDmode, EXPAND_NORMAL);
if (! CONST_INT_P (op0))
- error ("__builtin_avr_delay_cycles expects a"
- " compile time integer constant.");
+ error ("%s expects a compile time integer constant", bname);
avr_expand_delay_cycles (op0);
return 0;
}
+
+ case AVR_BUILTIN_MAP8:
+ {
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_expr (arg0, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+
+ if (!CONST_INT_P (op0))
+ {
+ error ("%s expects a compile time long integer constant"
+ " as first argument", bname);
+ return target;
+ }
+ }
+
+ case AVR_BUILTIN_MAP16:
+ {
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_expr (arg0, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+
+ if (!const_double_operand (op0, VOIDmode))
+ {
+ error ("%s expects a compile time long long integer constant"
+ " as first argument", bname);
+ return target;
+ }
+ }
}
for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++)
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index 73632d880f7..bddfe933ee6 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -28,11 +28,21 @@
;; j Branch condition.
;; k Reverse branch condition.
;;..m..Constant Direct Data memory address.
-;; i Print the SFR address quivalent of a CONST_INT RAM address.
-;; The resulting addres is suitable to be used in IN/OUT.
+;; i Print the SFR address quivalent of a CONST_INT or a CONST_INT
+;; RAM address. The resulting addres is suitable to be used in IN/OUT.
;; o Displacement for (mem (plus (reg) (const_int))) operands.
;; p POST_INC or PRE_DEC address as a pointer (X, Y, Z)
;; r POST_INC or PRE_DEC address as a register (r26, r28, r30)
+;; T/T Print operand suitable for BLD/BST instruction, i.e. register and
+;; bit number. This gets 2 operands: The first %T gets a REG_P and
+;; just cashes the operand for the next %T. The second %T gets
+;; a CONST_INT that represents a bit position.
+;; Example: With %0 = (reg:HI 18) and %1 = (const_int 13)
+;; "%T0%T1" it will print "r19,5".
+;; Notice that you must not write a comma between %T0 and %T1.
+;; T/t Similar to above, but don't print the comma and the bit number.
+;; Example: With %0 = (reg:HI 18) and %1 = (const_int 13)
+;; "%T0%t1" it will print "r19".
;;..x..Constant Direct Program memory address.
;; ~ Output 'r' if not AVR_HAVE_JMP_CALL.
;; ! Output 'e' if AVR_HAVE_EIJMP_EICALL.
@@ -64,6 +74,7 @@
UNSPEC_FMULSU
UNSPEC_COPYSIGN
UNSPEC_IDENTITY
+ UNSPEC_MAP_BITS
])
(define_c_enum "unspecv"
@@ -139,6 +150,7 @@
ashlhi, ashrhi, lshrhi,
ashlsi, ashrsi, lshrsi,
ashlpsi, ashrpsi, lshrpsi,
+ map_bits,
no"
(const_string "no"))
@@ -3020,7 +3032,7 @@
(const_int 15)))]
""
"bst %A0,0\;ror %B0\;ror %A0\;bld %B0,7"
- [(set_attr "length" "3")
+ [(set_attr "length" "4")
(set_attr "cc" "clobber")])
(define_insn "*rotlpsi2.1"
@@ -5093,6 +5105,30 @@
[(set_attr "length" "9")
(set_attr "cc" "clobber")])
+(define_insn "map_bitsqi"
+ [(set (match_operand:QI 0 "register_operand" "=d")
+ (unspec:QI [(match_operand:SI 1 "const_int_operand" "n")
+ (match_operand:QI 2 "register_operand" "r")]
+ UNSPEC_MAP_BITS))]
+ ""
+ {
+ return avr_out_map_bits (insn, operands, NULL);
+ }
+ [(set_attr "adjust_len" "map_bits")
+ (set_attr "cc" "clobber")])
+
+(define_insn "map_bitshi"
+ [(set (match_operand:HI 0 "register_operand" "=&r")
+ (unspec:HI [(match_operand:DI 1 "const_double_operand" "n")
+ (match_operand:HI 2 "register_operand" "r")]
+ UNSPEC_MAP_BITS))]
+ ""
+ {
+ return avr_out_map_bits (insn, operands, NULL);
+ }
+ [(set_attr "adjust_len" "map_bits")
+ (set_attr "cc" "clobber")])
+
;; Parity
diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h
index 07f58b99e35..3cc2253c3c2 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -37,8 +37,7 @@ enum stringop_alg
rep_prefix_8_byte,
loop_1_byte,
loop,
- unrolled_loop,
- sse_loop
+ unrolled_loop
};
/* Available call abi. */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 866cdd63f7a..1b871be8480 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -561,14 +561,10 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
COSTS_N_BYTES (2), /* cost of FABS instruction. */
COSTS_N_BYTES (2), /* cost of FCHS instruction. */
COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
- {{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
+ {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
- {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
- {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}},
- {{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
+ {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
- {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
- {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -636,14 +632,10 @@ struct processor_costs i386_cost = { /* 386 specific costs */
COSTS_N_INSNS (22), /* cost of FABS instruction. */
COSTS_N_INSNS (24), /* cost of FCHS instruction. */
COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
- {{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
+ {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
DUMMY_STRINGOP_ALGS},
- {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
- DUMMY_STRINGOP_ALGS}},
- {{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
+ {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
DUMMY_STRINGOP_ALGS},
- {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
- DUMMY_STRINGOP_ALGS}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -712,14 +704,10 @@ struct processor_costs i486_cost = { /* 486 specific costs */
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
- {{{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
+ {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
DUMMY_STRINGOP_ALGS},
- {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
- DUMMY_STRINGOP_ALGS}},
- {{{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
+ {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
DUMMY_STRINGOP_ALGS},
- {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
- DUMMY_STRINGOP_ALGS}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -786,14 +774,10 @@ struct processor_costs pentium_cost = {
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
- {{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
- {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}},
- {{{libcall, {{-1, rep_prefix_4_byte}}},
+ {{libcall, {{-1, rep_prefix_4_byte}}},
DUMMY_STRINGOP_ALGS},
- {{libcall, {{-1, rep_prefix_4_byte}}},
- DUMMY_STRINGOP_ALGS}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -865,18 +849,12 @@ struct processor_costs pentiumpro_cost = {
noticeable win, for bigger blocks either rep movsl or rep movsb is
way to go. Rep movsb has apparently more expensive startup time in CPU,
but after 4K the difference is down in the noise. */
- {{{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
+ {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
{8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
DUMMY_STRINGOP_ALGS},
- {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
- {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
- DUMMY_STRINGOP_ALGS}},
- {{{rep_prefix_4_byte, {{1024, unrolled_loop},
- {8192, rep_prefix_4_byte}, {-1, libcall}}},
+ {{rep_prefix_4_byte, {{1024, unrolled_loop},
+ {8192, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
- {{rep_prefix_4_byte, {{1024, unrolled_loop},
- {8192, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -944,14 +922,10 @@ struct processor_costs geode_cost = {
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
- {{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
- {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}},
- {{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
- {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -1021,14 +995,10 @@ struct processor_costs k6_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
- {{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
- {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}},
- {{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
+ {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
- {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -1098,14 +1068,10 @@ struct processor_costs athlon_cost = {
/* For some reason, Athlon deals better with REP prefix (relative to loops)
compared to K8. Alignment becomes important after 8 bytes for memcpy and
128 bytes for memset. */
- {{{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
+ {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
- {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}},
- {{{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
+ {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
- {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -1180,16 +1146,11 @@ struct processor_costs k8_cost = {
/* K8 has optimized REP instruction for medium sized blocks, but for very
small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
- {{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
+ {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
- {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
- {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
- {{{libcall, {{8, loop}, {24, unrolled_loop},
+ {{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
- {{libcall, {{8, loop}, {24, unrolled_loop},
- {2048, rep_prefix_4_byte}, {-1, libcall}}},
- {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
@@ -1272,16 +1233,11 @@ struct processor_costs amdfam10_cost = {
/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
- {{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
- {libcall, {{16, loop}, {512, rep_prefix_8_byte}, {-1, libcall}}}},
- {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
- {libcall, {{16, loop}, {512, rep_prefix_8_byte}, {-1, libcall}}}}},
- {{{libcall, {{8, loop}, {24, unrolled_loop},
+ {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
+ {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ {{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
- {{libcall, {{8, loop}, {24, unrolled_loop},
- {2048, rep_prefix_4_byte}, {-1, libcall}}},
- {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
@@ -1364,16 +1320,11 @@ struct processor_costs bdver1_cost = {
/* BDVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
- {{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
+ {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
- {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
- {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
- {{{libcall, {{8, loop}, {24, unrolled_loop},
+ {{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
- {{libcall, {{8, loop}, {24, unrolled_loop},
- {2048, rep_prefix_4_byte}, {-1, libcall}}},
- {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
@@ -1456,16 +1407,11 @@ struct processor_costs bdver2_cost = {
/* BDVER2 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
- {{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
- {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
- {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
- {{{libcall, {{8, loop}, {24, unrolled_loop},
- {2048, rep_prefix_4_byte}, {-1, libcall}}},
- {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
- {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
+ {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
@@ -1543,16 +1489,11 @@ struct processor_costs btver1_cost = {
/* BTVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
- {{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
+ {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
- {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
- {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
- {{{libcall, {{8, loop}, {24, unrolled_loop},
+ {{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
- {{libcall, {{8, loop}, {24, unrolled_loop},
- {2048, rep_prefix_4_byte}, {-1, libcall}}},
- {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
@@ -1619,18 +1560,11 @@ struct processor_costs pentium4_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
-
- {{{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
+ {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
DUMMY_STRINGOP_ALGS},
- {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
- DUMMY_STRINGOP_ALGS}},
-
- {{{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
+ {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
{-1, libcall}}},
DUMMY_STRINGOP_ALGS},
- {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
- {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -1697,22 +1631,13 @@ struct processor_costs nocona_cost = {
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
-
- {{{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
+ {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
{libcall, {{32, loop}, {20000, rep_prefix_8_byte},
{100000, unrolled_loop}, {-1, libcall}}}},
- {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
- {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
- {100000, unrolled_loop}, {-1, libcall}}}}},
-
- {{{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
+ {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
{-1, libcall}}},
{libcall, {{24, loop}, {64, unrolled_loop},
{8192, rep_prefix_8_byte}, {-1, libcall}}}},
- {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
- {-1, libcall}}},
- {libcall, {{24, loop}, {64, unrolled_loop},
- {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -1779,108 +1704,13 @@ struct processor_costs atom_cost = {
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
-
- /* stringop_algs for memcpy.
- SSE loops works best on Atom, but fall back into non-SSE unrolled loop variant
- if that fails. */
- {{{libcall, {{4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */
- {libcall, {{4096, unrolled_loop}, {-1, libcall}}}},
- {{libcall, {{2048, unrolled_loop}, {-1, libcall}}}, /* Unknown alignment. */
- {libcall, {{2048, unrolled_loop},
- {-1, libcall}}}}},
-
- /* stringop_algs for memset. */
- {{{libcall, {{4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */
- {libcall, {{4096, unrolled_loop}, {-1, libcall}}}},
- {{libcall, {{1024, unrolled_loop}, /* Unknown alignment. */
- {-1, libcall}}},
- {libcall, {{2048, unrolled_loop},
- {-1, libcall}}}}},
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
-/* Core should produce code tuned for core variants. */
-static const
-struct processor_costs core_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- /* On all chips taken into consideration lea is 2 cycles and more. With
- this cost however our current implementation of synth_mult results in
- use of unnecessary temporary registers causing regression on several
- SPECfp benchmarks. */
- COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (3), /* SI */
- COSTS_N_INSNS (4), /* DI */
- COSTS_N_INSNS (2)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (26), /* HI */
- COSTS_N_INSNS (42), /* SI */
- COSTS_N_INSNS (74), /* DI */
- COSTS_N_INSNS (74)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 17, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {4, 4, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {12, 12, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {6, 6, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {8, 8}, /* cost of loading MMX registers
- in SImode and DImode */
- {8, 8}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {8, 8, 8}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {8, 8, 8}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 5, /* MMX or SSE register to integer */
- 32, /* size of l1 cache. */
- 512, /* size of l2 cache. */
- 64, /* size of prefetch block */
- 6, /* number of parallel prefetches */
- /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
- value is increased to perhaps more appropriate value of 5. */
- 3, /* Branch cost */
- COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (8), /* cost of FMUL instruction. */
- COSTS_N_INSNS (20), /* cost of FDIV instruction. */
- COSTS_N_INSNS (8), /* cost of FABS instruction. */
- COSTS_N_INSNS (8), /* cost of FCHS instruction. */
- COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
-
- /* stringop_algs for memcpy. */
- {{{libcall, {{16, loop}, {24, unrolled_loop}, {1024, rep_prefix_4_byte}, {-1, libcall}}}, /* Known alignment. */
- {libcall, {{16, loop}, {24, unrolled_loop}, {1024, rep_prefix_8_byte}, {-1, libcall}}}},
- {{libcall, {{16, loop}, {24, unrolled_loop}, {1024, rep_prefix_4_byte}, {-1, libcall}}}, /* Unknown alignment. */
- {libcall, {{16, loop}, {24, unrolled_loop}, {1024, rep_prefix_8_byte}, {-1, libcall}}}}},
-
- /* stringop_algs for memset. */
- {{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, /* Known alignment. */
- {libcall, {{256, rep_prefix_8_byte}, {-1, libcall}}}},
- {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, /* Unknown alignment. */
- {libcall, {{256, rep_prefix_8_byte}, {-1, libcall}}}}},
+ {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
+ {libcall, {{32, loop}, {64, rep_prefix_4_byte},
+ {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ {{libcall, {{8, loop}, {15, unrolled_loop},
+ {2048, rep_prefix_4_byte}, {-1, libcall}}},
+ {libcall, {{24, loop}, {32, unrolled_loop},
+ {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -1894,7 +1724,7 @@ struct processor_costs core_cost = {
1, /* cond_not_taken_branch_cost. */
};
-/* Generic64 should produce code tuned for Nocona, Core, K8, Amdfam10 and buldozer. */
+/* Generic64 should produce code tuned for Nocona and K8. */
static const
struct processor_costs generic64_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@@ -1954,16 +1784,10 @@ struct processor_costs generic64_cost = {
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
-
- {{DUMMY_STRINGOP_ALGS,
- {libcall, {{16, rep_prefix_4_byte}, {128, rep_prefix_8_byte}, {4096, rep_prefix_1_byte}, {-1, libcall}}}},
- {DUMMY_STRINGOP_ALGS,
- {libcall, {{128, rep_prefix_4_byte}, {4096, rep_prefix_1_byte}, {-1, libcall}}}}},
-
- {{DUMMY_STRINGOP_ALGS,
- {libcall, {{16, rep_prefix_4_byte}, {512, unrolled_loop}, {4096, rep_prefix_1_byte}, {-1, libcall}}}},
- {DUMMY_STRINGOP_ALGS,
- {libcall, {{16, rep_prefix_4_byte}, {512, unrolled_loop}, {4096, rep_prefix_1_byte}, {-1, libcall}}}}},
+ {DUMMY_STRINGOP_ALGS,
+ {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ {DUMMY_STRINGOP_ALGS,
+ {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -1977,8 +1801,8 @@ struct processor_costs generic64_cost = {
1, /* cond_not_taken_branch_cost. */
};
-/* Generic32 should produce code tuned for PPro, Pentium4, Nocona, Core
- Athlon, K8, amdfam10, buldozer. */
+/* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
+ Athlon and K8. */
static const
struct processor_costs generic32_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@@ -2032,16 +1856,10 @@ struct processor_costs generic32_cost = {
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
- /* stringop_algs for memcpy. */
- {{{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
+ {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
- {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}},
- /* stringop_algs for memset. */
- {{{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
+ {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
- {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@@ -2718,8 +2536,6 @@ static void ix86_set_current_function (tree);
static unsigned int ix86_minimum_incoming_stack_boundary (bool);
static enum calling_abi ix86_function_abi (const_tree);
-static rtx promote_duplicated_reg (enum machine_mode, rtx);
-static rtx promote_duplicated_reg_to_size (rtx, int, int, int);
#ifndef SUBTARGET32_DEFAULT_CPU
@@ -2766,13 +2582,13 @@ static const struct ptt processor_target_table[PROCESSOR_max] =
{&k8_cost, 16, 7, 16, 7, 16},
{&nocona_cost, 0, 0, 0, 0, 0},
/* Core 2 32-bit. */
- {&core_cost, 16, 10, 16, 10, 16},
+ {&generic32_cost, 16, 10, 16, 10, 16},
/* Core 2 64-bit. */
- {&core_cost, 16, 10, 16, 10, 16},
+ {&generic64_cost, 16, 10, 16, 10, 16},
/* Core i7 32-bit. */
- {&core_cost, 16, 10, 16, 10, 16},
+ {&generic32_cost, 16, 10, 16, 10, 16},
/* Core i7 64-bit. */
- {&core_cost, 16, 10, 16, 10, 16},
+ {&generic64_cost, 16, 10, 16, 10, 16},
{&generic32_cost, 16, 7, 16, 7, 16},
{&generic64_cost, 16, 10, 16, 10, 16},
{&amdfam10_cost, 32, 24, 32, 7, 32},
@@ -16457,7 +16273,6 @@ distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
rtx prev = start;
rtx next = NULL;
- enum attr_type insn_type;
*found = false;
@@ -16470,8 +16285,8 @@ distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
distance = increase_distance (prev, next, distance);
if (insn_defines_reg (regno1, regno2, prev))
{
- insn_type = get_attr_type (prev);
- if (insn_type != TYPE_LEA)
+ if (recog_memoized (prev) < 0
+ || get_attr_type (prev) != TYPE_LEA)
{
*found = true;
return distance;
@@ -21111,37 +20926,22 @@ counter_mode (rtx count_exp)
return SImode;
}
-/* Helper function for expand_set_or_movmem_via_loop.
-
- When SRCPTR is non-NULL, output simple loop to move memory
+/* When SRCPTR is non-NULL, output simple loop to move memory
pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
equivalent loop to set memory by VALUE (supposed to be in MODE).
The size is rounded down to whole number of chunk size moved at once.
- SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.
-
- If ITER isn't NULL, than it'll be used in the generated loop without
- initialization (that allows to generate several consequent loops using the
- same iterator).
- If CHANGE_PTRS is specified, DESTPTR and SRCPTR would be increased by
- iterator value at the end of the function (as if they iterate in the loop).
- Otherwise, their vaules'll stay unchanged.
+ SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
- If EXPECTED_SIZE isn't -1, than it's used to compute branch-probabilities on
- the loop backedge. When expected size is unknown (it's -1), the probability
- is set to 80%.
- Return value is rtx of iterator, used in the loop - it could be reused in
- consequent calls of this function. */
-static rtx
-expand_set_or_movmem_via_loop_with_iter (rtx destmem, rtx srcmem,
- rtx destptr, rtx srcptr, rtx value,
- rtx count, rtx iter,
- enum machine_mode mode, int unroll,
- int expected_size, bool change_ptrs)
+static void
+expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
+ rtx destptr, rtx srcptr, rtx value,
+ rtx count, enum machine_mode mode, int unroll,
+ int expected_size)
{
- rtx out_label, top_label, tmp;
+ rtx out_label, top_label, iter, tmp;
enum machine_mode iter_mode = counter_mode (count);
rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
@@ -21149,42 +20949,32 @@ expand_set_or_movmem_via_loop_with_iter (rtx destmem, rtx srcmem,
rtx x_addr;
rtx y_addr;
int i;
- bool reuse_iter = (iter != NULL_RTX);
top_label = gen_label_rtx ();
out_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
- NULL, 1, OPTAB_DIRECT);
- if (!reuse_iter)
+ NULL, 1, OPTAB_DIRECT);
+ /* Those two should combine. */
+ if (piece_size == const1_rtx)
{
- iter = gen_reg_rtx (iter_mode);
- /* Those two should combine. */
- if (piece_size == const1_rtx)
- {
- emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
- true, out_label);
- predict_jump (REG_BR_PROB_BASE * 10 / 100);
- }
- emit_move_insn (iter, const0_rtx);
- }
- else
- {
- emit_cmp_and_jump_insns (iter, size, GE, NULL_RTX, iter_mode,
+ emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
true, out_label);
+ predict_jump (REG_BR_PROB_BASE * 10 / 100);
}
+ emit_move_insn (iter, const0_rtx);
emit_label (top_label);
tmp = convert_modes (Pmode, iter_mode, iter, true);
x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
- destmem =
- adjust_automodify_address_nv (copy_rtx (destmem), mode, x_addr, 0);
+ destmem = change_address (destmem, mode, x_addr);
if (srcmem)
{
y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
- srcmem =
- adjust_automodify_address_nv (copy_rtx (srcmem), mode, y_addr, 0);
+ srcmem = change_address (srcmem, mode, y_addr);
/* When unrolling for chips that reorder memory reads and writes,
we can save registers by using single temporary.
@@ -21256,43 +21046,19 @@ expand_set_or_movmem_via_loop_with_iter (rtx destmem, rtx srcmem,
}
else
predict_jump (REG_BR_PROB_BASE * 80 / 100);
- if (change_ptrs)
+ iter = ix86_zero_extend_to_Pmode (iter);
+ tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != destptr)
+ emit_move_insn (destptr, tmp);
+ if (srcptr)
{
- iter = ix86_zero_extend_to_Pmode (iter);
- tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
+ tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
true, OPTAB_LIB_WIDEN);
- if (tmp != destptr)
- emit_move_insn (destptr, tmp);
- if (srcptr)
- {
- tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
- true, OPTAB_LIB_WIDEN);
- if (tmp != srcptr)
- emit_move_insn (srcptr, tmp);
- }
+ if (tmp != srcptr)
+ emit_move_insn (srcptr, tmp);
}
emit_label (out_label);
- return iter;
-}
-
-/* When SRCPTR is non-NULL, output simple loop to move memory
- pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
- overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
- equivalent loop to set memory by VALUE (supposed to be in MODE).
-
- The size is rounded down to whole number of chunk size moved at once.
- SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
-
-static void
-expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
- rtx destptr, rtx srcptr, rtx value,
- rtx count, enum machine_mode mode, int unroll,
- int expected_size)
-{
- expand_set_or_movmem_via_loop_with_iter (destmem, srcmem,
- destptr, srcptr, value,
- count, NULL_RTX, mode, unroll,
- expected_size, true);
}
/* Output "rep; mov" instruction.
@@ -21396,18 +21162,7 @@ emit_strmov (rtx destmem, rtx srcmem,
emit_insn (gen_strmov (destptr, dest, srcptr, src));
}
-/* Emit strset instuction. If RHS is constant, and vector mode will be used,
- then move this constant to a vector register before emitting strset. */
-static void
-emit_strset (rtx destmem, rtx value,
- rtx destptr, enum machine_mode mode, int offset)
-{
- rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
- emit_insn (gen_strset (destptr, dest, value));
-}
-
-/* Output code to copy (COUNT % MAX_SIZE) bytes from SRCPTR to DESTPTR.
- SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
+/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
static void
expand_movmem_epilogue (rtx destmem, rtx srcmem,
rtx destptr, rtx srcptr, rtx count, int max_size)
@@ -21418,58 +21173,46 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem,
HOST_WIDE_INT countval = INTVAL (count);
int offset = 0;
- int remainder_size = countval % max_size;
- enum machine_mode move_mode = Pmode;
-
- /* Firstly, try to move data with the widest possible mode.
- Remaining part we'll move using Pmode and narrower modes. */
- if (TARGET_SSE)
+ if ((countval & 0x10) && max_size > 16)
{
- if (max_size >= GET_MODE_SIZE (V4SImode))
- move_mode = V4SImode;
- else if (max_size >= GET_MODE_SIZE (DImode))
- move_mode = DImode;
- }
-
- while (remainder_size >= GET_MODE_SIZE (move_mode))
- {
- emit_strmov (destmem, srcmem, destptr, srcptr, move_mode, offset);
- offset += GET_MODE_SIZE (move_mode);
- remainder_size -= GET_MODE_SIZE (move_mode);
+ if (TARGET_64BIT)
+ {
+ emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
+ emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
+ }
+ else
+ gcc_unreachable ();
+ offset += 16;
}
-
- /* Move the remaining part of epilogue - its size might be
- a size of the widest mode. */
- move_mode = Pmode;
- while (remainder_size >= GET_MODE_SIZE (move_mode))
+ if ((countval & 0x08) && max_size > 8)
{
- emit_strmov (destmem, srcmem, destptr, srcptr, move_mode, offset);
- offset += GET_MODE_SIZE (move_mode);
- remainder_size -= GET_MODE_SIZE (move_mode);
+ if (TARGET_64BIT)
+ emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
+ else
+ {
+ emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
+ emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
+ }
+ offset += 8;
}
-
- if (remainder_size >= 4)
+ if ((countval & 0x04) && max_size > 4)
{
- emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
+ emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
offset += 4;
- remainder_size -= 4;
}
- if (remainder_size >= 2)
+ if ((countval & 0x02) && max_size > 2)
{
- emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
+ emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
offset += 2;
- remainder_size -= 2;
}
- if (remainder_size >= 1)
+ if ((countval & 0x01) && max_size > 1)
{
- emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
+ emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
offset += 1;
- remainder_size -= 1;
}
- gcc_assert (remainder_size == 0);
return;
}
- if (max_size > 16)
+ if (max_size > 8)
{
count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
count, 1, OPTAB_DIRECT);
@@ -21484,25 +21227,6 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem,
*/
if (TARGET_SINGLE_STRINGOP)
{
- if (max_size > 8)
- {
- rtx label = ix86_expand_aligntest (count, 8, true);
- if (TARGET_64BIT)
- {
- src = change_address (srcmem, DImode, srcptr);
- dest = change_address (destmem, DImode, destptr);
- emit_insn (gen_strmov (destptr, dest, srcptr, src));
- }
- else
- {
- src = change_address (srcmem, SImode, srcptr);
- dest = change_address (destmem, SImode, destptr);
- emit_insn (gen_strmov (destptr, dest, srcptr, src));
- emit_insn (gen_strmov (destptr, dest, srcptr, src));
- }
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
if (max_size > 4)
{
rtx label = ix86_expand_aligntest (count, 4, true);
@@ -21536,35 +21260,6 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem,
rtx offset = force_reg (Pmode, const0_rtx);
rtx tmp;
- if (max_size > 8)
- {
- rtx label = ix86_expand_aligntest (count, 8, true);
- if (TARGET_64BIT)
- {
- src = change_address (srcmem, DImode, srcptr);
- dest = change_address (destmem, DImode, destptr);
- emit_move_insn (dest, src);
- tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (8), NULL,
- true, OPTAB_LIB_WIDEN);
- }
- else
- {
- src = change_address (srcmem, SImode, srcptr);
- dest = change_address (destmem, SImode, destptr);
- emit_move_insn (dest, src);
- tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
- true, OPTAB_LIB_WIDEN);
- if (tmp != offset)
- emit_move_insn (offset, tmp);
- tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
- true, OPTAB_LIB_WIDEN);
- emit_move_insn (dest, src);
- }
- if (tmp != offset)
- emit_move_insn (offset, tmp);
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
if (max_size > 4)
{
rtx label = ix86_expand_aligntest (count, 4, true);
@@ -21620,132 +21315,87 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
1, max_size / 2);
}
-/* Output code to set with VALUE at most (COUNT % MAX_SIZE) bytes starting from
- DESTPTR.
- DESTMEM provides MEMrtx to feed proper aliasing info.
- PROMOTED_TO_GPR_VALUE is rtx representing a GPR containing broadcasted VALUE.
- PROMOTED_TO_VECTOR_VALUE is rtx representing a vector register containing
- broadcasted VALUE.
- PROMOTED_TO_GPR_VALUE and PROMOTED_TO_VECTOR_VALUE could be NULL if the
- promotion hasn't been generated before. */
+/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
static void
-expand_setmem_epilogue (rtx destmem, rtx destptr, rtx promoted_to_vector_value,
- rtx promoted_to_gpr_value, rtx value, rtx count,
- int max_size)
+expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
{
+ rtx dest;
+
if (CONST_INT_P (count))
{
HOST_WIDE_INT countval = INTVAL (count);
int offset = 0;
- int remainder_size = countval % max_size;
- enum machine_mode move_mode = Pmode;
-
- /* Firstly, try to move data with the widest possible mode.
- Remaining part we'll move using Pmode and narrower modes. */
-
- if (promoted_to_vector_value)
+ if ((countval & 0x10) && max_size > 16)
{
- if (promoted_to_vector_value)
- {
- if (max_size >= GET_MODE_SIZE (V4SImode))
- move_mode = V4SImode;
- else if (max_size >= GET_MODE_SIZE (DImode))
- move_mode = DImode;
- }
- while (remainder_size >= GET_MODE_SIZE (move_mode))
+ if (TARGET_64BIT)
{
- if (GET_MODE (destmem) != move_mode)
- destmem = adjust_automodify_address_nv (destmem, move_mode,
- destptr, offset);
- emit_strset (destmem,
- promoted_to_vector_value,
- destptr,
- move_mode, offset);
-
- offset += GET_MODE_SIZE (move_mode);
- remainder_size -= GET_MODE_SIZE (move_mode);
+ dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
+ emit_insn (gen_strset (destptr, dest, value));
+ dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
+ emit_insn (gen_strset (destptr, dest, value));
}
+ else
+ gcc_unreachable ();
+ offset += 16;
}
-
- /* Move the remaining part of epilogue - its size might be
- a size of the widest mode. */
- while (remainder_size >= GET_MODE_SIZE (Pmode))
+ if ((countval & 0x08) && max_size > 8)
{
- if (!promoted_to_gpr_value)
- promoted_to_gpr_value = promote_duplicated_reg (Pmode, value);
- emit_strset (destmem, promoted_to_gpr_value, destptr, Pmode, offset);
- offset += GET_MODE_SIZE (Pmode);
- remainder_size -= GET_MODE_SIZE (Pmode);
+ if (TARGET_64BIT)
+ {
+ dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
+ emit_insn (gen_strset (destptr, dest, value));
+ }
+ else
+ {
+ dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
+ emit_insn (gen_strset (destptr, dest, value));
+ dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
+ emit_insn (gen_strset (destptr, dest, value));
+ }
+ offset += 8;
}
-
- if (!promoted_to_gpr_value && remainder_size > 1)
- promoted_to_gpr_value = promote_duplicated_reg (remainder_size >= 4
- ? SImode : HImode, value);
- if (remainder_size >= 4)
+ if ((countval & 0x04) && max_size > 4)
{
- emit_strset (destmem, gen_lowpart (SImode, promoted_to_gpr_value), destptr,
- SImode, offset);
+ dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
offset += 4;
- remainder_size -= 4;
}
- if (remainder_size >= 2)
+ if ((countval & 0x02) && max_size > 2)
{
- emit_strset (destmem, gen_lowpart (HImode, promoted_to_gpr_value), destptr,
- HImode, offset);
- offset +=2;
- remainder_size -= 2;
+ dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
+ offset += 2;
}
- if (remainder_size >= 1)
+ if ((countval & 0x01) && max_size > 1)
{
- emit_strset (destmem,
- promoted_to_gpr_value ? gen_lowpart (QImode, promoted_to_gpr_value) : value,
- destptr,
- QImode, offset);
+ dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
offset += 1;
- remainder_size -= 1;
}
- gcc_assert (remainder_size == 0);
return;
}
-
- /* count isn't const. */
if (max_size > 32)
{
- expand_setmem_epilogue_via_loop (destmem, destptr, value, count,
- max_size);
+ expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
return;
}
-
- if (!promoted_to_gpr_value)
- promoted_to_gpr_value = promote_duplicated_reg_to_size (value,
- GET_MODE_SIZE (Pmode),
- GET_MODE_SIZE (Pmode),
- GET_MODE_SIZE (Pmode));
-
if (max_size > 16)
{
rtx label = ix86_expand_aligntest (count, 16, true);
- if (TARGET_SSE && promoted_to_vector_value)
- {
- destmem = change_address (destmem,
- GET_MODE (promoted_to_vector_value),
- destptr);
- emit_insn (gen_strset (destptr, destmem, promoted_to_vector_value));
- }
- else if (TARGET_64BIT)
+ if (TARGET_64BIT)
{
- destmem = change_address (destmem, DImode, destptr);
- emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
- emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
+ dest = change_address (destmem, DImode, destptr);
+ emit_insn (gen_strset (destptr, dest, value));
+ emit_insn (gen_strset (destptr, dest, value));
}
else
{
- destmem = change_address (destmem, SImode, destptr);
- emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
- emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
- emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
- emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
+ dest = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strset (destptr, dest, value));
+ emit_insn (gen_strset (destptr, dest, value));
+ emit_insn (gen_strset (destptr, dest, value));
+ emit_insn (gen_strset (destptr, dest, value));
}
emit_label (label);
LABEL_NUSES (label) = 1;
@@ -21755,22 +21405,14 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx promoted_to_vector_value,
rtx label = ix86_expand_aligntest (count, 8, true);
if (TARGET_64BIT)
{
- destmem = change_address (destmem, DImode, destptr);
- emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
- }
- /* FIXME: When this hunk it output, IRA classifies promoted_to_vector_value
- as NO_REGS. */
- else if (TARGET_SSE && promoted_to_vector_value && 0)
- {
- destmem = change_address (destmem, V2SImode, destptr);
- emit_insn (gen_strset (destptr, destmem,
- gen_lowpart (V2SImode, promoted_to_vector_value)));
+ dest = change_address (destmem, DImode, destptr);
+ emit_insn (gen_strset (destptr, dest, value));
}
else
{
- destmem = change_address (destmem, SImode, destptr);
- emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
- emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
+ dest = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strset (destptr, dest, value));
+ emit_insn (gen_strset (destptr, dest, value));
}
emit_label (label);
LABEL_NUSES (label) = 1;
@@ -21778,27 +21420,24 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx promoted_to_vector_value,
if (max_size > 4)
{
rtx label = ix86_expand_aligntest (count, 4, true);
- destmem = change_address (destmem, SImode, destptr);
- emit_insn (gen_strset (destptr, destmem,
- gen_lowpart (SImode, promoted_to_gpr_value)));
+ dest = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
emit_label (label);
LABEL_NUSES (label) = 1;
}
if (max_size > 2)
{
rtx label = ix86_expand_aligntest (count, 2, true);
- destmem = change_address (destmem, HImode, destptr);
- emit_insn (gen_strset (destptr, destmem,
- gen_lowpart (HImode, promoted_to_gpr_value)));
+ dest = change_address (destmem, HImode, destptr);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
emit_label (label);
LABEL_NUSES (label) = 1;
}
if (max_size > 1)
{
rtx label = ix86_expand_aligntest (count, 1, true);
- destmem = change_address (destmem, QImode, destptr);
- emit_insn (gen_strset (destptr, destmem,
- gen_lowpart (QImode, promoted_to_gpr_value)));
+ dest = change_address (destmem, QImode, destptr);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
emit_label (label);
LABEL_NUSES (label) = 1;
}
@@ -21814,8 +21453,8 @@ expand_movmem_prologue (rtx destmem, rtx srcmem,
if (align <= 1 && desired_alignment > 1)
{
rtx label = ix86_expand_aligntest (destptr, 1, false);
- srcmem = adjust_automodify_address_nv (srcmem, QImode, srcptr, 0);
- destmem = adjust_automodify_address_nv (destmem, QImode, destptr, 0);
+ srcmem = change_address (srcmem, QImode, srcptr);
+ destmem = change_address (destmem, QImode, destptr);
emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
ix86_adjust_counter (count, 1);
emit_label (label);
@@ -21824,8 +21463,8 @@ expand_movmem_prologue (rtx destmem, rtx srcmem,
if (align <= 2 && desired_alignment > 2)
{
rtx label = ix86_expand_aligntest (destptr, 2, false);
- srcmem = adjust_automodify_address_nv (srcmem, HImode, srcptr, 0);
- destmem = adjust_automodify_address_nv (destmem, HImode, destptr, 0);
+ srcmem = change_address (srcmem, HImode, srcptr);
+ destmem = change_address (destmem, HImode, destptr);
emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
ix86_adjust_counter (count, 2);
emit_label (label);
@@ -21834,34 +21473,14 @@ expand_movmem_prologue (rtx destmem, rtx srcmem,
if (align <= 4 && desired_alignment > 4)
{
rtx label = ix86_expand_aligntest (destptr, 4, false);
- srcmem = adjust_automodify_address_nv (srcmem, SImode, srcptr, 0);
- destmem = adjust_automodify_address_nv (destmem, SImode, destptr, 0);
+ srcmem = change_address (srcmem, SImode, srcptr);
+ destmem = change_address (destmem, SImode, destptr);
emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
ix86_adjust_counter (count, 4);
emit_label (label);
LABEL_NUSES (label) = 1;
}
- if (align <= 8 && desired_alignment > 8)
- {
- rtx label = ix86_expand_aligntest (destptr, 8, false);
- if (TARGET_64BIT || TARGET_SSE)
- {
- srcmem = adjust_automodify_address_nv (srcmem, DImode, srcptr, 0);
- destmem = adjust_automodify_address_nv (destmem, DImode, destptr, 0);
- emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
- }
- else
- {
- srcmem = adjust_automodify_address_nv (srcmem, SImode, srcptr, 0);
- destmem = adjust_automodify_address_nv (destmem, SImode, destptr, 0);
- emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
- emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
- }
- ix86_adjust_counter (count, 8);
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- gcc_assert (desired_alignment <= 16);
+ gcc_assert (desired_alignment <= 8);
}
/* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
@@ -21916,37 +21535,6 @@ expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
off = 4;
emit_insn (gen_strmov (destreg, dst, srcreg, src));
}
- if (align_bytes & 8)
- {
- if (TARGET_64BIT || TARGET_SSE)
- {
- dst = adjust_automodify_address_nv (dst, DImode, destreg, off);
- src = adjust_automodify_address_nv (src, DImode, srcreg, off);
- emit_insn (gen_strmov (destreg, dst, srcreg, src));
- }
- else
- {
- dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
- src = adjust_automodify_address_nv (src, SImode, srcreg, off);
- emit_insn (gen_strmov (destreg, dst, srcreg, src));
- emit_insn (gen_strmov (destreg, dst, srcreg, src));
- }
- if (MEM_ALIGN (dst) < 8 * BITS_PER_UNIT)
- set_mem_align (dst, 8 * BITS_PER_UNIT);
- if (src_align_bytes >= 0)
- {
- unsigned int src_align = 0;
- if ((src_align_bytes & 7) == (align_bytes & 7))
- src_align = 8;
- else if ((src_align_bytes & 3) == (align_bytes & 3))
- src_align = 4;
- else if ((src_align_bytes & 1) == (align_bytes & 1))
- src_align = 2;
- if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
- set_mem_align (src, src_align * BITS_PER_UNIT);
- }
- off = 8;
- }
dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
@@ -21954,9 +21542,7 @@ expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
if (src_align_bytes >= 0)
{
unsigned int src_align = 0;
- if ((src_align_bytes & 15) == (align_bytes & 15))
- src_align = 16;
- else if ((src_align_bytes & 7) == (align_bytes & 7))
+ if ((src_align_bytes & 7) == (align_bytes & 7))
src_align = 8;
else if ((src_align_bytes & 3) == (align_bytes & 3))
src_align = 4;
@@ -21984,7 +21570,7 @@ expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
if (align <= 1 && desired_alignment > 1)
{
rtx label = ix86_expand_aligntest (destptr, 1, false);
- destmem = adjust_automodify_address_nv (destmem, QImode, destptr, 0);
+ destmem = change_address (destmem, QImode, destptr);
emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
ix86_adjust_counter (count, 1);
emit_label (label);
@@ -21993,7 +21579,7 @@ expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
if (align <= 2 && desired_alignment > 2)
{
rtx label = ix86_expand_aligntest (destptr, 2, false);
- destmem = adjust_automodify_address_nv (destmem, HImode, destptr, 0);
+ destmem = change_address (destmem, HImode, destptr);
emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
ix86_adjust_counter (count, 2);
emit_label (label);
@@ -22002,23 +21588,13 @@ expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
if (align <= 4 && desired_alignment > 4)
{
rtx label = ix86_expand_aligntest (destptr, 4, false);
- destmem = adjust_automodify_address_nv (destmem, SImode, destptr, 0);
+ destmem = change_address (destmem, SImode, destptr);
emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
ix86_adjust_counter (count, 4);
emit_label (label);
LABEL_NUSES (label) = 1;
}
- if (align <= 8 && desired_alignment > 8)
- {
- rtx label = ix86_expand_aligntest (destptr, 8, false);
- destmem = adjust_automodify_address_nv (destmem, SImode, destptr, 0);
- emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
- emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
- ix86_adjust_counter (count, 8);
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- gcc_assert (desired_alignment <= 16);
+ gcc_assert (desired_alignment <= 8);
}
/* Set enough from DST to align DST known to by aligned by ALIGN to
@@ -22054,19 +21630,6 @@ expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
emit_insn (gen_strset (destreg, dst,
gen_lowpart (SImode, value)));
}
- if (align_bytes & 8)
- {
- dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
- emit_insn (gen_strset (destreg, dst,
- gen_lowpart (SImode, value)));
- off = 4;
- dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
- emit_insn (gen_strset (destreg, dst,
- gen_lowpart (SImode, value)));
- if (MEM_ALIGN (dst) < 8 * BITS_PER_UNIT)
- set_mem_align (dst, 8 * BITS_PER_UNIT);
- off = 4;
- }
dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
set_mem_align (dst, desired_align * BITS_PER_UNIT);
@@ -22078,7 +21641,7 @@ expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
static enum stringop_alg
decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
- int *dynamic_check, bool align_unknown)
+ int *dynamic_check)
{
const struct stringop_algs * algs;
bool optimize_for_speed;
@@ -22087,44 +21650,40 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
consider such algorithms if the user has appropriated those
registers for their own purposes. */
bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
- || (memset
+ || (memset
? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
-#define ALG_USABLE_P(alg) ((rep_prefix_usable \
- || (alg != rep_prefix_1_byte \
- && alg != rep_prefix_4_byte \
- && alg != rep_prefix_8_byte)) \
- && (TARGET_SSE2 || alg != sse_loop))
+#define ALG_USABLE_P(alg) (rep_prefix_usable \
+ || (alg != rep_prefix_1_byte \
+ && alg != rep_prefix_4_byte \
+ && alg != rep_prefix_8_byte))
const struct processor_costs *cost;
/* Even if the string operation call is cold, we still might spend a lot
of time processing large blocks. */
if (optimize_function_for_size_p (cfun)
|| (optimize_insn_for_size_p ()
- && expected_size != -1 && expected_size < 256))
+ && expected_size != -1 && expected_size < 256))
optimize_for_speed = false;
else
optimize_for_speed = true;
- *dynamic_check = -1;
- if (!optimize)
- return (rep_prefix_usable ? rep_prefix_1_byte : libcall);
-
cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
+ *dynamic_check = -1;
if (memset)
- algs = &cost->memset[align_unknown][TARGET_64BIT != 0];
+ algs = &cost->memset[TARGET_64BIT != 0];
else
- algs = &cost->memcpy[align_unknown][TARGET_64BIT != 0];
+ algs = &cost->memcpy[TARGET_64BIT != 0];
if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
return ix86_stringop_alg;
/* rep; movq or rep; movl is the smallest variant. */
else if (!optimize_for_speed)
{
- if (!count || (count & 3) || memset)
- return rep_prefix_usable ? rep_prefix_1_byte : libcall;
+ if (!count || (count & 3))
+ return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
else
- return rep_prefix_usable ? rep_prefix_4_byte : libcall;
+ return rep_prefix_usable ? rep_prefix_4_byte : loop;
}
/* Very tiny blocks are best handled via the loop, REP is expensive to setup.
*/
@@ -22178,32 +21737,30 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
int max = -1;
enum stringop_alg alg;
int i;
- bool only_libcall_fits = true;
+ bool any_alg_usable_p = true;
for (i = 0; i < MAX_STRINGOP_ALGS; i++)
- {
- enum stringop_alg candidate = algs->size[i].alg;
+ {
+ enum stringop_alg candidate = algs->size[i].alg;
+ any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
- if (candidate != libcall && candidate
- && ALG_USABLE_P (candidate))
- {
- max = algs->size[i].max;
- only_libcall_fits = false;
- }
- }
+ if (candidate != libcall && candidate
+ && ALG_USABLE_P (candidate))
+ max = algs->size[i].max;
+ }
/* If there aren't any usable algorithms, then recursing on
- smaller sizes isn't going to find anything. Just return the
- simple byte-at-a-time copy loop. */
- if (only_libcall_fits)
- {
- /* Pick something reasonable. */
- if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
- *dynamic_check = 128;
- return loop_1_byte;
- }
+ smaller sizes isn't going to find anything. Just return the
+ simple byte-at-a-time copy loop. */
+ if (!any_alg_usable_p)
+ {
+ /* Pick something reasonable. */
+ if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+ *dynamic_check = 128;
+ return loop_1_byte;
+ }
if (max == -1)
max = 4096;
- alg = decide_alg (count, max / 2, memset, dynamic_check, align_unknown);
+ alg = decide_alg (count, max / 2, memset, dynamic_check);
gcc_assert (*dynamic_check == -1);
gcc_assert (alg != libcall);
if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
@@ -22227,14 +21784,9 @@ decide_alignment (int align,
case no_stringop:
gcc_unreachable ();
case loop:
- desired_align = GET_MODE_SIZE (Pmode);
- break;
case unrolled_loop:
desired_align = GET_MODE_SIZE (Pmode);
break;
- case sse_loop:
- desired_align = 16;
- break;
case rep_prefix_8_byte:
desired_align = 8;
break;
@@ -22322,11 +21874,6 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
enum stringop_alg alg;
int dynamic_check;
bool need_zero_guard = false;
- bool align_unknown;
- unsigned int unroll_factor;
- enum machine_mode move_mode;
- rtx loop_iter = NULL_RTX;
- int dst_offset, src_offset;
if (CONST_INT_P (align_exp))
align = INTVAL (align_exp);
@@ -22350,17 +21897,9 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
/* Step 0: Decide on preferred algorithm, desired alignment and
size of chunks to be copied by main loop. */
- dst_offset = get_mem_align_offset (dst, MOVE_MAX*BITS_PER_UNIT);
- src_offset = get_mem_align_offset (src, MOVE_MAX*BITS_PER_UNIT);
- align_unknown = (dst_offset < 0
- || src_offset < 0
- || src_offset != dst_offset);
- alg = decide_alg (count, expected_size, false, &dynamic_check, align_unknown);
+
+ alg = decide_alg (count, expected_size, false, &dynamic_check);
desired_align = decide_alignment (align, alg, expected_size);
- if (align_unknown)
- desired_align = align;
- unroll_factor = 1;
- move_mode = Pmode;
if (!TARGET_ALIGN_STRINGOPS)
align = desired_align;
@@ -22379,36 +21918,11 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
gcc_unreachable ();
case loop:
need_zero_guard = true;
- move_mode = Pmode;
- unroll_factor = 1;
- size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
+ size_needed = GET_MODE_SIZE (Pmode);
break;
case unrolled_loop:
need_zero_guard = true;
- move_mode = Pmode;
- unroll_factor = 1;
- /* Select maximal available 1,2 or 4 unroll factor.
- In 32bit we can not afford to use 4 registers inside the loop. */
- if (!count)
- unroll_factor = TARGET_64BIT ? 4 : 2;
- else
- while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
- && unroll_factor < (TARGET_64BIT ? 4 :2))
- unroll_factor *= 2;
- size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
- break;
- case sse_loop:
- need_zero_guard = true;
- /* Use SSE instructions, if possible. */
- move_mode = V4SImode;
- /* Select maximal available 1,2 or 4 unroll factor. */
- if (!count)
- unroll_factor = 4;
- else
- while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
- && unroll_factor < 4)
- unroll_factor *= 2;
- size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
+ size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
break;
case rep_prefix_8_byte:
size_needed = 8;
@@ -22469,12 +21983,6 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
}
else
{
- /* SSE and unrolled algs re-use iteration counter in the epilogue. */
- if (alg == sse_loop || alg == unrolled_loop)
- {
- loop_iter = gen_reg_rtx (counter_mode (count_exp));
- emit_move_insn (loop_iter, const0_rtx);
- }
label = gen_label_rtx ();
emit_cmp_and_jump_insns (count_exp,
GEN_INT (epilogue_size_needed),
@@ -22526,8 +22034,6 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
dst = change_address (dst, BLKmode, destreg);
expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
desired_align);
- set_mem_align (src, desired_align*BITS_PER_UNIT);
- set_mem_align (dst, desired_align*BITS_PER_UNIT);
}
else
{
@@ -22584,16 +22090,12 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
count_exp, Pmode, 1, expected_size);
break;
- case sse_loop:
case unrolled_loop:
- /* In some cases we want to use the same iterator in several adjacent
- loops, so here we save loop iterator rtx and don't update addresses. */
- loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, src, destreg,
- srcreg, NULL,
- count_exp, loop_iter,
- move_mode,
- unroll_factor,
- expected_size, false);
+ /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
+ registers for 4 temporaries anyway. */
+ expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
+ count_exp, Pmode, TARGET_64BIT ? 4 : 2,
+ expected_size);
break;
case rep_prefix_8_byte:
expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
@@ -22644,47 +22146,9 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
LABEL_NUSES (label) = 1;
}
- /* We haven't updated addresses, so we'll do it now.
- Also, if the epilogue seems to be big, we'll generate a loop (not
- unrolled) in it. We'll do it only if alignment is unknown, because in
- this case in epilogue we have to perform memmove by bytes, which is very
- slow. */
- if (alg == sse_loop || alg == unrolled_loop)
- {
- rtx tmp;
- int remainder_size = epilogue_size_needed;
-
- /* We may not need the epilgoue loop at all when the count is known
- and alignment is not adjusted. */
- if (count && desired_align <= align)
- remainder_size = count % epilogue_size_needed;
- if (remainder_size > 31)
- {
- /* Reduce epilogue's size by creating not-unrolled loop. If we won't
- do this, we can have very big epilogue - when alignment is statically
- unknown we'll have the epilogue byte by byte which may be very slow. */
- loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, src, destreg,
- srcreg, NULL, count_exp,
- loop_iter, move_mode, 1,
- expected_size, false);
- src = change_address (src, BLKmode, srcreg);
- dst = change_address (dst, BLKmode, destreg);
- epilogue_size_needed = GET_MODE_SIZE (move_mode);
- }
- tmp = expand_simple_binop (Pmode, PLUS, destreg, loop_iter, destreg,
- true, OPTAB_LIB_WIDEN);
- if (tmp != destreg)
- emit_move_insn (destreg, tmp);
-
- tmp = expand_simple_binop (Pmode, PLUS, srcreg, loop_iter, srcreg,
- true, OPTAB_LIB_WIDEN);
- if (tmp != srcreg)
- emit_move_insn (srcreg, tmp);
- }
if (count_exp != const0_rtx && epilogue_size_needed > 1)
expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
epilogue_size_needed);
-
if (jump_around_label)
emit_label (jump_around_label);
return true;
@@ -22702,37 +22166,7 @@ promote_duplicated_reg (enum machine_mode mode, rtx val)
rtx tmp;
int nops = mode == DImode ? 3 : 2;
- if (VECTOR_MODE_P (mode))
- {
- enum machine_mode inner = GET_MODE_INNER (mode);
- rtx promoted_val, vec_reg;
- if (CONST_INT_P (val))
- return ix86_build_const_vector (mode, true, val);
-
- promoted_val = promote_duplicated_reg (inner, val);
- vec_reg = gen_reg_rtx (mode);
- switch (mode)
- {
- case V2DImode:
- emit_insn (gen_vec_dupv2di (vec_reg, promoted_val));
- break;
- case V4SImode:
- emit_insn (gen_vec_dupv4si (vec_reg, promoted_val));
- break;
- default:
- gcc_unreachable ();
- break;
- }
-
- return vec_reg;
- }
gcc_assert (mode == SImode || mode == DImode);
- if (mode == DImode && !TARGET_64BIT)
- {
- rtx vec_reg = promote_duplicated_reg (V4SImode, val);
- vec_reg = convert_to_mode (V2DImode, vec_reg, 1);
- return vec_reg;
- }
if (val == const0_rtx)
return copy_to_mode_reg (mode, const0_rtx);
if (CONST_INT_P (val))
@@ -22798,27 +22232,11 @@ promote_duplicated_reg (enum machine_mode mode, rtx val)
static rtx
promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
{
- rtx promoted_val = NULL_RTX;
+ rtx promoted_val;
- if (size_needed > 8)
- {
- /* We want to promote to vector register, so we expect that at least SSE
- is available. */
- gcc_assert (TARGET_SSE);
-
- /* In case of promotion to vector register, we expect that val is a
- constant or already promoted to GPR value. */
- gcc_assert (GET_MODE (val) == Pmode || CONSTANT_P (val));
- if (TARGET_64BIT)
- promoted_val = promote_duplicated_reg (V2DImode, val);
- else
- promoted_val = promote_duplicated_reg (V4SImode, val);
- }
- else if (size_needed > 4)
- {
- gcc_assert (TARGET_64BIT);
- promoted_val = promote_duplicated_reg (DImode, val);
- }
+ if (TARGET_64BIT
+ && (size_needed > 4 || (desired_align > align && desired_align > 4)))
+ promoted_val = promote_duplicated_reg (DImode, val);
else if (size_needed > 2 || (desired_align > align && desired_align > 2))
promoted_val = promote_duplicated_reg (SImode, val);
else if (size_needed > 1 || (desired_align > align && desired_align > 1))
@@ -22846,15 +22264,10 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
int size_needed = 0, epilogue_size_needed;
int desired_align = 0, align_bytes = 0;
enum stringop_alg alg;
- rtx gpr_promoted_val = NULL;
- rtx vec_promoted_val = NULL;
+ rtx promoted_val = NULL;
+ bool force_loopy_epilogue = false;
int dynamic_check;
bool need_zero_guard = false;
- bool align_unknown;
- unsigned int unroll_factor;
- enum machine_mode move_mode;
- rtx loop_iter = NULL_RTX;
- bool early_jump = false;
if (CONST_INT_P (align_exp))
align = INTVAL (align_exp);
@@ -22874,11 +22287,8 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
/* Step 0: Decide on preferred algorithm, desired alignment and
size of chunks to be copied by main loop. */
- align_unknown = !(CONST_INT_P (align_exp) && INTVAL (align_exp) > 0);
- alg = decide_alg (count, expected_size, true, &dynamic_check, align_unknown);
+ alg = decide_alg (count, expected_size, true, &dynamic_check);
desired_align = decide_alignment (align, alg, expected_size);
- unroll_factor = 1;
- move_mode = Pmode;
if (!TARGET_ALIGN_STRINGOPS)
align = desired_align;
@@ -22896,34 +22306,11 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
gcc_unreachable ();
case loop:
need_zero_guard = true;
- move_mode = Pmode;
- size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
+ size_needed = GET_MODE_SIZE (Pmode);
break;
case unrolled_loop:
need_zero_guard = true;
- move_mode = Pmode;
- unroll_factor = 1;
- /* Select maximal available 1,2 or 4 unroll factor. */
- if (!count)
- unroll_factor = 4;
- else
- while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
- && unroll_factor < 4)
- unroll_factor *= 2;
- size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
- break;
- case sse_loop:
- need_zero_guard = true;
- move_mode = TARGET_64BIT ? V2DImode : V4SImode;
- unroll_factor = 1;
- /* Select maximal available 1,2 or 4 unroll factor. */
- if (!count)
- unroll_factor = 4;
- else
- while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
- && unroll_factor < 4)
- unroll_factor *= 2;
- size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
+ size_needed = GET_MODE_SIZE (Pmode) * 4;
break;
case rep_prefix_8_byte:
size_needed = 8;
@@ -22968,10 +22355,8 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
main loop and epilogue (ie one load of the big constant in the
front of all code. */
if (CONST_INT_P (val_exp))
- gpr_promoted_val = promote_duplicated_reg_to_size (val_exp,
- GET_MODE_SIZE (Pmode),
- GET_MODE_SIZE (Pmode),
- align);
+ promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+ desired_align, align);
/* Ensure that alignment prologue won't copy past end of block. */
if (size_needed > 1 || (desired_align > 1 && desired_align > align))
{
@@ -22980,6 +22365,12 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
Make sure it is power of 2. */
epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
+ /* To improve performance of small blocks, we jump around the VAL
+ promoting mode. This mean that if the promoted VAL is not constant,
+ we might not use it in the epilogue and have to use byte
+ loop variant. */
+ if (epilogue_size_needed > 2 && !promoted_val)
+ force_loopy_epilogue = true;
if (count)
{
if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
@@ -22994,14 +22385,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
}
else
{
- /* SSE and unrolled_lopo algs re-use iteration counter in the epilogue. */
- if (alg == sse_loop || alg == unrolled_loop)
- {
- loop_iter = gen_reg_rtx (counter_mode (count_exp));
- emit_move_insn (loop_iter, const0_rtx);
- }
label = gen_label_rtx ();
- early_jump = true;
emit_cmp_and_jump_insns (count_exp,
GEN_INT (epilogue_size_needed),
LTU, 0, counter_mode (count_exp), 1, label);
@@ -23026,11 +22410,9 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
/* Step 2: Alignment prologue. */
/* Do the expensive promotion once we branched off the small blocks. */
- if (!gpr_promoted_val)
- gpr_promoted_val = promote_duplicated_reg_to_size (val_exp,
- GET_MODE_SIZE (Pmode),
- GET_MODE_SIZE (Pmode),
- align);
+ if (!promoted_val)
+ promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+ desired_align, align);
gcc_assert (desired_align >= 1 && align >= 1);
if (desired_align > align)
@@ -23042,20 +22424,17 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
the pain to maintain it for the first move, so throw away
the info early. */
dst = change_address (dst, BLKmode, destreg);
- expand_setmem_prologue (dst, destreg, gpr_promoted_val, count_exp, align,
+ expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
desired_align);
- set_mem_align (dst, desired_align*BITS_PER_UNIT);
}
else
{
/* If we know how many bytes need to be stored before dst is
sufficiently aligned, maintain aliasing info accurately. */
- dst = expand_constant_setmem_prologue (dst, destreg, gpr_promoted_val,
+ dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
desired_align, align_bytes);
count_exp = plus_constant (count_exp, -align_bytes);
count -= align_bytes;
- if (count < (unsigned HOST_WIDE_INT) size_needed)
- goto epilogue;
}
if (need_zero_guard
&& (count < (unsigned HOST_WIDE_INT) size_needed
@@ -23083,7 +22462,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
emit_label (label);
LABEL_NUSES (label) = 1;
label = NULL;
- gpr_promoted_val = val_exp;
+ promoted_val = val_exp;
epilogue_size_needed = 1;
}
else if (label == NULL_RTX)
@@ -23097,40 +22476,27 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
case no_stringop:
gcc_unreachable ();
case loop_1_byte:
- expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, val_exp,
+ expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
count_exp, QImode, 1, expected_size);
break;
case loop:
- expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, gpr_promoted_val,
+ expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
count_exp, Pmode, 1, expected_size);
break;
case unrolled_loop:
- loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
- NULL, gpr_promoted_val, count_exp,
- loop_iter, move_mode, unroll_factor,
- expected_size, false);
- break;
- case sse_loop:
- vec_promoted_val =
- promote_duplicated_reg_to_size (gpr_promoted_val,
- GET_MODE_SIZE (move_mode),
- GET_MODE_SIZE (move_mode), align);
- loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
- NULL, vec_promoted_val, count_exp,
- loop_iter, move_mode, unroll_factor,
- expected_size, false);
+ expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
+ count_exp, Pmode, 4, expected_size);
break;
case rep_prefix_8_byte:
- gcc_assert (TARGET_64BIT);
- expand_setmem_via_rep_stos (dst, destreg, gpr_promoted_val, count_exp,
+ expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
DImode, val_exp);
break;
case rep_prefix_4_byte:
- expand_setmem_via_rep_stos (dst, destreg, gpr_promoted_val, count_exp,
+ expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
SImode, val_exp);
break;
case rep_prefix_1_byte:
- expand_setmem_via_rep_stos (dst, destreg, gpr_promoted_val, count_exp,
+ expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
QImode, val_exp);
break;
}
@@ -23161,46 +22527,17 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
}
emit_label (label);
LABEL_NUSES (label) = 1;
- /* We can not rely on fact that promoved value is known. */
- vec_promoted_val = 0;
- if (early_jump)
- gpr_promoted_val = 0;
}
epilogue:
- if (alg == unrolled_loop || alg == sse_loop)
+ if (count_exp != const0_rtx && epilogue_size_needed > 1)
{
- rtx tmp;
- int remainder_size = epilogue_size_needed;
- if (count && desired_align <= align)
- remainder_size = count % epilogue_size_needed;
- /* We may not need the epilgoue loop at all when the count is known
- and alignment is not adjusted. */
- if (remainder_size > 31
- && (alg == sse_loop ? vec_promoted_val : gpr_promoted_val))
- {
- /* Reduce epilogue's size by creating not-unrolled loop. If we won't
- do this, we can have very big epilogue - when alignment is statically
- unknown we'll have the epilogue byte by byte which may be very slow. */
- loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
- NULL, (alg == sse_loop ? vec_promoted_val : gpr_promoted_val), count_exp,
- loop_iter, move_mode, 1,
- expected_size, false);
- dst = change_address (dst, BLKmode, destreg);
- epilogue_size_needed = GET_MODE_SIZE (move_mode);
- }
- tmp = expand_simple_binop (Pmode, PLUS, destreg, loop_iter, destreg,
- true, OPTAB_LIB_WIDEN);
- if (tmp != destreg)
- emit_move_insn (destreg, tmp);
+ if (force_loopy_epilogue)
+ expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
+ epilogue_size_needed);
+ else
+ expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
+ epilogue_size_needed);
}
- if (count_exp == const0_rtx || epilogue_size_needed <= 1)
- ;
- else if (!gpr_promoted_val)
- expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
- epilogue_size_needed);
- else
- expand_setmem_epilogue (dst, destreg, vec_promoted_val, gpr_promoted_val,
- val_exp, count_exp, epilogue_size_needed);
if (jump_around_label)
emit_label (jump_around_label);
return true;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index b10a13cad8a..7721c465832 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -159,12 +159,8 @@ struct processor_costs {
const int fchs; /* cost of FCHS instruction. */
const int fsqrt; /* cost of FSQRT instruction. */
/* Specify what algorithm
- to use for stringops on unknown size.
- First index is used to specify whether
- alignment is known or not.
- Second - to specify whether 32 or 64 bits
- are used. */
- struct stringop_algs memcpy[2][2], memset[2][2];
+ to use for stringops on unknown size. */
+ struct stringop_algs memcpy[2], memset[2];
const int scalar_stmt_cost; /* Cost of any scalar operation, excluding
load and store. */
const int scalar_load_cost; /* Cost of scalar load. */
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index e797362d2dc..6c516e7b869 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -324,9 +324,6 @@ Enum(stringop_alg) String(loop) Value(loop)
EnumValue
Enum(stringop_alg) String(unrolled_loop) Value(unrolled_loop)
-EnumValue
-Enum(stringop_alg) String(sse_loop) Value(sse_loop)
-
mtls-dialect=
Target RejectNegative Joined Var(ix86_tls_dialect) Enum(tls_dialect) Init(TLS_DIALECT_GNU)
Use given thread-local storage dialect
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 97fc333c253..8c434b79bb8 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -7616,16 +7616,6 @@
(set_attr "prefix" "maybe_vex,orig,vex,maybe_vex,orig,orig")
(set_attr "mode" "V2SF,TI,TI,TI,V4SF,V2SF")])
-(define_expand "vec_dupv4si"
- [(set (match_operand:V4SI 0 "register_operand" "")
- (vec_duplicate:V4SI
- (match_operand:SI 1 "nonimmediate_operand" "")))]
- "TARGET_SSE"
-{
- if (!TARGET_AVX)
- operands[1] = force_reg (V4SImode, operands[1]);
-})
-
(define_insn "*vec_dupv4si"
[(set (match_operand:V4SI 0 "register_operand" "=x,x,x")
(vec_duplicate:V4SI
@@ -7642,16 +7632,6 @@
(set_attr "prefix" "maybe_vex,vex,orig")
(set_attr "mode" "TI,V4SF,V4SF")])
-(define_expand "vec_dupv2di"
- [(set (match_operand:V2DI 0 "register_operand" "")
- (vec_duplicate:V2DI
- (match_operand:DI 1 "nonimmediate_operand" "")))]
- "TARGET_SSE"
-{
- if (!TARGET_AVX)
- operands[1] = force_reg (V2DImode, operands[1]);
-})
-
(define_insn "*vec_dupv2di"
[(set (match_operand:V2DI 0 "register_operand" "=x,x,x,x")
(vec_duplicate:V2DI
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index 542d3b87882..5799b0aca50 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -123,7 +123,7 @@
DONE;
})
-;; ??? From volume 3 section 7.1.1 Guaranteed Atomic Operations,
+;; ??? From volume 3 section 8.1.1 Guaranteed Atomic Operations,
;; Only beginning at Pentium family processors do we get any guarantee of
;; atomicity in aligned 64-bit quantities. Beginning at P6, we get a
;; guarantee for 64-bit accesses that do not cross a cacheline boundary.
@@ -281,7 +281,7 @@
(unspec:DI [(match_operand:DI 1 "memory_operand" "m")] UNSPEC_MOVA))
(clobber (match_operand:DF 2 "register_operand" "=f"))]
"TARGET_80387"
- "fild\t%1\;fistp\t%0"
+ "fild%Z1\t%1\;fistp%Z0\t%0"
[(set_attr "type" "multi")
;; Worst case based on full sib+offset32 addressing modes
(set_attr "length" "14")])
diff --git a/gcc/config/m68k/linux.h b/gcc/config/m68k/linux.h
index 70738d221bf..325faf73a75 100644
--- a/gcc/config/m68k/linux.h
+++ b/gcc/config/m68k/linux.h
@@ -235,3 +235,7 @@ along with GCC; see the file COPYING3. If not see
#undef WCHAR_TYPE_SIZE
#define WCHAR_TYPE_SIZE BITS_PER_WORD
+
+/* Install the __sync libcalls. */
+#undef TARGET_INIT_LIBFUNCS
+#define TARGET_INIT_LIBFUNCS m68k_init_sync_libfuncs
diff --git a/gcc/config/m68k/m68k.c b/gcc/config/m68k/m68k.c
index 51a04ed9c15..61267a80497 100644
--- a/gcc/config/m68k/m68k.c
+++ b/gcc/config/m68k/m68k.c
@@ -47,6 +47,7 @@ along with GCC; see the file COPYING3. If not see
#include "insn-codes.h"
#include "ggc.h"
#include "opts.h"
+#include "optabs.h"
enum reg_class regno_reg_class[] =
{
@@ -164,6 +165,7 @@ static rtx m68k_function_arg (cumulative_args_t, enum machine_mode,
const_tree, bool);
static bool m68k_cannot_force_const_mem (enum machine_mode mode, rtx x);
static bool m68k_output_addr_const_extra (FILE *, rtx);
+static void m68k_init_sync_libfuncs (void) ATTRIBUTE_UNUSED;
/* Initialize the GCC target structure. */
@@ -6524,4 +6526,10 @@ m68k_conditional_register_usage (void)
fixed_regs[PIC_REG] = call_used_regs[PIC_REG] = 1;
}
+static void
+m68k_init_sync_libfuncs (void)
+{
+ init_sync_libfuncs (UNITS_PER_WORD);
+}
+
#include "gt-m68k.h"
diff --git a/gcc/config/m68k/m68k.md b/gcc/config/m68k/m68k.md
index 672ef0db6c0..e4b4b59afc2 100644
--- a/gcc/config/m68k/m68k.md
+++ b/gcc/config/m68k/m68k.md
@@ -124,6 +124,10 @@
(define_constants
[(UNSPECV_BLOCKAGE 0)
+ (UNSPECV_CAS_1 1)
+ (UNSPECV_CAS_2 2)
+ (UNSPECV_TAS_1 3)
+ (UNSPECV_TAS_2 4)
])
;; Registers by name.
@@ -255,6 +259,10 @@
(const_int 0)]
(const_int 1)))
+;; Mode macros for integer operations.
+(define_mode_iterator I [QI HI SI])
+(define_mode_attr sz [(QI "%.b") (HI "%.w") (SI "%.l")])
+
;; Mode macros for floating point operations.
;; Valid floating point modes
(define_mode_iterator FP [SF DF (XF "TARGET_68881")])
@@ -7806,3 +7814,4 @@
[(set_attr "type" "ib")])
(include "cf.md")
+(include "sync.md")
diff --git a/gcc/config/m68k/sync.md b/gcc/config/m68k/sync.md
new file mode 100644
index 00000000000..9a5bcda4c04
--- /dev/null
+++ b/gcc/config/m68k/sync.md
@@ -0,0 +1,80 @@
+;; GCC machine description for m68k synchronization instructions.
+;; Copyright (C) 2011
+;; Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+
+
+(define_expand "atomic_compare_and_swap<mode>"
+ [(match_operand:QI 0 "register_operand" "") ;; bool success output
+ (match_operand:I 1 "register_operand" "") ;; oldval output
+ (match_operand:I 2 "memory_operand" "") ;; memory
+ (match_operand:I 3 "register_operand" "") ;; expected input
+ (match_operand:I 4 "register_operand" "") ;; newval input
+ (match_operand:SI 5 "const_int_operand" "") ;; is_weak
+ (match_operand:SI 6 "const_int_operand" "") ;; success model
+ (match_operand:SI 7 "const_int_operand" "")] ;; failure model
+ "TARGET_68020 || TARGET_68040"
+{
+ emit_insn (gen_atomic_compare_and_swap<mode>_1
+ (operands[0], operands[1], operands[2],
+ operands[3], operands[4]));
+ emit_insn (gen_negqi2 (operands[0], operands[0]));
+ DONE;
+})
+
+(define_insn "atomic_compare_and_swap<mode>_1"
+ [(set (match_operand:I 1 "register_operand" "=d")
+ (unspec_volatile:I
+ [(match_operand:I 2 "memory_operand" "+m")
+ (match_operand:I 3 "register_operand" "0")
+ (match_operand:I 4 "register_operand" "d")]
+ UNSPECV_CAS_1))
+ (set (match_dup 2)
+ (unspec_volatile:I
+ [(match_dup 2) (match_dup 3) (match_dup 4)]
+ UNSPECV_CAS_2))
+ (set (match_operand:QI 0 "register_operand" "=d")
+ (unspec_volatile:QI
+ [(match_dup 2) (match_dup 3) (match_dup 4)]
+ UNSPECV_CAS_2))]
+ "TARGET_68020 || TARGET_68040"
+ ;; Elide the seq if operands[0] is dead.
+ "cas<sz> %1,%4,%2\;seq %0")
+
+(define_expand "sync_test_and_setqi"
+ [(match_operand:QI 0 "register_operand" "")
+ (match_operand:QI 1 "memory_operand" "")
+ (match_operand:QI 2 "general_operand" "")]
+ "!(TARGET_68020 || TARGET_68040)"
+{
+ if (operands[2] != const1_rtx)
+ FAIL;
+ emit_insn (gen_sync_test_and_setqi_1 (operands[0], operands[1]));
+ emit_insn (gen_negqi2 (operands[0], operands[0]));
+ DONE;
+})
+
+(define_insn "sync_test_and_setqi_1"
+ [(set (match_operand:QI 0 "register_operand" "=d")
+ (unspec_volatile:QI
+ [(match_operand:QI 1 "memory_operand" "+m")]
+ UNSPECV_TAS_1))
+ (set (match_dup 1)
+ (unspec_volatile:QI [(match_dup 1)] UNSPECV_TAS_2))]
+ "!(TARGET_68020 || TARGET_68040)"
+ "tas %1\;sne %0")
diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h
index b28b0b39165..dbabdffaef0 100644
--- a/gcc/config/mips/mips-protos.h
+++ b/gcc/config/mips/mips-protos.h
@@ -191,6 +191,9 @@ extern int mips_split_const_insns (rtx);
extern int mips_load_store_insns (rtx, rtx);
extern int mips_idiv_insns (void);
extern rtx mips_emit_move (rtx, rtx);
+#ifdef RTX_CODE
+extern void mips_emit_binary (enum rtx_code, rtx, rtx, rtx);
+#endif
extern rtx mips_pic_base_register (rtx);
extern rtx mips_got_load (rtx, rtx, enum mips_symbol_type);
extern bool mips_split_symbol (rtx, rtx, enum machine_mode, rtx *);
diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 75e73bda2a1..d3fd7097f57 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -2163,7 +2163,7 @@ static bool
mips16_unextended_reference_p (enum machine_mode mode, rtx base,
unsigned HOST_WIDE_INT offset)
{
- if (offset % GET_MODE_SIZE (mode) == 0)
+ if (mode != BLKmode && offset % GET_MODE_SIZE (mode) == 0)
{
if (GET_MODE_SIZE (mode) == 4 && base == stack_pointer_rtx)
return offset < 256U * GET_MODE_SIZE (mode);
@@ -2398,7 +2398,7 @@ mips_force_unary (enum machine_mode mode, enum rtx_code code, rtx op0)
/* Emit an instruction of the form (set TARGET (CODE OP0 OP1)). */
-static void
+void
mips_emit_binary (enum rtx_code code, rtx target, rtx op0, rtx op1)
{
emit_insn (gen_rtx_SET (VOIDmode, target,
@@ -3777,6 +3777,16 @@ mips_rtx_costs (rtx x, int code, int outer_code, int opno ATTRIBUTE_UNUSED,
return false;
case ZERO_EXTEND:
+ if (outer_code == SET
+ && ISA_HAS_BADDU
+ && (GET_CODE (XEXP (x, 0)) == TRUNCATE
+ || GET_CODE (XEXP (x, 0)) == SUBREG)
+ && GET_MODE (XEXP (x, 0)) == QImode
+ && GET_CODE (XEXP (XEXP (x, 0), 0)) == PLUS)
+ {
+ *total = set_src_cost (XEXP (XEXP (x, 0), 0), speed);
+ return true;
+ }
*total = mips_zero_extend_cost (mode, XEXP (x, 0));
return false;
@@ -15250,6 +15260,11 @@ mips_set_mips16_mode (int mips16_p)
/* MIPS16 has no BAL instruction. */
target_flags &= ~MASK_RELAX_PIC_CALLS;
+ /* The R4000 errata don't apply to any known MIPS16 cores.
+ It's simpler to make the R4000 fixes and MIPS16 mode
+ mutually exclusive. */
+ target_flags &= ~MASK_FIX_R4000;
+
if (flag_pic && !TARGET_OLDABI)
sorry ("MIPS16 PIC for ABIs other than o32 and o64");
@@ -15835,38 +15850,33 @@ mips_conditional_register_usage (void)
global_regs[CCDSP_PO_REGNUM] = 1;
global_regs[CCDSP_SC_REGNUM] = 1;
}
- else
- {
- int regno;
+ else
+ AND_COMPL_HARD_REG_SET (accessible_reg_set,
+ reg_class_contents[(int) DSP_ACC_REGS]);
- for (regno = DSP_ACC_REG_FIRST; regno <= DSP_ACC_REG_LAST; regno++)
- fixed_regs[regno] = call_used_regs[regno] = 1;
- }
if (!TARGET_HARD_FLOAT)
{
- int regno;
-
- for (regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
- fixed_regs[regno] = call_used_regs[regno] = 1;
- for (regno = ST_REG_FIRST; regno <= ST_REG_LAST; regno++)
- fixed_regs[regno] = call_used_regs[regno] = 1;
+ AND_COMPL_HARD_REG_SET (accessible_reg_set,
+ reg_class_contents[(int) FP_REGS]);
+ AND_COMPL_HARD_REG_SET (accessible_reg_set,
+ reg_class_contents[(int) ST_REGS]);
}
- else if (! ISA_HAS_8CC)
+ else if (!ISA_HAS_8CC)
{
- int regno;
-
/* We only have a single condition-code register. We implement
this by fixing all the condition-code registers and generating
RTL that refers directly to ST_REG_FIRST. */
- for (regno = ST_REG_FIRST; regno <= ST_REG_LAST; regno++)
- fixed_regs[regno] = call_used_regs[regno] = 1;
+ AND_COMPL_HARD_REG_SET (accessible_reg_set,
+ reg_class_contents[(int) ST_REGS]);
+ SET_HARD_REG_BIT (accessible_reg_set, FPSW_REGNUM);
+ fixed_regs[FPSW_REGNUM] = call_used_regs[FPSW_REGNUM] = 1;
}
- /* In MIPS16 mode, we permit the $t temporary registers to be used
- for reload. We prohibit the unused $s registers, since they
- are call-saved, and saving them via a MIPS16 register would
- probably waste more time than just reloading the value. */
if (TARGET_MIPS16)
{
+ /* In MIPS16 mode, we permit the $t temporary registers to be used
+ for reload. We prohibit the unused $s registers, since they
+ are call-saved, and saving them via a MIPS16 register would
+ probably waste more time than just reloading the value. */
fixed_regs[18] = call_used_regs[18] = 1;
fixed_regs[19] = call_used_regs[19] = 1;
fixed_regs[20] = call_used_regs[20] = 1;
@@ -15876,6 +15886,12 @@ mips_conditional_register_usage (void)
fixed_regs[26] = call_used_regs[26] = 1;
fixed_regs[27] = call_used_regs[27] = 1;
fixed_regs[30] = call_used_regs[30] = 1;
+
+ /* Do not allow HI and LO to be treated as register operands.
+ There are no MTHI or MTLO instructions (or any real need
+ for them) and one-way registers cannot easily be reloaded. */
+ AND_COMPL_HARD_REG_SET (operand_reg_set,
+ reg_class_contents[(int) MD_REGS]);
}
/* $f20-$f23 are call-clobbered for n64. */
if (mips_abi == ABI_64)
@@ -16061,12 +16077,20 @@ mips_mulsidi3_gen_fn (enum rtx_code ext_code)
case we still expand mulsidi3 for DMUL. */
if (ISA_HAS_DMUL3)
return signed_p ? gen_mulsidi3_64bit_dmul : NULL;
+ if (TARGET_MIPS16)
+ return (signed_p
+ ? gen_mulsidi3_64bit_mips16
+ : gen_umulsidi3_64bit_mips16);
if (TARGET_FIX_R4000)
return NULL;
return signed_p ? gen_mulsidi3_64bit : gen_umulsidi3_64bit;
}
else
{
+ if (TARGET_MIPS16)
+ return (signed_p
+ ? gen_mulsidi3_32bit_mips16
+ : gen_umulsidi3_32bit_mips16);
if (TARGET_FIX_R4000 && !ISA_HAS_DSP)
return signed_p ? gen_mulsidi3_32bit_r4000 : gen_umulsidi3_32bit_r4000;
return signed_p ? gen_mulsidi3_32bit : gen_umulsidi3_32bit;
diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md
index c2211a3c170..55b7fffb665 100644
--- a/gcc/config/mips/mips.md
+++ b/gcc/config/mips/mips.md
@@ -1331,11 +1331,19 @@
(match_operand:GPR 2 "register_operand")))]
""
{
+ rtx lo;
+
if (TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A)
emit_insn (gen_mul<mode>3_mul3_loongson (operands[0], operands[1],
operands[2]));
else if (ISA_HAS_<D>MUL3)
emit_insn (gen_mul<mode>3_mul3 (operands[0], operands[1], operands[2]));
+ else if (TARGET_MIPS16)
+ {
+ lo = gen_rtx_REG (<MODE>mode, LO_REGNUM);
+ emit_insn (gen_mul<mode>3_internal (lo, operands[1], operands[2]));
+ emit_move_insn (operands[0], lo);
+ }
else if (TARGET_FIX_R4000)
emit_insn (gen_mul<mode>3_r4000 (operands[0], operands[1], operands[2]));
else
@@ -1398,7 +1406,7 @@
(clobber (match_dup 0))])])
(define_insn "mul<mode>3_internal"
- [(set (match_operand:GPR 0 "register_operand" "=l")
+ [(set (match_operand:GPR 0 "muldiv_target_operand" "=l")
(mult:GPR (match_operand:GPR 1 "register_operand" "d")
(match_operand:GPR 2 "register_operand" "d")))]
"!TARGET_FIX_R4000"
@@ -1575,7 +1583,7 @@
;; Patterns generated by the define_peephole2 below.
(define_insn "*macc2"
- [(set (match_operand:SI 0 "register_operand" "=l")
+ [(set (match_operand:SI 0 "muldiv_target_operand" "=l")
(plus:SI (mult:SI (match_operand:SI 1 "register_operand" "d")
(match_operand:SI 2 "register_operand" "d"))
(match_dup 0)))
@@ -1589,7 +1597,7 @@
(set_attr "mode" "SI")])
(define_insn "*msac2"
- [(set (match_operand:SI 0 "register_operand" "=l")
+ [(set (match_operand:SI 0 "muldiv_target_operand" "=l")
(minus:SI (match_dup 0)
(mult:SI (match_operand:SI 1 "register_operand" "d")
(match_operand:SI 2 "register_operand" "d"))))
@@ -1744,11 +1752,25 @@
DONE;
})
+(define_expand "<u>mulsidi3_32bit_mips16"
+ [(set (match_operand:DI 0 "register_operand")
+ (mult:DI (any_extend:DI (match_operand:SI 1 "register_operand"))
+ (any_extend:DI (match_operand:SI 2 "register_operand"))))]
+ "!TARGET_64BIT && TARGET_MIPS16"
+{
+ rtx hilo;
+
+ hilo = gen_rtx_REG (DImode, MD_REG_FIRST);
+ emit_insn (gen_<u>mulsidi3_32bit (hilo, operands[1], operands[2]));
+ emit_move_insn (operands[0], hilo);
+ DONE;
+})
+
;; As well as being named patterns, these instructions are used by the
;; __builtin_mips_mult<u>() functions. We must always make those functions
;; available if !TARGET_64BIT && ISA_HAS_DSP.
(define_insn "<u>mulsidi3_32bit"
- [(set (match_operand:DI 0 "register_operand" "=ka")
+ [(set (match_operand:DI 0 "muldiv_target_operand" "=ka")
(mult:DI (any_extend:DI (match_operand:SI 1 "register_operand" "d"))
(any_extend:DI (match_operand:SI 2 "register_operand" "d"))))]
"!TARGET_64BIT && (!TARGET_FIX_R4000 || ISA_HAS_DSP)"
@@ -1766,20 +1788,27 @@
(mult:DI (any_extend:DI (match_operand:SI 1 "register_operand" "d"))
(any_extend:DI (match_operand:SI 2 "register_operand" "d"))))
(clobber (match_scratch:DI 3 "=x"))]
- "!TARGET_64BIT && TARGET_FIX_R4000"
+ "!TARGET_64BIT && TARGET_FIX_R4000 && !ISA_HAS_DSP"
"mult<u>\t%1,%2\;mflo\t%L0\;mfhi\t%M0"
[(set_attr "type" "imul")
(set_attr "mode" "SI")
(set_attr "length" "12")])
-(define_insn "<u>mulsidi3_64bit"
+(define_insn_and_split "<u>mulsidi3_64bit"
[(set (match_operand:DI 0 "register_operand" "=d")
(mult:DI (any_extend:DI (match_operand:SI 1 "register_operand" "d"))
(any_extend:DI (match_operand:SI 2 "register_operand" "d"))))
(clobber (match_scratch:TI 3 "=x"))
(clobber (match_scratch:DI 4 "=d"))]
- "TARGET_64BIT && !TARGET_FIX_R4000 && !ISA_HAS_DMUL3"
+ "TARGET_64BIT && !TARGET_FIX_R4000 && !ISA_HAS_DMUL3 && !TARGET_MIPS16"
"#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ emit_insn (gen_<u>mulsidi3_64bit_split (operands[0], operands[1],
+ operands[2], operands[4]));
+ DONE;
+}
[(set_attr "type" "imul")
(set_attr "mode" "SI")
(set (attr "length")
@@ -1787,63 +1816,52 @@
(const_int 16)
(const_int 28)))])
-(define_split
- [(set (match_operand:DI 0 "d_operand")
- (mult:DI (any_extend:DI (match_operand:SI 1 "d_operand"))
- (any_extend:DI (match_operand:SI 2 "d_operand"))))
- (clobber (match_operand:TI 3 "hilo_operand"))
- (clobber (match_operand:DI 4 "d_operand"))]
- "TARGET_64BIT && !TARGET_FIX_R4000 && ISA_HAS_EXT_INS && reload_completed"
- [(set (match_dup 3)
- (unspec:TI [(mult:DI (any_extend:DI (match_dup 1))
- (any_extend:DI (match_dup 2)))]
- UNSPEC_SET_HILO))
-
- ;; OP0 <- LO, OP4 <- HI
- (set (match_dup 0) (match_dup 5))
- (set (match_dup 4) (unspec:DI [(match_dup 3)] UNSPEC_MFHI))
-
- (set (zero_extract:DI (match_dup 0) (const_int 32) (const_int 32))
- (match_dup 4))]
- { operands[5] = gen_rtx_REG (DImode, LO_REGNUM); })
+(define_expand "<u>mulsidi3_64bit_mips16"
+ [(set (match_operand:DI 0 "register_operand")
+ (mult:DI (any_extend:DI (match_operand:SI 1 "register_operand"))
+ (any_extend:DI (match_operand:SI 2 "register_operand"))))]
+ "TARGET_64BIT && TARGET_MIPS16"
+{
+ emit_insn (gen_<u>mulsidi3_64bit_split (operands[0], operands[1],
+ operands[2], gen_reg_rtx (DImode)));
+ DONE;
+})
-(define_split
- [(set (match_operand:DI 0 "d_operand")
- (mult:DI (any_extend:DI (match_operand:SI 1 "d_operand"))
- (any_extend:DI (match_operand:SI 2 "d_operand"))))
- (clobber (match_operand:TI 3 "hilo_operand"))
- (clobber (match_operand:DI 4 "d_operand"))]
- "TARGET_64BIT && !TARGET_FIX_R4000 && !ISA_HAS_EXT_INS && reload_completed"
- [(set (match_dup 3)
- (unspec:TI [(mult:DI (any_extend:DI (match_dup 1))
- (any_extend:DI (match_dup 2)))]
- UNSPEC_SET_HILO))
-
- ;; OP0 <- LO, OP4 <- HI
- (set (match_dup 0) (match_dup 5))
- (set (match_dup 4) (unspec:DI [(match_dup 3)] UNSPEC_MFHI))
-
- ;; Zero-extend OP0.
- (set (match_dup 0)
- (ashift:DI (match_dup 0)
- (const_int 32)))
- (set (match_dup 0)
- (lshiftrt:DI (match_dup 0)
- (const_int 32)))
+(define_expand "<u>mulsidi3_64bit_split"
+ [(set (match_operand:DI 0 "register_operand")
+ (mult:DI (any_extend:DI (match_operand:SI 1 "register_operand"))
+ (any_extend:DI (match_operand:SI 2 "register_operand"))))
+ (clobber (match_operand:DI 3 "register_operand"))]
+ ""
+{
+ rtx hilo;
- ;; Shift OP4 into place.
- (set (match_dup 4)
- (ashift:DI (match_dup 4)
- (const_int 32)))
+ hilo = gen_rtx_REG (TImode, MD_REG_FIRST);
+ emit_insn (gen_<u>mulsidi3_64bit_hilo (hilo, operands[1], operands[2]));
- ;; OR the two halves together
- (set (match_dup 0)
- (ior:DI (match_dup 0)
- (match_dup 4)))]
- { operands[5] = gen_rtx_REG (DImode, LO_REGNUM); })
+ emit_move_insn (operands[0], gen_rtx_REG (DImode, LO_REGNUM));
+ emit_insn (gen_mfhidi_ti (operands[3], hilo));
+
+ if (ISA_HAS_EXT_INS)
+ emit_insn (gen_insvdi (operands[0], GEN_INT (32), GEN_INT (32),
+ operands[3]));
+ else
+ {
+ /* Zero-extend the low part. */
+ mips_emit_binary (ASHIFT, operands[0], operands[0], GEN_INT (32));
+ mips_emit_binary (LSHIFTRT, operands[0], operands[0], GEN_INT (32));
+
+ /* Shift the high part into place. */
+ mips_emit_binary (ASHIFT, operands[3], operands[3], GEN_INT (32));
+
+ /* OR the two halves together. */
+ mips_emit_binary (IOR, operands[0], operands[0], operands[3]);
+ }
+ DONE;
+})
(define_insn "<u>mulsidi3_64bit_hilo"
- [(set (match_operand:TI 0 "register_operand" "=x")
+ [(set (match_operand:TI 0 "muldiv_target_operand" "=x")
(unspec:TI
[(mult:DI
(any_extend:DI (match_operand:SI 1 "register_operand" "d"))
@@ -1867,7 +1885,7 @@
;; Widening multiply with negation.
(define_insn "*muls<u>_di"
- [(set (match_operand:DI 0 "register_operand" "=x")
+ [(set (match_operand:DI 0 "muldiv_target_operand" "=x")
(neg:DI
(mult:DI
(any_extend:DI (match_operand:SI 1 "register_operand" "d"))
@@ -1885,9 +1903,9 @@
;; in GENERATE_MADD_MSUB for -mno-dsp, but always ignore them for -mdsp,
;; even if !ISA_HAS_DSP_MULT.
(define_insn "<u>msubsidi4"
- [(set (match_operand:DI 0 "register_operand" "=ka")
+ [(set (match_operand:DI 0 "muldiv_target_operand" "=ka")
(minus:DI
- (match_operand:DI 3 "register_operand" "0")
+ (match_operand:DI 3 "muldiv_target_operand" "0")
(mult:DI
(any_extend:DI (match_operand:SI 1 "register_operand" "d"))
(any_extend:DI (match_operand:SI 2 "register_operand" "d")))))]
@@ -1918,6 +1936,9 @@
emit_insn (gen_<su>mulsi3_highpart_mulhi_internal (operands[0],
operands[1],
operands[2]));
+ else if (TARGET_MIPS16)
+ emit_insn (gen_<su>mulsi3_highpart_split (operands[0], operands[1],
+ operands[2]));
else
emit_insn (gen_<su>mulsi3_highpart_internal (operands[0], operands[1],
operands[2]));
@@ -1932,11 +1953,28 @@
(any_extend:DI (match_operand:SI 2 "register_operand" "d")))
(const_int 32))))
(clobber (match_scratch:SI 3 "=l"))]
- "!ISA_HAS_MULHI"
+ "!ISA_HAS_MULHI && !TARGET_MIPS16"
{ return TARGET_FIX_R4000 ? "mult<u>\t%1,%2\n\tmfhi\t%0" : "#"; }
"&& reload_completed && !TARGET_FIX_R4000"
[(const_int 0)]
{
+ emit_insn (gen_<su>mulsi3_highpart_split (operands[0], operands[1],
+ operands[2]));
+ DONE;
+}
+ [(set_attr "type" "imul")
+ (set_attr "mode" "SI")
+ (set_attr "length" "8")])
+
+(define_expand "<su>mulsi3_highpart_split"
+ [(set (match_operand:SI 0 "register_operand")
+ (truncate:SI
+ (lshiftrt:DI
+ (mult:DI (any_extend:DI (match_operand:SI 1 "register_operand"))
+ (any_extend:DI (match_operand:SI 2 "register_operand")))
+ (const_int 32))))]
+ ""
+{
rtx hilo;
if (TARGET_64BIT)
@@ -1952,10 +1990,7 @@
emit_insn (gen_mfhisi_di (operands[0], hilo));
}
DONE;
-}
- [(set_attr "type" "imul")
- (set_attr "mode" "SI")
- (set_attr "length" "8")])
+})
(define_insn "<su>mulsi3_highpart_mulhi_internal"
[(set (match_operand:SI 0 "register_operand" "=d")
@@ -1989,7 +2024,25 @@
;; Disable unsigned multiplication for -mfix-vr4120. This is for VR4120
;; errata MD(0), which says that dmultu does not always produce the
;; correct result.
-(define_insn_and_split "<su>muldi3_highpart"
+(define_expand "<su>muldi3_highpart"
+ [(set (match_operand:DI 0 "register_operand")
+ (truncate:DI
+ (lshiftrt:TI
+ (mult:TI (any_extend:TI (match_operand:DI 1 "register_operand"))
+ (any_extend:TI (match_operand:DI 2 "register_operand")))
+ (const_int 64))))]
+ "TARGET_64BIT && !(<CODE> == ZERO_EXTEND && TARGET_FIX_VR4120)"
+{
+ if (TARGET_MIPS16)
+ emit_insn (gen_<su>muldi3_highpart_split (operands[0], operands[1],
+ operands[2]));
+ else
+ emit_insn (gen_<su>muldi3_highpart_internal (operands[0], operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn_and_split "<su>muldi3_highpart_internal"
[(set (match_operand:DI 0 "register_operand" "=d")
(truncate:DI
(lshiftrt:TI
@@ -1997,21 +2050,37 @@
(any_extend:TI (match_operand:DI 2 "register_operand" "d")))
(const_int 64))))
(clobber (match_scratch:DI 3 "=l"))]
- "TARGET_64BIT && !(<CODE> == ZERO_EXTEND && TARGET_FIX_VR4120)"
+ "TARGET_64BIT
+ && !TARGET_MIPS16
+ && !(<CODE> == ZERO_EXTEND && TARGET_FIX_VR4120)"
{ return TARGET_FIX_R4000 ? "dmult<u>\t%1,%2\n\tmfhi\t%0" : "#"; }
"&& reload_completed && !TARGET_FIX_R4000"
[(const_int 0)]
{
+ emit_insn (gen_<su>muldi3_highpart_split (operands[0], operands[1],
+ operands[2]));
+ DONE;
+}
+ [(set_attr "type" "imul")
+ (set_attr "mode" "DI")
+ (set_attr "length" "8")])
+
+(define_expand "<su>muldi3_highpart_split"
+ [(set (match_operand:DI 0 "register_operand")
+ (truncate:DI
+ (lshiftrt:TI
+ (mult:TI (any_extend:TI (match_operand:DI 1 "register_operand"))
+ (any_extend:TI (match_operand:DI 2 "register_operand")))
+ (const_int 64))))]
+ ""
+{
rtx hilo;
hilo = gen_rtx_REG (TImode, MD_REG_FIRST);
emit_insn (gen_<u>mulditi3_internal (hilo, operands[1], operands[2]));
emit_insn (gen_mfhidi_ti (operands[0], hilo));
DONE;
-}
- [(set_attr "type" "imul")
- (set_attr "mode" "DI")
- (set_attr "length" "8")])
+})
(define_expand "<u>mulditi3"
[(set (match_operand:TI 0 "register_operand")
@@ -2019,7 +2088,15 @@
(any_extend:TI (match_operand:DI 2 "register_operand"))))]
"TARGET_64BIT && !(<CODE> == ZERO_EXTEND && TARGET_FIX_VR4120)"
{
- if (TARGET_FIX_R4000)
+ rtx hilo;
+
+ if (TARGET_MIPS16)
+ {
+ hilo = gen_rtx_REG (TImode, MD_REG_FIRST);
+ emit_insn (gen_<u>mulditi3_internal (hilo, operands[1], operands[2]));
+ emit_move_insn (operands[0], hilo);
+ }
+ else if (TARGET_FIX_R4000)
emit_insn (gen_<u>mulditi3_r4000 (operands[0], operands[1], operands[2]));
else
emit_insn (gen_<u>mulditi3_internal (operands[0], operands[1],
@@ -2028,7 +2105,7 @@
})
(define_insn "<u>mulditi3_internal"
- [(set (match_operand:TI 0 "register_operand" "=x")
+ [(set (match_operand:TI 0 "muldiv_target_operand" "=x")
(mult:TI (any_extend:TI (match_operand:DI 1 "register_operand" "d"))
(any_extend:TI (match_operand:DI 2 "register_operand" "d"))))]
"TARGET_64BIT
@@ -2067,11 +2144,11 @@
;; See the comment above <u>msubsidi4 for the relationship between
;; ISA_HAS_DSP and ISA_HAS_DSP_MULT.
(define_insn "<u>maddsidi4"
- [(set (match_operand:DI 0 "register_operand" "=ka")
+ [(set (match_operand:DI 0 "muldiv_target_operand" "=ka")
(plus:DI
(mult:DI (any_extend:DI (match_operand:SI 1 "register_operand" "d"))
(any_extend:DI (match_operand:SI 2 "register_operand" "d")))
- (match_operand:DI 3 "register_operand" "0")))]
+ (match_operand:DI 3 "muldiv_target_operand" "0")))]
"(TARGET_MAD || ISA_HAS_MACC || GENERATE_MADD_MSUB || ISA_HAS_DSP)
&& !TARGET_64BIT"
{
@@ -2311,72 +2388,113 @@
;; VR4120 errata MD(A1): signed division instructions do not work correctly
;; with negative operands. We use special libgcc functions instead.
-(define_insn_and_split "divmod<mode>4"
- [(set (match_operand:GPR 0 "register_operand" "=l")
+(define_expand "divmod<mode>4"
+ [(set (match_operand:GPR 0 "register_operand")
+ (div:GPR (match_operand:GPR 1 "register_operand")
+ (match_operand:GPR 2 "register_operand")))
+ (set (match_operand:GPR 3 "register_operand")
+ (mod:GPR (match_dup 1)
+ (match_dup 2)))]
+ "!TARGET_FIX_VR4120"
+{
+ if (TARGET_MIPS16)
+ {
+ emit_insn (gen_divmod<mode>4_split (operands[3], operands[1],
+ operands[2]));
+ emit_move_insn (operands[0], gen_rtx_REG (<MODE>mode, LO_REGNUM));
+ }
+ else
+ emit_insn (gen_divmod<mode>4_internal (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+})
+
+(define_insn_and_split "divmod<mode>4_internal"
+ [(set (match_operand:GPR 0 "muldiv_target_operand" "=l")
(div:GPR (match_operand:GPR 1 "register_operand" "d")
(match_operand:GPR 2 "register_operand" "d")))
(set (match_operand:GPR 3 "register_operand" "=d")
(mod:GPR (match_dup 1)
(match_dup 2)))]
- "!TARGET_FIX_VR4120"
+ "!TARGET_FIX_VR4120 && !TARGET_MIPS16"
"#"
"&& reload_completed"
[(const_int 0)]
{
- rtx hilo;
-
- if (TARGET_64BIT)
- {
- hilo = gen_rtx_REG (TImode, MD_REG_FIRST);
- emit_insn (gen_divmod<mode>4_hilo_ti (hilo, operands[1], operands[2]));
- emit_insn (gen_mfhi<mode>_ti (operands[3], hilo));
- }
- else
- {
- hilo = gen_rtx_REG (DImode, MD_REG_FIRST);
- emit_insn (gen_divmod<mode>4_hilo_di (hilo, operands[1], operands[2]));
- emit_insn (gen_mfhi<mode>_di (operands[3], hilo));
- }
+ emit_insn (gen_divmod<mode>4_split (operands[3], operands[1], operands[2]));
DONE;
}
[(set_attr "type" "idiv")
(set_attr "mode" "<MODE>")
(set_attr "length" "8")])
-(define_insn_and_split "udivmod<mode>4"
- [(set (match_operand:GPR 0 "register_operand" "=l")
+(define_expand "udivmod<mode>4"
+ [(set (match_operand:GPR 0 "register_operand")
+ (udiv:GPR (match_operand:GPR 1 "register_operand")
+ (match_operand:GPR 2 "register_operand")))
+ (set (match_operand:GPR 3 "register_operand")
+ (umod:GPR (match_dup 1)
+ (match_dup 2)))]
+ ""
+{
+ if (TARGET_MIPS16)
+ {
+ emit_insn (gen_udivmod<mode>4_split (operands[3], operands[1],
+ operands[2]));
+ emit_move_insn (operands[0], gen_rtx_REG (<MODE>mode, LO_REGNUM));
+ }
+ else
+ emit_insn (gen_udivmod<mode>4_internal (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+})
+
+(define_insn_and_split "udivmod<mode>4_internal"
+ [(set (match_operand:GPR 0 "muldiv_target_operand" "=l")
(udiv:GPR (match_operand:GPR 1 "register_operand" "d")
(match_operand:GPR 2 "register_operand" "d")))
(set (match_operand:GPR 3 "register_operand" "=d")
(umod:GPR (match_dup 1)
(match_dup 2)))]
- ""
+ "!TARGET_MIPS16"
"#"
"reload_completed"
[(const_int 0)]
{
+ emit_insn (gen_udivmod<mode>4_split (operands[3], operands[1], operands[2]));
+ DONE;
+}
+ [(set_attr "type" "idiv")
+ (set_attr "mode" "<MODE>")
+ (set_attr "length" "8")])
+
+(define_expand "<u>divmod<mode>4_split"
+ [(set (match_operand:GPR 0 "register_operand")
+ (any_mod:GPR (match_operand:GPR 1 "register_operand")
+ (match_operand:GPR 2 "register_operand")))]
+ ""
+{
rtx hilo;
if (TARGET_64BIT)
{
hilo = gen_rtx_REG (TImode, MD_REG_FIRST);
- emit_insn (gen_udivmod<mode>4_hilo_ti (hilo, operands[1], operands[2]));
- emit_insn (gen_mfhi<mode>_ti (operands[3], hilo));
+ emit_insn (gen_<u>divmod<mode>4_hilo_ti (hilo, operands[1],
+ operands[2]));
+ emit_insn (gen_mfhi<mode>_ti (operands[0], hilo));
}
else
{
hilo = gen_rtx_REG (DImode, MD_REG_FIRST);
- emit_insn (gen_udivmod<mode>4_hilo_di (hilo, operands[1], operands[2]));
- emit_insn (gen_mfhi<mode>_di (operands[3], hilo));
+ emit_insn (gen_<u>divmod<mode>4_hilo_di (hilo, operands[1],
+ operands[2]));
+ emit_insn (gen_mfhi<mode>_di (operands[0], hilo));
}
DONE;
-}
- [(set_attr "type" "idiv")
- (set_attr "mode" "<MODE>")
- (set_attr "length" "8")])
+})
(define_insn "<u>divmod<GPR:mode>4_hilo_<HILO:mode>"
- [(set (match_operand:HILO 0 "register_operand" "=x")
+ [(set (match_operand:HILO 0 "muldiv_target_operand" "=x")
(unspec:HILO
[(any_div:GPR (match_operand:GPR 1 "register_operand" "d")
(match_operand:GPR 2 "register_operand" "d"))]
@@ -4590,7 +4708,7 @@
;; and the errata related to -mfix-vr4130.
(define_insn "mfhi<GPR:mode>_<HILO:mode>"
[(set (match_operand:GPR 0 "register_operand" "=d")
- (unspec:GPR [(match_operand:HILO 1 "register_operand" "x")]
+ (unspec:GPR [(match_operand:HILO 1 "hilo_operand" "x")]
UNSPEC_MFHI))]
""
{ return ISA_HAS_MACCHI ? "<GPR:d>macchi\t%0,%.,%." : "mfhi\t%0"; }
diff --git a/gcc/config/mips/predicates.md b/gcc/config/mips/predicates.md
index dd5148067cd..5e9398e69f3 100644
--- a/gcc/config/mips/predicates.md
+++ b/gcc/config/mips/predicates.md
@@ -127,6 +127,11 @@
(and (match_code "reg,subreg")
(match_test "ST_REG_P (true_regnum (op))")))
+(define_predicate "muldiv_target_operand"
+ (if_then_else (match_test "TARGET_MIPS16")
+ (match_operand 0 "hilo_operand")
+ (match_operand 0 "register_operand")))
+
(define_special_predicate "pc_or_label_operand"
(match_code "pc,label_ref"))
@@ -189,7 +194,9 @@
})
(define_predicate "move_operand"
- (match_operand 0 "general_operand")
+ ;; Allow HI and LO to be used as the source of a MIPS16 move.
+ (ior (match_operand 0 "general_operand")
+ (match_operand 0 "hilo_operand"))
{
enum mips_symbol_type symbol_type;
diff --git a/gcc/config/pa/pa-linux.h b/gcc/config/pa/pa-linux.h
index addc0e18c8b..6cc68a5c6e1 100644
--- a/gcc/config/pa/pa-linux.h
+++ b/gcc/config/pa/pa-linux.h
@@ -133,7 +133,6 @@ along with GCC; see the file COPYING3. If not see
} \
while (0)
-/* Linux always uses gas. */
#undef TARGET_GAS
#define TARGET_GAS 1