diff options
-rw-r--r-- | gcc/ChangeLog | 30 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000-c.c | 62 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000-protos.h | 1 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.c | 121 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.h | 9 | ||||
-rw-r--r-- | gcc/config/rs6000/vsx.md | 39 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 4 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/powerpc/vec-extract-1.c | 27 |
8 files changed, 269 insertions, 24 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index cad90e6c92f..e51cc011e53 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,33 @@ +2016-07-28 Michael Meissner <meissner@linux.vnet.ibm.com> + + * config/rs6000/rs6000-protos.h (rs6000_split_vec_extract_var): + New declaration. + * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): + Add support for vec_extract of vector double or vector long having + a variable element number on 64-bit ISA 2.07 systems or newer. + * config/rs6000/rs6000.c (rs6000_expand_vector_extract): + Likewise. + (rs6000_split_vec_extract_var): New function to split a + vec_extract built-in function with variable element number. + (rtx_is_swappable_p): Variable vec_extracts and shifts are not + swappable. + * config/rs6000/vsx.md (UNSPEC_VSX_VSLO): New unspec. + (UNSPEC_VSX_EXTRACT): Likewise. + (vsx_extract_<mode>, VSX_D iterator): Fix constraints to allow + direct move instructions to be generated on 64-bit ISA 2.07 + systems and newer, and to take advantage of the ISA 3.0 MFVSRLD + instruction. + (vsx_vslo_<mode>): New insn to do VSLO on V2DFmode and V2DImode + arguments for vec_extract variable element. + (vsx_extract_<mode>_var, VSX_D iterator): New insn to support + vec_extract with variable element on V2DFmode and V2DImode + vectors. + * config/rs6000/rs6000.h (TARGET_VEXTRACTUB): Remove + -mupper-regs-df requirement, since it isn't needed. + (TARGET_DIRECT_MOVE_64BIT): New macro to say whether we can + do direct moves on 64-bit systems, which allows optimization of + vec_extract on 64-bit ISA 2.07 systems and newer. + 2016-07-28 Kristina Martsenko <kristina.martsenko@arm.com> 2016-07-28 Wilco Dijkstra <wdijkstr@arm.com> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index b8ca0e9d3bc..2a60262f034 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -5105,29 +5105,61 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, arg2); } - /* If we can use the VSX xxpermdi instruction, use that for extract. */ + /* See if we can optimize vec_extracts with the current VSX instruction + set. */ mode = TYPE_MODE (arg1_type); - if ((mode == V2DFmode || mode == V2DImode) && VECTOR_MEM_VSX_P (mode) - && TREE_CODE (arg2) == INTEGER_CST - && wi::ltu_p (arg2, 2)) + if (VECTOR_MEM_VSX_P (mode)) + { tree call = NULL_TREE; + int nunits = GET_MODE_NUNITS (mode); - if (mode == V2DFmode) - call = rs6000_builtin_decls[VSX_BUILTIN_VEC_EXT_V2DF]; - else if (mode == V2DImode) - call = rs6000_builtin_decls[VSX_BUILTIN_VEC_EXT_V2DI]; + /* If the second argument is an integer constant, if the value is in + the expected range, generate the built-in code if we can. We need + 64-bit and direct move to extract the small integer vectors. */ + if (TREE_CODE (arg2) == INTEGER_CST && wi::ltu_p (arg2, nunits)) + { + switch (mode) + { + default: + break; + + case V1TImode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_EXT_V1TI]; + break; + + case V2DFmode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_EXT_V2DF]; + break; + + case V2DImode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_EXT_V2DI]; + break; + } + } + + /* If the second argument is variable, we can optimize it if we are + generating 64-bit code on a machine with direct move. */ + else if (TREE_CODE (arg2) != INTEGER_CST && TARGET_DIRECT_MOVE_64BIT) + { + switch (mode) + { + default: + break; + + case V2DFmode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_EXT_V2DF]; + break; + + case V2DImode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_EXT_V2DI]; + break; + } + } if (call) return build_call_expr (call, 2, arg1, arg2); } - else if (mode == V1TImode && VECTOR_MEM_VSX_P (mode) - && TREE_CODE (arg2) == INTEGER_CST - && wi::eq_p (arg2, 0)) - { - tree call = rs6000_builtin_decls[VSX_BUILTIN_VEC_EXT_V1TI]; - return build_call_expr (call, 2, arg1, arg2); - } /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2). */ arg1_inner_type = TREE_TYPE (arg1_type); diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index fefe7049804..fdf5c6885a7 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -62,6 +62,7 @@ extern void rs6000_expand_vector_init (rtx, rtx); extern void paired_expand_vector_init (rtx, rtx); extern void rs6000_expand_vector_set (rtx, rtx, int); extern void rs6000_expand_vector_extract (rtx, rtx, rtx); +extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx); extern bool altivec_expand_vec_perm_const (rtx op[4]); extern void altivec_expand_vec_perm_le (rtx op[4]); extern bool rs6000_expand_vec_perm_const (rtx op[4]); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index e5d8ad07198..e9447f7a8b9 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6959,8 +6959,31 @@ rs6000_expand_vector_extract (rtx target, rtx vec, rtx elt) emit_insn (gen_vsx_extract_v4si (target, vec, elt)); return; } - else - break; + break; + } + } + else if (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (elt) + && TARGET_DIRECT_MOVE_64BIT) + { + if (GET_MODE (elt) != DImode) + { + rtx tmp = gen_reg_rtx (DImode); + convert_move (tmp, elt, 0); + elt = tmp; + } + + switch (mode) + { + case V2DFmode: + emit_insn (gen_vsx_extract_v2df_var (target, vec, elt)); + return; + + case V2DImode: + emit_insn (gen_vsx_extract_v2di_var (target, vec, elt)); + return; + + default: + gcc_unreachable (); } } @@ -6978,6 +7001,99 @@ rs6000_expand_vector_extract (rtx target, rtx vec, rtx elt) emit_move_insn (target, adjust_address_nv (mem, inner_mode, 0)); } +/* Split a variable vec_extract operation into the component instructions. */ + +void +rs6000_split_vec_extract_var (rtx dest, rtx src, rtx element, rtx tmp_gpr, + rtx tmp_altivec) +{ + machine_mode mode = GET_MODE (src); + machine_mode scalar_mode = GET_MODE (dest); + unsigned scalar_size = GET_MODE_SIZE (scalar_mode); + int byte_shift = exact_log2 (scalar_size); + + gcc_assert (byte_shift >= 0); + + if (REG_P (src) || SUBREG_P (src)) + { + int bit_shift = byte_shift + 3; + rtx element2; + + gcc_assert (REG_P (tmp_gpr) && REG_P (tmp_altivec)); + + /* For little endian, adjust element ordering. For V2DI/V2DF, we can use + an XOR, otherwise we need to subtract. The shift amount is so VSLO + will shift the element into the upper position (adding 3 to convert a + byte shift into a bit shift). */ + if (scalar_size == 8) + { + if (!VECTOR_ELT_ORDER_BIG) + { + emit_insn (gen_xordi3 (tmp_gpr, element, const1_rtx)); + element2 = tmp_gpr; + } + else + element2 = element; + + /* Generate RLDIC directly to shift left 6 bits and retrieve 1 + bit. */ + emit_insn (gen_rtx_SET (tmp_gpr, + gen_rtx_AND (DImode, + gen_rtx_ASHIFT (DImode, + element2, + GEN_INT (6)), + GEN_INT (64)))); + } + else + { + if (!VECTOR_ELT_ORDER_BIG) + { + rtx num_ele_m1 = GEN_INT (GET_MODE_NUNITS (mode) - 1); + + emit_insn (gen_anddi3 (tmp_gpr, element, num_ele_m1)); + emit_insn (gen_subdi3 (tmp_gpr, num_ele_m1, tmp_gpr)); + element2 = tmp_gpr; + } + else + element2 = element; + + emit_insn (gen_ashldi3 (tmp_gpr, element2, GEN_INT (bit_shift))); + } + + /* Get the value into the lower byte of the Altivec register where VSLO + expects it. */ + if (TARGET_P9_VECTOR) + emit_insn (gen_vsx_splat_v2di (tmp_altivec, tmp_gpr)); + else if (can_create_pseudo_p ()) + emit_insn (gen_vsx_concat_v2di (tmp_altivec, tmp_gpr, tmp_gpr)); + else + { + rtx tmp_di = gen_rtx_REG (DImode, REGNO (tmp_altivec)); + emit_move_insn (tmp_di, tmp_gpr); + emit_insn (gen_vsx_concat_v2di (tmp_altivec, tmp_di, tmp_di)); + } + + /* Do the VSLO to get the value into the final location. */ + switch (mode) + { + case V2DFmode: + emit_insn (gen_vsx_vslo_v2df (dest, src, tmp_altivec)); + return; + + case V2DImode: + emit_insn (gen_vsx_vslo_v2di (dest, src, tmp_altivec)); + return; + + default: + gcc_unreachable (); + } + + return; + } + else + gcc_unreachable (); + } + /* Return TRUE if OP is an invalid SUBREG operation on the e500. */ bool @@ -38640,6 +38756,7 @@ rtx_is_swappable_p (rtx op, unsigned int *special) case UNSPEC_VSX_CVDPSPN: case UNSPEC_VSX_CVSPDP: case UNSPEC_VSX_CVSPDPN: + case UNSPEC_VSX_EXTRACT: return 0; case UNSPEC_VSPLT_DIRECT: *special = SH_SPLAT; diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index a2b16d9c73b..9b3e83bcd30 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -602,7 +602,6 @@ extern int rs6000_vector_align[]; #define TARGET_DIRECT_MOVE_128 (TARGET_P9_VECTOR && TARGET_DIRECT_MOVE \ && TARGET_POWERPC64) #define TARGET_VEXTRACTUB (TARGET_P9_VECTOR && TARGET_DIRECT_MOVE \ - && TARGET_UPPER_REGS_DF \ && TARGET_UPPER_REGS_DI && TARGET_POWERPC64) /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present @@ -761,6 +760,14 @@ extern int rs6000_vector_align[]; && TARGET_SINGLE_FLOAT \ && TARGET_DOUBLE_FLOAT) +/* Macro to say whether we can do optimization where we need to do parts of the + calculation in 64-bit GPRs and then is transfered to the vector + registers. */ +#define TARGET_DIRECT_MOVE_64BIT (TARGET_DIRECT_MOVE \ + && TARGET_P8_VECTOR \ + && TARGET_POWERPC64 \ + && TARGET_UPPER_REGS_DI) + /* Whether the various reciprocal divide/square root estimate instructions exist, and whether we should automatically generate code for the instruction by default. */ diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 1fddebcf5c3..ca569a2cbf1 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -309,6 +309,8 @@ UNSPEC_VSX_XVCVDPUXDS UNSPEC_VSX_SIGN_EXTEND UNSPEC_P9_MEMORY + UNSPEC_VSX_VSLO + UNSPEC_VSX_EXTRACT ]) ;; VSX moves @@ -2118,16 +2120,13 @@ ;; register was picked. Limit the scalar value to FPRs for now. (define_insn "vsx_extract_<mode>" - [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" - "=d, wm, wo, d") + [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=d, d, wr, wr") (vec_select:<VS_scalar> - (match_operand:VSX_D 1 "gpc_reg_operand" - "<VSa>, <VSa>, <VSa>, <VSa>") + (match_operand:VSX_D 1 "gpc_reg_operand" "<VSa>, <VSa>, wm, wo") (parallel - [(match_operand:QI 2 "const_0_to_1_operand" - "wD, wD, wL, n")])))] + [(match_operand:QI 2 "const_0_to_1_operand" "wD, n, wD, n")])))] "VECTOR_MEM_VSX_P (<MODE>mode)" { int element = INTVAL (operands[2]); @@ -2205,6 +2204,34 @@ [(set_attr "type" "fpstore") (set_attr "length" "4")]) +;; Variable V2DI/V2DF extract shift +(define_insn "vsx_vslo_<mode>" + [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=v") + (unspec:<VS_scalar> [(match_operand:VSX_D 1 "gpc_reg_operand" "v") + (match_operand:V2DI 2 "gpc_reg_operand" "v")] + UNSPEC_VSX_VSLO))] + "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT" + "vslo %0,%1,%2" + [(set_attr "type" "vecperm")]) + +;; Variable V2DI/V2DF extract +(define_insn_and_split "vsx_extract_<mode>_var" + [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=v") + (unspec:<VS_scalar> [(match_operand:VSX_D 1 "input_operand" "v") + (match_operand:DI 2 "gpc_reg_operand" "r")] + UNSPEC_VSX_EXTRACT)) + (clobber (match_scratch:DI 3 "=r")) + (clobber (match_scratch:V2DI 4 "=&v"))] + "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT" + "#" + "&& reload_completed" + [(const_int 0)] +{ + rs6000_split_vec_extract_var (operands[0], operands[1], operands[2], + operands[3], operands[4]); + DONE; +}) + ;; Extract a SF element from V4SF (define_insn_and_split "vsx_extract_v4sf" [(set (match_operand:SF 0 "vsx_register_operand" "=f,f") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index fc764c97e97..239cf67c12d 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2016-07-28 Michael Meissner <meissner@linux.vnet.ibm.com> + + * gcc.target/powerpc/vec-extract-1.c: New test. + 2016-07-28 Steven G. Kargl <kargl@gcc.gnu.org> PR fortran/71799 diff --git a/gcc/testsuite/gcc.target/powerpc/vec-extract-1.c b/gcc/testsuite/gcc.target/powerpc/vec-extract-1.c new file mode 100644 index 00000000000..ef34e2bec2c --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vec-extract-1.c @@ -0,0 +1,27 @@ +/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-require-effective-target powerpc_p8vector_ok } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O2 -mupper-regs-df -mupper-regs-di" } */ + +#include <altivec.h> + +double +add_double (vector double a, int n) +{ + return vec_extract (a, n) + 1.0; +} + +long +add_long (vector long a, int n) +{ + return vec_extract (a, n) + 1; +} + +/* { dg-final { scan-assembler "vslo" } } */ +/* { dg-final { scan-assembler "mtvsrd" } } */ +/* { dg-final { scan-assembler "mfvsrd" } } */ +/* { dg-final { scan-assembler-not "stxvd2x" } } */ +/* { dg-final { scan-assembler-not "stxvx" } } */ +/* { dg-final { scan-assembler-not "stxv" } } */ +/* { dg-final { scan-assembler-not "ldx" } } */ |