summaryrefslogtreecommitdiff
path: root/libgcc/config/spu/multi3.c
blob: b8b0e90ee252ac4658b00433eca70eef19cc4fac (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
 
   This file is free software; you can redistribute it and/or modify it under
   the terms of the GNU General Public License as published by the Free
   Software Foundation; either version 3 of the License, or (at your option)
   any later version.
 
   This file is distributed in the hope that it will be useful, but WITHOUT
   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   for more details.
 
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#include <spu_intrinsics.h>

typedef int TItype __attribute__ ((mode (TI)));

union qword_TItype
  {
    qword q;
    TItype t;
  };
  
inline static qword
si_from_TItype (TItype t)
{ 
  union qword_TItype u;
  u.t = t;
  return u.q;
}

inline static TItype
si_to_TItype (qword q)
{ 
  union qword_TItype u;
  u.q = q;
  return u.t;
}

/* A straight forward vectorization and unrolling of
 *   short l[8], r[8];
 *   TItype total = 0;
 *   for (i = 0; i < 8; i++)
 *     for (j = 0; j < 8; j++)
 *       total += (TItype)((l[7-i] * r[7-j]) << (16 * (i + j)));
 */
TItype
__multi3 (TItype l, TItype r)
{
  qword u = si_from_TItype (l);
  qword v = si_from_TItype (r);
  qword splat0 = si_shufb (v, v, si_ilh (0x0001));
  qword splat1 = si_shufb (v, v, si_ilh (0x0203));
  qword splat2 = si_shufb (v, v, si_ilh (0x0405));
  qword splat3 = si_shufb (v, v, si_ilh (0x0607));
  qword splat4 = si_shufb (v, v, si_ilh (0x0809));
  qword splat5 = si_shufb (v, v, si_ilh (0x0a0b));
  qword splat6 = si_shufb (v, v, si_ilh (0x0c0d));
  qword splat7 = si_shufb (v, v, si_ilh (0x0e0f));

  qword part0l = si_shlqbyi (si_mpyu   (u, splat0), 14);
  qword part1h = si_shlqbyi (si_mpyhhu (u, splat1), 14);
  qword part1l = si_shlqbyi (si_mpyu   (u, splat1), 12);
  qword part2h = si_shlqbyi (si_mpyhhu (u, splat2), 12);
  qword part2l = si_shlqbyi (si_mpyu   (u, splat2), 10);
  qword part3h = si_shlqbyi (si_mpyhhu (u, splat3), 10);
  qword part3l = si_shlqbyi (si_mpyu   (u, splat3), 8);
  qword part4h = si_shlqbyi (si_mpyhhu (u, splat4), 8);
  qword part4l = si_shlqbyi (si_mpyu   (u, splat4), 6);
  qword part5h = si_shlqbyi (si_mpyhhu (u, splat5), 6);
  qword part5l = si_shlqbyi (si_mpyu   (u, splat5), 4);
  qword part6h = si_shlqbyi (si_mpyhhu (u, splat6), 4);
  qword part6l = si_shlqbyi (si_mpyu   (u, splat6), 2);
  qword part7h = si_shlqbyi (si_mpyhhu (u, splat7), 2);
  qword part7l = si_mpyu (u, splat7);

  qword carry, total0, total1, total2, total3, total4;
  qword total5, total6, total7, total8, total9, total10;
  qword total;

  total0 = si_a (si_a (si_a (part0l, part1h), si_a (part1l, part2h)), part7l);
  total1 = si_a (part2l, part3h);
  total2 = si_a (part3l, part4h);
  total3 = si_a (part4l, part5h);
  total4 = si_a (part5l, part6h);
  total5 = si_a (part6l, part7h);
  total6 = si_a (total0, total1);
  total7 = si_a (total2, total3);
  total8 = si_a (total4, total5);
  total9 = si_a (total6, total7);
  total10 = si_a (total8, total9);

  carry = si_cg (part2l, part3h);
  carry = si_a (carry, si_cg (part3l, part4h));
  carry = si_a (carry, si_cg (part4l, part5h));
  carry = si_a (carry, si_cg (part5l, part6h));
  carry = si_a (carry, si_cg (part6l, part7h));
  carry = si_a (carry, si_cg (total0, total1));
  carry = si_a (carry, si_cg (total2, total3));
  carry = si_a (carry, si_cg (total4, total5));
  carry = si_a (carry, si_cg (total6, total7));
  carry = si_a (carry, si_cg (total8, total9));
  carry = si_shlqbyi (carry, 4);

  total = si_cg (total10, carry);
  total = si_shlqbyi (total, 4);
  total = si_cgx (total10, carry, total);
  total = si_shlqbyi (total, 4);
  total = si_addx (total10, carry, total);
  return si_to_TItype (total);
}