1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
|
/* Copyright (C) 2011-2021 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This header is distributed to simplify porting x86_64 code that
makes explicit use of Intel intrinsics to powerpc64le.
It is the user's responsibility to determine if the results are
acceptable and make additional changes as necessary.
Note that much code that uses Intel intrinsics can be rewritten in
standard C or GNU C extensions, which are more portable and better
optimized across multiple targets. */
#if !defined _X86INTRIN_H_INCLUDED
# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _BMI2INTRIN_H_INCLUDED
#define _BMI2INTRIN_H_INCLUDED
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bzhi_u32 (unsigned int __X, unsigned int __Y)
{
return ((__X << (32 - __Y)) >> (32 - __Y));
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
{
unsigned long long __res = (unsigned long long) __X * __Y;
*__P = (unsigned int) (__res >> 32);
return (unsigned int) __res;
}
#ifdef __PPC64__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
{
return ((__X << (64 - __Y)) >> (64 - __Y));
}
/* __int128 requires base 64-bit. */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mulx_u64 (unsigned long long __X, unsigned long long __Y,
unsigned long long *__P)
{
unsigned __int128 __res = (unsigned __int128) __X * __Y;
*__P = (unsigned long long) (__res >> 64);
return (unsigned long long) __res;
}
#ifdef _ARCH_PWR7
/* popcount and bpermd require power7 minimum. */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u64 (unsigned long long __X, unsigned long long __M)
{
unsigned long result = 0x0UL;
const unsigned long mask = 0x8000000000000000UL;
unsigned long m = __M;
unsigned long c, t;
unsigned long p;
/* The pop-count of the mask gives the number of the bits from
source to process. This is also needed to shift bits from the
source into the correct position for the result. */
p = 64 - __builtin_popcountl (__M);
/* The loop is for the number of '1' bits in the mask and clearing
each mask bit as it is processed. */
while (m != 0)
{
c = __builtin_clzl (m);
t = __X << (p - c);
m ^= (mask >> c);
result |= (t & (mask >> c));
p++;
}
return (result);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u64 (unsigned long long __X, unsigned long long __M)
{
unsigned long p = 0x4040404040404040UL; // initial bit permute control
const unsigned long mask = 0x8000000000000000UL;
unsigned long m = __M;
unsigned long c;
unsigned long result;
/* if the mask is constant and selects 8 bits or less we can use
the Power8 Bit permute instruction. */
if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8))
{
/* Also if the pext mask is constant, then the popcount is
constant, we can evaluate the following loop at compile
time and use a constant bit permute vector. */
for (long i = 0; i < __builtin_popcountl (__M); i++)
{
c = __builtin_clzl (m);
p = (p << 8) | c;
m ^= (mask >> c);
}
result = __builtin_bpermd (p, __X);
}
else
{
p = 64 - __builtin_popcountl (__M);
result = 0;
/* We could a use a for loop here, but that combined with
-funroll-loops can expand to a lot of code. The while
loop avoids unrolling and the compiler commons the xor
from clearing the mask bit with the (m != 0) test. The
result is a more compact loop setup and body. */
while (m != 0)
{
unsigned long t;
c = __builtin_clzl (m);
t = (__X & (mask >> c)) >> (p - c);
m ^= (mask >> c);
result |= (t);
p++;
}
}
return (result);
}
/* these 32-bit implementations depend on 64-bit pdep/pext
which depend on _ARCH_PWR7. */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u32 (unsigned int __X, unsigned int __Y)
{
return _pdep_u64 (__X, __Y);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u32 (unsigned int __X, unsigned int __Y)
{
return _pext_u64 (__X, __Y);
}
#endif /* _ARCH_PWR7 */
#endif /* __PPC64__ */
#endif /* _BMI2INTRIN_H_INCLUDED */
|