deps/v8/src/objects/simd.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427

// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "src/objects/simd.h"

#include "src/base/cpu.h"
#include "src/objects/compressed-slots.h"
#include "src/objects/fixed-array-inl.h"
#include "src/objects/heap-number-inl.h"
#include "src/objects/smi-inl.h"

#ifdef _MSC_VER
// MSVC doesn't define SSE3. However, it does define AVX, and AVX implies SSE3.
#ifdef __AVX__
#ifndef __SSE3__
#define __SSE3__
#endif
#endif
#endif

#ifdef __SSE3__
#include <immintrin.h>
#endif

#ifdef V8_HOST_ARCH_ARM64
// We use Neon only on 64-bit ARM (because on 32-bit, some instructions and some
// types are not available). Note that ARM64 is guaranteed to have Neon.
#define NEON64
#include <arm_neon.h>
#endif

namespace v8 {
namespace internal {

namespace {

enum class SimdKinds { kSSE, kNeon, kAVX2, kNone };

inline SimdKinds get_vectorization_kind() {
#ifdef __SSE3__
  static base::CPU cpu;
  if (cpu.has_avx2()) {
    return SimdKinds::kAVX2;
  } else {
    // No need for a runtime check since we do not support x86/x64 CPUs without
    // SSE3.
    return SimdKinds::kSSE;
  }
#elif defined(NEON64)
  // No need for a runtime check since all Arm64 CPUs have Neon.
  return SimdKinds::kNeon;
#else
  return SimdKinds::kNone;
#endif
}

// Searches for |search_element| in |array| using a simple non-vectorized linear
// search. This is used as a fall-back when SIMD are not available, and to
// process the end of arrays than SIMD cannot process.
template <typename T>
inline uintptr_t slow_search(T* array, uintptr_t array_len, uintptr_t index,
                             T search_element) {
  for (; index < array_len; index++) {
    if (array[index] == search_element) {
      return index;
    }
  }
  return -1;
}

#ifdef NEON64
// extract_first_nonzero_index returns the first non-zero index in |v|. |v| is a
// Neon vector that can be either 32x4 (the return is then 0, 1, 2 or 3) or 64x2
// (the return is then 0 or 1). This is more or less equivalent to doing a
// movemask followed by a tzcnt on Intel.
//
// The input |v| should be a vector of -1 or 0 (for instance {0, 0},
// {0, -1, 0, -1}, {0, -1, 0, 0}), where -1 represents a match (and 0 a
// non-match), that was obtained by doing a vceqq. This function extract the
// index of the first non-zero item of the vector. To do so, we "and" the vector
// with {4, 3, 2, 1} (each number is "4 - the index of the item it's in"), which
// produces a vector of "indices or 0". Then, we extract the maximum of this
// vector, which is the index of the 1st match. An example:
//
//   v = {-1, 0, 0, -1}
//   mask = {4, 3, 2, 1}
//   v & mask = {4, 0, 0, 1}
//   max(v & mask) = 4
//   index of the first match = 4-max = 4-4 = 0
//

// With MSVC, uint32x4_t and uint64x2_t typedef to a union, where first member
// is uint64_t[2], and not uint32_t[4].
// C++ standard dictates that a union can only be initialized through its first
// member, which forces us to have uint64_t[2] for definition.
#if defined(_MSC_VER) && !defined(__clang__)
#define PACK32x4(w, x, y, z) \
  { ((w) + (uint64_t(x) << 32)), ((y) + (uint64_t(z) << 32)) }
#else
#define PACK32x4(w, x, y, z) \
  { (w), (x), (y), (z) }
#endif  // MSVC workaround

V8_ALLOW_UNUSED inline int extract_first_nonzero_index_uint32x4_t(
    uint32x4_t v) {
  uint32x4_t mask = PACK32x4(4, 3, 2, 1);
  mask = vandq_u32(mask, v);
  return 4 - vmaxvq_u32(mask);
}

inline int extract_first_nonzero_index_uint64x2_t(uint64x2_t v) {
  uint32x4_t mask =
      PACK32x4(2, 0, 1, 0);  // Could also be {2,2,1,1} or {0,2,0,1}
  mask = vandq_u32(mask, vreinterpretq_u32_u64(v));
  return 2 - vmaxvq_u32(mask);
}

inline int32_t reinterpret_vmaxvq_u64(uint64x2_t v) {
  return vmaxvq_u32(vreinterpretq_u32_u64(v));
}
#endif

#define VECTORIZED_LOOP_Neon(type_load, type_eq, set1, cmp, movemask)        \
  {                                                                          \
    constexpr int elems_in_vector = sizeof(type_load) / sizeof(T);           \
    type_load search_element_vec = set1(search_element);                     \
                                                                             \
    for (; index + elems_in_vector <= array_len; index += elems_in_vector) { \
      type_load vector = *reinterpret_cast<type_load*>(&array[index]);       \
      type_eq eq = cmp(vector, search_element_vec);                          \
      if (movemask(eq)) {                                                    \
        return index + extract_first_nonzero_index_##type_eq(eq);            \
      }                                                                      \
    }                                                                        \
  }

#define VECTORIZED_LOOP_x86(type_load, type_eq, set1, cmp, movemask, extract) \
  {                                                                           \
    constexpr int elems_in_vector = sizeof(type_load) / sizeof(T);            \
    type_load search_element_vec = set1(search_element);                      \
                                                                              \
    for (; index + elems_in_vector <= array_len; index += elems_in_vector) {  \
      type_load vector = *reinterpret_cast<type_load*>(&array[index]);        \
      type_eq eq = cmp(vector, search_element_vec);                           \
      int eq_mask = movemask(eq);                                             \
      if (eq_mask) {                                                          \
        return index + extract(eq_mask);                                      \
      }                                                                       \
    }                                                                         \
  }

// Uses SIMD to vectorize the search loop. This function should only be called
// for large-ish arrays. Note that nothing will break if |array_len| is less
// than vectorization_threshold: things will just be slower than necessary.
template <typename T>
inline uintptr_t fast_search_noavx(T* array, uintptr_t array_len,
                                   uintptr_t index, T search_element) {
  static constexpr bool is_uint32 =
      sizeof(T) == sizeof(uint32_t) && std::is_integral<T>::value;
  static constexpr bool is_uint64 =
      sizeof(T) == sizeof(uint64_t) && std::is_integral<T>::value;
  static constexpr bool is_double =
      sizeof(T) == sizeof(double) && std::is_floating_point<T>::value;

  static_assert(is_uint32 || is_uint64 || is_double);

#if !(defined(__SSE3__) || defined(NEON64))
  // No SIMD available.
  return slow_search(array, array_len, index, search_element);
#endif

#ifdef __SSE3__
  const int target_align = 16;
#elif defined(NEON64)
  const int target_align = 16;
#else
  const int target_align = 4;
  UNREACHABLE();
#endif

  // Scalar loop to reach desired alignment
  for (;
       index < array_len &&
       (reinterpret_cast<std::uintptr_t>(&(array[index])) % target_align) != 0;
       index++) {
    if (array[index] == search_element) {
      return index;
    }
  }

  // Inserting one of the vectorized loop
#ifdef __SSE3__
  if constexpr (is_uint32) {
#define MOVEMASK(x) _mm_movemask_ps(_mm_castsi128_ps(x))
#define EXTRACT(x) base::bits::CountTrailingZeros32(x)
    VECTORIZED_LOOP_x86(__m128i, __m128i, _mm_set1_epi32, _mm_cmpeq_epi32,
                        MOVEMASK, EXTRACT)
#undef MOVEMASK
#undef EXTRACT
  } else if constexpr (is_uint64) {
#define SET1(x) _mm_castsi128_ps(_mm_set1_epi64x(x))
#define CMP(a, b) _mm_cmpeq_pd(_mm_castps_pd(a), _mm_castps_pd(b))
#define EXTRACT(x) base::bits::CountTrailingZeros32(x)
    VECTORIZED_LOOP_x86(__m128, __m128d, SET1, CMP, _mm_movemask_pd, EXTRACT)
#undef SET1
#undef CMP
#undef EXTRACT
  } else if constexpr (is_double) {
#define EXTRACT(x) base::bits::CountTrailingZeros32(x)
    VECTORIZED_LOOP_x86(__m128d, __m128d, _mm_set1_pd, _mm_cmpeq_pd,
                        _mm_movemask_pd, EXTRACT)
#undef EXTRACT
  }
#elif defined(NEON64)
  if constexpr (is_uint32) {
    VECTORIZED_LOOP_Neon(uint32x4_t, uint32x4_t, vdupq_n_u32, vceqq_u32,
                         vmaxvq_u32)
  } else if constexpr (is_uint64) {
    VECTORIZED_LOOP_Neon(uint64x2_t, uint64x2_t, vdupq_n_u64, vceqq_u64,
                         reinterpret_vmaxvq_u64)
  } else if constexpr (is_double) {
    VECTORIZED_LOOP_Neon(float64x2_t, uint64x2_t, vdupq_n_f64, vceqq_f64,
                         reinterpret_vmaxvq_u64)
  }
#else
  UNREACHABLE();
#endif

  // The vectorized loop stops when there are not enough items left in the array
  // to fill a vector register. The slow_search function will take care of
  // iterating through the few remaining items.
  return slow_search(array, array_len, index, search_element);
}

#if defined(_MSC_VER) && defined(__clang__)
// Generating AVX2 code with Clang on Windows without the /arch:AVX2 flag does
// not seem possible at the moment.
#define IS_CLANG_WIN 1
#endif

// Since we don't compile with -mavx or -mavx2 (or /arch:AVX2 on MSVC), Clang
// and MSVC do not define __AVX__ nor __AVX2__. Thus, if __SSE3__ is defined, we
// generate the AVX2 code, and, at runtime, we'll decide to call it or not,
// depending on whether the CPU supports AVX2.
#if defined(__SSE3__) && !defined(_M_IX86) && !defined(IS_CLANG_WIN)
#ifdef _MSC_VER
#define TARGET_AVX2
#else
#define TARGET_AVX2 __attribute__((target("avx2")))
#endif
template <typename T>
TARGET_AVX2 inline uintptr_t fast_search_avx(T* array, uintptr_t array_len,
                                             uintptr_t index,
                                             T search_element) {
  static constexpr bool is_uint32 =
      sizeof(T) == sizeof(uint32_t) && std::is_integral<T>::value;
  static constexpr bool is_uint64 =
      sizeof(T) == sizeof(uint64_t) && std::is_integral<T>::value;
  static constexpr bool is_double =
      sizeof(T) == sizeof(double) && std::is_floating_point<T>::value;

  static_assert(is_uint32 || is_uint64 || is_double);

  const int target_align = 32;
  // Scalar loop to reach desired alignment
  for (;
       index < array_len &&
       (reinterpret_cast<std::uintptr_t>(&(array[index])) % target_align) != 0;
       index++) {
    if (array[index] == search_element) {
      return index;
    }
  }

  // Generating vectorized loop
  if constexpr (is_uint32) {
#define MOVEMASK(x) _mm256_movemask_ps(_mm256_castsi256_ps(x))
#define EXTRACT(x) base::bits::CountTrailingZeros32(x)
    VECTORIZED_LOOP_x86(__m256i, __m256i, _mm256_set1_epi32, _mm256_cmpeq_epi32,
                        MOVEMASK, EXTRACT)
#undef MOVEMASK
#undef EXTRACT
  } else if constexpr (is_uint64) {
#define MOVEMASK(x) _mm256_movemask_pd(_mm256_castsi256_pd(x))
#define EXTRACT(x) base::bits::CountTrailingZeros32(x)
    VECTORIZED_LOOP_x86(__m256i, __m256i, _mm256_set1_epi64x,
                        _mm256_cmpeq_epi64, MOVEMASK, EXTRACT)
#undef MOVEMASK
#undef EXTRACT
  } else if constexpr (is_double) {
#define CMP(a, b) _mm256_cmp_pd(a, b, _CMP_EQ_OQ)
#define EXTRACT(x) base::bits::CountTrailingZeros32(x)
    VECTORIZED_LOOP_x86(__m256d, __m256d, _mm256_set1_pd, CMP,
                        _mm256_movemask_pd, EXTRACT)
#undef CMP
#undef EXTRACT
  }

  // The vectorized loop stops when there are not enough items left in the array
  // to fill a vector register. The slow_search function will take care of
  // iterating through the few remaining items.
  return slow_search(array, array_len, index, search_element);
}

#undef TARGET_AVX2
#elif defined(IS_CLANG_WIN)
template <typename T>
inline uintptr_t fast_search_avx(T* array, uintptr_t array_len, uintptr_t index,
                                 T search_element) {
  // Falling back to SSE version
  return fast_search_noavx(array, array_len, index, search_element);
}
#else
template <typename T>
uintptr_t fast_search_avx(T* array, uintptr_t array_len, uintptr_t index,
                          T search_element) {
  UNREACHABLE();
}
#endif  // ifdef __SSE3__

#undef IS_CLANG_WIN
#undef VECTORIZED_LOOP_Neon
#undef VECTORIZED_LOOP_x86

template <typename T>
inline uintptr_t search(T* array, uintptr_t array_len, uintptr_t index,
                        T search_element) {
  if (get_vectorization_kind() == SimdKinds::kAVX2) {
    return fast_search_avx(array, array_len, index, search_element);
  } else {
    return fast_search_noavx(array, array_len, index, search_element);
  }
}

enum class ArrayIndexOfIncludesKind { DOUBLE, OBJECTORSMI };

// ArrayIndexOfIncludes only handles cases that can be efficiently
// vectorized:
//
//   * Searching for a Smi in a Smi array
//
//   * Searching for a Smi or Double in a Double array
//
//   * Searching for an object in an object array.
//
// Other cases should be dealt with either with the CSA builtin or with the
// inlined optimized code.
template <ArrayIndexOfIncludesKind kind>
Address ArrayIndexOfIncludes(Address array_start, uintptr_t array_len,
                             uintptr_t from_index, Address search_element) {
  if (array_len == 0) {
    return Smi::FromInt(-1).ptr();
  }

  if constexpr (kind == ArrayIndexOfIncludesKind::DOUBLE) {
    FixedDoubleArray fixed_array = FixedDoubleArray::cast(Object(array_start));
    double* array = static_cast<double*>(
        fixed_array.RawField(FixedDoubleArray::OffsetOfElementAt(0))
            .ToVoidPtr());

    double search_num;
    if (Object(search_element).IsSmi()) {
      search_num = Object(search_element).ToSmi().value();
    } else {
      DCHECK(Object(search_element).IsHeapNumber());
      search_num = HeapNumber::cast(Object(search_element)).value();
    }

    DCHECK(!std::isnan(search_num));

    if (reinterpret_cast<uintptr_t>(array) % sizeof(double) != 0) {
      // Slow scalar search for unaligned double array.
      for (; from_index < array_len; from_index++) {
        if (fixed_array.is_the_hole(static_cast<int>(from_index))) {
          // |search_num| cannot be NaN, so there is no need to check against
          // holes.
          continue;
        }
        if (fixed_array.get_scalar(static_cast<int>(from_index)) ==
            search_num) {
          return from_index;
        }
      }
      return Smi::FromInt(-1).ptr();
    }

    return search<double>(array, array_len, from_index, search_num);
  }

  if constexpr (kind == ArrayIndexOfIncludesKind::OBJECTORSMI) {
    FixedArray fixed_array = FixedArray::cast(Object(array_start));
    Tagged_t* array =
        static_cast<Tagged_t*>(fixed_array.data_start().ToVoidPtr());

    DCHECK(!Object(search_element).IsHeapNumber());
    DCHECK(!Object(search_element).IsBigInt());
    DCHECK(!Object(search_element).IsString());

    return search<Tagged_t>(array, array_len, from_index,
                            static_cast<Tagged_t>(search_element));
  }
}

}  // namespace

uintptr_t ArrayIndexOfIncludesSmiOrObject(Address array_start,
                                          uintptr_t array_len,
                                          uintptr_t from_index,
                                          Address search_element) {
  return ArrayIndexOfIncludes<ArrayIndexOfIncludesKind::OBJECTORSMI>(
      array_start, array_len, from_index, search_element);
}

uintptr_t ArrayIndexOfIncludesDouble(Address array_start, uintptr_t array_len,
                                     uintptr_t from_index,
                                     Address search_element) {
  return ArrayIndexOfIncludes<ArrayIndexOfIncludesKind::DOUBLE>(
      array_start, array_len, from_index, search_element);
}

#ifdef NEON64
#undef NEON64
#endif

}  // namespace internal
}  // namespace v8