summaryrefslogtreecommitdiff
path: root/configure.ac
diff options
context:
space:
mode:
authorLasse Collin <lasse.collin@tukaani.org>2022-11-14 21:34:57 +0200
committerLasse Collin <lasse.collin@tukaani.org>2022-11-14 23:05:46 +0200
commitf644473a211394447824ea00518d0a214ff3f7f2 (patch)
tree8fc19fa7f3811b090f3f73398ce8bd0708d10a53 /configure.ac
parent3b466bc79672bb2b06d1245a500588e6026e0ba0 (diff)
downloadxz-f644473a211394447824ea00518d0a214ff3f7f2.tar.gz
liblzma: Add fast CRC64 for 32/64-bit x86 using SSSE3 + SSE4.1 + CLMUL.
It also works on E2K as it supports these intrinsics. On x86-64 runtime detection is used so the code keeps working on older processors too. A CLMUL-only build can be done by using -msse4.1 -mpclmul in CFLAGS and this will reduce the library size since the generic implementation and its 8 KiB lookup table will be omitted. On 32-bit x86 this isn't used by default for now because by default on 32-bit x86 the separate assembly file crc64_x86.S is used. If --disable-assembler is used then this new CLMUL code is used the same way as on 64-bit x86. However, a CLMUL-only build (-msse4.1 -mpclmul) won't omit the 8 KiB lookup table on 32-bit x86 due to a currently-missing check for disabled assembler usage. The configure.ac check should be such that the code won't be built if something in the toolchain doesn't support it but --disable-clmul-crc option can be used to unconditionally disable this feature. CLMUL speeds up decompression of files that have compressed very well (assuming CRC64 is used as a check type). It is know that the CLMUL code is significantly slower than the generic code for tiny inputs (especially 1-8 bytes but up to 16 bytes). If that is a real-world problem then there is already a commented-out variant that uses the generic version for small inputs. Thanks to Ilya Kurdyukov for the original patch which was derived from a white paper from Intel [1] (published in 2009) and public domain code from [2] (released in 2016). [1] https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf [2] https://github.com/rawrunprotected/crc
Diffstat (limited to 'configure.ac')
-rw-r--r--configure.ac59
1 files changed, 57 insertions, 2 deletions
diff --git a/configure.ac b/configure.ac
index ddc82a3..60a6382 100644
--- a/configure.ac
+++ b/configure.ac
@@ -370,6 +370,16 @@ esac
AM_CONDITIONAL(COND_ASM_X86, test "x$enable_assembler" = xx86)
+#############
+# CLMUL CRC #
+#############
+
+AC_ARG_ENABLE([clmul-crc], AS_HELP_STRING([--disable-clmul-crc],
+ [Do not use carryless multiplication for CRC calculation
+ even if support for it is detected.]),
+ [], [enable_clmul_crc=yes])
+
+
#####################
# Size optimization #
#####################
@@ -733,8 +743,9 @@ AC_CHECK_HEADERS([fcntl.h limits.h sys/time.h],
[],
[AC_MSG_ERROR([Required header file(s) are missing.])])
-# This allows the use of the intrinsic functions if they are available.
-AC_CHECK_HEADERS([immintrin.h])
+# immintrin.h allows the use of the intrinsic functions if they are available.
+# cpuid.h may be used for detecting x86 processor features at runtime.
+AC_CHECK_HEADERS([immintrin.h cpuid.h])
###############################################################################
@@ -874,6 +885,50 @@ AC_CHECK_DECL([_mm_movemask_epi8],
#include <immintrin.h>
#endif])
+# For faster CRC on 32/64-bit x86 and E2K (see also crc64_fast.c):
+#
+# - Check for the CLMUL intrinsic _mm_clmulepi64_si128 in <immintrin.h>.
+#
+# - Check that __attribute__((__target__("ssse3,sse4.1,pclmul"))) works
+# together with _mm_clmulepi64_si128 from <immintrin.h>. The attribute
+# was added in GCC 4.4 but some GCC 4.x versions don't allow intrinsics
+# with it. Exception: it must be not be used with EDG-based compilers
+# like ICC and the compiler on E2K.
+#
+# If everything above is supported, runtime detection will be used to keep the
+# binaries working on systems that don't support the required extensions.
+AC_MSG_CHECKING([if _mm_clmulepi64_si128 is usable])
+if test "x$enable_clmul_crc" = xno ; then
+ AC_MSG_RESULT([no, --disable-clmul-crc was used])
+else
+ AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#include <immintrin.h>
+
+// CLMUL works on older E2K instruction set but it is slow due to emulation.
+#if defined(__e2k__) && __iset__ < 6
+# error
+#endif
+
+// Intel's old compiler (ICC) can define __GNUC__ but the attribute must not
+// be used with it. The new Clang-based ICX needs the attribute.
+// Checking for !defined(__EDG__) catches ICC and other EDG-based compilers.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
+__attribute__((__target__("ssse3,sse4.1,pclmul")))
+#endif
+__m128i my_clmul(__m128i a, __m128i b)
+{
+ return _mm_clmulepi64_si128(a, b, 0);
+}
+ ]])], [
+ AC_DEFINE([HAVE_USABLE_CLMUL], [1],
+ [Define to 1 if _mm_clmulepi64_si128 is usable.
+ See configure.ac for details.])
+ AC_MSG_RESULT([yes])
+ ], [
+ AC_MSG_RESULT([no])
+ ])
+fi
+
# Check for sandbox support. If one is found, set enable_sandbox=found.
case $enable_sandbox in
auto | capsicum)