diff options
-rw-r--r-- | configure.ac | 51 | ||||
-rw-r--r-- | po/POTFILES.in | 1 | ||||
-rw-r--r-- | src/cksum.c | 108 | ||||
-rw-r--r-- | src/cksum.h | 13 | ||||
-rw-r--r-- | src/cksum_pclmul.c | 189 | ||||
-rw-r--r-- | src/local.mk | 7 | ||||
-rwxr-xr-x | tests/misc/cksum.sh | 39 |
7 files changed, 381 insertions, 27 deletions
diff --git a/configure.ac b/configure.ac index 6351dd708..7fbecbf8d 100644 --- a/configure.ac +++ b/configure.ac @@ -524,6 +524,57 @@ CFLAGS=$ac_save_CFLAGS LDFLAGS=$ac_save_LDFLAGS ac_c_werror_flag=$cu_save_c_werror_flag +AC_MSG_CHECKING([if __get_cpuid available]) +AC_COMPILE_IFELSE( + [AC_LANG_SOURCE([[ + #include <cpuid.h> + + int main(void) + { + unsigned int eax, ebx, ecx, edx; + __get_cpuid(1, &eax, &ebx, &ecx, &edx); + return 1; + } + ]]) + ],[ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_CPUID], [1], [__get_cpuid available]) + cpuid_exists=yes + ],[ + AC_MSG_RESULT([no]) + ]) + +ac_save_CFLAGS=$CFLAGS +CFLAGS="-mavx -mpclmul $CFLAGS" +AC_MSG_CHECKING([if pclmul intrinsic exists]) +AC_COMPILE_IFELSE( + [AC_LANG_SOURCE([[ + #include <x86intrin.h> + + int main(void) + { + __m128i a, b; + a = _mm_clmulepi64_si128(a, b, 0x00); + return 1; + } + ]]) + ],[ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_PCLMUL_INTRINSIC], [1], [pclmul intrinsic exists]) + pclmul_intrinsic_exists=yes + ],[ + AC_MSG_RESULT([no]) + ]) +if test "x$cpuid_exists" = "xyes" && + test "x$pclmul_intrinsic_exists" = "xyes"; then + AC_DEFINE([USE_PCLMUL_CRC32], [1], + [CRC32 calculation by pclmul hardware instruction enabled]) +fi +AM_CONDITIONAL([USE_PCLMUL_CRC32], + [test "x$cpuid_exists" = "xyes" && + test "x$pclmul_intrinsic_exists" = "xyes"]) +CFLAGS=$ac_save_CFLAGS + ############################################################################ dnl Autogenerated by the 'gen-lists-of-programs.sh' auxiliary script. diff --git a/po/POTFILES.in b/po/POTFILES.in index 5f9c8fc50..b5f5bbff1 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -44,6 +44,7 @@ src/chown-core.c src/chown.c src/chroot.c src/cksum.c +src/cksum_pclmul.c src/comm.c src/copy.c src/coreutils.c diff --git a/src/cksum.c b/src/cksum.c index 00e260149..a38f03ca8 100644 --- a/src/cksum.c +++ b/src/cksum.c @@ -122,7 +122,7 @@ main (void) } } - printf ("static uint_fast32_t const crctab[8][256] = {\n"); + printf ("uint_fast32_t const crctab[8][256] = {\n"); for (int y = 0; y < 8; y++) { printf ("{\n 0x%08x", crctab[y][0]); @@ -146,6 +146,9 @@ main (void) # include "error.h" # include "cksum.h" +# if USE_PCLMUL_CRC32 +# include "cpuid.h" +# endif /* USE_PCLMUL_CRC32 */ /* Number of bytes to read at once. */ # define BUFLEN (1 << 16) @@ -153,39 +156,46 @@ main (void) /* Nonzero if any of the files read were the standard input. */ static bool have_read_stdin; -/* Calculate and print the checksum and length in bytes - of file FILE, or of the standard input if FILE is "-". - If PRINT_NAME is true, print FILE next to the checksum and size. - Return true if successful. */ +static bool +cksum_slice8 (FILE *fp, const char *file, uint_fast32_t *crc_out, + uintmax_t *length_out); +static bool + (*cksum_fp)(FILE *, const char *, uint_fast32_t *, + uintmax_t *) = cksum_slice8; +# if USE_PCLMUL_CRC32 static bool -cksum (const char *file, bool print_name) +pclmul_supported (void) +{ + unsigned int eax = 0; + unsigned int ebx = 0; + unsigned int ecx = 0; + unsigned int edx = 0; + + if (! __get_cpuid (1, &eax, &ebx, &ecx, &edx)) + return false; + + if (! (ecx & bit_PCLMUL)) + return false; + + if (! (ecx & bit_AVX)) + return false; + + return true; +} +# endif /* USE_PCLMUL_CRC32 */ + +static bool +cksum_slice8 (FILE *fp, const char *file, uint_fast32_t *crc_out, + uintmax_t *length_out) { uint32_t buf[BUFLEN/sizeof (uint32_t)]; uint_fast32_t crc = 0; uintmax_t length = 0; size_t bytes_read; - FILE *fp; - char length_buf[INT_BUFSIZE_BOUND (uintmax_t)]; - char const *hp; - if (STREQ (file, "-")) - { - fp = stdin; - have_read_stdin = true; - xset_binary_mode (STDIN_FILENO, O_BINARY); - } - else - { - fp = fopen (file, (O_BINARY ? "rb" : "r")); - if (fp == NULL) - { - error (0, errno, "%s", quotef (file)); - return false; - } - } - - fadvise (fp, FADVISE_SEQUENTIAL); + if (!fp || !file || !crc_out || !length_out) + return false; while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0) { @@ -221,6 +231,47 @@ cksum (const char *file, bool print_name) break; } + *crc_out = crc; + *length_out = length; + + return true; +} + +/* Calculate and print the checksum and length in bytes + of file FILE, or of the standard input if FILE is "-". + If PRINT_NAME is true, print FILE next to the checksum and size. + Return true if successful. */ + +static bool +cksum (const char *file, bool print_name) +{ + uint_fast32_t crc = 0; + uintmax_t length = 0; + FILE *fp; + char length_buf[INT_BUFSIZE_BOUND (uintmax_t)]; + char const *hp; + + if (STREQ (file, "-")) + { + fp = stdin; + have_read_stdin = true; + xset_binary_mode (STDIN_FILENO, O_BINARY); + } + else + { + fp = fopen (file, (O_BINARY ? "rb" : "r")); + if (fp == NULL) + { + error (0, errno, "%s", quotef (file)); + return false; + } + } + + fadvise (fp, FADVISE_SEQUENTIAL); + + if (! cksum_fp (fp, file, &crc, &length)) + return false; + if (ferror (fp)) { error (0, errno, "%s", quotef (file)); @@ -299,6 +350,11 @@ main (int argc, char **argv) have_read_stdin = false; +# if USE_PCLMUL_CRC32 + if (pclmul_supported ()) + cksum_fp = cksum_pclmul; +# endif /* USE_PCLMUL_CRC32 */ + if (optind == argc) ok = cksum ("-", false); else diff --git a/src/cksum.h b/src/cksum.h index 590a8aabe..b3174722f 100644 --- a/src/cksum.h +++ b/src/cksum.h @@ -1,4 +1,13 @@ -static uint_fast32_t const crctab[8][256] = { +#ifndef __CKSUM_H__ +# define __CKSUM_H__ + +extern bool +cksum_pclmul (FILE *fp, const char *file, uint_fast32_t *crc_out, + uintmax_t *length_out); + +extern uint_fast32_t const crctab[8][256]; + +uint_fast32_t const crctab[8][256] = { { 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, @@ -432,3 +441,5 @@ static uint_fast32_t const crctab[8][256] = { 0x0d26bcfb, 0x8b82b73a, 0xd0236bf0, 0x3cc10eae, 0x6760d264 }, }; + +#endif diff --git a/src/cksum_pclmul.c b/src/cksum_pclmul.c new file mode 100644 index 000000000..9a1b760fe --- /dev/null +++ b/src/cksum_pclmul.c @@ -0,0 +1,189 @@ +/* cksum -- calculate and print POSIX checksums and sizes of files + Copyright (C) 1992-2021 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include <stdio.h> +#include <sys/types.h> +#include <stdint.h> +#include <x86intrin.h> +#include "system.h" +#include "die.h" + +/* Number of bytes to read at once. */ +#define BUFLEN (1 << 16) + +extern uint_fast32_t const crctab[8][256]; + +extern bool +cksum_pclmul (FILE *fp, const char *file, uint_fast32_t *crc_out, + uintmax_t *length_out); + +/* Calculate CRC32 using PCLMULQDQ CPU instruction found in x86/x64 CPUs */ + +bool +cksum_pclmul (FILE *fp, const char *file, uint_fast32_t *crc_out, + uintmax_t *length_out) +{ + __m128i buf[BUFLEN / sizeof (__m128i)]; + uint_fast32_t crc = 0; + uintmax_t length = 0; + size_t bytes_read; + __m128i single_mult_constant; + __m128i four_mult_constant; + __m128i shuffle_constant; + + if (!fp || !file || !crc_out || !length_out) + return false; + + /* These constants and general algorithms are taken from the Intel whitepaper + "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + */ + single_mult_constant = _mm_set_epi64x (0xC5B9CD4C, 0xE8A45605); + four_mult_constant = _mm_set_epi64x (0x8833794C, 0xE6228B11); + + /* Constant to byteswap a full SSE register */ + shuffle_constant = _mm_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15); + + while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0) + { + __m128i *datap; + __m128i data; + __m128i data2; + __m128i data3; + __m128i data4; + __m128i data5; + __m128i data6; + __m128i data7; + __m128i data8; + __m128i fold_data; + __m128i xor_crc; + + if (length + bytes_read < length) + die (EXIT_FAILURE, 0, _("%s: file too long"), quotef (file)); + length += bytes_read; + + datap = (__m128i *)buf; + + /* Fold in parallel eight 16-byte blocks into four 16-byte blocks */ + if (bytes_read >= 16*8) + { + data = _mm_loadu_si128 (datap); + data = _mm_shuffle_epi8 (data, shuffle_constant); + /* XOR in initial CRC value (for us 0 so no effect), or CRC value + calculated for previous BUFLEN buffer from fread */ + xor_crc = _mm_set_epi32 (crc, 0, 0, 0); + crc = 0; + data = _mm_xor_si128 (data, xor_crc); + data3 = _mm_loadu_si128 (datap+1); + data3 = _mm_shuffle_epi8 (data3, shuffle_constant); + data5 = _mm_loadu_si128 (datap+2); + data5 = _mm_shuffle_epi8 (data5, shuffle_constant); + data7 = _mm_loadu_si128 (datap+3); + data7 = _mm_shuffle_epi8 (data7, shuffle_constant); + + + while (bytes_read >= 16*8) + { + datap += 4; + + /* Do multiplication here for four consecutive 16 byte blocks */ + data2 = _mm_clmulepi64_si128 (data, four_mult_constant, 0x00); + data = _mm_clmulepi64_si128 (data, four_mult_constant, 0x11); + data4 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x00); + data3 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x11); + data6 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x00); + data5 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x11); + data8 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x00); + data7 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x11); + + /* Now multiplication results for the four blocks is xor:ed with + next four 16 byte blocks from the buffer. This effectively + "consumes" the first four blocks from the buffer. + Keep xor result in variables for multiplication in next + round of loop. */ + data = _mm_xor_si128 (data, data2); + data2 = _mm_loadu_si128 (datap); + data2 = _mm_shuffle_epi8 (data2, shuffle_constant); + data = _mm_xor_si128 (data, data2); + + data3 = _mm_xor_si128 (data3, data4); + data4 = _mm_loadu_si128 (datap+1); + data4 = _mm_shuffle_epi8 (data4, shuffle_constant); + data3 = _mm_xor_si128 (data3, data4); + + data5 = _mm_xor_si128 (data5, data6); + data6 = _mm_loadu_si128 (datap+2); + data6 = _mm_shuffle_epi8 (data6, shuffle_constant); + data5 = _mm_xor_si128 (data5, data6); + + data7 = _mm_xor_si128 (data7, data8); + data8 = _mm_loadu_si128 (datap+3); + data8 = _mm_shuffle_epi8 (data8, shuffle_constant); + data7 = _mm_xor_si128 (data7, data8); + + bytes_read -= (16 * 4); + } + /* At end of loop we write out results from variables back into + the buffer, for use in single fold loop */ + data = _mm_shuffle_epi8 (data, shuffle_constant); + _mm_storeu_si128 (datap, data); + data3 = _mm_shuffle_epi8 (data3, shuffle_constant); + _mm_storeu_si128 (datap+1, data3); + data5 = _mm_shuffle_epi8 (data5, shuffle_constant); + _mm_storeu_si128 (datap+2, data5); + data7 = _mm_shuffle_epi8 (data7, shuffle_constant); + _mm_storeu_si128 (datap+3, data7); + } + + /* Fold two 16-byte blocks into one 16-byte block */ + if (bytes_read >= 32) + { + data = _mm_loadu_si128 (datap); + data = _mm_shuffle_epi8 (data, shuffle_constant); + xor_crc = _mm_set_epi32 (crc, 0, 0, 0); + crc = 0; + data = _mm_xor_si128 (data, xor_crc); + while (bytes_read >= 32) + { + datap++; + + data2 = _mm_clmulepi64_si128 (data, single_mult_constant, 0x00); + data = _mm_clmulepi64_si128 (data, single_mult_constant, 0x11); + fold_data = _mm_loadu_si128 (datap); + fold_data = _mm_shuffle_epi8 (fold_data, shuffle_constant); + data = _mm_xor_si128 (data, data2); + data = _mm_xor_si128 (data, fold_data); + bytes_read -= 16; + } + data = _mm_shuffle_epi8 (data, shuffle_constant); + _mm_storeu_si128 (datap, data); + } + + /* And finish up last 0-31 bytes in a byte by byte fashion */ + unsigned char *cp = (unsigned char *)datap; + while (bytes_read--) + crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF]; + if (feof (fp)) + break; + } + + *crc_out = crc; + *length_out = length; + + return true; +} diff --git a/src/local.mk b/src/local.mk index b9e81d9a3..8c8479a53 100644 --- a/src/local.mk +++ b/src/local.mk @@ -359,6 +359,13 @@ nodist_src_coreutils_SOURCES = src/coreutils.h src_coreutils_SOURCES = src/coreutils.c src_cksum_SOURCES = src/cksum.c src/cksum.h +if USE_PCLMUL_CRC32 +noinst_LIBRARIES += src/libcksum_pclmul.a +src_libcksum_pclmul_a_SOURCES = src/cksum_pclmul.c src/cksum.h +cksum_pclmul_ldadd = src/libcksum_pclmul.a +src_cksum_LDADD += $(cksum_pclmul_ldadd) +src_libcksum_pclmul_a_CFLAGS = -mavx -mpclmul $(AM_CFLAGS) +endif src_cp_SOURCES = src/cp.c $(copy_sources) $(selinux_sources) src_dir_SOURCES = src/ls.c src/ls-dir.c src_env_SOURCES = src/env.c src/operand2sig.c diff --git a/tests/misc/cksum.sh b/tests/misc/cksum.sh index 3b141c111..950b5d561 100755 --- a/tests/misc/cksum.sh +++ b/tests/misc/cksum.sh @@ -32,4 +32,43 @@ cksum in > out || fail=1 printf '%s\n' '4097727897 2077 in' > exp || framework_failure_ compare exp out || fail=1 +# Make sure crc is correct for files larger than 128 bytes (4 fold pclmul) +{ + env printf $(env printf '\\%03o' $(seq 0 130)); +} > in || framework_failure_ + +cksum in > out || fail=1 +printf '%s\n' '3800919234 131 in' > exp || framework_failure_ +compare exp out || fail=1 + +# Make sure crc is correct for files larger than 32 bytes +# but <128 bytes (1 fold pclmul) +{ + env printf $(env printf '\\%03o' $(seq 0 64)); +} > in || framework_failure_ + +cksum in > out || fail=1 +printf '%s\n' '796287823 65 in' > exp || framework_failure_ +compare exp out || fail=1 + +# Make sure crc is still handled correctly when next 65k buffer is read +# (>32 bytes more than 65k) +{ + seq 1 12780 +} > in || framework_failure_ + +cksum in > out || fail=1 +printf '%s\n' '3720986905 65574 in' > exp || framework_failure_ +compare exp out || fail=1 + +# Make sure crc is still handled correctly when next 65k buffer is read +# (>=128 bytes more than 65k) +{ + seq 1 12795 +} > in || framework_failure_ + +cksum in > out || fail=1 +printf '%s\n' '4278270357 65664 in' > exp || framework_failure_ +compare exp out || fail=1 + Exit $fail |