diff options
author | Anton Blanchard <anton@ozlabs.org> | 2018-07-07 14:21:46 +1000 |
---|---|---|
committer | Erik de Castro Lopo <erikd@mega-nerd.com> | 2018-08-20 18:16:34 +1000 |
commit | cdb030cd3749b4547399a049ce3cae5e974ccd48 (patch) | |
tree | 1ae2e9befc7da38c7c44f27c5ac33ff9afdddb72 /src | |
parent | 8e1796b91a1893609b8e2f5813d08f7df3f011cf (diff) | |
download | flac-cdb030cd3749b4547399a049ce3cae5e974ccd48.tar.gz |
Add VSX optimised versions of autocorrelation loops
Add a POWER8 and POWER9 version of the autocorrelation functions.
flac --best is about 3.3x faster on POWER9 with this patch.
Signed-off-by: Anton Blanchard <anton@ozlabs.org>
Diffstat (limited to 'src')
-rw-r--r-- | src/libFLAC/Makefile.am | 1 | ||||
-rw-r--r-- | src/libFLAC/include/private/lpc.h | 14 | ||||
-rw-r--r-- | src/libFLAC/lpc_intrin_vsx.c | 942 | ||||
-rw-r--r-- | src/libFLAC/stream_encoder.c | 30 |
4 files changed, 987 insertions, 0 deletions
diff --git a/src/libFLAC/Makefile.am b/src/libFLAC/Makefile.am index 863f7f95..f0f32f04 100644 --- a/src/libFLAC/Makefile.am +++ b/src/libFLAC/Makefile.am @@ -114,6 +114,7 @@ libFLAC_sources = \ lpc_intrin_sse2.c \ lpc_intrin_sse41.c \ lpc_intrin_avx2.c \ + lpc_intrin_vsx.c \ md5.c \ memory.c \ metadata_iterators.c \ diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h index 63d64324..64dfd1f8 100644 --- a/src/libFLAC/include/private/lpc.h +++ b/src/libFLAC/include/private/lpc.h @@ -91,6 +91,20 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real da void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); # endif # endif +#if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX) +#ifdef FLAC__HAS_TARGET_POWER9 +void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); +#endif +#ifdef FLAC__HAS_TARGET_POWER8 +void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]); +#endif +#endif #endif /* diff --git a/src/libFLAC/lpc_intrin_vsx.c b/src/libFLAC/lpc_intrin_vsx.c new file mode 100644 index 00000000..48c82182 --- /dev/null +++ b/src/libFLAC/lpc_intrin_vsx.c @@ -0,0 +1,942 @@ +/* libFLAC - Free Lossless Audio Codec library + * Copyright (C) 2000-2009 Josh Coalson + * Copyright (C) 2011-2016 Xiph.Org Foundation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * - Neither the name of the Xiph.org Foundation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#ifndef FLAC__INTEGER_ONLY_LIBRARY +#ifndef FLAC__NO_ASM +#if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX) + +#include "private/cpu.h" +#include "private/lpc.h" +#include "FLAC/assert.h" +#include "FLAC/format.h" + +#include <altivec.h> + +#ifdef FLAC__HAS_TARGET_POWER8 +__attribute__((target("cpu=power8"))) +void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]) +{ + long i; + long limit = (long)data_len - 16; + const FLAC__real *base; + vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum2 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum3 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum12 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum13 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum22 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum23 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum32 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum33 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float d0, d1, d2, d3, d4; +#if WORDS_BIGENDIAN + vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; + vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 }; + vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 }; + vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B }; +#else + vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; + vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; + vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; + vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 }; + vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 }; + vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 }; +#endif + + (void) lag; + FLAC__ASSERT(lag <= 16); + FLAC__ASSERT(lag <= data_len); + + base = data; + + d0 = vec_vsx_ld(0, base); + d1 = vec_vsx_ld(16, base); + d2 = vec_vsx_ld(32, base); + d3 = vec_vsx_ld(48, base); + + base += 16; + + for (i = 0; i <= (limit-4); i += 4) { + vector float d, d0_orig = d0; + + d4 = vec_vsx_ld(0, base); + base += 4; + + d = vec_splat(d0_orig, 0); + sum0 += d0 * d; + sum1 += d1 * d; + sum2 += d2 * d; + sum3 += d3 * d; + + d = vec_splat(d0_orig, 1); + d0 = vec_sel(d0_orig, d4, vsel1); + sum10 += d0 * d; + sum11 += d1 * d; + sum12 += d2 * d; + sum13 += d3 * d; + + d = vec_splat(d0_orig, 2); + d0 = vec_sel(d0_orig, d4, vsel2); + sum20 += d0 * d; + sum21 += d1 * d; + sum22 += d2 * d; + sum23 += d3 * d; + + d = vec_splat(d0_orig, 3); + d0 = vec_sel(d0_orig, d4, vsel3); + sum30 += d0 * d; + sum31 += d1 * d; + sum32 += d2 * d; + sum33 += d3 * d; + + d0 = d1; + d1 = d2; + d2 = d3; + d3 = d4; + } + + sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1); + sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm1); + sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm1); + sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm1); + + sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2); + sum1 += vec_perm(sum21, sum22, (vector unsigned char)vperm2); + sum2 += vec_perm(sum22, sum23, (vector unsigned char)vperm2); + sum3 += vec_perm(sum23, sum20, (vector unsigned char)vperm2); + + sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3); + sum1 += vec_perm(sum31, sum32, (vector unsigned char)vperm3); + sum2 += vec_perm(sum32, sum33, (vector unsigned char)vperm3); + sum3 += vec_perm(sum33, sum30, (vector unsigned char)vperm3); + + for (; i <= limit; i++) { + vector float d; + + d0 = vec_vsx_ld(0, data+i); + d1 = vec_vsx_ld(16, data+i); + d2 = vec_vsx_ld(32, data+i); + d3 = vec_vsx_ld(48, data+i); + + d = vec_splat(d0, 0); + sum0 += d0 * d; + sum1 += d1 * d; + sum2 += d2 * d; + sum3 += d3 * d; + } + + vec_vsx_st(sum0, 0, autoc); + vec_vsx_st(sum1, 16, autoc); + vec_vsx_st(sum2, 32, autoc); + vec_vsx_st(sum3, 48, autoc); + + for (; i < (long)data_len; i++) { + uint32_t coeff; + + FLAC__real d = data[i]; + for (coeff = 0; coeff < data_len - i; coeff++) + autoc[coeff] += d * data[i+coeff]; + } +} + +__attribute__((target("cpu=power8"))) +void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]) +{ + long i; + long limit = (long)data_len - 12; + const FLAC__real *base; + vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum2 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum12 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum22 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum32 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float d0, d1, d2, d3; +#if WORDS_BIGENDIAN + vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; + vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 }; + vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 }; + vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B }; +#else + vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; + vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; + vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; + vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 }; + vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 }; + vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 }; +#endif + + (void) lag; + FLAC__ASSERT(lag <= 12); + FLAC__ASSERT(lag <= data_len); + + base = data; + + d0 = vec_vsx_ld(0, base); + d1 = vec_vsx_ld(16, base); + d2 = vec_vsx_ld(32, base); + + base += 12; + + for (i = 0; i <= (limit-3); i += 4) { + vector float d, d0_orig = d0; + + d3 = vec_vsx_ld(0, base); + base += 4; + + d = vec_splat(d0_orig, 0); + sum0 += d0 * d; + sum1 += d1 * d; + sum2 += d2 * d; + + d = vec_splat(d0_orig, 1); + d0 = vec_sel(d0_orig, d3, vsel1); + sum10 += d0 * d; + sum11 += d1 * d; + sum12 += d2 * d; + + d = vec_splat(d0_orig, 2); + d0 = vec_sel(d0_orig, d3, vsel2); + sum20 += d0 * d; + sum21 += d1 * d; + sum22 += d2 * d; + + d = vec_splat(d0_orig, 3); + d0 = vec_sel(d0_orig, d3, vsel3); + sum30 += d0 * d; + sum31 += d1 * d; + sum32 += d2 * d; + + d0 = d1; + d1 = d2; + d2 = d3; + } + + sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1); + sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm1); + sum2 += vec_perm(sum12, sum10, (vector unsigned char)vperm1); + + sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2); + sum1 += vec_perm(sum21, sum22, (vector unsigned char)vperm2); + sum2 += vec_perm(sum22, sum20, (vector unsigned char)vperm2); + + sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3); + sum1 += vec_perm(sum31, sum32, (vector unsigned char)vperm3); + sum2 += vec_perm(sum32, sum30, (vector unsigned char)vperm3); + + for (; i <= limit; i++) { + vector float d; + + d0 = vec_vsx_ld(0, data+i); + d1 = vec_vsx_ld(16, data+i); + d2 = vec_vsx_ld(32, data+i); + + d = vec_splat(d0, 0); + sum0 += d0 * d; + sum1 += d1 * d; + sum2 += d2 * d; + } + + vec_vsx_st(sum0, 0, autoc); + vec_vsx_st(sum1, 16, autoc); + vec_vsx_st(sum2, 32, autoc); + + for (; i < (long)data_len; i++) { + uint32_t coeff; + + FLAC__real d = data[i]; + for (coeff = 0; coeff < data_len - i; coeff++) + autoc[coeff] += d * data[i+coeff]; + } +} + +__attribute__((target("cpu=power8"))) +void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]) +{ + long i; + long limit = (long)data_len - 8; + const FLAC__real *base; + vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float d0, d1, d2; +#if WORDS_BIGENDIAN + vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; + vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 }; + vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 }; + vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B }; +#else + vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; + vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; + vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; + vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 }; + vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 }; + vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 }; +#endif + + (void) lag; + FLAC__ASSERT(lag <= 8); + FLAC__ASSERT(lag <= data_len); + + base = data; + + d0 = vec_vsx_ld(0, base); + d1 = vec_vsx_ld(16, base); + + base += 8; + + for (i = 0; i <= (limit-2); i += 4) { + vector float d, d0_orig = d0; + + d2 = vec_vsx_ld(0, base); + base += 4; + + d = vec_splat(d0_orig, 0); + sum0 += d0 * d; + sum1 += d1 * d; + + d = vec_splat(d0_orig, 1); + d0 = vec_sel(d0_orig, d2, vsel1); + sum10 += d0 * d; + sum11 += d1 * d; + + d = vec_splat(d0_orig, 2); + d0 = vec_sel(d0_orig, d2, vsel2); + sum20 += d0 * d; + sum21 += d1 * d; + + d = vec_splat(d0_orig, 3); + d0 = vec_sel(d0_orig, d2, vsel3); + sum30 += d0 * d; + sum31 += d1 * d; + + d0 = d1; + d1 = d2; + } + + sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1); + sum1 += vec_perm(sum11, sum10, (vector unsigned char)vperm1); + + sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2); + sum1 += vec_perm(sum21, sum20, (vector unsigned char)vperm2); + + sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3); + sum1 += vec_perm(sum31, sum30, (vector unsigned char)vperm3); + + for (; i <= limit; i++) { + vector float d; + + d0 = vec_vsx_ld(0, data+i); + d1 = vec_vsx_ld(16, data+i); + + d = vec_splat(d0, 0); + sum0 += d0 * d; + sum1 += d1 * d; + } + + vec_vsx_st(sum0, 0, autoc); + vec_vsx_st(sum1, 16, autoc); + + for (; i < (long)data_len; i++) { + uint32_t coeff; + + FLAC__real d = data[i]; + for (coeff = 0; coeff < data_len - i; coeff++) + autoc[coeff] += d * data[i+coeff]; + } +} + +__attribute__((target("cpu=power8"))) +void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]) +{ + long i; + long limit = (long)data_len - 4; + const FLAC__real *base; + vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float d0, d1; +#if WORDS_BIGENDIAN + vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; + vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 }; + vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 }; + vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B }; +#else + vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; + vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; + vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; + vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 }; + vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 }; + vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 }; +#endif + + (void) lag; + FLAC__ASSERT(lag <= 4); + FLAC__ASSERT(lag <= data_len); + + base = data; + + d0 = vec_vsx_ld(0, base); + + base += 4; + + for (i = 0; i <= (limit-1); i += 4) { + vector float d, d0_orig = d0; + + d1 = vec_vsx_ld(0, base); + base += 4; + + d = vec_splat(d0_orig, 0); + sum0 += d0 * d; + + d = vec_splat(d0_orig, 1); + d0 = vec_sel(d0_orig, d1, vsel1); + sum10 += d0 * d; + + d = vec_splat(d0_orig, 2); + d0 = vec_sel(d0_orig, d1, vsel2); + sum20 += d0 * d; + + d = vec_splat(d0_orig, 3); + d0 = vec_sel(d0_orig, d1, vsel3); + sum30 += d0 * d; + + d0 = d1; + } + + sum0 += vec_perm(sum10, sum10, (vector unsigned char)vperm1); + + sum0 += vec_perm(sum20, sum20, (vector unsigned char)vperm2); + + sum0 += vec_perm(sum30, sum30, (vector unsigned char)vperm3); + + for (; i <= limit; i++) { + vector float d; + + d0 = vec_vsx_ld(0, data+i); + + d = vec_splat(d0, 0); + sum0 += d0 * d; + } + + vec_vsx_st(sum0, 0, autoc); + + for (; i < (long)data_len; i++) { + uint32_t coeff; + + FLAC__real d = data[i]; + for (coeff = 0; coeff < data_len - i; coeff++) + autoc[coeff] += d * data[i+coeff]; + } +} +#endif /* FLAC__HAS_TARGET_POWER8 */ + +#ifdef FLAC__HAS_TARGET_POWER9 +__attribute__((target("cpu=power9"))) +void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]) +{ + long i; + long limit = (long)data_len - 16; + const FLAC__real *base; + vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum2 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum3 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum12 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum13 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum22 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum23 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum32 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum33 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float d0, d1, d2, d3, d4; +#if WORDS_BIGENDIAN + vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; + vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 }; + vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 }; + vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B }; +#else + vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; + vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; + vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; + vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 }; + vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 }; + vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 }; +#endif + + (void) lag; + FLAC__ASSERT(lag <= 16); + FLAC__ASSERT(lag <= data_len); + + base = data; + + d0 = vec_vsx_ld(0, base); + d1 = vec_vsx_ld(16, base); + d2 = vec_vsx_ld(32, base); + d3 = vec_vsx_ld(48, base); + + base += 16; + + for (i = 0; i <= (limit-4); i += 4) { + vector float d, d0_orig = d0; + + d4 = vec_vsx_ld(0, base); + base += 4; + + d = vec_splat(d0_orig, 0); + sum0 += d0 * d; + sum1 += d1 * d; + sum2 += d2 * d; + sum3 += d3 * d; + + d = vec_splat(d0_orig, 1); + d0 = vec_sel(d0_orig, d4, vsel1); + sum10 += d0 * d; + sum11 += d1 * d; + sum12 += d2 * d; + sum13 += d3 * d; + + d = vec_splat(d0_orig, 2); + d0 = vec_sel(d0_orig, d4, vsel2); + sum20 += d0 * d; + sum21 += d1 * d; + sum22 += d2 * d; + sum23 += d3 * d; + + d = vec_splat(d0_orig, 3); + d0 = vec_sel(d0_orig, d4, vsel3); + sum30 += d0 * d; + sum31 += d1 * d; + sum32 += d2 * d; + sum33 += d3 * d; + + d0 = d1; + d1 = d2; + d2 = d3; + d3 = d4; + } + + sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1); + sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm1); + sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm1); + sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm1); + + sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2); + sum1 += vec_perm(sum21, sum22, (vector unsigned char)vperm2); + sum2 += vec_perm(sum22, sum23, (vector unsigned char)vperm2); + sum3 += vec_perm(sum23, sum20, (vector unsigned char)vperm2); + + sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3); + sum1 += vec_perm(sum31, sum32, (vector unsigned char)vperm3); + sum2 += vec_perm(sum32, sum33, (vector unsigned char)vperm3); + sum3 += vec_perm(sum33, sum30, (vector unsigned char)vperm3); + + for (; i <= limit; i++) { + vector float d; + + d0 = vec_vsx_ld(0, data+i); + d1 = vec_vsx_ld(16, data+i); + d2 = vec_vsx_ld(32, data+i); + d3 = vec_vsx_ld(48, data+i); + + d = vec_splat(d0, 0); + sum0 += d0 * d; + sum1 += d1 * d; + sum2 += d2 * d; + sum3 += d3 * d; + } + + vec_vsx_st(sum0, 0, autoc); + vec_vsx_st(sum1, 16, autoc); + vec_vsx_st(sum2, 32, autoc); + vec_vsx_st(sum3, 48, autoc); + + for (; i < (long)data_len; i++) { + uint32_t coeff; + + FLAC__real d = data[i]; + for (coeff = 0; coeff < data_len - i; coeff++) + autoc[coeff] += d * data[i+coeff]; + } +} + +__attribute__((target("cpu=power9"))) +void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]) +{ + long i; + long limit = (long)data_len - 12; + const FLAC__real *base; + vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum2 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum12 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum22 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum32 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float d0, d1, d2, d3; +#if WORDS_BIGENDIAN + vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; + vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 }; + vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 }; + vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B }; +#else + vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; + vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; + vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; + vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 }; + vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 }; + vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 }; +#endif + + (void) lag; + FLAC__ASSERT(lag <= 12); + FLAC__ASSERT(lag <= data_len); + + base = data; + + d0 = vec_vsx_ld(0, base); + d1 = vec_vsx_ld(16, base); + d2 = vec_vsx_ld(32, base); + + base += 12; + + for (i = 0; i <= (limit-3); i += 4) { + vector float d, d0_orig = d0; + + d3 = vec_vsx_ld(0, base); + base += 4; + + d = vec_splat(d0_orig, 0); + sum0 += d0 * d; + sum1 += d1 * d; + sum2 += d2 * d; + + d = vec_splat(d0_orig, 1); + d0 = vec_sel(d0_orig, d3, vsel1); + sum10 += d0 * d; + sum11 += d1 * d; + sum12 += d2 * d; + + d = vec_splat(d0_orig, 2); + d0 = vec_sel(d0_orig, d3, vsel2); + sum20 += d0 * d; + sum21 += d1 * d; + sum22 += d2 * d; + + d = vec_splat(d0_orig, 3); + d0 = vec_sel(d0_orig, d3, vsel3); + sum30 += d0 * d; + sum31 += d1 * d; + sum32 += d2 * d; + + d0 = d1; + d1 = d2; + d2 = d3; + } + + sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1); + sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm1); + sum2 += vec_perm(sum12, sum10, (vector unsigned char)vperm1); + + sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2); + sum1 += vec_perm(sum21, sum22, (vector unsigned char)vperm2); + sum2 += vec_perm(sum22, sum20, (vector unsigned char)vperm2); + + sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3); + sum1 += vec_perm(sum31, sum32, (vector unsigned char)vperm3); + sum2 += vec_perm(sum32, sum30, (vector unsigned char)vperm3); + + for (; i <= limit; i++) { + vector float d; + + d0 = vec_vsx_ld(0, data+i); + d1 = vec_vsx_ld(16, data+i); + d2 = vec_vsx_ld(32, data+i); + + d = vec_splat(d0, 0); + sum0 += d0 * d; + sum1 += d1 * d; + sum2 += d2 * d; + } + + vec_vsx_st(sum0, 0, autoc); + vec_vsx_st(sum1, 16, autoc); + vec_vsx_st(sum2, 32, autoc); + + for (; i < (long)data_len; i++) { + uint32_t coeff; + + FLAC__real d = data[i]; + for (coeff = 0; coeff < data_len - i; coeff++) + autoc[coeff] += d * data[i+coeff]; + } +} + +__attribute__((target("cpu=power9"))) +void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]) +{ + long i; + long limit = (long)data_len - 8; + const FLAC__real *base; + vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float d0, d1, d2; +#if WORDS_BIGENDIAN + vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; + vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 }; + vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 }; + vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B }; +#else + vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; + vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; + vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; + vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 }; + vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 }; + vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 }; +#endif + + (void) lag; + FLAC__ASSERT(lag <= 8); + FLAC__ASSERT(lag <= data_len); + + base = data; + + d0 = vec_vsx_ld(0, base); + d1 = vec_vsx_ld(16, base); + + base += 8; + + for (i = 0; i <= (limit-2); i += 4) { + vector float d, d0_orig = d0; + + d2 = vec_vsx_ld(0, base); + base += 4; + + d = vec_splat(d0_orig, 0); + sum0 += d0 * d; + sum1 += d1 * d; + + d = vec_splat(d0_orig, 1); + d0 = vec_sel(d0_orig, d2, vsel1); + sum10 += d0 * d; + sum11 += d1 * d; + + d = vec_splat(d0_orig, 2); + d0 = vec_sel(d0_orig, d2, vsel2); + sum20 += d0 * d; + sum21 += d1 * d; + + d = vec_splat(d0_orig, 3); + d0 = vec_sel(d0_orig, d2, vsel3); + sum30 += d0 * d; + sum31 += d1 * d; + + d0 = d1; + d1 = d2; + } + + sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1); + sum1 += vec_perm(sum11, sum10, (vector unsigned char)vperm1); + + sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2); + sum1 += vec_perm(sum21, sum20, (vector unsigned char)vperm2); + + sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3); + sum1 += vec_perm(sum31, sum30, (vector unsigned char)vperm3); + + for (; i <= limit; i++) { + vector float d; + + d0 = vec_vsx_ld(0, data+i); + d1 = vec_vsx_ld(16, data+i); + + d = vec_splat(d0, 0); + sum0 += d0 * d; + sum1 += d1 * d; + } + + vec_vsx_st(sum0, 0, autoc); + vec_vsx_st(sum1, 16, autoc); + + for (; i < (long)data_len; i++) { + uint32_t coeff; + + FLAC__real d = data[i]; + for (coeff = 0; coeff < data_len - i; coeff++) + autoc[coeff] += d * data[i+coeff]; + } +} + +__attribute__((target("cpu=power9"))) +void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[]) +{ + long i; + long limit = (long)data_len - 4; + const FLAC__real *base; + vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f}; + vector float d0, d1; +#if WORDS_BIGENDIAN + vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; + vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 }; + vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 }; + vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B }; +#else + vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; + vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; + vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; + vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 }; + vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 }; + vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 }; +#endif + + (void) lag; + FLAC__ASSERT(lag <= 4); + FLAC__ASSERT(lag <= data_len); + + base = data; + + d0 = vec_vsx_ld(0, base); + + base += 4; + + for (i = 0; i <= (limit-1); i += 4) { + vector float d, d0_orig = d0; + + d1 = vec_vsx_ld(0, base); + base += 4; + + d = vec_splat(d0_orig, 0); + sum0 += d0 * d; + + d = vec_splat(d0_orig, 1); + d0 = vec_sel(d0_orig, d1, vsel1); + sum10 += d0 * d; + + d = vec_splat(d0_orig, 2); + d0 = vec_sel(d0_orig, d1, vsel2); + sum20 += d0 * d; + + d = vec_splat(d0_orig, 3); + d0 = vec_sel(d0_orig, d1, vsel3); + sum30 += d0 * d; + + d0 = d1; + } + + sum0 += vec_perm(sum10, sum10, (vector unsigned char)vperm1); + + sum0 += vec_perm(sum20, sum20, (vector unsigned char)vperm2); + + sum0 += vec_perm(sum30, sum30, (vector unsigned char)vperm3); + + for (; i <= limit; i++) { + vector float d; + + d0 = vec_vsx_ld(0, data+i); + + d = vec_splat(d0, 0); + sum0 += d0 * d; + } + + vec_vsx_st(sum0, 0, autoc); + + for (; i < (long)data_len; i++) { + uint32_t coeff; + + FLAC__real d = data[i]; + for (coeff = 0; coeff < data_len - i; coeff++) + autoc[coeff] += d * data[i+coeff]; + } +} +#endif /* FLAC__HAS_TARGET_POWER9 */ + +#endif /* FLAC__CPU_PPC64 && FLAC__USE_VSX */ +#endif /* FLAC__NO_ASM */ +#endif /* FLAC__INTEGER_ONLY_LIBRARY */ diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c index 87cfb580..74387ec3 100644 --- a/src/libFLAC/stream_encoder.c +++ b/src/libFLAC/stream_encoder.c @@ -885,6 +885,36 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( /* now override with asm where appropriate */ #ifndef FLAC__INTEGER_ONLY_LIBRARY # ifndef FLAC__NO_ASM +#if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX) +#ifdef FLAC__HAS_TARGET_POWER8 +#ifdef FLAC__HAS_TARGET_POWER9 + if (encoder->private_->cpuinfo.ppc.arch_3_00) { + if(encoder->protected_->max_lpc_order < 4) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_4; + else if(encoder->protected_->max_lpc_order < 8) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8; + else if(encoder->protected_->max_lpc_order < 12) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12; + else if(encoder->protected_->max_lpc_order < 16) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_16; + else + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation; + } else +#endif + if (encoder->private_->cpuinfo.ppc.arch_2_07) { + if(encoder->protected_->max_lpc_order < 4) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_4; + else if(encoder->protected_->max_lpc_order < 8) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8; + else if(encoder->protected_->max_lpc_order < 12) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12; + else if(encoder->protected_->max_lpc_order < 16) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_16; + else + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation; + } +#endif +#endif if(encoder->private_->cpuinfo.use_asm) { # ifdef FLAC__CPU_IA32 FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32); |