From 20ce6f9bf294f54e09a422bf78757cf821bb2f13 Mon Sep 17 00:00:00 2001 From: Martijn van Beurden Date: Mon, 10 Oct 2022 08:37:46 +0200 Subject: Do not let small blocksizes be handled by intrinsics autoc calculation Also, fix a bug in which apodization windows were not recalculated when blocksize was shrunk --- src/libFLAC/lpc.c | 34 +++++----- src/libFLAC/stream_encoder.c | 145 +++++++++++++++++++++++-------------------- 2 files changed, 95 insertions(+), 84 deletions(-) diff --git a/src/libFLAC/lpc.c b/src/libFLAC/lpc.c index 1814fef5..a760b121 100644 --- a/src/libFLAC/lpc.c +++ b/src/libFLAC/lpc.c @@ -130,23 +130,7 @@ void FLAC__lpc_compute_autocorrelation(const FLAC__real data[], uint32_t data_le autoc[lag] = d; } #endif - if(lag <= 8) { - #undef MAX_LAG - #define MAX_LAG 8 - #include "deduplication/lpc_compute_autocorrelation_intrin.c" - } - else if(lag <= 12) { - #undef MAX_LAG - #define MAX_LAG 12 - #include "deduplication/lpc_compute_autocorrelation_intrin.c" - } - else if(lag <= 16) { - #undef MAX_LAG - #define MAX_LAG 16 - #include "deduplication/lpc_compute_autocorrelation_intrin.c" - } - else { - + if (data_len < FLAC__MAX_LPC_ORDER || lag > 16) { /* * this version tends to run faster because of better data locality * ('data_len' is usually much larger than 'lag') @@ -171,6 +155,22 @@ void FLAC__lpc_compute_autocorrelation(const FLAC__real data[], uint32_t data_le autoc[coeff] += d * data[sample+coeff]; } } + else if(lag <= 8) { + #undef MAX_LAG + #define MAX_LAG 8 + #include "deduplication/lpc_compute_autocorrelation_intrin.c" + } + else if(lag <= 12) { + #undef MAX_LAG + #define MAX_LAG 12 + #include "deduplication/lpc_compute_autocorrelation_intrin.c" + } + else if(lag <= 16) { + #undef MAX_LAG + #define MAX_LAG 16 + #include "deduplication/lpc_compute_autocorrelation_intrin.c" + } + } void FLAC__lpc_compute_lp_coefficients(const double autoc[], uint32_t *max_order, FLAC__real lp_coeff[][FLAC__MAX_LPC_ORDER], double error[]) diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c index 2879c51c..b33847dc 100644 --- a/src/libFLAC/stream_encoder.c +++ b/src/libFLAC/stream_encoder.c @@ -1486,6 +1486,10 @@ FLAC_API FLAC__bool FLAC__stream_encoder_finish(FLAC__StreamEncoder *encoder) if(encoder->protected_->state == FLAC__STREAM_ENCODER_OK && !encoder->private_->is_being_deleted) { if(encoder->private_->current_sample_number != 0) { encoder->protected_->blocksize = encoder->private_->current_sample_number; + if(!resize_buffers_(encoder, encoder->protected_->blocksize)) { + /* the above function sets the state for us in case of an error */ + return FLAC__STREAM_ENCODER_INIT_STATUS_ENCODER_ERROR; + } if(!process_frame_(encoder, /*is_last_block=*/true)) error = true; } @@ -2562,81 +2566,88 @@ FLAC__bool resize_buffers_(FLAC__StreamEncoder *encoder, uint32_t new_blocksize) FLAC__ASSERT(encoder->protected_->state == FLAC__STREAM_ENCODER_OK); FLAC__ASSERT(encoder->private_->current_sample_number == 0); - /* To avoid excessive malloc'ing, we only grow the buffer; no shrinking. */ - if(new_blocksize <= encoder->private_->input_capacity) - return true; - ok = true; - /* WATCHOUT: FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx() and ..._intrin_sse2() - * require that the input arrays (in our case the integer signals) - * have a buffer of up to 3 zeroes in front (at negative indices) for - * alignment purposes; we use 4 in front to keep the data well-aligned. - */ + /* To avoid excessive malloc'ing, we only grow the buffer; no shrinking. */ + if(new_blocksize > encoder->private_->input_capacity) { + + /* WATCHOUT: FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx() and ..._intrin_sse2() + * require that the input arrays (in our case the integer signals) + * have a buffer of up to 3 zeroes in front (at negative indices) for + * alignment purposes; we use 4 in front to keep the data well-aligned. + */ - for(i = 0; ok && i < encoder->protected_->channels; i++) { - ok = ok && FLAC__memory_alloc_aligned_int32_array(new_blocksize+4+OVERREAD_, &encoder->private_->integer_signal_unaligned[i], &encoder->private_->integer_signal[i]); - if(ok) { - memset(encoder->private_->integer_signal[i], 0, sizeof(FLAC__int32)*4); - encoder->private_->integer_signal[i] += 4; + for(i = 0; ok && i < encoder->protected_->channels; i++) { + ok = ok && FLAC__memory_alloc_aligned_int32_array(new_blocksize+4+OVERREAD_, &encoder->private_->integer_signal_unaligned[i], &encoder->private_->integer_signal[i]); + if(ok) { + memset(encoder->private_->integer_signal[i], 0, sizeof(FLAC__int32)*4); + encoder->private_->integer_signal[i] += 4; + } } - } - for(i = 0; ok && i < 2; i++) { - ok = ok && FLAC__memory_alloc_aligned_int32_array(new_blocksize+4+OVERREAD_, &encoder->private_->integer_signal_mid_side_unaligned[i], &encoder->private_->integer_signal_mid_side[i]); - if(ok) { - memset(encoder->private_->integer_signal_mid_side[i], 0, sizeof(FLAC__int32)*4); - encoder->private_->integer_signal_mid_side[i] += 4; + for(i = 0; ok && i < 2; i++) { + ok = ok && FLAC__memory_alloc_aligned_int32_array(new_blocksize+4+OVERREAD_, &encoder->private_->integer_signal_mid_side_unaligned[i], &encoder->private_->integer_signal_mid_side[i]); + if(ok) { + memset(encoder->private_->integer_signal_mid_side[i], 0, sizeof(FLAC__int32)*4); + encoder->private_->integer_signal_mid_side[i] += 4; + } } - } - ok = ok && FLAC__memory_alloc_aligned_int64_array(new_blocksize+4+OVERREAD_, &encoder->private_->integer_signal_33bit_side_unaligned, &encoder->private_->integer_signal_33bit_side); + ok = ok && FLAC__memory_alloc_aligned_int64_array(new_blocksize+4+OVERREAD_, &encoder->private_->integer_signal_33bit_side_unaligned, &encoder->private_->integer_signal_33bit_side); #ifndef FLAC__INTEGER_ONLY_LIBRARY - if(ok && encoder->protected_->max_lpc_order > 0) { - for(i = 0; ok && i < encoder->protected_->num_apodizations; i++) - ok = ok && FLAC__memory_alloc_aligned_real_array(new_blocksize, &encoder->private_->window_unaligned[i], &encoder->private_->window[i]); - ok = ok && FLAC__memory_alloc_aligned_real_array(new_blocksize, &encoder->private_->windowed_signal_unaligned, &encoder->private_->windowed_signal); - } + if(ok && encoder->protected_->max_lpc_order > 0) { + for(i = 0; ok && i < encoder->protected_->num_apodizations; i++) + ok = ok && FLAC__memory_alloc_aligned_real_array(new_blocksize, &encoder->private_->window_unaligned[i], &encoder->private_->window[i]); + ok = ok && FLAC__memory_alloc_aligned_real_array(new_blocksize, &encoder->private_->windowed_signal_unaligned, &encoder->private_->windowed_signal); + } #endif - for(channel = 0; ok && channel < encoder->protected_->channels; channel++) { - for(i = 0; ok && i < 2; i++) { - ok = ok && FLAC__memory_alloc_aligned_int32_array(new_blocksize, &encoder->private_->residual_workspace_unaligned[channel][i], &encoder->private_->residual_workspace[channel][i]); + for(channel = 0; ok && channel < encoder->protected_->channels; channel++) { + for(i = 0; ok && i < 2; i++) { + ok = ok && FLAC__memory_alloc_aligned_int32_array(new_blocksize, &encoder->private_->residual_workspace_unaligned[channel][i], &encoder->private_->residual_workspace[channel][i]); + } } - } - for(channel = 0; ok && channel < encoder->protected_->channels; channel++) { - for(i = 0; ok && i < 2; i++) { - ok = ok && FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(&encoder->private_->partitioned_rice_contents_workspace[channel][i], encoder->protected_->max_residual_partition_order); - ok = ok && FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(&encoder->private_->partitioned_rice_contents_workspace[channel][i], encoder->protected_->max_residual_partition_order); + for(channel = 0; ok && channel < encoder->protected_->channels; channel++) { + for(i = 0; ok && i < 2; i++) { + ok = ok && FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(&encoder->private_->partitioned_rice_contents_workspace[channel][i], encoder->protected_->max_residual_partition_order); + ok = ok && FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(&encoder->private_->partitioned_rice_contents_workspace[channel][i], encoder->protected_->max_residual_partition_order); + } } - } - for(channel = 0; ok && channel < 2; channel++) { - for(i = 0; ok && i < 2; i++) { - ok = ok && FLAC__memory_alloc_aligned_int32_array(new_blocksize, &encoder->private_->residual_workspace_mid_side_unaligned[channel][i], &encoder->private_->residual_workspace_mid_side[channel][i]); + for(channel = 0; ok && channel < 2; channel++) { + for(i = 0; ok && i < 2; i++) { + ok = ok && FLAC__memory_alloc_aligned_int32_array(new_blocksize, &encoder->private_->residual_workspace_mid_side_unaligned[channel][i], &encoder->private_->residual_workspace_mid_side[channel][i]); + } + } + + for(channel = 0; ok && channel < 2; channel++) { + for(i = 0; ok && i < 2; i++) { + ok = ok && FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(&encoder->private_->partitioned_rice_contents_workspace_mid_side[channel][i], encoder->protected_->max_residual_partition_order); + } } - } - for(channel = 0; ok && channel < 2; channel++) { for(i = 0; ok && i < 2; i++) { - ok = ok && FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(&encoder->private_->partitioned_rice_contents_workspace_mid_side[channel][i], encoder->protected_->max_residual_partition_order); + ok = ok && FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(&encoder->private_->partitioned_rice_contents_extra[i], encoder->protected_->max_residual_partition_order); } - } - for(i = 0; ok && i < 2; i++) { - ok = ok && FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(&encoder->private_->partitioned_rice_contents_extra[i], encoder->protected_->max_residual_partition_order); - } + /* the *2 is an approximation to the series 1 + 1/2 + 1/4 + ... that sums tree occupies in a flat array */ + /*@@@ new_blocksize*2 is too pessimistic, but to fix, we need smarter logic because a smaller new_blocksize can actually increase the # of partitions; would require moving this out into a separate function, then checking its capacity against the need of the current blocksize&min/max_partition_order (and maybe predictor order) */ + ok = ok && FLAC__memory_alloc_aligned_uint64_array(new_blocksize * 2, &encoder->private_->abs_residual_partition_sums_unaligned, &encoder->private_->abs_residual_partition_sums); + if(encoder->protected_->do_escape_coding) + ok = ok && FLAC__memory_alloc_aligned_unsigned_array(new_blocksize * 2, &encoder->private_->raw_bits_per_partition_unaligned, &encoder->private_->raw_bits_per_partition); +} + if(ok) + encoder->private_->input_capacity = new_blocksize; + else { + encoder->protected_->state = FLAC__STREAM_ENCODER_MEMORY_ALLOCATION_ERROR; + return ok; + } - /* the *2 is an approximation to the series 1 + 1/2 + 1/4 + ... that sums tree occupies in a flat array */ - /*@@@ new_blocksize*2 is too pessimistic, but to fix, we need smarter logic because a smaller new_blocksize can actually increase the # of partitions; would require moving this out into a separate function, then checking its capacity against the need of the current blocksize&min/max_partition_order (and maybe predictor order) */ - ok = ok && FLAC__memory_alloc_aligned_uint64_array(new_blocksize * 2, &encoder->private_->abs_residual_partition_sums_unaligned, &encoder->private_->abs_residual_partition_sums); - if(encoder->protected_->do_escape_coding) - ok = ok && FLAC__memory_alloc_aligned_unsigned_array(new_blocksize * 2, &encoder->private_->raw_bits_per_partition_unaligned, &encoder->private_->raw_bits_per_partition); /* now adjust the windows if the blocksize has changed */ #ifndef FLAC__INTEGER_ONLY_LIBRARY - if(ok && new_blocksize != encoder->private_->input_capacity && encoder->protected_->max_lpc_order > 0) { - for(i = 0; ok && i < encoder->protected_->num_apodizations; i++) { + if(encoder->protected_->max_lpc_order > 0) { + for(i = 0; i < encoder->protected_->num_apodizations; i++) { switch(encoder->protected_->apodizations[i].type) { case FLAC__APODIZATION_BARTLETT: FLAC__window_bartlett(encoder->private_->window[i], new_blocksize); @@ -2700,14 +2711,14 @@ FLAC__bool resize_buffers_(FLAC__StreamEncoder *encoder, uint32_t new_blocksize) } } } + if (new_blocksize < FLAC__MAX_LPC_ORDER) { + /* intrinsics autocorrelation routines do not all handle cases in which lag might be + * larger than data_len */ + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation; + } #endif - if(ok) - encoder->private_->input_capacity = new_blocksize; - else - encoder->protected_->state = FLAC__STREAM_ENCODER_MEMORY_ALLOCATION_ERROR; - - return ok; + return true; } FLAC__bool write_bitbuffer_(FLAC__StreamEncoder *encoder, uint32_t samples, FLAC__bool is_last_block) @@ -3734,14 +3745,14 @@ FLAC__bool process_subframe_( } else { /* window part of subblock */ - if(max_lpc_order_this_apodization >= frame_header->blocksize/b) { - max_lpc_order_this_apodization = frame_header->blocksize/b - 1; - if(frame_header->blocksize/b > 0) - max_lpc_order_this_apodization = frame_header->blocksize/b - 1; - else { - set_next_subdivide_tukey(encoder->protected_->apodizations[a].parameters.subdivide_tukey.parts, &a, &b, &c); - continue; - } + if(frame_header->blocksize/b < FLAC__MAX_LPC_ORDER) { + /* intrinsics autocorrelation routines do not all handle cases in which lag might be + * larger than data_len, and some routines round lag up to the nearest multiple of 4 + * As little gain is expected from using LPC on part of a signal as small as 32 samples + * and to enable widening this rounding up to larger values in the future, windowing + * parts smaller than FLAC__MAX_LPC_ORDER (which is 32) samples is not supported */ + set_next_subdivide_tukey(encoder->protected_->apodizations[a].parameters.subdivide_tukey.parts, &a, &b, &c); + continue; } if(!(c % 2)){ /* on even c, evaluate the (c/2)th partial window of size blocksize/b */ -- cgit v1.2.1