From aeb5de55a9c646ca334e4f6252af28536ca22349 Mon Sep 17 00:00:00 2001 From: Jun He Date: Tue, 17 Aug 2021 16:36:43 +0800 Subject: decompress: refine data depdency The final ip advance value doesn't have to wait for the result of offset to load *tag. It can be computed along with the offset, so the codegen will use one csinc in parallel with ldrb. This will improve the throughput. With this change it is observed ~4.2% uplift in UFlat/10 and ~3.7% in UFlatMedley Signed-off-by: Jun He Change-Id: I20ab211235bbf578c6c978f2bbd9160a49e920da --- snappy.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/snappy.cc b/snappy.cc index 4008e76..670b87e 100644 --- a/snappy.cc +++ b/snappy.cc @@ -1015,12 +1015,16 @@ size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) { // delta2 = ((c >> 2) + 1) ip++ // This is different from X86 optimizations because ARM has conditional add // instruction (csinc) and it removes several register moves. - const size_t literal_tag_offset = (*tag >> 2) + 1; const size_t tag_type = *tag & 3; const bool is_literal = (tag_type == 0); - *tag = is_literal ? ip[literal_tag_offset] : ip[tag_type]; - ip += is_literal ? literal_tag_offset : tag_type; - ip++; + if (is_literal) { + size_t next_literal_tag = (*tag >> 2) + 1; + *tag = ip[next_literal_tag]; + ip += next_literal_tag + 1; + } else { + *tag = ip[tag_type]; + ip += tag_type + 1; + } return tag_type; } -- cgit v1.2.1