summaryrefslogtreecommitdiff
path: root/driver
diff options
context:
space:
mode:
authorPeter Marheine <pmarheine@chromium.org>2023-02-09 14:14:30 +1100
committerChromeos LUCI <chromeos-scoped@luci-project-accounts.iam.gserviceaccount.com>2023-02-09 23:45:25 +0000
commit76b9b1ae5cdc15d5b62c2bafe4db1aefd7964e6f (patch)
tree14252246f6cb508c5586b6bc79c91bba31e42f1e /driver
parentd2582f3b416a72465d72676dfb0f3a10f45758e2 (diff)
downloadchrome-ec-76b9b1ae5cdc15d5b62c2bafe4db1aefd7964e6f.tar.gz
it8xxx2: micro-optimize SHA256 computation
Hash performance is a significant contributor to EC boot-up time, so it's valuable to optimize the computation of firmware hashes. This change improves RW image hashing performance on Nereid, reducing time taken for `hash rw` on the console by about 15ms on average (from 1263ms to 1244ms for 384kB of data). This primarily replaces the byte swapping in SHA256_update with an inline assembly version that saves 6 instructions per word of data hashed. Calling htobe32() turns into a call into libgcc's __bswapsi2 which does implement an efficient swap, but spends several instructions reloading constant values used by the function and incurs function call overhead. This inline version saves 4 instructions by allowing the constants to be kept in registers through the entire SHA256_update (effectively hoisting the constant loads outside the loop), and 2 more in procedure call overhead. SHA256_chip_calculation is also optimized slightly: declaring the hash_ctrl local as volatile forces it to be allocated a stack slot and the read value be written to the stack (then ignored), which is unnecessary because the macro referring to the SHA1HASHCTRLR register is already a volatile read. Removing the unneeded qualifier saves 3 instructions between stack adjustment that is no longer needed (because the function now operates only on values that fit in registers) and the unneeded store that is now removed. BUG=b:260762509 TEST=`hash 0xbfffc 32` has the same value before and after this change, with manual verification that the 32 bytes of memory at 0x800bfffc are unchanged between the two builds (`md 0x800bfffc 32`). BRANCH=nissa LOW_COVERAGE_REASON=uncovered line can only run on actual hardware, not native_posix where coverage is gathered Change-Id: I99fcb2278518f6a57046985ef03a58ebb2f307a1 Signed-off-by: Peter Marheine <pmarheine@chromium.org> Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/ec/+/4235298 Reviewed-by: Adam Mills <adamjmills@google.com>
Diffstat (limited to 'driver')
-rw-r--r--driver/sha256/sha256_it8xxx2.c32
1 files changed, 30 insertions, 2 deletions
diff --git a/driver/sha256/sha256_it8xxx2.c b/driver/sha256/sha256_it8xxx2.c
index 32fda2c714..eb9d83e948 100644
--- a/driver/sha256/sha256_it8xxx2.c
+++ b/driver/sha256/sha256_it8xxx2.c
@@ -66,7 +66,7 @@ void SHA256_init(struct sha256_ctx *ctx)
static void SHA256_chip_calculation(struct sha256_ctx *ctx)
{
- volatile uint8_t hash_ctrl __unused;
+ uint8_t hash_ctrl __unused;
uint32_t key;
key = irq_lock();
@@ -85,7 +85,35 @@ void SHA256_update(struct sha256_ctx *ctx, const uint8_t *data, uint32_t len)
ASSERT(len % 4 == 0);
while (rem_len) {
- ctx->w[ctx->w_index++] = htobe32(p[data_index++]);
+ uint32_t tmp, x = p[data_index++];
+
+ /*
+ * htobe32(x); manually inlining this saves several instructions
+ * when compared to a call to __bswapsi2, saving function call
+ * overhead and reloading of the mask constants (because there
+ * are registers to spare in this function). It's written as
+ * inline assembly to ensure that it won't get lowered to a
+ * __builtin_bswap32 that might not be inlined.
+ *
+ * x = ((x << 8) & 0xFF00FF00) | ((x >> 8) & 0xFF00FF);
+ * x = (x << 16) | (x >> 16)
+ */
+ if (IS_ENABLED(CONFIG_RISCV) &&
+ IS_ENABLED(CONFIG_LITTLE_ENDIAN)) {
+ __asm__(" slli %[scratch], %[x], 8\n"
+ " srli %[x], %[x], 8\n"
+ " and %[scratch], %[scratch], %[hi]\n"
+ " and %[x], %[x], %[lo]\n"
+ " or %[x], %[scratch], %[x]\n"
+ " slli %[scratch], %[x], 16\n"
+ " srli %[x], %[x], 16\n"
+ " or %[x], %[x], %[scratch]\n"
+ : [x] "+r"(x), [scratch] "=&r"(tmp)
+ : [hi] "r"(0xFF00FF00), [lo] "r"(0x00FF00FF));
+ } else {
+ x = htobe32(x);
+ }
+ ctx->w[ctx->w_index++] = x;
if (ctx->w_index >= 16) {
SHA256_chip_calculation(ctx);
}