diff --git a/crypto/Makefile b/crypto/Makefile index 99a9fa9087d1..0d2cdd523fd9 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -98,7 +98,14 @@ CFLAGS_aegis128-neon-inner.o += -mfpu=crypto-neon-fp-armv8 aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o endif ifeq ($(ARCH),arm64) -CFLAGS_aegis128-neon-inner.o += -ffreestanding -mcpu=generic+crypto +aegis128-cflags-y := -ffreestanding -mcpu=generic+crypto +aegis128-cflags-$(CONFIG_CC_IS_GCC) += -ffixed-q16 -ffixed-q17 -ffixed-q18 \ + -ffixed-q19 -ffixed-q20 -ffixed-q21 \ + -ffixed-q22 -ffixed-q23 -ffixed-q24 \ + -ffixed-q25 -ffixed-q26 -ffixed-q27 \ + -ffixed-q28 -ffixed-q29 -ffixed-q30 \ + -ffixed-q31 +CFLAGS_aegis128-neon-inner.o += $(aegis128-cflags-y) CFLAGS_REMOVE_aegis128-neon-inner.o += -mgeneral-regs-only aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o endif diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c index 3d8043c4832b..ed55568afd1b 100644 --- a/crypto/aegis128-neon-inner.c +++ b/crypto/aegis128-neon-inner.c @@ -17,6 +17,8 @@ #include +extern int aegis128_have_aes_insn; + void *memcpy(void *dest, const void *src, size_t n); void *memset(void *s, int c, size_t n); @@ -24,6 +26,8 @@ struct aegis128_state { uint8x16_t v[5]; }; +extern const uint8x16x4_t crypto_aes_sbox[]; + static struct aegis128_state aegis128_load_state_neon(const void *state) { return (struct aegis128_state){ { @@ -49,6 +53,46 @@ uint8x16_t aegis_aes_round(uint8x16_t w) { uint8x16_t z = {}; +#ifdef CONFIG_ARM64 + if (!__builtin_expect(aegis128_have_aes_insn, 1)) { + static const uint8x16_t shift_rows = { + 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, + 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, + }; + static const uint8x16_t ror32by8 = { + 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, + }; + uint8x16_t v; + + // shift rows + w = vqtbl1q_u8(w, shift_rows); + + // sub bytes + if (!IS_ENABLED(CONFIG_CC_IS_GCC)) { + v = vqtbl4q_u8(crypto_aes_sbox[0], w); + v = vqtbx4q_u8(v, crypto_aes_sbox[1], w - 0x40); + v = vqtbx4q_u8(v, crypto_aes_sbox[2], w - 0x80); + v = vqtbx4q_u8(v, crypto_aes_sbox[3], w - 0xc0); + } else { + asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w)); + w -= 0x40; + asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w)); + w -= 0x40; + asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w)); + w -= 0x40; + asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w)); + } + + // mix columns + w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b); + w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v); + w ^= vqtbl1q_u8(v ^ w, ror32by8); + + return w; + } +#endif + /* * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics * to force the compiler to issue the aese/aesmc instructions in pairs. @@ -73,10 +117,27 @@ struct aegis128_state aegis128_update_neon(struct aegis128_state st, return st; } +static inline __attribute__((always_inline)) +void preload_sbox(void) +{ + if (!IS_ENABLED(CONFIG_ARM64) || + !IS_ENABLED(CONFIG_CC_IS_GCC) || + __builtin_expect(aegis128_have_aes_insn, 1)) + return; + + asm("ld1 {v16.16b-v19.16b}, [%0], #64 \n\t" + "ld1 {v20.16b-v23.16b}, [%0], #64 \n\t" + "ld1 {v24.16b-v27.16b}, [%0], #64 \n\t" + "ld1 {v28.16b-v31.16b}, [%0] \n\t" + :: "r"(crypto_aes_sbox)); +} + void crypto_aegis128_update_neon(void *state, const void *msg) { struct aegis128_state st = aegis128_load_state_neon(state); + preload_sbox(); + st = aegis128_update_neon(st, vld1q_u8(msg)); aegis128_save_state_neon(st, state); @@ -88,6 +149,8 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src, struct aegis128_state st = aegis128_load_state_neon(state); uint8x16_t msg; + preload_sbox(); + while (size >= AEGIS_BLOCK_SIZE) { uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; @@ -120,6 +183,8 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src, struct aegis128_state st = aegis128_load_state_neon(state); uint8x16_t msg; + preload_sbox(); + while (size >= AEGIS_BLOCK_SIZE) { msg = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; st = aegis128_update_neon(st, msg); diff --git a/crypto/aegis128-neon.c b/crypto/aegis128-neon.c index c1c0a1686f67..751f9c195aa4 100644 --- a/crypto/aegis128-neon.c +++ b/crypto/aegis128-neon.c @@ -14,9 +14,15 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src, void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src, unsigned int size); +int aegis128_have_aes_insn __ro_after_init; + bool crypto_aegis128_have_simd(void) { - return cpu_have_feature(cpu_feature(AES)); + if (cpu_have_feature(cpu_feature(AES))) { + aegis128_have_aes_insn = 1; + return true; + } + return IS_ENABLED(CONFIG_ARM64); } void crypto_aegis128_update_simd(union aegis_block *state, const void *msg)