summaryrefslogtreecommitdiff
path: root/arch/x86/crypto/aesni-intel_asm.S
diff options
context:
space:
mode:
authorArd Biesheuvel <ardb@kernel.org>2020-12-08 00:34:02 +0100
committerHerbert Xu <herbert@gondor.apana.org.au>2021-01-03 08:41:34 +1100
commitddf169a98f01d6fd46295ec0dd4c1d6385be65d4 (patch)
tree67811b959d2e4af9d06d09be2b60774cba7404d8 /arch/x86/crypto/aesni-intel_asm.S
parenta417178abc4ae2517231ee67a1291d58929fade1 (diff)
crypto: aesni - implement support for cts(cbc(aes))
Follow the same approach as the arm64 driver for implementing a version of AES-NI in CBC mode that supports ciphertext stealing. This results in a ~2x speed increase for relatively short inputs (less than 256 bytes), which is relevant given that AES-CBC with ciphertext stealing is used for filename encryption in the fscrypt layer. For larger inputs, the speedup is still significant (~25% on decryption, ~6% on encryption) Tested-by: Eric Biggers <ebiggers@google.com> # x86_64 Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/aesni-intel_asm.S')
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S129
1 files changed, 128 insertions, 1 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index d1436c37008b..a2710f76862f 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -2577,13 +2577,140 @@ SYM_FUNC_START(aesni_cbc_dec)
ret
SYM_FUNC_END(aesni_cbc_dec)
-#ifdef __x86_64__
+/*
+ * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+SYM_FUNC_START(aesni_cts_cbc_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
+#endif
+ mov 480(KEYP), KLEN
+ movups (IVP), STATE
+ sub $16, LEN
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ movups (T1), %xmm4
+ movups (IVP), %xmm5
+
+ movups (INP), IN1
+ add LEN, INP
+ movups (INP), IN2
+
+ pxor IN1, STATE
+ call _aesni_enc1
+
+ pshufb %xmm5, IN2
+ pxor STATE, IN2
+ pshufb %xmm4, STATE
+ add OUTP, LEN
+ movups STATE, (LEN)
+
+ movaps IN2, STATE
+ call _aesni_enc1
+ movups STATE, (OUTP)
+
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ ret
+SYM_FUNC_END(aesni_cts_cbc_enc)
+
+/*
+ * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+SYM_FUNC_START(aesni_cts_cbc_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
+#endif
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ movups (IVP), IV
+ sub $16, LEN
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ movups (T1), %xmm4
+
+ movups (INP), STATE
+ add LEN, INP
+ movups (INP), IN1
+
+ call _aesni_dec1
+ movaps STATE, IN2
+ pshufb %xmm4, STATE
+ pxor IN1, STATE
+
+ add OUTP, LEN
+ movups STATE, (LEN)
+
+ movups (IVP), %xmm0
+ pshufb %xmm0, IN1
+ pblendvb IN2, IN1
+ movaps IN1, STATE
+ call _aesni_dec1
+
+ pxor IV, STATE
+ movups STATE, (OUTP)
+
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ ret
+SYM_FUNC_END(aesni_cts_cbc_dec)
+
.pushsection .rodata
.align 16
+.Lcts_permute_table:
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+#ifdef __x86_64__
.Lbswap_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#endif
.popsection
+#ifdef __x86_64__
/*
* _aesni_inc_init: internal ABI
* setup registers used by _aesni_inc