diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/crypto/Makefile | 12 | ||||
-rw-r--r-- | arch/x86/crypto/aes-i586-asm_32.S | 89 | ||||
-rw-r--r-- | arch/x86/crypto/aes-x86_64-asm_64.S | 68 | ||||
-rw-r--r-- | arch/x86/crypto/aes_32.c | 515 | ||||
-rw-r--r-- | arch/x86/crypto/aes_64.c | 336 | ||||
-rw-r--r-- | arch/x86/crypto/aes_glue.c | 57 | ||||
-rw-r--r-- | arch/x86/crypto/salsa20-i586-asm_32.S | 1114 | ||||
-rw-r--r-- | arch/x86/crypto/salsa20-x86_64-asm_64.S | 920 | ||||
-rw-r--r-- | arch/x86/crypto/salsa20_glue.c | 129 | ||||
-rw-r--r-- | arch/x86/crypto/twofish_64.c | 97 | ||||
-rw-r--r-- | arch/x86/crypto/twofish_glue.c (renamed from arch/x86/crypto/twofish_32.c) | 8 |
11 files changed, 2309 insertions, 1036 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 46bb609e2444..3874c2de5403 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -4,12 +4,16 @@ obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o +obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o +obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o -aes-i586-y := aes-i586-asm_32.o aes_32.o -twofish-i586-y := twofish-i586-asm_32.o twofish_32.o +aes-i586-y := aes-i586-asm_32.o aes_glue.o +twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o +salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o -aes-x86_64-y := aes-x86_64-asm_64.o aes_64.o -twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o +aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o +twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o +salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S index f942f0c8f630..1093bede3e0a 100644 --- a/arch/x86/crypto/aes-i586-asm_32.S +++ b/arch/x86/crypto/aes-i586-asm_32.S @@ -46,9 +46,9 @@ #define in_blk 16 /* offsets in crypto_tfm structure */ -#define ekey (crypto_tfm_ctx_offset + 0) -#define nrnd (crypto_tfm_ctx_offset + 256) -#define dkey (crypto_tfm_ctx_offset + 260) +#define klen (crypto_tfm_ctx_offset + 0) +#define ekey (crypto_tfm_ctx_offset + 4) +#define dkey (crypto_tfm_ctx_offset + 244) // register mapping for encrypt and decrypt subroutines @@ -221,8 +221,8 @@ .global aes_enc_blk -.extern ft_tab -.extern fl_tab +.extern crypto_ft_tab +.extern crypto_fl_tab .align 4 @@ -236,7 +236,7 @@ aes_enc_blk: 1: push %ebx mov in_blk+4(%esp),%r2 push %esi - mov nrnd(%ebp),%r3 // number of rounds + mov klen(%ebp),%r3 // key size push %edi #if ekey != 0 lea ekey(%ebp),%ebp // key pointer @@ -255,26 +255,26 @@ aes_enc_blk: sub $8,%esp // space for register saves on stack add $16,%ebp // increment to next round key - cmp $12,%r3 + cmp $24,%r3 jb 4f // 10 rounds for 128-bit key lea 32(%ebp),%ebp je 3f // 12 rounds for 192-bit key lea 32(%ebp),%ebp -2: fwd_rnd1( -64(%ebp) ,ft_tab) // 14 rounds for 256-bit key - fwd_rnd2( -48(%ebp) ,ft_tab) -3: fwd_rnd1( -32(%ebp) ,ft_tab) // 12 rounds for 192-bit key - fwd_rnd2( -16(%ebp) ,ft_tab) -4: fwd_rnd1( (%ebp) ,ft_tab) // 10 rounds for 128-bit key - fwd_rnd2( +16(%ebp) ,ft_tab) - fwd_rnd1( +32(%ebp) ,ft_tab) - fwd_rnd2( +48(%ebp) ,ft_tab) - fwd_rnd1( +64(%ebp) ,ft_tab) - fwd_rnd2( +80(%ebp) ,ft_tab) - fwd_rnd1( +96(%ebp) ,ft_tab) - fwd_rnd2(+112(%ebp) ,ft_tab) - fwd_rnd1(+128(%ebp) ,ft_tab) - fwd_rnd2(+144(%ebp) ,fl_tab) // last round uses a different table +2: fwd_rnd1( -64(%ebp), crypto_ft_tab) // 14 rounds for 256-bit key + fwd_rnd2( -48(%ebp), crypto_ft_tab) +3: fwd_rnd1( -32(%ebp), crypto_ft_tab) // 12 rounds for 192-bit key + fwd_rnd2( -16(%ebp), crypto_ft_tab) +4: fwd_rnd1( (%ebp), crypto_ft_tab) // 10 rounds for 128-bit key + fwd_rnd2( +16(%ebp), crypto_ft_tab) + fwd_rnd1( +32(%ebp), crypto_ft_tab) + fwd_rnd2( +48(%ebp), crypto_ft_tab) + fwd_rnd1( +64(%ebp), crypto_ft_tab) + fwd_rnd2( +80(%ebp), crypto_ft_tab) + fwd_rnd1( +96(%ebp), crypto_ft_tab) + fwd_rnd2(+112(%ebp), crypto_ft_tab) + fwd_rnd1(+128(%ebp), crypto_ft_tab) + fwd_rnd2(+144(%ebp), crypto_fl_tab) // last round uses a different table // move final values to the output array. CAUTION: the // order of these assigns rely on the register mappings @@ -297,8 +297,8 @@ aes_enc_blk: .global aes_dec_blk -.extern it_tab -.extern il_tab +.extern crypto_it_tab +.extern crypto_il_tab .align 4 @@ -312,14 +312,11 @@ aes_dec_blk: 1: push %ebx mov in_blk+4(%esp),%r2 push %esi - mov nrnd(%ebp),%r3 // number of rounds + mov klen(%ebp),%r3 // key size push %edi #if dkey != 0 lea dkey(%ebp),%ebp // key pointer #endif - mov %r3,%r0 - shl $4,%r0 - add %r0,%ebp // input four columns and xor in first round key @@ -333,27 +330,27 @@ aes_dec_blk: xor 12(%ebp),%r5 sub $8,%esp // space for register saves on stack - sub $16,%ebp // increment to next round key - cmp $12,%r3 + add $16,%ebp // increment to next round key + cmp $24,%r3 jb 4f // 10 rounds for 128-bit key - lea -32(%ebp),%ebp + lea 32(%ebp),%ebp je 3f // 12 rounds for 192-bit key - lea -32(%ebp),%ebp - -2: inv_rnd1( +64(%ebp), it_tab) // 14 rounds for 256-bit key - inv_rnd2( +48(%ebp), it_tab) -3: inv_rnd1( +32(%ebp), it_tab) // 12 rounds for 192-bit key - inv_rnd2( +16(%ebp), it_tab) -4: inv_rnd1( (%ebp), it_tab) // 10 rounds for 128-bit key - inv_rnd2( -16(%ebp), it_tab) - inv_rnd1( -32(%ebp), it_tab) - inv_rnd2( -48(%ebp), it_tab) - inv_rnd1( -64(%ebp), it_tab) - inv_rnd2( -80(%ebp), it_tab) - inv_rnd1( -96(%ebp), it_tab) - inv_rnd2(-112(%ebp), it_tab) - inv_rnd1(-128(%ebp), it_tab) - inv_rnd2(-144(%ebp), il_tab) // last round uses a different table + lea 32(%ebp),%ebp + +2: inv_rnd1( -64(%ebp), crypto_it_tab) // 14 rounds for 256-bit key + inv_rnd2( -48(%ebp), crypto_it_tab) +3: inv_rnd1( -32(%ebp), crypto_it_tab) // 12 rounds for 192-bit key + inv_rnd2( -16(%ebp), crypto_it_tab) +4: inv_rnd1( (%ebp), crypto_it_tab) // 10 rounds for 128-bit key + inv_rnd2( +16(%ebp), crypto_it_tab) + inv_rnd1( +32(%ebp), crypto_it_tab) + inv_rnd2( +48(%ebp), crypto_it_tab) + inv_rnd1( +64(%ebp), crypto_it_tab) + inv_rnd2( +80(%ebp), crypto_it_tab) + inv_rnd1( +96(%ebp), crypto_it_tab) + inv_rnd2(+112(%ebp), crypto_it_tab) + inv_rnd1(+128(%ebp), crypto_it_tab) + inv_rnd2(+144(%ebp), crypto_il_tab) // last round uses a different table // move final values to the output array. CAUTION: the // order of these assigns rely on the register mappings diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S index 26b40de4d0b0..a120f526c3df 100644 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -8,10 +8,10 @@ * including this sentence is retained in full. */ -.extern aes_ft_tab -.extern aes_it_tab -.extern aes_fl_tab -.extern aes_il_tab +.extern crypto_ft_tab +.extern crypto_it_tab +.extern crypto_fl_tab +.extern crypto_il_tab .text @@ -56,13 +56,13 @@ .align 8; \ FUNC: movq r1,r2; \ movq r3,r4; \ - leaq BASE+KEY+52(r8),r9; \ + leaq BASE+KEY+48+4(r8),r9; \ movq r10,r11; \ movl (r7),r5 ## E; \ movl 4(r7),r1 ## E; \ movl 8(r7),r6 ## E; \ movl 12(r7),r7 ## E; \ - movl BASE(r8),r10 ## E; \ + movl BASE+0(r8),r10 ## E; \ xorl -48(r9),r5 ## E; \ xorl -44(r9),r1 ## E; \ xorl -40(r9),r6 ## E; \ @@ -154,37 +154,37 @@ FUNC: movq r1,r2; \ /* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */ entry(aes_enc_blk,0,enc128,enc192) - encrypt_round(aes_ft_tab,-96) - encrypt_round(aes_ft_tab,-80) -enc192: encrypt_round(aes_ft_tab,-64) - encrypt_round(aes_ft_tab,-48) -enc128: encrypt_round(aes_ft_tab,-32) - encrypt_round(aes_ft_tab,-16) - encrypt_round(aes_ft_tab, 0) - encrypt_round(aes_ft_tab, 16) - encrypt_round(aes_ft_tab, 32) - encrypt_round(aes_ft_tab, 48) - encrypt_round(aes_ft_tab, 64) - encrypt_round(aes_ft_tab, 80) - encrypt_round(aes_ft_tab, 96) - encrypt_final(aes_fl_tab,112) + encrypt_round(crypto_ft_tab,-96) + encrypt_round(crypto_ft_tab,-80) +enc192: encrypt_round(crypto_ft_tab,-64) + encrypt_round(crypto_ft_tab,-48) +enc128: encrypt_round(crypto_ft_tab,-32) + encrypt_round(crypto_ft_tab,-16) + encrypt_round(crypto_ft_tab, 0) + encrypt_round(crypto_ft_tab, 16) + encrypt_round(crypto_ft_tab, 32) + encrypt_round(crypto_ft_tab, 48) + encrypt_round(crypto_ft_tab, 64) + encrypt_round(crypto_ft_tab, 80) + encrypt_round(crypto_ft_tab, 96) + encrypt_final(crypto_fl_tab,112) return /* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ entry(aes_dec_blk,240,dec128,dec192) - decrypt_round(aes_it_tab,-96) - decrypt_round(aes_it_tab,-80) -dec192: decrypt_round(aes_it_tab,-64) - decrypt_round(aes_it_tab,-48) -dec128: decrypt_round(aes_it_tab,-32) - decrypt_round(aes_it_tab,-16) - decrypt_round(aes_it_tab, 0) - decrypt_round(aes_it_tab, 16) - decrypt_round(aes_it_tab, 32) - decrypt_round(aes_it_tab, 48) - decrypt_round(aes_it_tab, 64) - decrypt_round(aes_it_tab, 80) - decrypt_round(aes_it_tab, 96) - decrypt_final(aes_il_tab,112) + decrypt_round(crypto_it_tab,-96) + decrypt_round(crypto_it_tab,-80) +dec192: decrypt_round(crypto_it_tab,-64) + decrypt_round(crypto_it_tab,-48) +dec128: decrypt_round(crypto_it_tab,-32) + decrypt_round(crypto_it_tab,-16) + decrypt_round(crypto_it_tab, 0) + decrypt_round(crypto_it_tab, 16) + decrypt_round(crypto_it_tab, 32) + decrypt_round(crypto_it_tab, 48) + decrypt_round(crypto_it_tab, 64) + decrypt_round(crypto_it_tab, 80) + decrypt_round(crypto_it_tab, 96) + decrypt_final(crypto_il_tab,112) return diff --git a/arch/x86/crypto/aes_32.c b/arch/x86/crypto/aes_32.c deleted file mode 100644 index 49aad9397f10..000000000000 --- a/arch/x86/crypto/aes_32.c +++ /dev/null @@ -1,515 +0,0 @@ -/* - * - * Glue Code for optimized 586 assembler version of AES - * - * Copyright (c) 2002, Dr Brian Gladman <>, Worcester, UK. - * All rights reserved. - * - * LICENSE TERMS - * - * The free distribution and use of this software in both source and binary - * form is allowed (with or without changes) provided that: - * - * 1. distributions of this source code include the above copyright - * notice, this list of conditions and the following disclaimer; - * - * 2. distributions in binary form include the above copyright - * notice, this list of conditions and the following disclaimer - * in the documentation and/or other associated materials; - * - * 3. the copyright holder's name is not used to endorse products - * built using this software without specific written permission. - * - * ALTERNATIVELY, provided that this notice is retained in full, this product - * may be distributed under the terms of the GNU General Public License (GPL), - * in which case the provisions of the GPL apply INSTEAD OF those given above. - * - * DISCLAIMER - * - * This software is provided 'as is' with no explicit or implied warranties - * in respect of its properties, including, but not limited to, correctness - * and/or fitness for purpose. - * - * Copyright (c) 2003, Adam J. Richter <adam@yggdrasil.com> (conversion to - * 2.5 API). - * Copyright (c) 2003, 2004 Fruhwirth Clemens <clemens@endorphin.org> - * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> - * - */ - -#include <asm/byteorder.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/crypto.h> -#include <linux/linkage.h> - -asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); -asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); - -#define AES_MIN_KEY_SIZE 16 -#define AES_MAX_KEY_SIZE 32 -#define AES_BLOCK_SIZE 16 -#define AES_KS_LENGTH 4 * AES_BLOCK_SIZE -#define RC_LENGTH 29 - -struct aes_ctx { - u32 ekey[AES_KS_LENGTH]; - u32 rounds; - u32 dkey[AES_KS_LENGTH]; -}; - -#define WPOLY 0x011b -#define bytes2word(b0, b1, b2, b3) \ - (((u32)(b3) << 24) | ((u32)(b2) << 16) | ((u32)(b1) << 8) | (b0)) - -/* define the finite field multiplies required for Rijndael */ -#define f2(x) ((x) ? pow[log[x] + 0x19] : 0) -#define f3(x) ((x) ? pow[log[x] + 0x01] : 0) -#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0) -#define fb(x) ((x) ? pow[log[x] + 0x68] : 0) -#define fd(x) ((x) ? pow[log[x] + 0xee] : 0) -#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0) -#define fi(x) ((x) ? pow[255 - log[x]]: 0) - -static inline u32 upr(u32 x, int n) -{ - return (x << 8 * n) | (x >> (32 - 8 * n)); -} - -static inline u8 bval(u32 x, int n) -{ - return x >> 8 * n; -} - -/* The forward and inverse affine transformations used in the S-box */ -#define fwd_affine(x) \ - (w = (u32)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(u8)(w^(w>>8))) - -#define inv_affine(x) \ - (w = (u32)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(u8)(w^(w>>8))) - -static u32 rcon_tab[RC_LENGTH]; - -u32 ft_tab[4][256]; -u32 fl_tab[4][256]; -static u32 im_tab[4][256]; -u32 il_tab[4][256]; -u32 it_tab[4][256]; - -static void gen_tabs(void) -{ - u32 i, w; - u8 pow[512], log[256]; - - /* - * log and power tables for GF(2^8) finite field with - * WPOLY as modular polynomial - the simplest primitive - * root is 0x03, used here to generate the tables. - */ - i = 0; w = 1; - - do { - pow[i] = (u8)w; - pow[i + 255] = (u8)w; - log[w] = (u8)i++; - w ^= (w << 1) ^ (w & 0x80 ? WPOLY : 0); - } while (w != 1); - - for(i = 0, w = 1; i < RC_LENGTH; ++i) { - rcon_tab[i] = bytes2word(w, 0, 0, 0); - w = f2(w); - } - - for(i = 0; i < 256; ++i) { - u8 b; - - b = fwd_affine(fi((u8)i)); - w = bytes2word(f2(b), b, b, f3(b)); - - /* tables for a normal encryption round */ - ft_tab[0][i] = w; - ft_tab[1][i] = upr(w, 1); - ft_tab[2][i] = upr(w, 2); - ft_tab[3][i] = upr(w, 3); - w = bytes2word(b, 0, 0, 0); - - /* - * tables for last encryption round - * (may also be used in the key schedule) - */ - fl_tab[0][i] = w; - fl_tab[1][i] = upr(w, 1); - fl_tab[2][i] = upr(w, 2); - fl_tab[3][i] = upr(w, 3); - - b = fi(inv_affine((u8)i)); - w = bytes2word(fe(b), f9(b), fd(b), fb(b)); - - /* tables for the inverse mix column operation */ - im_tab[0][b] = w; - im_tab[1][b] = upr(w, 1); - im_tab[2][b] = upr(w, 2); - im_tab[3][b] = upr(w, 3); - - /* tables for a normal decryption round */ - it_tab[0][i] = w; - it_tab[1][i] = upr(w,1); - it_tab[2][i] = upr(w,2); - it_tab[3][i] = upr(w,3); - - w = bytes2word(b, 0, 0, 0); - - /* tables for last decryption round */ - il_tab[0][i] = w; - il_tab[1][i] = upr(w,1); - il_tab[2][i] = upr(w,2); - il_tab[3][i] = upr(w,3); - } -} - -#define four_tables(x,tab,vf,rf,c) \ -( tab[0][bval(vf(x,0,c),rf(0,c))] ^ \ - tab[1][bval(vf(x,1,c),rf(1,c))] ^ \ - tab[2][bval(vf(x,2,c),rf(2,c))] ^ \ - tab[3][bval(vf(x,3,c),rf(3,c))] \ -) - -#define vf1(x,r,c) (x) -#define rf1(r,c) (r) -#define rf2(r,c) ((r-c)&3) - -#define inv_mcol(x) four_tables(x,im_tab,vf1,rf1,0) -#define ls_box(x,c) four_tables(x,fl_tab,vf1,rf2,c) - -#define ff(x) inv_mcol(x) - -#define ke4(k,i) \ -{ \ - k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; \ - k[4*(i)+5] = ss[1] ^= ss[0]; \ - k[4*(i)+6] = ss[2] ^= ss[1]; \ - k[4*(i)+7] = ss[3] ^= ss[2]; \ -} - -#define kel4(k,i) \ -{ \ - k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; \ - k[4*(i)+5] = ss[1] ^= ss[0]; \ - k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \ -} - -#define ke6(k,i) \ -{ \ - k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \ - k[6*(i)+ 7] = ss[1] ^= ss[0]; \ - k[6*(i)+ 8] = ss[2] ^= ss[1]; \ - k[6*(i)+ 9] = ss[3] ^= ss[2]; \ - k[6*(i)+10] = ss[4] ^= ss[3]; \ - k[6*(i)+11] = ss[5] ^= ss[4]; \ -} - -#define kel6(k,i) \ -{ \ - k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \ - k[6*(i)+ 7] = ss[1] ^= ss[0]; \ - k[6*(i)+ 8] = ss[2] ^= ss[1]; \ - k[6*(i)+ 9] = ss[3] ^= ss[2]; \ -} - -#define ke8(k,i) \ -{ \ - k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \ - k[8*(i)+ 9] = ss[1] ^= ss[0]; \ - k[8*(i)+10] = ss[2] ^= ss[1]; \ - k[8*(i)+11] = ss[3] ^= ss[2]; \ - k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); \ - k[8*(i)+13] = ss[5] ^= ss[4]; \ - k[8*(i)+14] = ss[6] ^= ss[5]; \ - k[8*(i)+15] = ss[7] ^= ss[6]; \ -} - -#define kel8(k,i) \ -{ \ - k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \ - k[8*(i)+ 9] = ss[1] ^= ss[0]; \ - k[8*(i)+10] = ss[2] ^= ss[1]; \ - k[8*(i)+11] = ss[3] ^= ss[2]; \ -} - -#define kdf4(k,i) \ -{ \ - ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \ - ss[1] = ss[1] ^ ss[3]; \ - ss[2] = ss[2] ^ ss[3]; \ - ss[3] = ss[3]; \ - ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \ - ss[i % 4] ^= ss[4]; \ - ss[4] ^= k[4*(i)]; \ - k[4*(i)+4] = ff(ss[4]); \ - ss[4] ^= k[4*(i)+1]; \ - k[4*(i)+5] = ff(ss[4]); \ - ss[4] ^= k[4*(i)+2]; \ - k[4*(i)+6] = ff(ss[4]); \ - ss[4] ^= k[4*(i)+3]; \ - k[4*(i)+7] = ff(ss[4]); \ -} - -#define kd4(k,i) \ -{ \ - ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \ - ss[i % 4] ^= ss[4]; \ - ss[4] = ff(ss[4]); \ - k[4*(i)+4] = ss[4] ^= k[4*(i)]; \ - k[4*(i)+5] = ss[4] ^= k[4*(i)+1]; \ - k[4*(i)+6] = ss[4] ^= k[4*(i)+2]; \ - k[4*(i)+7] = ss[4] ^= k[4*(i)+3]; \ -} - -#define kdl4(k,i) \ -{ \ - ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \ - ss[i % 4] ^= ss[4]; \ - k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \ - k[4*(i)+5] = ss[1] ^ ss[3]; \ - k[4*(i)+6] = ss[0]; \ - k[4*(i)+7] = ss[1]; \ -} - -#define kdf6(k,i) \ -{ \ - ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \ - k[6*(i)+ 6] = ff(ss[0]); \ - ss[1] ^= ss[0]; \ - k[6*(i)+ 7] = ff(ss[1]); \ - ss[2] ^= ss[1]; \ - k[6*(i)+ 8] = ff(ss[2]); \ - ss[3] ^= ss[2]; \ - k[6*(i)+ 9] = ff(ss[3]); \ - ss[4] ^= ss[3]; \ - k[6*(i)+10] = ff(ss[4]); \ - ss[5] ^= ss[4]; \ - k[6*(i)+11] = ff(ss[5]); \ -} - -#define kd6(k,i) \ -{ \ - ss[6] = ls_box(ss[5],3) ^ rcon_tab[i]; \ - ss[0] ^= ss[6]; ss[6] = ff(ss[6]); \ - k[6*(i)+ 6] = ss[6] ^= k[6*(i)]; \ - ss[1] ^= ss[0]; \ - k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1]; \ - ss[2] ^= ss[1]; \ - k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2]; \ - ss[3] ^= ss[2]; \ - k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3]; \ - ss[4] ^= ss[3]; \ - k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4]; \ - ss[5] ^= ss[4]; \ - k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5]; \ -} - -#define kdl6(k,i) \ -{ \ - ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \ - k[6*(i)+ 6] = ss[0]; \ - ss[1] ^= ss[0]; \ - k[6*(i)+ 7] = ss[1]; \ - ss[2] ^= ss[1]; \ - k[6*(i)+ 8] = ss[2]; \ - ss[3] ^= ss[2]; \ - k[6*(i)+ 9] = ss[3]; \ -} - -#define kdf8(k,i) \ -{ \ - ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \ - k[8*(i)+ 8] = ff(ss[0]); \ - ss[1] ^= ss[0]; \ - k[8*(i)+ 9] = ff(ss[1]); \ - ss[2] ^= ss[1]; \ - k[8*(i)+10] = ff(ss[2]); \ - ss[3] ^= ss[2]; \ - k[8*(i)+11] = ff(ss[3]); \ - ss[4] ^= ls_box(ss[3],0); \ - k[8*(i)+12] = ff(ss[4]); \ - ss[5] ^= ss[4]; \ - k[8*(i)+13] = ff(ss[5]); \ - ss[6] ^= ss[5]; \ - k[8*(i)+14] = ff(ss[6]); \ - ss[7] ^= ss[6]; \ - k[8*(i)+15] = ff(ss[7]); \ -} - -#define kd8(k,i) \ -{ \ - u32 __g = ls_box(ss[7],3) ^ rcon_tab[i]; \ - ss[0] ^= __g; \ - __g = ff(__g); \ - k[8*(i)+ 8] = __g ^= k[8*(i)]; \ - ss[1] ^= ss[0]; \ - k[8*(i)+ 9] = __g ^= k[8*(i)+ 1]; \ - ss[2] ^= ss[1]; \ - k[8*(i)+10] = __g ^= k[8*(i)+ 2]; \ - ss[3] ^= ss[2]; \ - k[8*(i)+11] = __g ^= k[8*(i)+ 3]; \ - __g = ls_box(ss[3],0); \ - ss[4] ^= __g; \ - __g = ff(__g); \ - k[8*(i)+12] = __g ^= k[8*(i)+ 4]; \ - ss[5] ^= ss[4]; \ - k[8*(i)+13] = __g ^= k[8*(i)+ 5]; \ - ss[6] ^= ss[5]; \ - k[8*(i)+14] = __g ^= k[8*(i)+ 6]; \ - ss[7] ^= ss[6]; \ - k[8*(i)+15] = __g ^= k[8*(i)+ 7]; \ -} - -#define kdl8(k,i) \ -{ \ - ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \ - k[8*(i)+ 8] = ss[0]; \ - ss[1] ^= ss[0]; \ - k[8*(i)+ 9] = ss[1]; \ - ss[2] ^= ss[1]; \ - k[8*(i)+10] = ss[2]; \ - ss[3] ^= ss[2]; \ - k[8*(i)+11] = ss[3]; \ -} - -static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, - unsigned int key_len) -{ - int i; - u32 ss[8]; - struct aes_ctx *ctx = crypto_tfm_ctx(tfm); - const __le32 *key = (const __le32 *)in_key; - u32 *flags = &tfm->crt_flags; - - /* encryption schedule */ - - ctx->ekey[0] = ss[0] = le32_to_cpu(key[0]); - ctx->ekey[1] = ss[1] = le32_to_cpu(key[1]); - ctx->ekey[2] = ss[2] = le32_to_cpu(key[2]); - ctx->ekey[3] = ss[3] = le32_to_cpu(key[3]); - - switch(key_len) { - case 16: - for (i = 0; i < 9; i++) - ke4(ctx->ekey, i); - kel4(ctx->ekey, 9); - ctx->rounds = 10; - break; - - case 24: - ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]); - ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]); - for (i = 0; i < 7; i++) - ke6(ctx->ekey, i); - kel6(ctx->ekey, 7); - ctx->rounds = 12; - break; - - case 32: - ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]); - ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]); - ctx->ekey[6] = ss[6] = le32_to_cpu(key[6]); - ctx->ekey[7] = ss[7] = le32_to_cpu(key[7]); - for (i = 0; i < 6; i++) - ke8(ctx->ekey, i); - kel8(ctx->ekey, 6); - ctx->rounds = 14; - break; - - default: - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } - - /* decryption schedule */ - - ctx->dkey[0] = ss[0] = le32_to_cpu(key[0]); - ctx->dkey[1] = ss[1] = le32_to_cpu(key[1]); - ctx->dkey[2] = ss[2] = le32_to_cpu(key[2]); - ctx->dkey[3] = ss[3] = le32_to_cpu(key[3]); - - switch (key_len) { - case 16: - kdf4(ctx->dkey, 0); - for (i = 1; i < 9; i++) - kd4(ctx->dkey, i); - kdl4(ctx->dkey, 9); - break; - - case 24: - ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4])); - ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5])); - kdf6(ctx->dkey, 0); - for (i = 1; i < 7; i++) - kd6(ctx->dkey, i); - kdl6(ctx->dkey, 7); - break; - - case 32: - ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4])); - ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5])); - ctx->dkey[6] = ff(ss[6] = le32_to_cpu(key[6])); - ctx->dkey[7] = ff(ss[7] = le32_to_cpu(key[7])); - kdf8(ctx->dkey, 0); - for (i = 1; i < 6; i++) - kd8(ctx->dkey, i); - kdl8(ctx->dkey, 6); - break; - } - return 0; -} - -static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - aes_enc_blk(tfm, dst, src); -} - -static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - aes_dec_blk(tfm, dst, src); -} - -static struct crypto_alg aes_alg = { - .cra_name = "aes", - .cra_driver_name = "aes-i586", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct aes_ctx), - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(aes_alg.cra_list), - .cra_u = { - .cipher = { - .cia_min_keysize = AES_MIN_KEY_SIZE, - .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = aes_set_key, - .cia_encrypt = aes_encrypt, - .cia_decrypt = aes_decrypt - } - } -}; - -static int __init aes_init(void) -{ - gen_tabs(); - return crypto_register_alg(&aes_alg); -} - -static void __exit aes_fini(void) -{ - crypto_unregister_alg(&aes_alg); -} - -module_init(aes_init); -module_exit(aes_fini); - -MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, i586 asm optimized"); -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_AUTHOR("Fruhwirth Clemens, James Morris, Brian Gladman, Adam Richter"); -MODULE_ALIAS("aes"); diff --git a/arch/x86/crypto/aes_64.c b/arch/x86/crypto/aes_64.c deleted file mode 100644 index 5cdb13ea5cc2..000000000000 --- a/arch/x86/crypto/aes_64.c +++ /dev/null @@ -1,336 +0,0 @@ -/* - * Cryptographic API. - * - * AES Cipher Algorithm. - * - * Based on Brian Gladman's code. - * - * Linux developers: - * Alexander Kjeldaas <astor@fast.no> - * Herbert Valerio Riedel <hvr@hvrlab.org> - * Kyle McMartin <kyle@debian.org> - * Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API). - * Andreas Steinmetz <ast@domdv.de> (adapted to x86_64 assembler) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * --------------------------------------------------------------------------- - * Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK. - * All rights reserved. - * - * LICENSE TERMS - * - * The free distribution and use of this software in both source and binary - * form is allowed (with or without changes) provided that: - * - * 1. distributions of this source code include the above copyright - * notice, this list of conditions and the following disclaimer; - * - * 2. distributions in binary form include the above copyright - * notice, this list of conditions and the following disclaimer - * in the documentation and/or other associated materials; - * - * 3. the copyright holder's name is not used to endorse products - * built using this software without specific written permission. - * - * ALTERNATIVELY, provided that this notice is retained in full, this product - * may be distributed under the terms of the GNU General Public License (GPL), - * in which case the provisions of the GPL apply INSTEAD OF those given above. - * - * DISCLAIMER - * - * This software is provided 'as is' with no explicit or implied warranties - * in respect of its properties, including, but not limited to, correctness - * and/or fitness for purpose. - * --------------------------------------------------------------------------- - */ - -/* Some changes from the Gladman version: - s/RIJNDAEL(e_key)/E_KEY/g - s/RIJNDAEL(d_key)/D_KEY/g -*/ - -#include <asm/byteorder.h> -#include <linux/bitops.h> -#include <linux/crypto.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/types.h> - -#define AES_MIN_KEY_SIZE 16 -#define AES_MAX_KEY_SIZE 32 - -#define AES_BLOCK_SIZE 16 - -/* - * #define byte(x, nr) ((unsigned char)((x) >> (nr*8))) - */ -static inline u8 byte(const u32 x, const unsigned n) -{ - return x >> (n << 3); -} - -struct aes_ctx -{ - u32 key_length; - u32 buf[120]; -}; - -#define E_KEY (&ctx->buf[0]) -#define D_KEY (&ctx->buf[60]) - -static u8 pow_tab[256] __initdata; -static u8 log_tab[256] __initdata; -static u8 sbx_tab[256] __initdata; -static u8 isb_tab[256] __initdata; -static u32 rco_tab[10]; -u32 aes_ft_tab[4][256]; -u32 aes_it_tab[4][256]; - -u32 aes_fl_tab[4][256]; -u32 aes_il_tab[4][256]; - -static inline u8 f_mult(u8 a, u8 b) -{ - u8 aa = log_tab[a], cc = aa + log_tab[b]; - - return pow_tab[cc + (cc < aa ? 1 : 0)]; -} - -#define ff_mult(a, b) (a && b ? f_mult(a, b) : 0) - -#define ls_box(x) \ - (aes_fl_tab[0][byte(x, 0)] ^ \ - aes_fl_tab[1][byte(x, 1)] ^ \ - aes_fl_tab[2][byte(x, 2)] ^ \ - aes_fl_tab[3][byte(x, 3)]) - -static void __init gen_tabs(void) -{ - u32 i, t; - u8 p, q; - - /* log and power tables for GF(2**8) finite field with - 0x011b as modular polynomial - the simplest primitive - root is 0x03, used here to generate the tables */ - - for (i = 0, p = 1; i < 256; ++i) { - pow_tab[i] = (u8)p; - log_tab[p] = (u8)i; - - p ^= (p << 1) ^ (p & 0x80 ? 0x01b : 0); - } - - log_tab[1] = 0; - - for (i = 0, p = 1; i < 10; ++i) { - rco_tab[i] = p; - - p = (p << 1) ^ (p & 0x80 ? 0x01b : 0); - } - - for (i = 0; i < 256; ++i) { - p = (i ? pow_tab[255 - log_tab[i]] : 0); - q = ((p >> 7) | (p << 1)) ^ ((p >> 6) | (p << 2)); - p ^= 0x63 ^ q ^ ((q >> 6) | (q << 2)); - sbx_tab[i] = p; - isb_tab[p] = (u8)i; - } - - for (i = 0; i < 256; ++i) { - p = sbx_tab[i]; - - t = p; - aes_fl_tab[0][i] = t; - aes_fl_tab[1][i] = rol32(t, 8); - aes_fl_tab[2][i] = rol32(t, 16); - aes_fl_tab[3][i] = rol32(t, 24); - - t = ((u32)ff_mult(2, p)) | - ((u32)p << 8) | - ((u32)p << 16) | ((u32)ff_mult(3, p) << 24); - - aes_ft_tab[0][i] = t; - aes_ft_tab[1][i] = rol32(t, 8); - aes_ft_tab[2][i] = rol32(t, 16); - aes_ft_tab[3][i] = rol32(t, 24); - - p = isb_tab[i]; - - t = p; - aes_il_tab[0][i] = t; - aes_il_tab[1][i] = rol32(t, 8); - aes_il_tab[2][i] = rol32(t, 16); - aes_il_tab[3][i] = rol32(t, 24); - - t = ((u32)ff_mult(14, p)) | - ((u32)ff_mult(9, p) << 8) | - ((u32)ff_mult(13, p) << 16) | - ((u32)ff_mult(11, p) << 24); - - aes_it_tab[0][i] = t; - aes_it_tab[1][i] = rol32(t, 8); - aes_it_tab[2][i] = rol32(t, 16); - aes_it_tab[3][i] = rol32(t, 24); - } -} - -#define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b) - -#define imix_col(y, x) \ - u = star_x(x); \ - v = star_x(u); \ - w = star_x(v); \ - t = w ^ (x); \ - (y) = u ^ v ^ w; \ - (y) ^= ror32(u ^ t, 8) ^ \ - ror32(v ^ t, 16) ^ \ - ror32(t, 24) - -/* initialise the key schedule from the user supplied key */ - -#define loop4(i) \ -{ \ - t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \ - t ^= E_KEY[4 * i]; E_KEY[4 * i + 4] = t; \ - t ^= E_KEY[4 * i + 1]; E_KEY[4 * i + 5] = t; \ - t ^= E_KEY[4 * i + 2]; E_KEY[4 * i + 6] = t; \ - t ^= E_KEY[4 * i + 3]; E_KEY[4 * i + 7] = t; \ -} - -#define loop6(i) \ -{ \ - t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \ - t ^= E_KEY[6 * i]; E_KEY[6 * i + 6] = t; \ - t ^= E_KEY[6 * i + 1]; E_KEY[6 * i + 7] = t; \ - t ^= E_KEY[6 * i + 2]; E_KEY[6 * i + 8] = t; \ - t ^= E_KEY[6 * i + 3]; E_KEY[6 * i + 9] = t; \ - t ^= E_KEY[6 * i + 4]; E_KEY[6 * i + 10] = t; \ - t ^= E_KEY[6 * i + 5]; E_KEY[6 * i + 11] = t; \ -} - -#define loop8(i) \ -{ \ - t = ror32(t, 8); ; t = ls_box(t) ^ rco_tab[i]; \ - t ^= E_KEY[8 * i]; E_KEY[8 * i + 8] = t; \ - t ^= E_KEY[8 * i + 1]; E_KEY[8 * i + 9] = t; \ - t ^= E_KEY[8 * i + 2]; E_KEY[8 * i + 10] = t; \ - t ^= E_KEY[8 * i + 3]; E_KEY[8 * i + 11] = t; \ - t = E_KEY[8 * i + 4] ^ ls_box(t); \ - E_KEY[8 * i + 12] = t; \ - t ^= E_KEY[8 * i + 5]; E_KEY[8 * i + 13] = t; \ - t ^= E_KEY[8 * i + 6]; E_KEY[8 * i + 14] = t; \ - t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t; \ -} - -static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, - unsigned int key_len) -{ - struct aes_ctx *ctx = crypto_tfm_ctx(tfm); - const __le32 *key = (const __le32 *)in_key; - u32 *flags = &tfm->crt_flags; - u32 i, j, t, u, v, w; - - if (key_len % 8) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } - - ctx->key_length = key_len; - - D_KEY[key_len + 24] = E_KEY[0] = le32_to_cpu(key[0]); - D_KEY[key_len + 25] = E_KEY[1] = le32_to_cpu(key[1]); - D_KEY[key_len + 26] = E_KEY[2] = le32_to_cpu(key[2]); - D_KEY[key_len + 27] = E_KEY[3] = le32_to_cpu(key[3]); - - switch (key_len) { - case 16: - t = E_KEY[3]; - for (i = 0; i < 10; ++i) - loop4(i); - break; - - case 24: - E_KEY[4] = le32_to_cpu(key[4]); - t = E_KEY[5] = le32_to_cpu(key[5]); - for (i = 0; i < 8; ++i) - loop6 (i); - break; - - case 32: - E_KEY[4] = le32_to_cpu(key[4]); - E_KEY[5] = le32_to_cpu(key[5]); - E_KEY[6] = le32_to_cpu(key[6]); - t = E_KEY[7] = le32_to_cpu(key[7]); - for (i = 0; i < 7; ++i) - loop8(i); - break; - } - - D_KEY[0] = E_KEY[key_len + 24]; - D_KEY[1] = E_KEY[key_len + 25]; - D_KEY[2] = E_KEY[key_len + 26]; - D_KEY[3] = E_KEY[key_len + 27]; - - for (i = 4; i < key_len + 24; ++i) { - j = key_len + 24 - (i & ~3) + (i & 3); - imix_col(D_KEY[j], E_KEY[i]); - } - - return 0; -} - -asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); -asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); - -static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - aes_enc_blk(tfm, dst, src); -} - -static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - aes_dec_blk(tfm, dst, src); -} - -static struct crypto_alg aes_alg = { - .cra_name = "aes", - .cra_driver_name = "aes-x86_64", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct aes_ctx), - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(aes_alg.cra_list), - .cra_u = { - .cipher = { - .cia_min_keysize = AES_MIN_KEY_SIZE, - .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = aes_set_key, - .cia_encrypt = aes_encrypt, - .cia_decrypt = aes_decrypt - } - } -}; - -static int __init aes_init(void) -{ - gen_tabs(); - return crypto_register_alg(&aes_alg); -} - -static void __exit aes_fini(void) -{ - crypto_unregister_alg(&aes_alg); -} - -module_init(aes_init); -module_exit(aes_fini); - -MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("aes"); diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c new file mode 100644 index 000000000000..71f457827116 --- /dev/null +++ b/arch/x86/crypto/aes_glue.c @@ -0,0 +1,57 @@ +/* + * Glue Code for the asm optimized version of the AES Cipher Algorithm + * + */ + +#include <crypto/aes.h> + +asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); +asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); + +static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + aes_enc_blk(tfm, dst, src); +} + +static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + aes_dec_blk(tfm, dst, src); +} + +static struct crypto_alg aes_alg = { + .cra_name = "aes", + .cra_driver_name = "aes-asm", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(aes_alg.cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = crypto_aes_set_key, + .cia_encrypt = aes_encrypt, + .cia_decrypt = aes_decrypt + } + } +}; + +static int __init aes_init(void) +{ + return crypto_register_alg(&aes_alg); +} + +static void __exit aes_fini(void) +{ + crypto_unregister_alg(&aes_alg); +} + +module_init(aes_init); +module_exit(aes_fini); + +MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("aes"); +MODULE_ALIAS("aes-asm"); diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S new file mode 100644 index 000000000000..72eb306680b2 --- /dev/null +++ b/arch/x86/crypto/salsa20-i586-asm_32.S @@ -0,0 +1,1114 @@ +# salsa20_pm.s version 20051229 +# D. J. Bernstein +# Public domain. + +# enter ECRYPT_encrypt_bytes +.text +.p2align 5 +.globl ECRYPT_encrypt_bytes +ECRYPT_encrypt_bytes: + mov %esp,%eax + and $31,%eax + add $256,%eax + sub %eax,%esp + # eax_stack = eax + movl %eax,80(%esp) + # ebx_stack = ebx + movl %ebx,84(%esp) + # esi_stack = esi + movl %esi,88(%esp) + # edi_stack = edi + movl %edi,92(%esp) + # ebp_stack = ebp + movl %ebp,96(%esp) + # x = arg1 + movl 4(%esp,%eax),%edx + # m = arg2 + movl 8(%esp,%eax),%esi + # out = arg3 + movl 12(%esp,%eax),%edi + # bytes = arg4 + movl 16(%esp,%eax),%ebx + # bytes -= 0 + sub $0,%ebx + # goto done if unsigned<= + jbe ._done +._start: + # in0 = *(uint32 *) (x + 0) + movl 0(%edx),%eax + # in1 = *(uint32 *) (x + 4) + movl 4(%edx),%ecx + # in2 = *(uint32 *) (x + 8) + movl 8(%edx),%ebp + # j0 = in0 + movl %eax,164(%esp) + # in3 = *(uint32 *) (x + 12) + movl 12(%edx),%eax + # j1 = in1 + movl %ecx,168(%esp) + # in4 = *(uint32 *) (x + 16) + movl 16(%edx),%ecx + # j2 = in2 + movl %ebp,172(%esp) + # in5 = *(uint32 *) (x + 20) + movl 20(%edx),%ebp + # j3 = in3 + movl %eax,176(%esp) + # in6 = *(uint32 *) (x + 24) + movl 24(%edx),%eax + # j4 = in4 + movl %ecx,180(%esp) + # in7 = *(uint32 *) (x + 28) + movl 28(%edx),%ecx + # j5 = in5 + movl %ebp,184(%esp) + # in8 = *(uint32 *) (x + 32) + movl 32(%edx),%ebp + # j6 = in6 + movl %eax,188(%esp) + # in9 = *(uint32 *) (x + 36) + movl 36(%edx),%eax + # j7 = in7 + movl %ecx,192(%esp) + # in10 = *(uint32 *) (x + 40) + movl 40(%edx),%ecx + # j8 = in8 + movl %ebp,196(%esp) + # in11 = *(uint32 *) (x + 44) + movl 44(%edx),%ebp + # j9 = in9 + movl %eax,200(%esp) + # in12 = *(uint32 *) (x + 48) + movl 48(%edx),%eax + # j10 = in10 + movl %ecx,204(%esp) + # in13 = *(uint32 *) (x + 52) + movl 52(%edx),%ecx + # j11 = in11 + movl %ebp,208(%esp) + # in14 = *(uint32 *) (x + 56) + movl 56(%edx),%ebp + # j12 = in12 + movl %eax,212(%esp) + # in15 = *(uint32 *) (x + 60) + movl 60(%edx),%eax + # j13 = in13 + movl %ecx,216(%esp) + # j14 = in14 + movl %ebp,220(%esp) + # j15 = in15 + movl %eax,224(%esp) + # x_backup = x + movl %edx,64(%esp) +._bytesatleast1: + # bytes - 64 + cmp $64,%ebx + # goto nocopy if unsigned>= + jae ._nocopy + # ctarget = out + movl %edi,228(%esp) + # out = &tmp + leal 0(%esp),%edi + # i = bytes + mov %ebx,%ecx + # while (i) { *out++ = *m++; --i } + rep movsb + # out = &tmp + leal 0(%esp),%edi + # m = &tmp + leal 0(%esp),%esi +._nocopy: + # out_backup = out + movl %edi,72(%esp) + # m_backup = m + movl %esi,68(%esp) + # bytes_backup = bytes + movl %ebx,76(%esp) + # in0 = j0 + movl 164(%esp),%eax + # in1 = j1 + movl 168(%esp),%ecx + # in2 = j2 + movl 172(%esp),%edx + # in3 = j3 + movl 176(%esp),%ebx + # x0 = in0 + movl %eax,100(%esp) + # x1 = in1 + movl %ecx,104(%esp) + # x2 = in2 + movl %edx,108(%esp) + # x3 = in3 + movl %ebx,112(%esp) + # in4 = j4 + movl 180(%esp),%eax + # in5 = j5 + movl 184(%esp),%ecx + # in6 = j6 + movl 188(%esp),%edx + # in7 = j7 + movl 192(%esp),%ebx + # x4 = in4 + movl %eax,116(%esp) + # x5 = in5 + movl %ecx,120(%esp) + # x6 = in6 + movl %edx,124(%esp) + # x7 = in7 + movl %ebx,128(%esp) + # in8 = j8 + movl 196(%esp),%eax + # in9 = j9 + movl 200(%esp),%ecx + # in10 = j10 + movl 204(%esp),%edx + # in11 = j11 + movl 208(%esp),%ebx + # x8 = in8 + movl %eax,132(%esp) + # x9 = in9 + movl %ecx,136(%esp) + # x10 = in10 + movl %edx,140(%esp) + # x11 = in11 + movl %ebx,144(%esp) + # in12 = j12 + movl 212(%esp),%eax + # in13 = j13 + movl 216(%esp),%ecx + # in14 = j14 + movl 220(%esp),%edx + # in15 = j15 + movl 224(%esp),%ebx + # x12 = in12 + movl %eax,148(%esp) + # x13 = in13 + movl %ecx,152(%esp) + # x14 = in14 + movl %edx,156(%esp) + # x15 = in15 + movl %ebx,160(%esp) + # i = 20 + mov $20,%ebp + # p = x0 + movl 100(%esp),%eax + # s = x5 + movl 120(%esp),%ecx + # t = x10 + movl 140(%esp),%edx + # w = x15 + movl 160(%esp),%ebx +._mainloop: + # x0 = p + movl %eax,100(%esp) + # x10 = t + movl %edx,140(%esp) + # p += x12 + addl 148(%esp),%eax + # x5 = s + movl %ecx,120(%esp) + # t += x6 + addl 124(%esp),%edx + # x15 = w + movl %ebx,160(%esp) + # r = x1 + movl 104(%esp),%esi + # r += s + add %ecx,%esi + # v = x11 + movl 144(%esp),%edi + # v += w + add %ebx,%edi + # p <<<= 7 + rol $7,%eax + # p ^= x4 + xorl 116(%esp),%eax + # t <<<= 7 + rol $7,%edx + # t ^= x14 + xorl 156(%esp),%edx + # r <<<= 7 + rol $7,%esi + # r ^= x9 + xorl 136(%esp),%esi + # v <<<= 7 + rol $7,%edi + # v ^= x3 + xorl 112(%esp),%edi + # x4 = p + movl %eax,116(%esp) + # x14 = t + movl %edx,156(%esp) + # p += x0 + addl 100(%esp),%eax + # x9 = r + movl %esi,136(%esp) + # t += x10 + addl 140(%esp),%edx + # x3 = v + movl %edi,112(%esp) + # p <<<= 9 + rol $9,%eax + # p ^= x8 + xorl 132(%esp),%eax + # t <<<= 9 + rol $9,%edx + # t ^= x2 + xorl 108(%esp),%edx + # s += r + add %esi,%ecx + # s <<<= 9 + rol $9,%ecx + # s ^= x13 + xorl 152(%esp),%ecx + # w += v + add %edi,%ebx + # w <<<= 9 + rol $9,%ebx + # w ^= x7 + xorl 128(%esp),%ebx + # x8 = p + movl %eax,132(%esp) + # x2 = t + movl %edx,108(%esp) + # p += x4 + addl 116(%esp),%eax + # x13 = s + movl %ecx,152(%esp) + # t += x14 + addl 156(%esp),%edx + # x7 = w + movl %ebx,128(%esp) + # p <<<= 13 + rol $13,%eax + # p ^= x12 + xorl 148(%esp),%eax + # t <<<= 13 + rol $13,%edx + # t ^= x6 + xorl 124(%esp),%edx + # r += s + add %ecx,%esi + # r <<<= 13 + rol $13,%esi + # r ^= x1 + xorl 104(%esp),%esi + # v += w + add %ebx,%edi + # v <<<= 13 + rol $13,%edi + # v ^= x11 + xorl 144(%esp),%edi + # x12 = p + movl %eax,148(%esp) + # x6 = t + movl %edx,124(%esp) + # p += x8 + addl 132(%esp),%eax + # x1 = r + movl %esi,104(%esp) + # t += x2 + addl 108(%esp),%edx + # x11 = v + movl %edi,144(%esp) + # p <<<= 18 + rol $18,%eax + # p ^= x0 + xorl 100(%esp),%eax + # t <<<= 18 + rol $18,%edx + # t ^= x10 + xorl 140(%esp),%edx + # s += r + add %esi,%ecx + # s <<<= 18 + rol $18,%ecx + # s ^= x5 + xorl 120(%esp),%ecx + # w += v + add %edi,%ebx + # w <<<= 18 + rol $18,%ebx + # w ^= x15 + xorl 160(%esp),%ebx + # x0 = p + movl %eax,100(%esp) + # x10 = t + movl %edx,140(%esp) + # p += x3 + addl 112(%esp),%eax + # p <<<= 7 + rol $7,%eax + # x5 = s + movl %ecx,120(%esp) + # t += x9 + addl 136(%esp),%edx + # x15 = w + movl %ebx,160(%esp) + # r = x4 + movl 116(%esp),%esi + # r += s + add %ecx,%esi + # v = x14 + movl 156(%esp),%edi + # v += w + add %ebx,%edi + # p ^= x1 + xorl 104(%esp),%eax + # t <<<= 7 + rol $7,%edx + # t ^= x11 + xorl 144(%esp),%edx + # r <<<= 7 + rol $7,%esi + # r ^= x6 + xorl 124(%esp),%esi + # v <<<= 7 + rol $7,%edi + # v ^= x12 + xorl 148(%esp),%edi + # x1 = p + movl %eax,104(%esp) + # x11 = t + movl %edx,144(%esp) + # p += x0 + addl 100(%esp),%eax + # x6 = r + movl %esi,124(%esp) + # t += x10 + addl 140(%esp),%edx + # x12 = v + movl %edi,148(%esp) + # p <<<= 9 + rol $9,%eax + # p ^= x2 + xorl 108(%esp),%eax + # t <<<= 9 + rol $9,%edx + # t ^= x8 + xorl 132(%esp),%edx + # s += r + add %esi,%ecx + # s <<<= 9 + rol $9,%ecx + # s ^= x7 + xorl 128(%esp),%ecx + # w += v + add %edi,%ebx + # w <<<= 9 + rol $9,%ebx + # w ^= x13 + xorl 152(%esp),%ebx + # x2 = p + movl %eax,108(%esp) + # x8 = t + movl %edx,132(%esp) + # p += x1 + addl 104(%esp),%eax + # x7 = s + movl %ecx,128(%esp) + # t += x11 + addl 144(%esp),%edx + # x13 = w + movl %ebx,152(%esp) + # p <<<= 13 + rol $13,%eax + # p ^= x3 + xorl 112(%esp),%eax + # t <<<= 13 + rol $13,%edx + # t ^= x9 + xorl 136(%esp),%edx + # r += s + add %ecx,%esi + # r <<<= 13 + rol $13,%esi + # r ^= x4 + xorl 116(%esp),%esi + # v += w + add %ebx,%edi + # v <<<= 13 + rol $13,%edi + # v ^= x14 + xorl 156(%esp),%edi + # x3 = p + movl %eax,112(%esp) + # x9 = t + movl %edx,136(%esp) + # p += x2 + addl 108(%esp),%eax + # x4 = r + movl %esi,116(%esp) + # t += x8 + addl 132(%esp),%edx + # x14 = v + movl %edi,156(%esp) + # p <<<= 18 + rol $18,%eax + # p ^= x0 + xorl 100(%esp),%eax + # t <<<= 18 + rol $18,%edx + # t ^= x10 + xorl 140(%esp),%edx + # s += r + add %esi,%ecx + # s <<<= 18 + rol $18,%ecx + # s ^= x5 + xorl 120(%esp),%ecx + # w += v + add %edi,%ebx + # w <<<= 18 + rol $18,%ebx + # w ^= x15 + xorl 160(%esp),%ebx + # x0 = p + movl %eax,100(%esp) + # x10 = t + movl %edx,140(%esp) + # p += x12 + addl 148(%esp),%eax + # x5 = s + movl %ecx,120(%esp) + # t += x6 + addl 124(%esp),%edx + # x15 = w + movl %ebx,160(%esp) + # r = x1 + movl 104(%esp),%esi + # r += s + add %ecx,%esi + # v = x11 + movl 144(%esp),%edi + # v += w + add %ebx,%edi + # p <<<= 7 + rol $7,%eax + # p ^= x4 + xorl 116(%esp),%eax + # t <<<= 7 + rol $7,%edx + # t ^= x14 + xorl 156(%esp),%edx + # r <<<= 7 + rol $7,%esi + # r ^= x9 + xorl 136(%esp),%esi + # v <<<= 7 + rol $7,%edi + # v ^= x3 + xorl 112(%esp),%edi + # x4 = p + movl %eax,116(%esp) + # x14 = t + movl %edx,156(%esp) + # p += x0 + addl 100(%esp),%eax + # x9 = r + movl %esi,136(%esp) + # t += x10 + addl 140(%esp),%edx + # x3 = v + movl %edi,112(%esp) + # p <<<= 9 + rol $9,%eax + # p ^= x8 + xorl 132(%esp),%eax + # t <<<= 9 + rol $9,%edx + # t ^= x2 + xorl 108(%esp),%edx + # s += r + add %esi,%ecx + # s <<<= 9 + rol $9,%ecx + # s ^= x13 + xorl 152(%esp),%ecx + # w += v + add %edi,%ebx + # w <<<= 9 + rol $9,%ebx + # w ^= x7 + xorl 128(%esp),%ebx + # x8 = p + movl %eax,132(%esp) + # x2 = t + movl %edx,108(%esp) + # p += x4 + addl 116(%esp),%eax + # x13 = s + movl %ecx,152(%esp) + # t += x14 + addl 156(%esp),%edx + # x7 = w + movl %ebx,128(%esp) + # p <<<= 13 + rol $13,%eax + # p ^= x12 + xorl 148(%esp),%eax + # t <<<= 13 + rol $13,%edx + # t ^= x6 + xorl 124(%esp),%edx + # r += s + add %ecx,%esi + # r <<<= 13 + rol $13,%esi + # r ^= x1 + xorl 104(%esp),%esi + # v += w + add %ebx,%edi + # v <<<= 13 + rol $13,%edi + # v ^= x11 + xorl 144(%esp),%edi + # x12 = p + movl %eax,148(%esp) + # x6 = t + movl %edx,124(%esp) + # p += x8 + addl 132(%esp),%eax + # x1 = r + movl %esi,104(%esp) + # t += x2 + addl 108(%esp),%edx + # x11 = v + movl %edi,144(%esp) + # p <<<= 18 + rol $18,%eax + # p ^= x0 + xorl 100(%esp),%eax + # t <<<= 18 + rol $18,%edx + # t ^= x10 + xorl 140(%esp),%edx + # s += r + add %esi,%ecx + # s <<<= 18 + rol $18,%ecx + # s ^= x5 + xorl 120(%esp),%ecx + # w += v + add %edi,%ebx + # w <<<= 18 + rol $18,%ebx + # w ^= x15 + xorl 160(%esp),%ebx + # x0 = p + movl %eax,100(%esp) + # x10 = t + movl %edx,140(%esp) + # p += x3 + addl 112(%esp),%eax + # p <<<= 7 + rol $7,%eax + # x5 = s + movl %ecx,120(%esp) + # t += x9 + addl 136(%esp),%edx + # x15 = w + movl %ebx,160(%esp) + # r = x4 + movl 116(%esp),%esi + # r += s + add %ecx,%esi + # v = x14 + movl 156(%esp),%edi + # v += w + add %ebx,%edi + # p ^= x1 + xorl 104(%esp),%eax + # t <<<= 7 + rol $7,%edx + # t ^= x11 + xorl 144(%esp),%edx + # r <<<= 7 + rol $7,%esi + # r ^= x6 + xorl 124(%esp),%esi + # v <<<= 7 + rol $7,%edi + # v ^= x12 + xorl 148(%esp),%edi + # x1 = p + movl %eax,104(%esp) + # x11 = t + movl %edx,144(%esp) + # p += x0 + addl 100(%esp),%eax + # x6 = r + movl %esi,124(%esp) + # t += x10 + addl 140(%esp),%edx + # x12 = v + movl %edi,148(%esp) + # p <<<= 9 + rol $9,%eax + # p ^= x2 + xorl 108(%esp),%eax + # t <<<= 9 + rol $9,%edx + # t ^= x8 + xorl 132(%esp),%edx + # s += r + add %esi,%ecx + # s <<<= 9 + rol $9,%ecx + # s ^= x7 + xorl 128(%esp),%ecx + # w += v + add %edi,%ebx + # w <<<= 9 + rol $9,%ebx + # w ^= x13 + xorl 152(%esp),%ebx + # x2 = p + movl %eax,108(%esp) + # x8 = t + movl %edx,132(%esp) + # p += x1 + addl 104(%esp),%eax + # x7 = s + movl %ecx,128(%esp) + # t += x11 + addl 144(%esp),%edx + # x13 = w + movl %ebx,152(%esp) + # p <<<= 13 + rol $13,%eax + # p ^= x3 + xorl 112(%esp),%eax + # t <<<= 13 + rol $13,%edx + # t ^= x9 + xorl 136(%esp),%edx + # r += s + add %ecx,%esi + # r <<<= 13 + rol $13,%esi + # r ^= x4 + xorl 116(%esp),%esi + # v += w + add %ebx,%edi + # v <<<= 13 + rol $13,%edi + # v ^= x14 + xorl 156(%esp),%edi + # x3 = p + movl %eax,112(%esp) + # x9 = t + movl %edx,136(%esp) + # p += x2 + addl 108(%esp),%eax + # x4 = r + movl %esi,116(%esp) + # t += x8 + addl 132(%esp),%edx + # x14 = v + movl %edi,156(%esp) + # p <<<= 18 + rol $18,%eax + # p ^= x0 + xorl 100(%esp),%eax + # t <<<= 18 + rol $18,%edx + # t ^= x10 + xorl 140(%esp),%edx + # s += r + add %esi,%ecx + # s <<<= 18 + rol $18,%ecx + # s ^= x5 + xorl 120(%esp),%ecx + # w += v + add %edi,%ebx + # w <<<= 18 + rol $18,%ebx + # w ^= x15 + xorl 160(%esp),%ebx + # i -= 4 + sub $4,%ebp + # goto mainloop if unsigned > + ja ._mainloop + # x0 = p + movl %eax,100(%esp) + # x5 = s + movl %ecx,120(%esp) + # x10 = t + movl %edx,140(%esp) + # x15 = w + movl %ebx,160(%esp) + # out = out_backup + movl 72(%esp),%edi + # m = m_backup + movl 68(%esp),%esi + # in0 = x0 + movl 100(%esp),%eax + # in1 = x1 + movl 104(%esp),%ecx + # in0 += j0 + addl 164(%esp),%eax + # in1 += j1 + addl 168(%esp),%ecx + # in0 ^= *(uint32 *) (m + 0) + xorl 0(%esi),%eax + # in1 ^= *(uint32 *) (m + 4) + xorl 4(%esi),%ecx + # *(uint32 *) (out + 0) = in0 + movl %eax,0(%edi) + # *(uint32 *) (out + 4) = in1 + movl %ecx,4(%edi) + # in2 = x2 + movl 108(%esp),%eax + # in3 = x3 + movl 112(%esp),%ecx + # in2 += j2 + addl 172(%esp),%eax + # in3 += j3 + addl 176(%esp),%ecx + # in2 ^= *(uint32 *) (m + 8) + xorl 8(%esi),%eax + # in3 ^= *(uint32 *) (m + 12) + xorl 12(%esi),%ecx + # *(uint32 *) (out + 8) = in2 + movl %eax,8(%edi) + # *(uint32 *) (out + 12) = in3 + movl %ecx,12(%edi) + # in4 = x4 + movl 116(%esp),%eax + # in5 = x5 + movl 120(%esp),%ecx + # in4 += j4 + addl 180(%esp),%eax + # in5 += j5 + addl 184(%esp),%ecx + # in4 ^= *(uint32 *) (m + 16) + xorl 16(%esi),%eax + # in5 ^= *(uint32 *) (m + 20) + xorl 20(%esi),%ecx + # *(uint32 *) (out + 16) = in4 + movl %eax,16(%edi) + # *(uint32 *) (out + 20) = in5 + movl %ecx,20(%edi) + # in6 = x6 + movl 124(%esp),%eax + # in7 = x7 + movl 128(%esp),%ecx + # in6 += j6 + addl 188(%esp),%eax + # in7 += j7 + addl 192(%esp),%ecx + # in6 ^= *(uint32 *) (m + 24) + xorl 24(%esi),%eax + # in7 ^= *(uint32 *) (m + 28) + xorl 28(%esi),%ecx + # *(uint32 *) (out + 24) = in6 + movl %eax,24(%edi) + # *(uint32 *) (out + 28) = in7 + movl %ecx,28(%edi) + # in8 = x8 + movl 132(%esp),%eax + # in9 = x9 + movl 136(%esp),%ecx + # in8 += j8 + addl 196(%esp),%eax + # in9 += j9 + addl 200(%esp),%ecx + # in8 ^= *(uint32 *) (m + 32) + xorl 32(%esi),%eax + # in9 ^= *(uint32 *) (m + 36) + xorl 36(%esi),%ecx + # *(uint32 *) (out + 32) = in8 + movl %eax,32(%edi) + # *(uint32 *) (out + 36) = in9 + movl %ecx,36(%edi) + # in10 = x10 + movl 140(%esp),%eax + # in11 = x11 + movl 144(%esp),%ecx + # in10 += j10 + addl 204(%esp),%eax + # in11 += j11 + addl 208(%esp),%ecx + # in10 ^= *(uint32 *) (m + 40) + xorl 40(%esi),%eax + # in11 ^= *(uint32 *) (m + 44) + xorl 44(%esi),%ecx + # *(uint32 *) (out + 40) = in10 + movl %eax,40(%edi) + # *(uint32 *) (out + 44) = in11 + movl %ecx,44(%edi) + # in12 = x12 + movl 148(%esp),%eax + # in13 = x13 + movl 152(%esp),%ecx + # in12 += j12 + addl 212(%esp),%eax + # in13 += j13 + addl 216(%esp),%ecx + # in12 ^= *(uint32 *) (m + 48) + xorl 48(%esi),%eax + # in13 ^= *(uint32 *) (m + 52) + xorl 52(%esi),%ecx + # *(uint32 *) (out + 48) = in12 + movl %eax,48(%edi) + # *(uint32 *) (out + 52) = in13 + movl %ecx,52(%edi) + # in14 = x14 + movl 156(%esp),%eax + # in15 = x15 + movl 160(%esp),%ecx + # in14 += j14 + addl 220(%esp),%eax + # in15 += j15 + addl 224(%esp),%ecx + # in14 ^= *(uint32 *) (m + 56) + xorl 56(%esi),%eax + # in15 ^= *(uint32 *) (m + 60) + xorl 60(%esi),%ecx + # *(uint32 *) (out + 56) = in14 + movl %eax,56(%edi) + # *(uint32 *) (out + 60) = in15 + movl %ecx,60(%edi) + # bytes = bytes_backup + movl 76(%esp),%ebx + # in8 = j8 + movl 196(%esp),%eax + # in9 = j9 + movl 200(%esp),%ecx + # in8 += 1 + add $1,%eax + # in9 += 0 + carry + adc $0,%ecx + # j8 = in8 + movl %eax,196(%esp) + # j9 = in9 + movl %ecx,200(%esp) + # bytes - 64 + cmp $64,%ebx + # goto bytesatleast65 if unsigned> + ja ._bytesatleast65 + # goto bytesatleast64 if unsigned>= + jae ._bytesatleast64 + # m = out + mov %edi,%esi + # out = ctarget + movl 228(%esp),%edi + # i = bytes + mov %ebx,%ecx + # while (i) { *out++ = *m++; --i } + rep movsb +._bytesatleast64: + # x = x_backup + movl 64(%esp),%eax + # in8 = j8 + movl 196(%esp),%ecx + # in9 = j9 + movl 200(%esp),%edx + # *(uint32 *) (x + 32) = in8 + movl %ecx,32(%eax) + # *(uint32 *) (x + 36) = in9 + movl %edx,36(%eax) +._done: + # eax = eax_stack + movl 80(%esp),%eax + # ebx = ebx_stack + movl 84(%esp),%ebx + # esi = esi_stack + movl 88(%esp),%esi + # edi = edi_stack + movl 92(%esp),%edi + # ebp = ebp_stack + movl 96(%esp),%ebp + # leave + add %eax,%esp + ret +._bytesatleast65: + # bytes -= 64 + sub $64,%ebx + # out += 64 + add $64,%edi + # m += 64 + add $64,%esi + # goto bytesatleast1 + jmp ._bytesatleast1 +# enter ECRYPT_keysetup +.text +.p2align 5 +.globl ECRYPT_keysetup +ECRYPT_keysetup: + mov %esp,%eax + and $31,%eax + add $256,%eax + sub %eax,%esp + # eax_stack = eax + movl %eax,64(%esp) + # ebx_stack = ebx + movl %ebx,68(%esp) + # esi_stack = esi + movl %esi,72(%esp) + # edi_stack = edi + movl %edi,76(%esp) + # ebp_stack = ebp + movl %ebp,80(%esp) + # k = arg2 + movl 8(%esp,%eax),%ecx + # kbits = arg3 + movl 12(%esp,%eax),%edx + # x = arg1 + movl 4(%esp,%eax),%eax + # in1 = *(uint32 *) (k + 0) + movl 0(%ecx),%ebx + # in2 = *(uint32 *) (k + 4) + movl 4(%ecx),%esi + # in3 = *(uint32 *) (k + 8) + movl 8(%ecx),%edi + # in4 = *(uint32 *) (k + 12) + movl 12(%ecx),%ebp + # *(uint32 *) (x + 4) = in1 + movl %ebx,4(%eax) + # *(uint32 *) (x + 8) = in2 + movl %esi,8(%eax) + # *(uint32 *) (x + 12) = in3 + movl %edi,12(%eax) + # *(uint32 *) (x + 16) = in4 + movl %ebp,16(%eax) + # kbits - 256 + cmp $256,%edx + # goto kbits128 if unsigned< + jb ._kbits128 +._kbits256: + # in11 = *(uint32 *) (k + 16) + movl 16(%ecx),%edx + # in12 = *(uint32 *) (k + 20) + movl 20(%ecx),%ebx + # in13 = *(uint32 *) (k + 24) + movl 24(%ecx),%esi + # in14 = *(uint32 *) (k + 28) + movl 28(%ecx),%ecx + # *(uint32 *) (x + 44) = in11 + movl %edx,44(%eax) + # *(uint32 *) (x + 48) = in12 + movl %ebx,48(%eax) + # *(uint32 *) (x + 52) = in13 + movl %esi,52(%eax) + # *(uint32 *) (x + 56) = in14 + movl %ecx,56(%eax) + # in0 = 1634760805 + mov $1634760805,%ecx + # in5 = 857760878 + mov $857760878,%edx + # in10 = 2036477234 + mov $2036477234,%ebx + # in15 = 1797285236 + mov $1797285236,%esi + # *(uint32 *) (x + 0) = in0 + movl %ecx,0(%eax) + # *(uint32 *) (x + 20) = in5 + movl %edx,20(%eax) + # *(uint32 *) (x + 40) = in10 + movl %ebx,40(%eax) + # *(uint32 *) (x + 60) = in15 + movl %esi,60(%eax) + # goto keysetupdone + jmp ._keysetupdone +._kbits128: + # in11 = *(uint32 *) (k + 0) + movl 0(%ecx),%edx + # in12 = *(uint32 *) (k + 4) + movl 4(%ecx),%ebx + # in13 = *(uint32 *) (k + 8) + movl 8(%ecx),%esi + # in14 = *(uint32 *) (k + 12) + movl 12(%ecx),%ecx + # *(uint32 *) (x + 44) = in11 + movl %edx,44(%eax) + # *(uint32 *) (x + 48) = in12 + movl %ebx,48(%eax) + # *(uint32 *) (x + 52) = in13 + movl %esi,52(%eax) + # *(uint32 *) (x + 56) = in14 + movl %ecx,56(%eax) + # in0 = 1634760805 + mov $1634760805,%ecx + # in5 = 824206446 + mov $824206446,%edx + # in10 = 2036477238 + mov $2036477238,%ebx + # in15 = 1797285236 + mov $1797285236,%esi + # *(uint32 *) (x + 0) = in0 + movl %ecx,0(%eax) + # *(uint32 *) (x + 20) = in5 + movl %edx,20(%eax) + # *(uint32 *) (x + 40) = in10 + movl %ebx,40(%eax) + # *(uint32 *) (x + 60) = in15 + movl %esi,60(%eax) +._keysetupdone: + # eax = eax_stack + movl 64(%esp),%eax + # ebx = ebx_stack + movl 68(%esp),%ebx + # esi = esi_stack + movl 72(%esp),%esi + # edi = edi_stack + movl 76(%esp),%edi + # ebp = ebp_stack + movl 80(%esp),%ebp + # leave + add %eax,%esp + ret +# enter ECRYPT_ivsetup +.text +.p2align 5 +.globl ECRYPT_ivsetup +ECRYPT_ivsetup: + mov %esp,%eax + and $31,%eax + add $256,%eax + sub %eax,%esp + # eax_stack = eax + movl %eax,64(%esp) + # ebx_stack = ebx + movl %ebx,68(%esp) + # esi_stack = esi + movl %esi,72(%esp) + # edi_stack = edi + movl %edi,76(%esp) + # ebp_stack = ebp + movl %ebp,80(%esp) + # iv = arg2 + movl 8(%esp,%eax),%ecx + # x = arg1 + movl 4(%esp,%eax),%eax + # in6 = *(uint32 *) (iv + 0) + movl 0(%ecx),%edx + # in7 = *(uint32 *) (iv + 4) + movl 4(%ecx),%ecx + # in8 = 0 + mov $0,%ebx + # in9 = 0 + mov $0,%esi + # *(uint32 *) (x + 24) = in6 + movl %edx,24(%eax) + # *(uint32 *) (x + 28) = in7 + movl %ecx,28(%eax) + # *(uint32 *) (x + 32) = in8 + movl %ebx,32(%eax) + # *(uint32 *) (x + 36) = in9 + movl %esi,36(%eax) + # eax = eax_stack + movl 64(%esp),%eax + # ebx = ebx_stack + movl 68(%esp),%ebx + # esi = esi_stack + movl 72(%esp),%esi + # edi = edi_stack + movl 76(%esp),%edi + # ebp = ebp_stack + movl 80(%esp),%ebp + # leave + add %eax,%esp + ret diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S new file mode 100644 index 000000000000..6214a9b09706 --- /dev/null +++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S @@ -0,0 +1,920 @@ +# enter ECRYPT_encrypt_bytes +.text +.p2align 5 +.globl ECRYPT_encrypt_bytes +ECRYPT_encrypt_bytes: + mov %rsp,%r11 + and $31,%r11 + add $256,%r11 + sub %r11,%rsp + # x = arg1 + mov %rdi,%r8 + # m = arg2 + mov %rsi,%rsi + # out = arg3 + mov %rdx,%rdi + # bytes = arg4 + mov %rcx,%rdx + # unsigned>? bytes - 0 + cmp $0,%rdx + # comment:fp stack unchanged by jump + # goto done if !unsigned> + jbe ._done + # comment:fp stack unchanged by fallthrough +# start: +._start: + # r11_stack = r11 + movq %r11,0(%rsp) + # r12_stack = r12 + movq %r12,8(%rsp) + # r13_stack = r13 + movq %r13,16(%rsp) + # r14_stack = r14 + movq %r14,24(%rsp) + # r15_stack = r15 + movq %r15,32(%rsp) + # rbx_stack = rbx + movq %rbx,40(%rsp) + # rbp_stack = rbp + movq %rbp,48(%rsp) + # in0 = *(uint64 *) (x + 0) + movq 0(%r8),%rcx + # in2 = *(uint64 *) (x + 8) + movq 8(%r8),%r9 + # in4 = *(uint64 *) (x + 16) + movq 16(%r8),%rax + # in6 = *(uint64 *) (x + 24) + movq 24(%r8),%r10 + # in8 = *(uint64 *) (x + 32) + movq 32(%r8),%r11 + # in10 = *(uint64 *) (x + 40) + movq 40(%r8),%r12 + # in12 = *(uint64 *) (x + 48) + movq 48(%r8),%r13 + # in14 = *(uint64 *) (x + 56) + movq 56(%r8),%r14 + # j0 = in0 + movq %rcx,56(%rsp) + # j2 = in2 + movq %r9,64(%rsp) + # j4 = in4 + movq %rax,72(%rsp) + # j6 = in6 + movq %r10,80(%rsp) + # j8 = in8 + movq %r11,88(%rsp) + # j10 = in10 + movq %r12,96(%rsp) + # j12 = in12 + movq %r13,104(%rsp) + # j14 = in14 + movq %r14,112(%rsp) + # x_backup = x + movq %r8,120(%rsp) +# bytesatleast1: +._bytesatleast1: + # unsigned<? bytes - 64 + cmp $64,%rdx + # comment:fp stack unchanged by jump + # goto nocopy if !unsigned< + jae ._nocopy + # ctarget = out + movq %rdi,128(%rsp) + # out = &tmp + leaq 192(%rsp),%rdi + # i = bytes + mov %rdx,%rcx + # while (i) { *out++ = *m++; --i } + rep movsb + # out = &tmp + leaq 192(%rsp),%rdi + # m = &tmp + leaq 192(%rsp),%rsi + # comment:fp stack unchanged by fallthrough +# nocopy: +._nocopy: + # out_backup = out + movq %rdi,136(%rsp) + # m_backup = m + movq %rsi,144(%rsp) + # bytes_backup = bytes + movq %rdx,152(%rsp) + # x1 = j0 + movq 56(%rsp),%rdi + # x0 = x1 + mov %rdi,%rdx + # (uint64) x1 >>= 32 + shr $32,%rdi + # x3 = j2 + movq 64(%rsp),%rsi + # x2 = x3 + mov %rsi,%rcx + # (uint64) x3 >>= 32 + shr $32,%rsi + # x5 = j4 + movq 72(%rsp),%r8 + # x4 = x5 + mov %r8,%r9 + # (uint64) x5 >>= 32 + shr $32,%r8 + # x5_stack = x5 + movq %r8,160(%rsp) + # x7 = j6 + movq 80(%rsp),%r8 + # x6 = x7 + mov %r8,%rax + # (uint64) x7 >>= 32 + shr $32,%r8 + # x9 = j8 + movq 88(%rsp),%r10 + # x8 = x9 + mov %r10,%r11 + # (uint64) x9 >>= 32 + shr $32,%r10 + # x11 = j10 + movq 96(%rsp),%r12 + # x10 = x11 + mov %r12,%r13 + # x10_stack = x10 + movq %r13,168(%rsp) + # (uint64) x11 >>= 32 + shr $32,%r12 + # x13 = j12 + movq 104(%rsp),%r13 + # x12 = x13 + mov %r13,%r14 + # (uint64) x13 >>= 32 + shr $32,%r13 + # x15 = j14 + movq 112(%rsp),%r15 + # x14 = x15 + mov %r15,%rbx + # (uint64) x15 >>= 32 + shr $32,%r15 + # x15_stack = x15 + movq %r15,176(%rsp) + # i = 20 + mov $20,%r15 +# mainloop: +._mainloop: + # i_backup = i + movq %r15,184(%rsp) + # x5 = x5_stack + movq 160(%rsp),%r15 + # a = x12 + x0 + lea (%r14,%rdx),%rbp + # (uint32) a <<<= 7 + rol $7,%ebp + # x4 ^= a + xor %rbp,%r9 + # b = x1 + x5 + lea (%rdi,%r15),%rbp + # (uint32) b <<<= 7 + rol $7,%ebp + # x9 ^= b + xor %rbp,%r10 + # a = x0 + x4 + lea (%rdx,%r9),%rbp + # (uint32) a <<<= 9 + rol $9,%ebp + # x8 ^= a + xor %rbp,%r11 + # b = x5 + x9 + lea (%r15,%r10),%rbp + # (uint32) b <<<= 9 + rol $9,%ebp + # x13 ^= b + xor %rbp,%r13 + # a = x4 + x8 + lea (%r9,%r11),%rbp + # (uint32) a <<<= 13 + rol $13,%ebp + # x12 ^= a + xor %rbp,%r14 + # b = x9 + x13 + lea (%r10,%r13),%rbp + # (uint32) b <<<= 13 + rol $13,%ebp + # x1 ^= b + xor %rbp,%rdi + # a = x8 + x12 + lea (%r11,%r14),%rbp + # (uint32) a <<<= 18 + rol $18,%ebp + # x0 ^= a + xor %rbp,%rdx + # b = x13 + x1 + lea (%r13,%rdi),%rbp + # (uint32) b <<<= 18 + rol $18,%ebp + # x5 ^= b + xor %rbp,%r15 + # x10 = x10_stack + movq 168(%rsp),%rbp + # x5_stack = x5 + movq %r15,160(%rsp) + # c = x6 + x10 + lea (%rax,%rbp),%r15 + # (uint32) c <<<= 7 + rol $7,%r15d + # x14 ^= c + xor %r15,%rbx + # c = x10 + x14 + lea (%rbp,%rbx),%r15 + # (uint32) c <<<= 9 + rol $9,%r15d + # x2 ^= c + xor %r15,%rcx + # c = x14 + x2 + lea (%rbx,%rcx),%r15 + # (uint32) c <<<= 13 + rol $13,%r15d + # x6 ^= c + xor %r15,%rax + # c = x2 + x6 + lea (%rcx,%rax),%r15 + # (uint32) c <<<= 18 + rol $18,%r15d + # x10 ^= c + xor %r15,%rbp + # x15 = x15_stack + movq 176(%rsp),%r15 + # x10_stack = x10 + movq %rbp,168(%rsp) + # d = x11 + x15 + lea (%r12,%r15),%rbp + # (uint32) d <<<= 7 + rol $7,%ebp + # x3 ^= d + xor %rbp,%rsi + # d = x15 + x3 + lea (%r15,%rsi),%rbp + # (uint32) d <<<= 9 + rol $9,%ebp + # x7 ^= d + xor %rbp,%r8 + # d = x3 + x7 + lea (%rsi,%r8),%rbp + # (uint32) d <<<= 13 + rol $13,%ebp + # x11 ^= d + xor %rbp,%r12 + # d = x7 + x11 + lea (%r8,%r12),%rbp + # (uint32) d <<<= 18 + rol $18,%ebp + # x15 ^= d + xor %rbp,%r15 + # x15_stack = x15 + movq %r15,176(%rsp) + # x5 = x5_stack + movq 160(%rsp),%r15 + # a = x3 + x0 + lea (%rsi,%rdx),%rbp + # (uint32) a <<<= 7 + rol $7,%ebp + # x1 ^= a + xor %rbp,%rdi + # b = x4 + x5 + lea (%r9,%r15),%rbp + # (uint32) b <<<= 7 + rol $7,%ebp + # x6 ^= b + xor %rbp,%rax + # a = x0 + x1 + lea (%rdx,%rdi),%rbp + # (uint32) a <<<= 9 + rol $9,%ebp + # x2 ^= a + xor %rbp,%rcx + # b = x5 + x6 + lea (%r15,%rax),%rbp + # (uint32) b <<<= 9 + rol $9,%ebp + # x7 ^= b + xor %rbp,%r8 + # a = x1 + x2 + lea (%rdi,%rcx),%rbp + # (uint32) a <<<= 13 + rol $13,%ebp + # x3 ^= a + xor %rbp,%rsi + # b = x6 + x7 + lea (%rax,%r8),%rbp + # (uint32) b <<<= 13 + rol $13,%ebp + # x4 ^= b + xor %rbp,%r9 + # a = x2 + x3 + lea (%rcx,%rsi),%rbp + # (uint32) a <<<= 18 + rol $18,%ebp + # x0 ^= a + xor %rbp,%rdx + # b = x7 + x4 + lea (%r8,%r9),%rbp + # (uint32) b <<<= 18 + rol $18,%ebp + # x5 ^= b + xor %rbp,%r15 + # x10 = x10_stack + movq 168(%rsp),%rbp + # x5_stack = x5 + movq %r15,160(%rsp) + # c = x9 + x10 + lea (%r10,%rbp),%r15 + # (uint32) c <<<= 7 + rol $7,%r15d + # x11 ^= c + xor %r15,%r12 + # c = x10 + x11 + lea (%rbp,%r12),%r15 + # (uint32) c <<<= 9 + rol $9,%r15d + # x8 ^= c + xor %r15,%r11 + # c = x11 + x8 + lea (%r12,%r11),%r15 + # (uint32) c <<<= 13 + rol $13,%r15d + # x9 ^= c + xor %r15,%r10 + # c = x8 + x9 + lea (%r11,%r10),%r15 + # (uint32) c <<<= 18 + rol $18,%r15d + # x10 ^= c + xor %r15,%rbp + # x15 = x15_stack + movq 176(%rsp),%r15 + # x10_stack = x10 + movq %rbp,168(%rsp) + # d = x14 + x15 + lea (%rbx,%r15),%rbp + # (uint32) d <<<= 7 + rol $7,%ebp + # x12 ^= d + xor %rbp,%r14 + # d = x15 + x12 + lea (%r15,%r14),%rbp + # (uint32) d <<<= 9 + rol $9,%ebp + # x13 ^= d + xor %rbp,%r13 + # d = x12 + x13 + lea (%r14,%r13),%rbp + # (uint32) d <<<= 13 + rol $13,%ebp + # x14 ^= d + xor %rbp,%rbx + # d = x13 + x14 + lea (%r13,%rbx),%rbp + # (uint32) d <<<= 18 + rol $18,%ebp + # x15 ^= d + xor %rbp,%r15 + # x15_stack = x15 + movq %r15,176(%rsp) + # x5 = x5_stack + movq 160(%rsp),%r15 + # a = x12 + x0 + lea (%r14,%rdx),%rbp + # (uint32) a <<<= 7 + rol $7,%ebp + # x4 ^= a + xor %rbp,%r9 + # b = x1 + x5 + lea (%rdi,%r15),%rbp + # (uint32) b <<<= 7 + rol $7,%ebp + # x9 ^= b + xor %rbp,%r10 + # a = x0 + x4 + lea (%rdx,%r9),%rbp + # (uint32) a <<<= 9 + rol $9,%ebp + # x8 ^= a + xor %rbp,%r11 + # b = x5 + x9 + lea (%r15,%r10),%rbp + # (uint32) b <<<= 9 + rol $9,%ebp + # x13 ^= b + xor %rbp,%r13 + # a = x4 + x8 + lea (%r9,%r11),%rbp + # (uint32) a <<<= 13 + rol $13,%ebp + # x12 ^= a + xor %rbp,%r14 + # b = x9 + x13 + lea (%r10,%r13),%rbp + # (uint32) b <<<= 13 + rol $13,%ebp + # x1 ^= b + xor %rbp,%rdi + # a = x8 + x12 + lea (%r11,%r14),%rbp + # (uint32) a <<<= 18 + rol $18,%ebp + # x0 ^= a + xor %rbp,%rdx + # b = x13 + x1 + lea (%r13,%rdi),%rbp + # (uint32) b <<<= 18 + rol $18,%ebp + # x5 ^= b + xor %rbp,%r15 + # x10 = x10_stack + movq 168(%rsp),%rbp + # x5_stack = x5 + movq %r15,160(%rsp) + # c = x6 + x10 + lea (%rax,%rbp),%r15 + # (uint32) c <<<= 7 + rol $7,%r15d + # x14 ^= c + xor %r15,%rbx + # c = x10 + x14 + lea (%rbp,%rbx),%r15 + # (uint32) c <<<= 9 + rol $9,%r15d + # x2 ^= c + xor %r15,%rcx + # c = x14 + x2 + lea (%rbx,%rcx),%r15 + # (uint32) c <<<= 13 + rol $13,%r15d + # x6 ^= c + xor %r15,%rax + # c = x2 + x6 + lea (%rcx,%rax),%r15 + # (uint32) c <<<= 18 + rol $18,%r15d + # x10 ^= c + xor %r15,%rbp + # x15 = x15_stack + movq 176(%rsp),%r15 + # x10_stack = x10 + movq %rbp,168(%rsp) + # d = x11 + x15 + lea (%r12,%r15),%rbp + # (uint32) d <<<= 7 + rol $7,%ebp + # x3 ^= d + xor %rbp,%rsi + # d = x15 + x3 + lea (%r15,%rsi),%rbp + # (uint32) d <<<= 9 + rol $9,%ebp + # x7 ^= d + xor %rbp,%r8 + # d = x3 + x7 + lea (%rsi,%r8),%rbp + # (uint32) d <<<= 13 + rol $13,%ebp + # x11 ^= d + xor %rbp,%r12 + # d = x7 + x11 + lea (%r8,%r12),%rbp + # (uint32) d <<<= 18 + rol $18,%ebp + # x15 ^= d + xor %rbp,%r15 + # x15_stack = x15 + movq %r15,176(%rsp) + # x5 = x5_stack + movq 160(%rsp),%r15 + # a = x3 + x0 + lea (%rsi,%rdx),%rbp + # (uint32) a <<<= 7 + rol $7,%ebp + # x1 ^= a + xor %rbp,%rdi + # b = x4 + x5 + lea (%r9,%r15),%rbp + # (uint32) b <<<= 7 + rol $7,%ebp + # x6 ^= b + xor %rbp,%rax + # a = x0 + x1 + lea (%rdx,%rdi),%rbp + # (uint32) a <<<= 9 + rol $9,%ebp + # x2 ^= a + xor %rbp,%rcx + # b = x5 + x6 + lea (%r15,%rax),%rbp + # (uint32) b <<<= 9 + rol $9,%ebp + # x7 ^= b + xor %rbp,%r8 + # a = x1 + x2 + lea (%rdi,%rcx),%rbp + # (uint32) a <<<= 13 + rol $13,%ebp + # x3 ^= a + xor %rbp,%rsi + # b = x6 + x7 + lea (%rax,%r8),%rbp + # (uint32) b <<<= 13 + rol $13,%ebp + # x4 ^= b + xor %rbp,%r9 + # a = x2 + x3 + lea (%rcx,%rsi),%rbp + # (uint32) a <<<= 18 + rol $18,%ebp + # x0 ^= a + xor %rbp,%rdx + # b = x7 + x4 + lea (%r8,%r9),%rbp + # (uint32) b <<<= 18 + rol $18,%ebp + # x5 ^= b + xor %rbp,%r15 + # x10 = x10_stack + movq 168(%rsp),%rbp + # x5_stack = x5 + movq %r15,160(%rsp) + # c = x9 + x10 + lea (%r10,%rbp),%r15 + # (uint32) c <<<= 7 + rol $7,%r15d + # x11 ^= c + xor %r15,%r12 + # c = x10 + x11 + lea (%rbp,%r12),%r15 + # (uint32) c <<<= 9 + rol $9,%r15d + # x8 ^= c + xor %r15,%r11 + # c = x11 + x8 + lea (%r12,%r11),%r15 + # (uint32) c <<<= 13 + rol $13,%r15d + # x9 ^= c + xor %r15,%r10 + # c = x8 + x9 + lea (%r11,%r10),%r15 + # (uint32) c <<<= 18 + rol $18,%r15d + # x10 ^= c + xor %r15,%rbp + # x15 = x15_stack + movq 176(%rsp),%r15 + # x10_stack = x10 + movq %rbp,168(%rsp) + # d = x14 + x15 + lea (%rbx,%r15),%rbp + # (uint32) d <<<= 7 + rol $7,%ebp + # x12 ^= d + xor %rbp,%r14 + # d = x15 + x12 + lea (%r15,%r14),%rbp + # (uint32) d <<<= 9 + rol $9,%ebp + # x13 ^= d + xor %rbp,%r13 + # d = x12 + x13 + lea (%r14,%r13),%rbp + # (uint32) d <<<= 13 + rol $13,%ebp + # x14 ^= d + xor %rbp,%rbx + # d = x13 + x14 + lea (%r13,%rbx),%rbp + # (uint32) d <<<= 18 + rol $18,%ebp + # x15 ^= d + xor %rbp,%r15 + # x15_stack = x15 + movq %r15,176(%rsp) + # i = i_backup + movq 184(%rsp),%r15 + # unsigned>? i -= 4 + sub $4,%r15 + # comment:fp stack unchanged by jump + # goto mainloop if unsigned> + ja ._mainloop + # (uint32) x2 += j2 + addl 64(%rsp),%ecx + # x3 <<= 32 + shl $32,%rsi + # x3 += j2 + addq 64(%rsp),%rsi + # (uint64) x3 >>= 32 + shr $32,%rsi + # x3 <<= 32 + shl $32,%rsi + # x2 += x3 + add %rsi,%rcx + # (uint32) x6 += j6 + addl 80(%rsp),%eax + # x7 <<= 32 + shl $32,%r8 + # x7 += j6 + addq 80(%rsp),%r8 + # (uint64) x7 >>= 32 + shr $32,%r8 + # x7 <<= 32 + shl $32,%r8 + # x6 += x7 + add %r8,%rax + # (uint32) x8 += j8 + addl 88(%rsp),%r11d + # x9 <<= 32 + shl $32,%r10 + # x9 += j8 + addq 88(%rsp),%r10 + # (uint64) x9 >>= 32 + shr $32,%r10 + # x9 <<= 32 + shl $32,%r10 + # x8 += x9 + add %r10,%r11 + # (uint32) x12 += j12 + addl 104(%rsp),%r14d + # x13 <<= 32 + shl $32,%r13 + # x13 += j12 + addq 104(%rsp),%r13 + # (uint64) x13 >>= 32 + shr $32,%r13 + # x13 <<= 32 + shl $32,%r13 + # x12 += x13 + add %r13,%r14 + # (uint32) x0 += j0 + addl 56(%rsp),%edx + # x1 <<= 32 + shl $32,%rdi + # x1 += j0 + addq 56(%rsp),%rdi + # (uint64) x1 >>= 32 + shr $32,%rdi + # x1 <<= 32 + shl $32,%rdi + # x0 += x1 + add %rdi,%rdx + # x5 = x5_stack + movq 160(%rsp),%rdi + # (uint32) x4 += j4 + addl 72(%rsp),%r9d + # x5 <<= 32 + shl $32,%rdi + # x5 += j4 + addq 72(%rsp),%rdi + # (uint64) x5 >>= 32 + shr $32,%rdi + # x5 <<= 32 + shl $32,%rdi + # x4 += x5 + add %rdi,%r9 + # x10 = x10_stack + movq 168(%rsp),%r8 + # (uint32) x10 += j10 + addl 96(%rsp),%r8d + # x11 <<= 32 + shl $32,%r12 + # x11 += j10 + addq 96(%rsp),%r12 + # (uint64) x11 >>= 32 + shr $32,%r12 + # x11 <<= 32 + shl $32,%r12 + # x10 += x11 + add %r12,%r8 + # x15 = x15_stack + movq 176(%rsp),%rdi + # (uint32) x14 += j14 + addl 112(%rsp),%ebx + # x15 <<= 32 + shl $32,%rdi + # x15 += j14 + addq 112(%rsp),%rdi + # (uint64) x15 >>= 32 + shr $32,%rdi + # x15 <<= 32 + shl $32,%rdi + # x14 += x15 + add %rdi,%rbx + # out = out_backup + movq 136(%rsp),%rdi + # m = m_backup + movq 144(%rsp),%rsi + # x0 ^= *(uint64 *) (m + 0) + xorq 0(%rsi),%rdx + # *(uint64 *) (out + 0) = x0 + movq %rdx,0(%rdi) + # x2 ^= *(uint64 *) (m + 8) + xorq 8(%rsi),%rcx + # *(uint64 *) (out + 8) = x2 + movq %rcx,8(%rdi) + # x4 ^= *(uint64 *) (m + 16) + xorq 16(%rsi),%r9 + # *(uint64 *) (out + 16) = x4 + movq %r9,16(%rdi) + # x6 ^= *(uint64 *) (m + 24) + xorq 24(%rsi),%rax + # *(uint64 *) (out + 24) = x6 + movq %rax,24(%rdi) + # x8 ^= *(uint64 *) (m + 32) + xorq 32(%rsi),%r11 + # *(uint64 *) (out + 32) = x8 + movq %r11,32(%rdi) + # x10 ^= *(uint64 *) (m + 40) + xorq 40(%rsi),%r8 + # *(uint64 *) (out + 40) = x10 + movq %r8,40(%rdi) + # x12 ^= *(uint64 *) (m + 48) + xorq 48(%rsi),%r14 + # *(uint64 *) (out + 48) = x12 + movq %r14,48(%rdi) + # x14 ^= *(uint64 *) (m + 56) + xorq 56(%rsi),%rbx + # *(uint64 *) (out + 56) = x14 + movq %rbx,56(%rdi) + # bytes = bytes_backup + movq 152(%rsp),%rdx + # in8 = j8 + movq 88(%rsp),%rcx + # in8 += 1 + add $1,%rcx + # j8 = in8 + movq %rcx,88(%rsp) + # unsigned>? unsigned<? bytes - 64 + cmp $64,%rdx + # comment:fp stack unchanged by jump + # goto bytesatleast65 if unsigned> + ja ._bytesatleast65 + # comment:fp stack unchanged by jump + # goto bytesatleast64 if !unsigned< + jae ._bytesatleast64 + # m = out + mov %rdi,%rsi + # out = ctarget + movq 128(%rsp),%rdi + # i = bytes + mov %rdx,%rcx + # while (i) { *out++ = *m++; --i } + rep movsb + # comment:fp stack unchanged by fallthrough +# bytesatleast64: +._bytesatleast64: + # x = x_backup + movq 120(%rsp),%rdi + # in8 = j8 + movq 88(%rsp),%rsi + # *(uint64 *) (x + 32) = in8 + movq %rsi,32(%rdi) + # r11 = r11_stack + movq 0(%rsp),%r11 + # r12 = r12_stack + movq 8(%rsp),%r12 + # r13 = r13_stack + movq 16(%rsp),%r13 + # r14 = r14_stack + movq 24(%rsp),%r14 + # r15 = r15_stack + movq 32(%rsp),%r15 + # rbx = rbx_stack + movq 40(%rsp),%rbx + # rbp = rbp_stack + movq 48(%rsp),%rbp + # comment:fp stack unchanged by fallthrough +# done: +._done: + # leave + add %r11,%rsp + mov %rdi,%rax + mov %rsi,%rdx + ret +# bytesatleast65: +._bytesatleast65: + # bytes -= 64 + sub $64,%rdx + # out += 64 + add $64,%rdi + # m += 64 + add $64,%rsi + # comment:fp stack unchanged by jump + # goto bytesatleast1 + jmp ._bytesatleast1 +# enter ECRYPT_keysetup +.text +.p2align 5 +.globl ECRYPT_keysetup +ECRYPT_keysetup: + mov %rsp,%r11 + and $31,%r11 + add $256,%r11 + sub %r11,%rsp + # k = arg2 + mov %rsi,%rsi + # kbits = arg3 + mov %rdx,%rdx + # x = arg1 + mov %rdi,%rdi + # in0 = *(uint64 *) (k + 0) + movq 0(%rsi),%r8 + # in2 = *(uint64 *) (k + 8) + movq 8(%rsi),%r9 + # *(uint64 *) (x + 4) = in0 + movq %r8,4(%rdi) + # *(uint64 *) (x + 12) = in2 + movq %r9,12(%rdi) + # unsigned<? kbits - 256 + cmp $256,%rdx + # comment:fp stack unchanged by jump + # goto kbits128 if unsigned< + jb ._kbits128 +# kbits256: +._kbits256: + # in10 = *(uint64 *) (k + 16) + movq 16(%rsi),%rdx + # in12 = *(uint64 *) (k + 24) + movq 24(%rsi),%rsi + # *(uint64 *) (x + 44) = in10 + movq %rdx,44(%rdi) + # *(uint64 *) (x + 52) = in12 + movq %rsi,52(%rdi) + # in0 = 1634760805 + mov $1634760805,%rsi + # in4 = 857760878 + mov $857760878,%rdx + # in10 = 2036477234 + mov $2036477234,%rcx + # in14 = 1797285236 + mov $1797285236,%r8 + # *(uint32 *) (x + 0) = in0 + movl %esi,0(%rdi) + # *(uint32 *) (x + 20) = in4 + movl %edx,20(%rdi) + # *(uint32 *) (x + 40) = in10 + movl %ecx,40(%rdi) + # *(uint32 *) (x + 60) = in14 + movl %r8d,60(%rdi) + # comment:fp stack unchanged by jump + # goto keysetupdone + jmp ._keysetupdone +# kbits128: +._kbits128: + # in10 = *(uint64 *) (k + 0) + movq 0(%rsi),%rdx + # in12 = *(uint64 *) (k + 8) + movq 8(%rsi),%rsi + # *(uint64 *) (x + 44) = in10 + movq %rdx,44(%rdi) + # *(uint64 *) (x + 52) = in12 + movq %rsi,52(%rdi) + # in0 = 1634760805 + mov $1634760805,%rsi + # in4 = 824206446 + mov $824206446,%rdx + # in10 = 2036477238 + mov $2036477238,%rcx + # in14 = 1797285236 + mov $1797285236,%r8 + # *(uint32 *) (x + 0) = in0 + movl %esi,0(%rdi) + # *(uint32 *) (x + 20) = in4 + movl %edx,20(%rdi) + # *(uint32 *) (x + 40) = in10 + movl %ecx,40(%rdi) + # *(uint32 *) (x + 60) = in14 + movl %r8d,60(%rdi) +# keysetupdone: +._keysetupdone: + # leave + add %r11,%rsp + mov %rdi,%rax + mov %rsi,%rdx + ret +# enter ECRYPT_ivsetup +.text +.p2align 5 +.globl ECRYPT_ivsetup +ECRYPT_ivsetup: + mov %rsp,%r11 + and $31,%r11 + add $256,%r11 + sub %r11,%rsp + # iv = arg2 + mov %rsi,%rsi + # x = arg1 + mov %rdi,%rdi + # in6 = *(uint64 *) (iv + 0) + movq 0(%rsi),%rsi + # in8 = 0 + mov $0,%r8 + # *(uint64 *) (x + 24) = in6 + movq %rsi,24(%rdi) + # *(uint64 *) (x + 32) = in8 + movq %r8,32(%rdi) + # leave + add %r11,%rsp + mov %rdi,%rax + mov %rsi,%rdx + ret diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c new file mode 100644 index 000000000000..bccb76d80987 --- /dev/null +++ b/arch/x86/crypto/salsa20_glue.c @@ -0,0 +1,129 @@ +/* + * Glue code for optimized assembly version of Salsa20. + * + * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com> + * + * The assembly codes are public domain assembly codes written by Daniel. J. + * Bernstein <djb@cr.yp.to>. The codes are modified to include indentation + * and to remove extraneous comments and functions that are not needed. + * - i586 version, renamed as salsa20-i586-asm_32.S + * available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s> + * - x86-64 version, renamed as salsa20-x86_64-asm_64.S + * available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <crypto/algapi.h> +#include <linux/module.h> +#include <linux/crypto.h> + +#define SALSA20_IV_SIZE 8U +#define SALSA20_MIN_KEY_SIZE 16U +#define SALSA20_MAX_KEY_SIZE 32U + +// use the ECRYPT_* function names +#define salsa20_keysetup ECRYPT_keysetup +#define salsa20_ivsetup ECRYPT_ivsetup +#define salsa20_encrypt_bytes ECRYPT_encrypt_bytes + +struct salsa20_ctx +{ + u32 input[16]; +}; + +asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k, + u32 keysize, u32 ivsize); +asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv); +asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx, + const u8 *src, u8 *dst, u32 bytes); + +static int setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keysize) +{ + struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm); + salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8); + return 0; +} + +static int encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct blkcipher_walk walk; + struct crypto_blkcipher *tfm = desc->tfm; + struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm); + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, 64); + + salsa20_ivsetup(ctx, walk.iv); + + if (likely(walk.nbytes == nbytes)) + { + salsa20_encrypt_bytes(ctx, walk.src.virt.addr, + walk.dst.virt.addr, nbytes); + return blkcipher_walk_done(desc, &walk, 0); + } + + while (walk.nbytes >= 64) { + salsa20_encrypt_bytes(ctx, walk.src.virt.addr, + walk.dst.virt.addr, + walk.nbytes - (walk.nbytes % 64)); + err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64); + } + + if (walk.nbytes) { + salsa20_encrypt_bytes(ctx, walk.src.virt.addr, + walk.dst.virt.addr, walk.nbytes); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + +static struct crypto_alg alg = { + .cra_name = "salsa20", + .cra_driver_name = "salsa20-asm", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_type = &crypto_blkcipher_type, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct salsa20_ctx), + .cra_alignmask = 3, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg.cra_list), + .cra_u = { + .blkcipher = { + .setkey = setkey, + .encrypt = encrypt, + .decrypt = encrypt, + .min_keysize = SALSA20_MIN_KEY_SIZE, + .max_keysize = SALSA20_MAX_KEY_SIZE, + .ivsize = SALSA20_IV_SIZE, + } + } +}; + +static int __init init(void) +{ + return crypto_register_alg(&alg); +} + +static void __exit fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)"); +MODULE_ALIAS("salsa20"); +MODULE_ALIAS("salsa20-asm"); diff --git a/arch/x86/crypto/twofish_64.c b/arch/x86/crypto/twofish_64.c deleted file mode 100644 index 182d91d5cfb9..000000000000 --- a/arch/x86/crypto/twofish_64.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Glue Code for optimized x86_64 assembler version of TWOFISH - * - * Originally Twofish for GPG - * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998 - * 256-bit key length added March 20, 1999 - * Some modifications to reduce the text size by Werner Koch, April, 1998 - * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com> - * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net> - * - * The original author has disclaimed all copyright interest in this - * code and thus put it in the public domain. The subsequent authors - * have put this under the GNU General Public License. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 - * USA - * - * This code is a "clean room" implementation, written from the paper - * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey, - * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available - * through http://www.counterpane.com/twofish.html - * - * For background information on multiplication in finite fields, used for - * the matrix operations in the key schedule, see the book _Contemporary - * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the - * Third Edition. - */ - -#include <crypto/twofish.h> -#include <linux/crypto.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/types.h> - -asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); -asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); - -static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - twofish_enc_blk(tfm, dst, src); -} - -static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - twofish_dec_blk(tfm, dst, src); -} - -static struct crypto_alg alg = { - .cra_name = "twofish", - .cra_driver_name = "twofish-x86_64", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 3, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(alg.cra_list), - .cra_u = { - .cipher = { - .cia_min_keysize = TF_MIN_KEY_SIZE, - .cia_max_keysize = TF_MAX_KEY_SIZE, - .cia_setkey = twofish_setkey, - .cia_encrypt = twofish_encrypt, - .cia_decrypt = twofish_decrypt - } - } -}; - -static int __init init(void) -{ - return crypto_register_alg(&alg); -} - -static void __exit fini(void) -{ - crypto_unregister_alg(&alg); -} - -module_init(init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized"); -MODULE_ALIAS("twofish"); diff --git a/arch/x86/crypto/twofish_32.c b/arch/x86/crypto/twofish_glue.c index e3004dfe9c7a..cefaf8b9aa18 100644 --- a/arch/x86/crypto/twofish_32.c +++ b/arch/x86/crypto/twofish_glue.c @@ -1,5 +1,5 @@ /* - * Glue Code for optimized 586 assembler version of TWOFISH + * Glue Code for assembler optimized version of TWOFISH * * Originally Twofish for GPG * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998 @@ -44,7 +44,6 @@ #include <linux/module.h> #include <linux/types.h> - asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); @@ -60,7 +59,7 @@ static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) static struct crypto_alg alg = { .cra_name = "twofish", - .cra_driver_name = "twofish-i586", + .cra_driver_name = "twofish-asm", .cra_priority = 200, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = TF_BLOCK_SIZE, @@ -93,5 +92,6 @@ module_init(init); module_exit(fini); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION ("Twofish Cipher Algorithm, i586 asm optimized"); +MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized"); MODULE_ALIAS("twofish"); +MODULE_ALIAS("twofish-asm"); |