11 files changed, 2309 insertions, 1036 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 46bb609e2444..3874c2de5403 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -4,12 +4,16 @@
 
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
+obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
+obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 
-aes-i586-y := aes-i586-asm_32.o aes_32.o
-twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
+aes-i586-y := aes-i586-asm_32.o aes_glue.o
+twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
+salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 
-aes-x86_64-y := aes-x86_64-asm_64.o aes_64.o
-twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
+aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
+twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
+salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
index f942f0c8f630..1093bede3e0a 100644
--- a/arch/x86/crypto/aes-i586-asm_32.S
+++ b/arch/x86/crypto/aes-i586-asm_32.S
@@ -46,9 +46,9 @@
 #define in_blk 16
 
 /* offsets in crypto_tfm structure */
-#define ekey (crypto_tfm_ctx_offset + 0)
-#define nrnd (crypto_tfm_ctx_offset + 256)
-#define dkey (crypto_tfm_ctx_offset + 260)
+#define klen (crypto_tfm_ctx_offset + 0)
+#define ekey (crypto_tfm_ctx_offset + 4)
+#define dkey (crypto_tfm_ctx_offset + 244)
 
 // register mapping for encrypt and decrypt subroutines
 
@@ -221,8 +221,8 @@
 
 .global  aes_enc_blk
 
-.extern  ft_tab
-.extern  fl_tab
+.extern  crypto_ft_tab
+.extern  crypto_fl_tab
 
 .align 4
 
@@ -236,7 +236,7 @@ aes_enc_blk:
 1:	push    %ebx
 	mov     in_blk+4(%esp),%r2
 	push    %esi
-	mov     nrnd(%ebp),%r3   // number of rounds
+	mov     klen(%ebp),%r3   // key size
 	push    %edi
 #if ekey != 0
 	lea     ekey(%ebp),%ebp  // key pointer
@@ -255,26 +255,26 @@ aes_enc_blk:
 
 	sub     $8,%esp		// space for register saves on stack
 	add     $16,%ebp	// increment to next round key
-	cmp     $12,%r3
+	cmp     $24,%r3
 	jb      4f		// 10 rounds for 128-bit key
 	lea     32(%ebp),%ebp
 	je      3f		// 12 rounds for 192-bit key
 	lea     32(%ebp),%ebp
 
-2:	fwd_rnd1( -64(%ebp) ,ft_tab)	// 14 rounds for 256-bit key
-	fwd_rnd2( -48(%ebp) ,ft_tab)
-3:	fwd_rnd1( -32(%ebp) ,ft_tab)	// 12 rounds for 192-bit key
-	fwd_rnd2( -16(%ebp) ,ft_tab)
-4:	fwd_rnd1(    (%ebp) ,ft_tab)	// 10 rounds for 128-bit key
-	fwd_rnd2( +16(%ebp) ,ft_tab)
-	fwd_rnd1( +32(%ebp) ,ft_tab)
-	fwd_rnd2( +48(%ebp) ,ft_tab)
-	fwd_rnd1( +64(%ebp) ,ft_tab)
-	fwd_rnd2( +80(%ebp) ,ft_tab)
-	fwd_rnd1( +96(%ebp) ,ft_tab)
-	fwd_rnd2(+112(%ebp) ,ft_tab)
-	fwd_rnd1(+128(%ebp) ,ft_tab)
-	fwd_rnd2(+144(%ebp) ,fl_tab)	// last round uses a different table
+2:	fwd_rnd1( -64(%ebp), crypto_ft_tab)	// 14 rounds for 256-bit key
+	fwd_rnd2( -48(%ebp), crypto_ft_tab)
+3:	fwd_rnd1( -32(%ebp), crypto_ft_tab)	// 12 rounds for 192-bit key
+	fwd_rnd2( -16(%ebp), crypto_ft_tab)
+4:	fwd_rnd1(    (%ebp), crypto_ft_tab)	// 10 rounds for 128-bit key
+	fwd_rnd2( +16(%ebp), crypto_ft_tab)
+	fwd_rnd1( +32(%ebp), crypto_ft_tab)
+	fwd_rnd2( +48(%ebp), crypto_ft_tab)
+	fwd_rnd1( +64(%ebp), crypto_ft_tab)
+	fwd_rnd2( +80(%ebp), crypto_ft_tab)
+	fwd_rnd1( +96(%ebp), crypto_ft_tab)
+	fwd_rnd2(+112(%ebp), crypto_ft_tab)
+	fwd_rnd1(+128(%ebp), crypto_ft_tab)
+	fwd_rnd2(+144(%ebp), crypto_fl_tab)	// last round uses a different table
 
 // move final values to the output array.  CAUTION: the 
 // order of these assigns rely on the register mappings
@@ -297,8 +297,8 @@ aes_enc_blk:
 
 .global  aes_dec_blk
 
-.extern  it_tab
-.extern  il_tab
+.extern  crypto_it_tab
+.extern  crypto_il_tab
 
 .align 4
 
@@ -312,14 +312,11 @@ aes_dec_blk:
 1:	push    %ebx
 	mov     in_blk+4(%esp),%r2
 	push    %esi
-	mov     nrnd(%ebp),%r3   // number of rounds
+	mov     klen(%ebp),%r3   // key size
 	push    %edi
 #if dkey != 0
 	lea     dkey(%ebp),%ebp  // key pointer
 #endif
-	mov     %r3,%r0
-	shl     $4,%r0
-	add     %r0,%ebp
 	
 // input four columns and xor in first round key
 
@@ -333,27 +330,27 @@ aes_dec_blk:
 	xor     12(%ebp),%r5
 
 	sub     $8,%esp		// space for register saves on stack
-	sub     $16,%ebp	// increment to next round key
-	cmp     $12,%r3
+	add     $16,%ebp	// increment to next round key
+	cmp     $24,%r3
 	jb      4f		// 10 rounds for 128-bit key
-	lea     -32(%ebp),%ebp
+	lea     32(%ebp),%ebp
 	je      3f		// 12 rounds for 192-bit key
-	lea     -32(%ebp),%ebp
-
-2:	inv_rnd1( +64(%ebp), it_tab)	// 14 rounds for 256-bit key
-	inv_rnd2( +48(%ebp), it_tab)
-3:	inv_rnd1( +32(%ebp), it_tab)	// 12 rounds for 192-bit key
-	inv_rnd2( +16(%ebp), it_tab)
-4:	inv_rnd1(    (%ebp), it_tab)	// 10 rounds for 128-bit key
-	inv_rnd2( -16(%ebp), it_tab)
-	inv_rnd1( -32(%ebp), it_tab)
-	inv_rnd2( -48(%ebp), it_tab)
-	inv_rnd1( -64(%ebp), it_tab)
-	inv_rnd2( -80(%ebp), it_tab)
-	inv_rnd1( -96(%ebp), it_tab)
-	inv_rnd2(-112(%ebp), it_tab)
-	inv_rnd1(-128(%ebp), it_tab)
-	inv_rnd2(-144(%ebp), il_tab)	// last round uses a different table
+	lea     32(%ebp),%ebp
+
+2:	inv_rnd1( -64(%ebp), crypto_it_tab)	// 14 rounds for 256-bit key
+	inv_rnd2( -48(%ebp), crypto_it_tab)
+3:	inv_rnd1( -32(%ebp), crypto_it_tab)	// 12 rounds for 192-bit key
+	inv_rnd2( -16(%ebp), crypto_it_tab)
+4:	inv_rnd1(    (%ebp), crypto_it_tab)	// 10 rounds for 128-bit key
+	inv_rnd2( +16(%ebp), crypto_it_tab)
+	inv_rnd1( +32(%ebp), crypto_it_tab)
+	inv_rnd2( +48(%ebp), crypto_it_tab)
+	inv_rnd1( +64(%ebp), crypto_it_tab)
+	inv_rnd2( +80(%ebp), crypto_it_tab)
+	inv_rnd1( +96(%ebp), crypto_it_tab)
+	inv_rnd2(+112(%ebp), crypto_it_tab)
+	inv_rnd1(+128(%ebp), crypto_it_tab)
+	inv_rnd2(+144(%ebp), crypto_il_tab)	// last round uses a different table
 
 // move final values to the output array.  CAUTION: the 
 // order of these assigns rely on the register mappings
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
index 26b40de4d0b0..a120f526c3df 100644
--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -8,10 +8,10 @@
  * including this sentence is retained in full.
  */
 
-.extern aes_ft_tab
-.extern aes_it_tab
-.extern aes_fl_tab
-.extern aes_il_tab
+.extern crypto_ft_tab
+.extern crypto_it_tab
+.extern crypto_fl_tab
+.extern crypto_il_tab
 
 .text
 
@@ -56,13 +56,13 @@
 	.align	8;			\
 FUNC:	movq	r1,r2;			\
 	movq	r3,r4;			\
-	leaq	BASE+KEY+52(r8),r9;	\
+	leaq	BASE+KEY+48+4(r8),r9;	\
 	movq	r10,r11;		\
 	movl	(r7),r5 ## E;		\
 	movl	4(r7),r1 ## E;		\
 	movl	8(r7),r6 ## E;		\
 	movl	12(r7),r7 ## E;		\
-	movl	BASE(r8),r10 ## E;	\
+	movl	BASE+0(r8),r10 ## E;	\
 	xorl	-48(r9),r5 ## E;	\
 	xorl	-44(r9),r1 ## E;	\
 	xorl	-40(r9),r6 ## E;	\
@@ -154,37 +154,37 @@ FUNC:	movq	r1,r2;			\
 /* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
 
 	entry(aes_enc_blk,0,enc128,enc192)
-	encrypt_round(aes_ft_tab,-96)
-	encrypt_round(aes_ft_tab,-80)
-enc192:	encrypt_round(aes_ft_tab,-64)
-	encrypt_round(aes_ft_tab,-48)
-enc128:	encrypt_round(aes_ft_tab,-32)
-	encrypt_round(aes_ft_tab,-16)
-	encrypt_round(aes_ft_tab,  0)
-	encrypt_round(aes_ft_tab, 16)
-	encrypt_round(aes_ft_tab, 32)
-	encrypt_round(aes_ft_tab, 48)
-	encrypt_round(aes_ft_tab, 64)
-	encrypt_round(aes_ft_tab, 80)
-	encrypt_round(aes_ft_tab, 96)
-	encrypt_final(aes_fl_tab,112)
+	encrypt_round(crypto_ft_tab,-96)
+	encrypt_round(crypto_ft_tab,-80)
+enc192:	encrypt_round(crypto_ft_tab,-64)
+	encrypt_round(crypto_ft_tab,-48)
+enc128:	encrypt_round(crypto_ft_tab,-32)
+	encrypt_round(crypto_ft_tab,-16)
+	encrypt_round(crypto_ft_tab,  0)
+	encrypt_round(crypto_ft_tab, 16)
+	encrypt_round(crypto_ft_tab, 32)
+	encrypt_round(crypto_ft_tab, 48)
+	encrypt_round(crypto_ft_tab, 64)
+	encrypt_round(crypto_ft_tab, 80)
+	encrypt_round(crypto_ft_tab, 96)
+	encrypt_final(crypto_fl_tab,112)
 	return
 
 /* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
 
 	entry(aes_dec_blk,240,dec128,dec192)
-	decrypt_round(aes_it_tab,-96)
-	decrypt_round(aes_it_tab,-80)
-dec192:	decrypt_round(aes_it_tab,-64)
-	decrypt_round(aes_it_tab,-48)
-dec128:	decrypt_round(aes_it_tab,-32)
-	decrypt_round(aes_it_tab,-16)
-	decrypt_round(aes_it_tab,  0)
-	decrypt_round(aes_it_tab, 16)
-	decrypt_round(aes_it_tab, 32)
-	decrypt_round(aes_it_tab, 48)
-	decrypt_round(aes_it_tab, 64)
-	decrypt_round(aes_it_tab, 80)
-	decrypt_round(aes_it_tab, 96)
-	decrypt_final(aes_il_tab,112)
+	decrypt_round(crypto_it_tab,-96)
+	decrypt_round(crypto_it_tab,-80)
+dec192:	decrypt_round(crypto_it_tab,-64)
+	decrypt_round(crypto_it_tab,-48)
+dec128:	decrypt_round(crypto_it_tab,-32)
+	decrypt_round(crypto_it_tab,-16)
+	decrypt_round(crypto_it_tab,  0)
+	decrypt_round(crypto_it_tab, 16)
+	decrypt_round(crypto_it_tab, 32)
+	decrypt_round(crypto_it_tab, 48)
+	decrypt_round(crypto_it_tab, 64)
+	decrypt_round(crypto_it_tab, 80)
+	decrypt_round(crypto_it_tab, 96)
+	decrypt_final(crypto_il_tab,112)
 	return
diff --git a/arch/x86/crypto/aes_32.c b/arch/x86/crypto/aes_32.c
deleted file mode 100644
index 49aad9397f10..000000000000
--- a/arch/x86/crypto/aes_32.c
+++ /dev/null
@@ -1,515 +0,0 @@
-/* 
- * 
- * Glue Code for optimized 586 assembler version of AES
- *
- * Copyright (c) 2002, Dr Brian Gladman <>, Worcester, UK.
- * All rights reserved.
- *
- * LICENSE TERMS
- *
- * The free distribution and use of this software in both source and binary
- * form is allowed (with or without changes) provided that:
- *
- *   1. distributions of this source code include the above copyright
- *      notice, this list of conditions and the following disclaimer;
- *
- *   2. distributions in binary form include the above copyright
- *      notice, this list of conditions and the following disclaimer
- *      in the documentation and/or other associated materials;
- *
- *   3. the copyright holder's name is not used to endorse products
- *      built using this software without specific written permission.
- *
- * ALTERNATIVELY, provided that this notice is retained in full, this product
- * may be distributed under the terms of the GNU General Public License (GPL),
- * in which case the provisions of the GPL apply INSTEAD OF those given above.
- *
- * DISCLAIMER
- *
- * This software is provided 'as is' with no explicit or implied warranties
- * in respect of its properties, including, but not limited to, correctness
- * and/or fitness for purpose.
- *
- * Copyright (c) 2003, Adam J. Richter <adam@yggdrasil.com> (conversion to
- * 2.5 API).
- * Copyright (c) 2003, 2004 Fruhwirth Clemens <clemens@endorphin.org>
- * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
- *
- */
-
-#include <asm/byteorder.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/crypto.h>
-#include <linux/linkage.h>
-
-asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-
-#define AES_MIN_KEY_SIZE	16
-#define AES_MAX_KEY_SIZE	32
-#define AES_BLOCK_SIZE		16
-#define AES_KS_LENGTH		4 * AES_BLOCK_SIZE
-#define RC_LENGTH		29
-
-struct aes_ctx {
-	u32 ekey[AES_KS_LENGTH];
-	u32 rounds;
-	u32 dkey[AES_KS_LENGTH];
-};
-
-#define WPOLY 0x011b
-#define bytes2word(b0, b1, b2, b3)  \
-	(((u32)(b3) << 24) | ((u32)(b2) << 16) | ((u32)(b1) << 8) | (b0))
-
-/* define the finite field multiplies required for Rijndael */
-#define f2(x) ((x) ? pow[log[x] + 0x19] : 0)
-#define f3(x) ((x) ? pow[log[x] + 0x01] : 0)
-#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0)
-#define fb(x) ((x) ? pow[log[x] + 0x68] : 0)
-#define fd(x) ((x) ? pow[log[x] + 0xee] : 0)
-#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0)
-#define fi(x) ((x) ?   pow[255 - log[x]]: 0)
-
-static inline u32 upr(u32 x, int n)
-{
-	return (x << 8 * n) | (x >> (32 - 8 * n));
-}
-
-static inline u8 bval(u32 x, int n)
-{
-	return x >> 8 * n;
-}
-
-/* The forward and inverse affine transformations used in the S-box */
-#define fwd_affine(x) \
-	(w = (u32)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(u8)(w^(w>>8)))
-
-#define inv_affine(x) \
-	(w = (u32)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(u8)(w^(w>>8)))
-
-static u32 rcon_tab[RC_LENGTH];
-
-u32 ft_tab[4][256];
-u32 fl_tab[4][256];
-static u32 im_tab[4][256];
-u32 il_tab[4][256];
-u32 it_tab[4][256];
-
-static void gen_tabs(void)
-{
-	u32 i, w;
-	u8 pow[512], log[256];
-
-	/*
-	 * log and power tables for GF(2^8) finite field with
-	 * WPOLY as modular polynomial - the simplest primitive
-	 * root is 0x03, used here to generate the tables.
-	 */
-	i = 0; w = 1; 
-	
-	do {
-		pow[i] = (u8)w;
-		pow[i + 255] = (u8)w;
-		log[w] = (u8)i++;
-		w ^=  (w << 1) ^ (w & 0x80 ? WPOLY : 0);
-	} while (w != 1);
-	
-	for(i = 0, w = 1; i < RC_LENGTH; ++i) {
-		rcon_tab[i] = bytes2word(w, 0, 0, 0);
-		w = f2(w);
-	}
-
-	for(i = 0; i < 256; ++i) {
-		u8 b;
-		
-		b = fwd_affine(fi((u8)i));
-		w = bytes2word(f2(b), b, b, f3(b));
-
-		/* tables for a normal encryption round */
-		ft_tab[0][i] = w;
-		ft_tab[1][i] = upr(w, 1);
-		ft_tab[2][i] = upr(w, 2);
-		ft_tab[3][i] = upr(w, 3);
-		w = bytes2word(b, 0, 0, 0);
-		
-		/*
-		 * tables for last encryption round
-		 * (may also be used in the key schedule)
-		 */
-		fl_tab[0][i] = w;
-		fl_tab[1][i] = upr(w, 1);
-		fl_tab[2][i] = upr(w, 2);
-		fl_tab[3][i] = upr(w, 3);
-		
-		b = fi(inv_affine((u8)i));
-		w = bytes2word(fe(b), f9(b), fd(b), fb(b));
-
-		/* tables for the inverse mix column operation  */
-		im_tab[0][b] = w;
-		im_tab[1][b] = upr(w, 1);
-		im_tab[2][b] = upr(w, 2);
-		im_tab[3][b] = upr(w, 3);
-
-		/* tables for a normal decryption round */
-		it_tab[0][i] = w;
-		it_tab[1][i] = upr(w,1);
-		it_tab[2][i] = upr(w,2);
-		it_tab[3][i] = upr(w,3);
-
-		w = bytes2word(b, 0, 0, 0);
-		
-		/* tables for last decryption round */
-		il_tab[0][i] = w;
-		il_tab[1][i] = upr(w,1);
-		il_tab[2][i] = upr(w,2);
-		il_tab[3][i] = upr(w,3);
-    }
-}
-
-#define four_tables(x,tab,vf,rf,c)		\
-(	tab[0][bval(vf(x,0,c),rf(0,c))]	^	\
-	tab[1][bval(vf(x,1,c),rf(1,c))] ^	\
-	tab[2][bval(vf(x,2,c),rf(2,c))] ^	\
-	tab[3][bval(vf(x,3,c),rf(3,c))]		\
-)
-
-#define vf1(x,r,c)  (x)
-#define rf1(r,c)    (r)
-#define rf2(r,c)    ((r-c)&3)
-
-#define inv_mcol(x) four_tables(x,im_tab,vf1,rf1,0)
-#define ls_box(x,c) four_tables(x,fl_tab,vf1,rf2,c)
-
-#define ff(x) inv_mcol(x)
-
-#define ke4(k,i)							\
-{									\
-	k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i];		\
-	k[4*(i)+5] = ss[1] ^= ss[0];					\
-	k[4*(i)+6] = ss[2] ^= ss[1];					\
-	k[4*(i)+7] = ss[3] ^= ss[2];					\
-}
-
-#define kel4(k,i)							\
-{									\
-	k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i];		\
-	k[4*(i)+5] = ss[1] ^= ss[0];					\
-	k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2];	\
-}
-
-#define ke6(k,i)							\
-{									\
-	k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];		\
-	k[6*(i)+ 7] = ss[1] ^= ss[0];					\
-	k[6*(i)+ 8] = ss[2] ^= ss[1];					\
-	k[6*(i)+ 9] = ss[3] ^= ss[2];					\
-	k[6*(i)+10] = ss[4] ^= ss[3];					\
-	k[6*(i)+11] = ss[5] ^= ss[4];					\
-}
-
-#define kel6(k,i)							\
-{									\
-	k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];		\
-	k[6*(i)+ 7] = ss[1] ^= ss[0];					\
-	k[6*(i)+ 8] = ss[2] ^= ss[1];					\
-	k[6*(i)+ 9] = ss[3] ^= ss[2];					\
-}
-
-#define ke8(k,i)							\
-{									\
-	k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];		\
-	k[8*(i)+ 9] = ss[1] ^= ss[0];					\
-	k[8*(i)+10] = ss[2] ^= ss[1];					\
-	k[8*(i)+11] = ss[3] ^= ss[2];					\
-	k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0);				\
-	k[8*(i)+13] = ss[5] ^= ss[4];					\
-	k[8*(i)+14] = ss[6] ^= ss[5];					\
-	k[8*(i)+15] = ss[7] ^= ss[6];					\
-}
-
-#define kel8(k,i)							\
-{									\
-	k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];		\
-	k[8*(i)+ 9] = ss[1] ^= ss[0];					\
-	k[8*(i)+10] = ss[2] ^= ss[1];					\
-	k[8*(i)+11] = ss[3] ^= ss[2];					\
-}
-
-#define kdf4(k,i)							\
-{									\
-	ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3];				\
-	ss[1] = ss[1] ^ ss[3];						\
-	ss[2] = ss[2] ^ ss[3];						\
-	ss[3] = ss[3];							\
-	ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];			\
-	ss[i % 4] ^= ss[4];						\
-	ss[4] ^= k[4*(i)];						\
-	k[4*(i)+4] = ff(ss[4]);						\
-	ss[4] ^= k[4*(i)+1];						\
-	k[4*(i)+5] = ff(ss[4]);						\
-	ss[4] ^= k[4*(i)+2];						\
-	k[4*(i)+6] = ff(ss[4]);						\
-	ss[4] ^= k[4*(i)+3];						\
-	k[4*(i)+7] = ff(ss[4]);						\
-}
-
-#define kd4(k,i)							\
-{									\
-	ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];			\
-	ss[i % 4] ^= ss[4];						\
-	ss[4] = ff(ss[4]);						\
-	k[4*(i)+4] = ss[4] ^= k[4*(i)];					\
-	k[4*(i)+5] = ss[4] ^= k[4*(i)+1];				\
-	k[4*(i)+6] = ss[4] ^= k[4*(i)+2];				\
-	k[4*(i)+7] = ss[4] ^= k[4*(i)+3];				\
-}
-
-#define kdl4(k,i)							\
-{									\
-	ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];			\
-	ss[i % 4] ^= ss[4];						\
-	k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3];			\
-	k[4*(i)+5] = ss[1] ^ ss[3];					\
-	k[4*(i)+6] = ss[0];						\
-	k[4*(i)+7] = ss[1];						\
-}
-
-#define kdf6(k,i)							\
-{									\
-	ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];				\
-	k[6*(i)+ 6] = ff(ss[0]);					\
-	ss[1] ^= ss[0];							\
-	k[6*(i)+ 7] = ff(ss[1]);					\
-	ss[2] ^= ss[1];							\
-	k[6*(i)+ 8] = ff(ss[2]);					\
-	ss[3] ^= ss[2];							\
-	k[6*(i)+ 9] = ff(ss[3]);					\
-	ss[4] ^= ss[3];							\
-	k[6*(i)+10] = ff(ss[4]);					\
-	ss[5] ^= ss[4];							\
-	k[6*(i)+11] = ff(ss[5]);					\
-}
-
-#define kd6(k,i)							\
-{									\
-	ss[6] = ls_box(ss[5],3) ^ rcon_tab[i];				\
-	ss[0] ^= ss[6]; ss[6] = ff(ss[6]);				\
-	k[6*(i)+ 6] = ss[6] ^= k[6*(i)];				\
-	ss[1] ^= ss[0];							\
-	k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1];				\
-	ss[2] ^= ss[1];							\
-	k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2];				\
-	ss[3] ^= ss[2];							\
-	k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3];				\
-	ss[4] ^= ss[3];							\
-	k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4];				\
-	ss[5] ^= ss[4];							\
-	k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5];				\
-}
-
-#define kdl6(k,i)							\
-{									\
-	ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];				\
-	k[6*(i)+ 6] = ss[0];						\
-	ss[1] ^= ss[0];							\
-	k[6*(i)+ 7] = ss[1];						\
-	ss[2] ^= ss[1];							\
-	k[6*(i)+ 8] = ss[2];						\
-	ss[3] ^= ss[2];							\
-	k[6*(i)+ 9] = ss[3];						\
-}
-
-#define kdf8(k,i)							\
-{									\
-	ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];				\
-	k[8*(i)+ 8] = ff(ss[0]);					\
-	ss[1] ^= ss[0];							\
-	k[8*(i)+ 9] = ff(ss[1]);					\
-	ss[2] ^= ss[1];							\
-	k[8*(i)+10] = ff(ss[2]);					\
-	ss[3] ^= ss[2];							\
-	k[8*(i)+11] = ff(ss[3]);					\
-	ss[4] ^= ls_box(ss[3],0);					\
-	k[8*(i)+12] = ff(ss[4]);					\
-	ss[5] ^= ss[4];							\
-	k[8*(i)+13] = ff(ss[5]);					\
-	ss[6] ^= ss[5];							\
-	k[8*(i)+14] = ff(ss[6]);					\
-	ss[7] ^= ss[6];							\
-	k[8*(i)+15] = ff(ss[7]);					\
-}
-
-#define kd8(k,i)							\
-{									\
-	u32 __g = ls_box(ss[7],3) ^ rcon_tab[i];			\
-	ss[0] ^= __g;							\
-	__g = ff(__g);							\
-	k[8*(i)+ 8] = __g ^= k[8*(i)];					\
-	ss[1] ^= ss[0];							\
-	k[8*(i)+ 9] = __g ^= k[8*(i)+ 1];				\
-	ss[2] ^= ss[1];							\
-	k[8*(i)+10] = __g ^= k[8*(i)+ 2];				\
-	ss[3] ^= ss[2];							\
-	k[8*(i)+11] = __g ^= k[8*(i)+ 3];				\
-	__g = ls_box(ss[3],0);						\
-	ss[4] ^= __g;							\
-	__g = ff(__g);							\
-	k[8*(i)+12] = __g ^= k[8*(i)+ 4];				\
-	ss[5] ^= ss[4];							\
-	k[8*(i)+13] = __g ^= k[8*(i)+ 5];				\
-	ss[6] ^= ss[5];							\
-	k[8*(i)+14] = __g ^= k[8*(i)+ 6];				\
-	ss[7] ^= ss[6];							\
-	k[8*(i)+15] = __g ^= k[8*(i)+ 7];				\
-}
-
-#define kdl8(k,i)							\
-{									\
-	ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];				\
-	k[8*(i)+ 8] = ss[0];						\
-	ss[1] ^= ss[0];							\
-	k[8*(i)+ 9] = ss[1];						\
-	ss[2] ^= ss[1];							\
-	k[8*(i)+10] = ss[2];						\
-	ss[3] ^= ss[2];							\
-	k[8*(i)+11] = ss[3];						\
-}
-
-static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
-		       unsigned int key_len)
-{
-	int i;
-	u32 ss[8];
-	struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	const __le32 *key = (const __le32 *)in_key;
-	u32 *flags = &tfm->crt_flags;
-
-	/* encryption schedule */
-	
-	ctx->ekey[0] = ss[0] = le32_to_cpu(key[0]);
-	ctx->ekey[1] = ss[1] = le32_to_cpu(key[1]);
-	ctx->ekey[2] = ss[2] = le32_to_cpu(key[2]);
-	ctx->ekey[3] = ss[3] = le32_to_cpu(key[3]);
-
-	switch(key_len) {
-	case 16:
-		for (i = 0; i < 9; i++)
-			ke4(ctx->ekey, i);
-		kel4(ctx->ekey, 9);
-		ctx->rounds = 10;
-		break;
-		
-	case 24:
-		ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
-		ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
-		for (i = 0; i < 7; i++)
-			ke6(ctx->ekey, i);
-		kel6(ctx->ekey, 7); 
-		ctx->rounds = 12;
-		break;
-
-	case 32:
-		ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
-		ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
-		ctx->ekey[6] = ss[6] = le32_to_cpu(key[6]);
-		ctx->ekey[7] = ss[7] = le32_to_cpu(key[7]);
-		for (i = 0; i < 6; i++)
-			ke8(ctx->ekey, i);
-		kel8(ctx->ekey, 6);
-		ctx->rounds = 14;
-		break;
-
-	default:
-		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
-	
-	/* decryption schedule */
-	
-	ctx->dkey[0] = ss[0] = le32_to_cpu(key[0]);
-	ctx->dkey[1] = ss[1] = le32_to_cpu(key[1]);
-	ctx->dkey[2] = ss[2] = le32_to_cpu(key[2]);
-	ctx->dkey[3] = ss[3] = le32_to_cpu(key[3]);
-
-	switch (key_len) {
-	case 16:
-		kdf4(ctx->dkey, 0);
-		for (i = 1; i < 9; i++)
-			kd4(ctx->dkey, i);
-		kdl4(ctx->dkey, 9);
-		break;
-		
-	case 24:
-		ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
-		ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
-		kdf6(ctx->dkey, 0);
-		for (i = 1; i < 7; i++)
-			kd6(ctx->dkey, i);
-		kdl6(ctx->dkey, 7);
-		break;
-
-	case 32:
-		ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
-		ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
-		ctx->dkey[6] = ff(ss[6] = le32_to_cpu(key[6]));
-		ctx->dkey[7] = ff(ss[7] = le32_to_cpu(key[7]));
-		kdf8(ctx->dkey, 0);
-		for (i = 1; i < 6; i++)
-			kd8(ctx->dkey, i);
-		kdl8(ctx->dkey, 6);
-		break;
-	}
-	return 0;
-}
-
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	aes_enc_blk(tfm, dst, src);
-}
-
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	aes_dec_blk(tfm, dst, src);
-}
-
-static struct crypto_alg aes_alg = {
-	.cra_name		=	"aes",
-	.cra_driver_name	=	"aes-i586",
-	.cra_priority		=	200,
-	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		=	AES_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct aes_ctx),
-	.cra_module		=	THIS_MODULE,
-	.cra_list		=	LIST_HEAD_INIT(aes_alg.cra_list),
-	.cra_u			=	{
-		.cipher = {
-			.cia_min_keysize	=	AES_MIN_KEY_SIZE,
-			.cia_max_keysize	=	AES_MAX_KEY_SIZE,
-			.cia_setkey	   	= 	aes_set_key,
-			.cia_encrypt	 	=	aes_encrypt,
-			.cia_decrypt	  	=	aes_decrypt
-		}
-	}
-};
-
-static int __init aes_init(void)
-{
-	gen_tabs();
-	return crypto_register_alg(&aes_alg);
-}
-
-static void __exit aes_fini(void)
-{
-	crypto_unregister_alg(&aes_alg);
-}
-
-module_init(aes_init);
-module_exit(aes_fini);
-
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, i586 asm optimized");
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Fruhwirth Clemens, James Morris, Brian Gladman, Adam Richter");
-MODULE_ALIAS("aes");
diff --git a/arch/x86/crypto/aes_64.c b/arch/x86/crypto/aes_64.c
deleted file mode 100644
index 5cdb13ea5cc2..000000000000
--- a/arch/x86/crypto/aes_64.c
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
- * Cryptographic API.
- *
- * AES Cipher Algorithm.
- *
- * Based on Brian Gladman's code.
- *
- * Linux developers:
- *  Alexander Kjeldaas <astor@fast.no>
- *  Herbert Valerio Riedel <hvr@hvrlab.org>
- *  Kyle McMartin <kyle@debian.org>
- *  Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API).
- *  Andreas Steinmetz <ast@domdv.de> (adapted to x86_64 assembler)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * ---------------------------------------------------------------------------
- * Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK.
- * All rights reserved.
- *
- * LICENSE TERMS
- *
- * The free distribution and use of this software in both source and binary
- * form is allowed (with or without changes) provided that:
- *
- *   1. distributions of this source code include the above copyright
- *      notice, this list of conditions and the following disclaimer;
- *
- *   2. distributions in binary form include the above copyright
- *      notice, this list of conditions and the following disclaimer
- *      in the documentation and/or other associated materials;
- *
- *   3. the copyright holder's name is not used to endorse products
- *      built using this software without specific written permission.
- *
- * ALTERNATIVELY, provided that this notice is retained in full, this product
- * may be distributed under the terms of the GNU General Public License (GPL),
- * in which case the provisions of the GPL apply INSTEAD OF those given above.
- *
- * DISCLAIMER
- *
- * This software is provided 'as is' with no explicit or implied warranties
- * in respect of its properties, including, but not limited to, correctness
- * and/or fitness for purpose.
- * ---------------------------------------------------------------------------
- */
-
-/* Some changes from the Gladman version:
-    s/RIJNDAEL(e_key)/E_KEY/g
-    s/RIJNDAEL(d_key)/D_KEY/g
-*/
-
-#include <asm/byteorder.h>
-#include <linux/bitops.h>
-#include <linux/crypto.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/types.h>
-
-#define AES_MIN_KEY_SIZE	16
-#define AES_MAX_KEY_SIZE	32
-
-#define AES_BLOCK_SIZE		16
-
-/*
- * #define byte(x, nr) ((unsigned char)((x) >> (nr*8)))
- */
-static inline u8 byte(const u32 x, const unsigned n)
-{
-	return x >> (n << 3);
-}
-
-struct aes_ctx
-{
-	u32 key_length;
-	u32 buf[120];
-};
-
-#define E_KEY (&ctx->buf[0])
-#define D_KEY (&ctx->buf[60])
-
-static u8 pow_tab[256] __initdata;
-static u8 log_tab[256] __initdata;
-static u8 sbx_tab[256] __initdata;
-static u8 isb_tab[256] __initdata;
-static u32 rco_tab[10];
-u32 aes_ft_tab[4][256];
-u32 aes_it_tab[4][256];
-
-u32 aes_fl_tab[4][256];
-u32 aes_il_tab[4][256];
-
-static inline u8 f_mult(u8 a, u8 b)
-{
-	u8 aa = log_tab[a], cc = aa + log_tab[b];
-
-	return pow_tab[cc + (cc < aa ? 1 : 0)];
-}
-
-#define ff_mult(a, b) (a && b ? f_mult(a, b) : 0)
-
-#define ls_box(x)				\
-	(aes_fl_tab[0][byte(x, 0)] ^		\
-	 aes_fl_tab[1][byte(x, 1)] ^		\
-	 aes_fl_tab[2][byte(x, 2)] ^		\
-	 aes_fl_tab[3][byte(x, 3)])
-
-static void __init gen_tabs(void)
-{
-	u32 i, t;
-	u8 p, q;
-
-	/* log and power tables for GF(2**8) finite field with
-	   0x011b as modular polynomial - the simplest primitive
-	   root is 0x03, used here to generate the tables */
-
-	for (i = 0, p = 1; i < 256; ++i) {
-		pow_tab[i] = (u8)p;
-		log_tab[p] = (u8)i;
-
-		p ^= (p << 1) ^ (p & 0x80 ? 0x01b : 0);
-	}
-
-	log_tab[1] = 0;
-
-	for (i = 0, p = 1; i < 10; ++i) {
-		rco_tab[i] = p;
-
-		p = (p << 1) ^ (p & 0x80 ? 0x01b : 0);
-	}
-
-	for (i = 0; i < 256; ++i) {
-		p = (i ? pow_tab[255 - log_tab[i]] : 0);
-		q = ((p >> 7) | (p << 1)) ^ ((p >> 6) | (p << 2));
-		p ^= 0x63 ^ q ^ ((q >> 6) | (q << 2));
-		sbx_tab[i] = p;
-		isb_tab[p] = (u8)i;
-	}
-
-	for (i = 0; i < 256; ++i) {
-		p = sbx_tab[i];
-
-		t = p;
-		aes_fl_tab[0][i] = t;
-		aes_fl_tab[1][i] = rol32(t, 8);
-		aes_fl_tab[2][i] = rol32(t, 16);
-		aes_fl_tab[3][i] = rol32(t, 24);
-
-		t = ((u32)ff_mult(2, p)) |
-		    ((u32)p << 8) |
-		    ((u32)p << 16) | ((u32)ff_mult(3, p) << 24);
-
-		aes_ft_tab[0][i] = t;
-		aes_ft_tab[1][i] = rol32(t, 8);
-		aes_ft_tab[2][i] = rol32(t, 16);
-		aes_ft_tab[3][i] = rol32(t, 24);
-
-		p = isb_tab[i];
-
-		t = p;
-		aes_il_tab[0][i] = t;
-		aes_il_tab[1][i] = rol32(t, 8);
-		aes_il_tab[2][i] = rol32(t, 16);
-		aes_il_tab[3][i] = rol32(t, 24);
-
-		t = ((u32)ff_mult(14, p)) |
-		    ((u32)ff_mult(9, p) << 8) |
-		    ((u32)ff_mult(13, p) << 16) |
-		    ((u32)ff_mult(11, p) << 24);
-
-		aes_it_tab[0][i] = t;
-		aes_it_tab[1][i] = rol32(t, 8);
-		aes_it_tab[2][i] = rol32(t, 16);
-		aes_it_tab[3][i] = rol32(t, 24);
-	}
-}
-
-#define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b)
-
-#define imix_col(y, x)			\
-	u    = star_x(x);		\
-	v    = star_x(u);		\
-	w    = star_x(v);		\
-	t    = w ^ (x);			\
-	(y)  = u ^ v ^ w;		\
-	(y) ^= ror32(u ^ t,  8) ^	\
-	       ror32(v ^ t, 16) ^	\
-	       ror32(t, 24)
-
-/* initialise the key schedule from the user supplied key */
-
-#define loop4(i)					\
-{							\
-	t = ror32(t,  8); t = ls_box(t) ^ rco_tab[i];	\
-	t ^= E_KEY[4 * i];     E_KEY[4 * i + 4] = t;	\
-	t ^= E_KEY[4 * i + 1]; E_KEY[4 * i + 5] = t;	\
-	t ^= E_KEY[4 * i + 2]; E_KEY[4 * i + 6] = t;	\
-	t ^= E_KEY[4 * i + 3]; E_KEY[4 * i + 7] = t;	\
-}
-
-#define loop6(i)					\
-{							\
-	t = ror32(t,  8); t = ls_box(t) ^ rco_tab[i];	\
-	t ^= E_KEY[6 * i];     E_KEY[6 * i + 6] = t;	\
-	t ^= E_KEY[6 * i + 1]; E_KEY[6 * i + 7] = t;	\
-	t ^= E_KEY[6 * i + 2]; E_KEY[6 * i + 8] = t;	\
-	t ^= E_KEY[6 * i + 3]; E_KEY[6 * i + 9] = t;	\
-	t ^= E_KEY[6 * i + 4]; E_KEY[6 * i + 10] = t;	\
-	t ^= E_KEY[6 * i + 5]; E_KEY[6 * i + 11] = t;	\
-}
-
-#define loop8(i)					\
-{							\
-	t = ror32(t,  8); ; t = ls_box(t) ^ rco_tab[i];	\
-	t ^= E_KEY[8 * i];     E_KEY[8 * i + 8] = t;	\
-	t ^= E_KEY[8 * i + 1]; E_KEY[8 * i + 9] = t;	\
-	t ^= E_KEY[8 * i + 2]; E_KEY[8 * i + 10] = t;	\
-	t ^= E_KEY[8 * i + 3]; E_KEY[8 * i + 11] = t;	\
-	t  = E_KEY[8 * i + 4] ^ ls_box(t);		\
-	E_KEY[8 * i + 12] = t;				\
-	t ^= E_KEY[8 * i + 5]; E_KEY[8 * i + 13] = t;	\
-	t ^= E_KEY[8 * i + 6]; E_KEY[8 * i + 14] = t;	\
-	t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t;	\
-}
-
-static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
-		       unsigned int key_len)
-{
-	struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	const __le32 *key = (const __le32 *)in_key;
-	u32 *flags = &tfm->crt_flags;
-	u32 i, j, t, u, v, w;
-
-	if (key_len % 8) {
-		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
-
-	ctx->key_length = key_len;
-
-	D_KEY[key_len + 24] = E_KEY[0] = le32_to_cpu(key[0]);
-	D_KEY[key_len + 25] = E_KEY[1] = le32_to_cpu(key[1]);
-	D_KEY[key_len + 26] = E_KEY[2] = le32_to_cpu(key[2]);
-	D_KEY[key_len + 27] = E_KEY[3] = le32_to_cpu(key[3]);
-
-	switch (key_len) {
-	case 16:
-		t = E_KEY[3];
-		for (i = 0; i < 10; ++i)
-			loop4(i);
-		break;
-
-	case 24:
-		E_KEY[4] = le32_to_cpu(key[4]);
-		t = E_KEY[5] = le32_to_cpu(key[5]);
-		for (i = 0; i < 8; ++i)
-			loop6 (i);
-		break;
-
-	case 32:
-		E_KEY[4] = le32_to_cpu(key[4]);
-		E_KEY[5] = le32_to_cpu(key[5]);
-		E_KEY[6] = le32_to_cpu(key[6]);
-		t = E_KEY[7] = le32_to_cpu(key[7]);
-		for (i = 0; i < 7; ++i)
-			loop8(i);
-		break;
-	}
-
-	D_KEY[0] = E_KEY[key_len + 24];
-	D_KEY[1] = E_KEY[key_len + 25];
-	D_KEY[2] = E_KEY[key_len + 26];
-	D_KEY[3] = E_KEY[key_len + 27];
-
-	for (i = 4; i < key_len + 24; ++i) {
-		j = key_len + 24 - (i & ~3) + (i & 3);
-		imix_col(D_KEY[j], E_KEY[i]);
-	}
-
-	return 0;
-}
-
-asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
-asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
-
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	aes_enc_blk(tfm, dst, src);
-}
-
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	aes_dec_blk(tfm, dst, src);
-}
-
-static struct crypto_alg aes_alg = {
-	.cra_name		=	"aes",
-	.cra_driver_name	=	"aes-x86_64",
-	.cra_priority		=	200,
-	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		=	AES_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct aes_ctx),
-	.cra_module		=	THIS_MODULE,
-	.cra_list		=	LIST_HEAD_INIT(aes_alg.cra_list),
-	.cra_u			=	{
-		.cipher = {
-			.cia_min_keysize	=	AES_MIN_KEY_SIZE,
-			.cia_max_keysize	=	AES_MAX_KEY_SIZE,
-			.cia_setkey	   	= 	aes_set_key,
-			.cia_encrypt	 	=	aes_encrypt,
-			.cia_decrypt	  	=	aes_decrypt
-		}
-	}
-};
-
-static int __init aes_init(void)
-{
-	gen_tabs();
-	return crypto_register_alg(&aes_alg);
-}
-
-static void __exit aes_fini(void)
-{
-	crypto_unregister_alg(&aes_alg);
-}
-
-module_init(aes_init);
-module_exit(aes_fini);
-
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("aes");
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
new file mode 100644
index 000000000000..71f457827116
--- /dev/null
+++ b/arch/x86/crypto/aes_glue.c
@@ -0,0 +1,57 @@
+/*
+ * Glue Code for the asm optimized version of the AES Cipher Algorithm
+ *
+ */
+
+#include <crypto/aes.h>
+
+asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	aes_enc_blk(tfm, dst, src);
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	aes_dec_blk(tfm, dst, src);
+}
+
+static struct crypto_alg aes_alg = {
+	.cra_name		= "aes",
+	.cra_driver_name	= "aes-asm",
+	.cra_priority		= 200,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(aes_alg.cra_list),
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= AES_MIN_KEY_SIZE,
+			.cia_max_keysize	= AES_MAX_KEY_SIZE,
+			.cia_setkey		= crypto_aes_set_key,
+			.cia_encrypt		= aes_encrypt,
+			.cia_decrypt		= aes_decrypt
+		}
+	}
+};
+
+static int __init aes_init(void)
+{
+	return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+	crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("aes");
+MODULE_ALIAS("aes-asm");
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S
new file mode 100644
index 000000000000..72eb306680b2
--- /dev/null
+++ b/arch/x86/crypto/salsa20-i586-asm_32.S
@@ -0,0 +1,1114 @@
+# salsa20_pm.s version 20051229
+# D. J. Bernstein
+# Public domain.
+
+# enter ECRYPT_encrypt_bytes
+.text
+.p2align 5
+.globl ECRYPT_encrypt_bytes
+ECRYPT_encrypt_bytes:
+	mov	%esp,%eax
+	and	$31,%eax
+	add	$256,%eax
+	sub	%eax,%esp
+	# eax_stack = eax
+	movl	%eax,80(%esp)
+	# ebx_stack = ebx
+	movl	%ebx,84(%esp)
+	# esi_stack = esi
+	movl	%esi,88(%esp)
+	# edi_stack = edi
+	movl	%edi,92(%esp)
+	# ebp_stack = ebp
+	movl	%ebp,96(%esp)
+	# x = arg1
+	movl	4(%esp,%eax),%edx
+	# m = arg2
+	movl	8(%esp,%eax),%esi
+	# out = arg3
+	movl	12(%esp,%eax),%edi
+	# bytes = arg4
+	movl	16(%esp,%eax),%ebx
+	# bytes -= 0
+	sub	$0,%ebx
+	# goto done if unsigned<=
+	jbe	._done
+._start:
+	# in0 = *(uint32 *) (x + 0)
+	movl	0(%edx),%eax
+	# in1 = *(uint32 *) (x + 4)
+	movl	4(%edx),%ecx
+	# in2 = *(uint32 *) (x + 8)
+	movl	8(%edx),%ebp
+	# j0 = in0
+	movl	%eax,164(%esp)
+	# in3 = *(uint32 *) (x + 12)
+	movl	12(%edx),%eax
+	# j1 = in1
+	movl	%ecx,168(%esp)
+	# in4 = *(uint32 *) (x + 16)
+	movl	16(%edx),%ecx
+	# j2 = in2
+	movl	%ebp,172(%esp)
+	# in5 = *(uint32 *) (x + 20)
+	movl	20(%edx),%ebp
+	# j3 = in3
+	movl	%eax,176(%esp)
+	# in6 = *(uint32 *) (x + 24)
+	movl	24(%edx),%eax
+	# j4 = in4
+	movl	%ecx,180(%esp)
+	# in7 = *(uint32 *) (x + 28)
+	movl	28(%edx),%ecx
+	# j5 = in5
+	movl	%ebp,184(%esp)
+	# in8 = *(uint32 *) (x + 32)
+	movl	32(%edx),%ebp
+	# j6 = in6
+	movl	%eax,188(%esp)
+	# in9 = *(uint32 *) (x + 36)
+	movl	36(%edx),%eax
+	# j7 = in7
+	movl	%ecx,192(%esp)
+	# in10 = *(uint32 *) (x + 40)
+	movl	40(%edx),%ecx
+	# j8 = in8
+	movl	%ebp,196(%esp)
+	# in11 = *(uint32 *) (x + 44)
+	movl	44(%edx),%ebp
+	# j9 = in9
+	movl	%eax,200(%esp)
+	# in12 = *(uint32 *) (x + 48)
+	movl	48(%edx),%eax
+	# j10 = in10
+	movl	%ecx,204(%esp)
+	# in13 = *(uint32 *) (x + 52)
+	movl	52(%edx),%ecx
+	# j11 = in11
+	movl	%ebp,208(%esp)
+	# in14 = *(uint32 *) (x + 56)
+	movl	56(%edx),%ebp
+	# j12 = in12
+	movl	%eax,212(%esp)
+	# in15 = *(uint32 *) (x + 60)
+	movl	60(%edx),%eax
+	# j13 = in13
+	movl	%ecx,216(%esp)
+	# j14 = in14
+	movl	%ebp,220(%esp)
+	# j15 = in15
+	movl	%eax,224(%esp)
+	# x_backup = x
+	movl	%edx,64(%esp)
+._bytesatleast1:
+	#   bytes - 64
+	cmp	$64,%ebx
+	#   goto nocopy if unsigned>=
+	jae	._nocopy
+	#     ctarget = out
+	movl	%edi,228(%esp)
+	#     out = &tmp
+	leal	0(%esp),%edi
+	#     i = bytes
+	mov	%ebx,%ecx
+	#     while (i) { *out++ = *m++; --i }
+	rep	movsb
+	#     out = &tmp
+	leal	0(%esp),%edi
+	#     m = &tmp
+	leal	0(%esp),%esi
+._nocopy:
+	#   out_backup = out
+	movl	%edi,72(%esp)
+	#   m_backup = m
+	movl	%esi,68(%esp)
+	#   bytes_backup = bytes
+	movl	%ebx,76(%esp)
+	#   in0 = j0
+	movl	164(%esp),%eax
+	#   in1 = j1
+	movl	168(%esp),%ecx
+	#   in2 = j2
+	movl	172(%esp),%edx
+	#   in3 = j3
+	movl	176(%esp),%ebx
+	#   x0 = in0
+	movl	%eax,100(%esp)
+	#   x1 = in1
+	movl	%ecx,104(%esp)
+	#   x2 = in2
+	movl	%edx,108(%esp)
+	#   x3 = in3
+	movl	%ebx,112(%esp)
+	#   in4 = j4
+	movl	180(%esp),%eax
+	#   in5 = j5
+	movl	184(%esp),%ecx
+	#   in6 = j6
+	movl	188(%esp),%edx
+	#   in7 = j7
+	movl	192(%esp),%ebx
+	#   x4 = in4
+	movl	%eax,116(%esp)
+	#   x5 = in5
+	movl	%ecx,120(%esp)
+	#   x6 = in6
+	movl	%edx,124(%esp)
+	#   x7 = in7
+	movl	%ebx,128(%esp)
+	#   in8 = j8
+	movl	196(%esp),%eax
+	#   in9 = j9
+	movl	200(%esp),%ecx
+	#   in10 = j10
+	movl	204(%esp),%edx
+	#   in11 = j11
+	movl	208(%esp),%ebx
+	#   x8 = in8
+	movl	%eax,132(%esp)
+	#   x9 = in9
+	movl	%ecx,136(%esp)
+	#   x10 = in10
+	movl	%edx,140(%esp)
+	#   x11 = in11
+	movl	%ebx,144(%esp)
+	#   in12 = j12
+	movl	212(%esp),%eax
+	#   in13 = j13
+	movl	216(%esp),%ecx
+	#   in14 = j14
+	movl	220(%esp),%edx
+	#   in15 = j15
+	movl	224(%esp),%ebx
+	#   x12 = in12
+	movl	%eax,148(%esp)
+	#   x13 = in13
+	movl	%ecx,152(%esp)
+	#   x14 = in14
+	movl	%edx,156(%esp)
+	#   x15 = in15
+	movl	%ebx,160(%esp)
+	#   i = 20
+	mov	$20,%ebp
+	# p = x0
+	movl	100(%esp),%eax
+	# s = x5
+	movl	120(%esp),%ecx
+	# t = x10
+	movl	140(%esp),%edx
+	# w = x15
+	movl	160(%esp),%ebx
+._mainloop:
+	# x0 = p
+	movl	%eax,100(%esp)
+	# 				x10 = t
+	movl	%edx,140(%esp)
+	# p += x12
+	addl	148(%esp),%eax
+	# 		x5 = s
+	movl	%ecx,120(%esp)
+	# 				t += x6
+	addl	124(%esp),%edx
+	# 						x15 = w
+	movl	%ebx,160(%esp)
+	# 		r = x1
+	movl	104(%esp),%esi
+	# 		r += s
+	add	%ecx,%esi
+	# 						v = x11
+	movl	144(%esp),%edi
+	# 						v += w
+	add	%ebx,%edi
+	# p <<<= 7
+	rol	$7,%eax
+	# p ^= x4
+	xorl	116(%esp),%eax
+	# 				t <<<= 7
+	rol	$7,%edx
+	# 				t ^= x14
+	xorl	156(%esp),%edx
+	# 		r <<<= 7
+	rol	$7,%esi
+	# 		r ^= x9
+	xorl	136(%esp),%esi
+	# 						v <<<= 7
+	rol	$7,%edi
+	# 						v ^= x3
+	xorl	112(%esp),%edi
+	# x4 = p
+	movl	%eax,116(%esp)
+	# 				x14 = t
+	movl	%edx,156(%esp)
+	# p += x0
+	addl	100(%esp),%eax
+	# 		x9 = r
+	movl	%esi,136(%esp)
+	# 				t += x10
+	addl	140(%esp),%edx
+	# 						x3 = v
+	movl	%edi,112(%esp)
+	# p <<<= 9
+	rol	$9,%eax
+	# p ^= x8
+	xorl	132(%esp),%eax
+	# 				t <<<= 9
+	rol	$9,%edx
+	# 				t ^= x2
+	xorl	108(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 9
+	rol	$9,%ecx
+	# 		s ^= x13
+	xorl	152(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 9
+	rol	$9,%ebx
+	# 						w ^= x7
+	xorl	128(%esp),%ebx
+	# x8 = p
+	movl	%eax,132(%esp)
+	# 				x2 = t
+	movl	%edx,108(%esp)
+	# p += x4
+	addl	116(%esp),%eax
+	# 		x13 = s
+	movl	%ecx,152(%esp)
+	# 				t += x14
+	addl	156(%esp),%edx
+	# 						x7 = w
+	movl	%ebx,128(%esp)
+	# p <<<= 13
+	rol	$13,%eax
+	# p ^= x12
+	xorl	148(%esp),%eax
+	# 				t <<<= 13
+	rol	$13,%edx
+	# 				t ^= x6
+	xorl	124(%esp),%edx
+	# 		r += s
+	add	%ecx,%esi
+	# 		r <<<= 13
+	rol	$13,%esi
+	# 		r ^= x1
+	xorl	104(%esp),%esi
+	# 						v += w
+	add	%ebx,%edi
+	# 						v <<<= 13
+	rol	$13,%edi
+	# 						v ^= x11
+	xorl	144(%esp),%edi
+	# x12 = p
+	movl	%eax,148(%esp)
+	# 				x6 = t
+	movl	%edx,124(%esp)
+	# p += x8
+	addl	132(%esp),%eax
+	# 		x1 = r
+	movl	%esi,104(%esp)
+	# 				t += x2
+	addl	108(%esp),%edx
+	# 						x11 = v
+	movl	%edi,144(%esp)
+	# p <<<= 18
+	rol	$18,%eax
+	# p ^= x0
+	xorl	100(%esp),%eax
+	# 				t <<<= 18
+	rol	$18,%edx
+	# 				t ^= x10
+	xorl	140(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 18
+	rol	$18,%ecx
+	# 		s ^= x5
+	xorl	120(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 18
+	rol	$18,%ebx
+	# 						w ^= x15
+	xorl	160(%esp),%ebx
+	# x0 = p
+	movl	%eax,100(%esp)
+	# 				x10 = t
+	movl	%edx,140(%esp)
+	# p += x3
+	addl	112(%esp),%eax
+	# p <<<= 7
+	rol	$7,%eax
+	# 		x5 = s
+	movl	%ecx,120(%esp)
+	# 				t += x9
+	addl	136(%esp),%edx
+	# 						x15 = w
+	movl	%ebx,160(%esp)
+	# 		r = x4
+	movl	116(%esp),%esi
+	# 		r += s
+	add	%ecx,%esi
+	# 						v = x14
+	movl	156(%esp),%edi
+	# 						v += w
+	add	%ebx,%edi
+	# p ^= x1
+	xorl	104(%esp),%eax
+	# 				t <<<= 7
+	rol	$7,%edx
+	# 				t ^= x11
+	xorl	144(%esp),%edx
+	# 		r <<<= 7
+	rol	$7,%esi
+	# 		r ^= x6
+	xorl	124(%esp),%esi
+	# 						v <<<= 7
+	rol	$7,%edi
+	# 						v ^= x12
+	xorl	148(%esp),%edi
+	# x1 = p
+	movl	%eax,104(%esp)
+	# 				x11 = t
+	movl	%edx,144(%esp)
+	# p += x0
+	addl	100(%esp),%eax
+	# 		x6 = r
+	movl	%esi,124(%esp)
+	# 				t += x10
+	addl	140(%esp),%edx
+	# 						x12 = v
+	movl	%edi,148(%esp)
+	# p <<<= 9
+	rol	$9,%eax
+	# p ^= x2
+	xorl	108(%esp),%eax
+	# 				t <<<= 9
+	rol	$9,%edx
+	# 				t ^= x8
+	xorl	132(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 9
+	rol	$9,%ecx
+	# 		s ^= x7
+	xorl	128(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 9
+	rol	$9,%ebx
+	# 						w ^= x13
+	xorl	152(%esp),%ebx
+	# x2 = p
+	movl	%eax,108(%esp)
+	# 				x8 = t
+	movl	%edx,132(%esp)
+	# p += x1
+	addl	104(%esp),%eax
+	# 		x7 = s
+	movl	%ecx,128(%esp)
+	# 				t += x11
+	addl	144(%esp),%edx
+	# 						x13 = w
+	movl	%ebx,152(%esp)
+	# p <<<= 13
+	rol	$13,%eax
+	# p ^= x3
+	xorl	112(%esp),%eax
+	# 				t <<<= 13
+	rol	$13,%edx
+	# 				t ^= x9
+	xorl	136(%esp),%edx
+	# 		r += s
+	add	%ecx,%esi
+	# 		r <<<= 13
+	rol	$13,%esi
+	# 		r ^= x4
+	xorl	116(%esp),%esi
+	# 						v += w
+	add	%ebx,%edi
+	# 						v <<<= 13
+	rol	$13,%edi
+	# 						v ^= x14
+	xorl	156(%esp),%edi
+	# x3 = p
+	movl	%eax,112(%esp)
+	# 				x9 = t
+	movl	%edx,136(%esp)
+	# p += x2
+	addl	108(%esp),%eax
+	# 		x4 = r
+	movl	%esi,116(%esp)
+	# 				t += x8
+	addl	132(%esp),%edx
+	# 						x14 = v
+	movl	%edi,156(%esp)
+	# p <<<= 18
+	rol	$18,%eax
+	# p ^= x0
+	xorl	100(%esp),%eax
+	# 				t <<<= 18
+	rol	$18,%edx
+	# 				t ^= x10
+	xorl	140(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 18
+	rol	$18,%ecx
+	# 		s ^= x5
+	xorl	120(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 18
+	rol	$18,%ebx
+	# 						w ^= x15
+	xorl	160(%esp),%ebx
+	# x0 = p
+	movl	%eax,100(%esp)
+	# 				x10 = t
+	movl	%edx,140(%esp)
+	# p += x12
+	addl	148(%esp),%eax
+	# 		x5 = s
+	movl	%ecx,120(%esp)
+	# 				t += x6
+	addl	124(%esp),%edx
+	# 						x15 = w
+	movl	%ebx,160(%esp)
+	# 		r = x1
+	movl	104(%esp),%esi
+	# 		r += s
+	add	%ecx,%esi
+	# 						v = x11
+	movl	144(%esp),%edi
+	# 						v += w
+	add	%ebx,%edi
+	# p <<<= 7
+	rol	$7,%eax
+	# p ^= x4
+	xorl	116(%esp),%eax
+	# 				t <<<= 7
+	rol	$7,%edx
+	# 				t ^= x14
+	xorl	156(%esp),%edx
+	# 		r <<<= 7
+	rol	$7,%esi
+	# 		r ^= x9
+	xorl	136(%esp),%esi
+	# 						v <<<= 7
+	rol	$7,%edi
+	# 						v ^= x3
+	xorl	112(%esp),%edi
+	# x4 = p
+	movl	%eax,116(%esp)
+	# 				x14 = t
+	movl	%edx,156(%esp)
+	# p += x0
+	addl	100(%esp),%eax
+	# 		x9 = r
+	movl	%esi,136(%esp)
+	# 				t += x10
+	addl	140(%esp),%edx
+	# 						x3 = v
+	movl	%edi,112(%esp)
+	# p <<<= 9
+	rol	$9,%eax
+	# p ^= x8
+	xorl	132(%esp),%eax
+	# 				t <<<= 9
+	rol	$9,%edx
+	# 				t ^= x2
+	xorl	108(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 9
+	rol	$9,%ecx
+	# 		s ^= x13
+	xorl	152(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 9
+	rol	$9,%ebx
+	# 						w ^= x7
+	xorl	128(%esp),%ebx
+	# x8 = p
+	movl	%eax,132(%esp)
+	# 				x2 = t
+	movl	%edx,108(%esp)
+	# p += x4
+	addl	116(%esp),%eax
+	# 		x13 = s
+	movl	%ecx,152(%esp)
+	# 				t += x14
+	addl	156(%esp),%edx
+	# 						x7 = w
+	movl	%ebx,128(%esp)
+	# p <<<= 13
+	rol	$13,%eax
+	# p ^= x12
+	xorl	148(%esp),%eax
+	# 				t <<<= 13
+	rol	$13,%edx
+	# 				t ^= x6
+	xorl	124(%esp),%edx
+	# 		r += s
+	add	%ecx,%esi
+	# 		r <<<= 13
+	rol	$13,%esi
+	# 		r ^= x1
+	xorl	104(%esp),%esi
+	# 						v += w
+	add	%ebx,%edi
+	# 						v <<<= 13
+	rol	$13,%edi
+	# 						v ^= x11
+	xorl	144(%esp),%edi
+	# x12 = p
+	movl	%eax,148(%esp)
+	# 				x6 = t
+	movl	%edx,124(%esp)
+	# p += x8
+	addl	132(%esp),%eax
+	# 		x1 = r
+	movl	%esi,104(%esp)
+	# 				t += x2
+	addl	108(%esp),%edx
+	# 						x11 = v
+	movl	%edi,144(%esp)
+	# p <<<= 18
+	rol	$18,%eax
+	# p ^= x0
+	xorl	100(%esp),%eax
+	# 				t <<<= 18
+	rol	$18,%edx
+	# 				t ^= x10
+	xorl	140(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 18
+	rol	$18,%ecx
+	# 		s ^= x5
+	xorl	120(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 18
+	rol	$18,%ebx
+	# 						w ^= x15
+	xorl	160(%esp),%ebx
+	# x0 = p
+	movl	%eax,100(%esp)
+	# 				x10 = t
+	movl	%edx,140(%esp)
+	# p += x3
+	addl	112(%esp),%eax
+	# p <<<= 7
+	rol	$7,%eax
+	# 		x5 = s
+	movl	%ecx,120(%esp)
+	# 				t += x9
+	addl	136(%esp),%edx
+	# 						x15 = w
+	movl	%ebx,160(%esp)
+	# 		r = x4
+	movl	116(%esp),%esi
+	# 		r += s
+	add	%ecx,%esi
+	# 						v = x14
+	movl	156(%esp),%edi
+	# 						v += w
+	add	%ebx,%edi
+	# p ^= x1
+	xorl	104(%esp),%eax
+	# 				t <<<= 7
+	rol	$7,%edx
+	# 				t ^= x11
+	xorl	144(%esp),%edx
+	# 		r <<<= 7
+	rol	$7,%esi
+	# 		r ^= x6
+	xorl	124(%esp),%esi
+	# 						v <<<= 7
+	rol	$7,%edi
+	# 						v ^= x12
+	xorl	148(%esp),%edi
+	# x1 = p
+	movl	%eax,104(%esp)
+	# 				x11 = t
+	movl	%edx,144(%esp)
+	# p += x0
+	addl	100(%esp),%eax
+	# 		x6 = r
+	movl	%esi,124(%esp)
+	# 				t += x10
+	addl	140(%esp),%edx
+	# 						x12 = v
+	movl	%edi,148(%esp)
+	# p <<<= 9
+	rol	$9,%eax
+	# p ^= x2
+	xorl	108(%esp),%eax
+	# 				t <<<= 9
+	rol	$9,%edx
+	# 				t ^= x8
+	xorl	132(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 9
+	rol	$9,%ecx
+	# 		s ^= x7
+	xorl	128(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 9
+	rol	$9,%ebx
+	# 						w ^= x13
+	xorl	152(%esp),%ebx
+	# x2 = p
+	movl	%eax,108(%esp)
+	# 				x8 = t
+	movl	%edx,132(%esp)
+	# p += x1
+	addl	104(%esp),%eax
+	# 		x7 = s
+	movl	%ecx,128(%esp)
+	# 				t += x11
+	addl	144(%esp),%edx
+	# 						x13 = w
+	movl	%ebx,152(%esp)
+	# p <<<= 13
+	rol	$13,%eax
+	# p ^= x3
+	xorl	112(%esp),%eax
+	# 				t <<<= 13
+	rol	$13,%edx
+	# 				t ^= x9
+	xorl	136(%esp),%edx
+	# 		r += s
+	add	%ecx,%esi
+	# 		r <<<= 13
+	rol	$13,%esi
+	# 		r ^= x4
+	xorl	116(%esp),%esi
+	# 						v += w
+	add	%ebx,%edi
+	# 						v <<<= 13
+	rol	$13,%edi
+	# 						v ^= x14
+	xorl	156(%esp),%edi
+	# x3 = p
+	movl	%eax,112(%esp)
+	# 				x9 = t
+	movl	%edx,136(%esp)
+	# p += x2
+	addl	108(%esp),%eax
+	# 		x4 = r
+	movl	%esi,116(%esp)
+	# 				t += x8
+	addl	132(%esp),%edx
+	# 						x14 = v
+	movl	%edi,156(%esp)
+	# p <<<= 18
+	rol	$18,%eax
+	# p ^= x0
+	xorl	100(%esp),%eax
+	# 				t <<<= 18
+	rol	$18,%edx
+	# 				t ^= x10
+	xorl	140(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 18
+	rol	$18,%ecx
+	# 		s ^= x5
+	xorl	120(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 18
+	rol	$18,%ebx
+	# 						w ^= x15
+	xorl	160(%esp),%ebx
+	# i -= 4
+	sub	$4,%ebp
+	# goto mainloop if unsigned >
+	ja	._mainloop
+	# x0 = p
+	movl	%eax,100(%esp)
+	# x5 = s
+	movl	%ecx,120(%esp)
+	# x10 = t
+	movl	%edx,140(%esp)
+	# x15 = w
+	movl	%ebx,160(%esp)
+	#   out = out_backup
+	movl	72(%esp),%edi
+	#   m = m_backup
+	movl	68(%esp),%esi
+	#   in0 = x0
+	movl	100(%esp),%eax
+	#   in1 = x1
+	movl	104(%esp),%ecx
+	#   in0 += j0
+	addl	164(%esp),%eax
+	#   in1 += j1
+	addl	168(%esp),%ecx
+	#   in0 ^= *(uint32 *) (m + 0)
+	xorl	0(%esi),%eax
+	#   in1 ^= *(uint32 *) (m + 4)
+	xorl	4(%esi),%ecx
+	#   *(uint32 *) (out + 0) = in0
+	movl	%eax,0(%edi)
+	#   *(uint32 *) (out + 4) = in1
+	movl	%ecx,4(%edi)
+	#   in2 = x2
+	movl	108(%esp),%eax
+	#   in3 = x3
+	movl	112(%esp),%ecx
+	#   in2 += j2
+	addl	172(%esp),%eax
+	#   in3 += j3
+	addl	176(%esp),%ecx
+	#   in2 ^= *(uint32 *) (m + 8)
+	xorl	8(%esi),%eax
+	#   in3 ^= *(uint32 *) (m + 12)
+	xorl	12(%esi),%ecx
+	#   *(uint32 *) (out + 8) = in2
+	movl	%eax,8(%edi)
+	#   *(uint32 *) (out + 12) = in3
+	movl	%ecx,12(%edi)
+	#   in4 = x4
+	movl	116(%esp),%eax
+	#   in5 = x5
+	movl	120(%esp),%ecx
+	#   in4 += j4
+	addl	180(%esp),%eax
+	#   in5 += j5
+	addl	184(%esp),%ecx
+	#   in4 ^= *(uint32 *) (m + 16)
+	xorl	16(%esi),%eax
+	#   in5 ^= *(uint32 *) (m + 20)
+	xorl	20(%esi),%ecx
+	#   *(uint32 *) (out + 16) = in4
+	movl	%eax,16(%edi)
+	#   *(uint32 *) (out + 20) = in5
+	movl	%ecx,20(%edi)
+	#   in6 = x6
+	movl	124(%esp),%eax
+	#   in7 = x7
+	movl	128(%esp),%ecx
+	#   in6 += j6
+	addl	188(%esp),%eax
+	#   in7 += j7
+	addl	192(%esp),%ecx
+	#   in6 ^= *(uint32 *) (m + 24)
+	xorl	24(%esi),%eax
+	#   in7 ^= *(uint32 *) (m + 28)
+	xorl	28(%esi),%ecx
+	#   *(uint32 *) (out + 24) = in6
+	movl	%eax,24(%edi)
+	#   *(uint32 *) (out + 28) = in7
+	movl	%ecx,28(%edi)
+	#   in8 = x8
+	movl	132(%esp),%eax
+	#   in9 = x9
+	movl	136(%esp),%ecx
+	#   in8 += j8
+	addl	196(%esp),%eax
+	#   in9 += j9
+	addl	200(%esp),%ecx
+	#   in8 ^= *(uint32 *) (m + 32)
+	xorl	32(%esi),%eax
+	#   in9 ^= *(uint32 *) (m + 36)
+	xorl	36(%esi),%ecx
+	#   *(uint32 *) (out + 32) = in8
+	movl	%eax,32(%edi)
+	#   *(uint32 *) (out + 36) = in9
+	movl	%ecx,36(%edi)
+	#   in10 = x10
+	movl	140(%esp),%eax
+	#   in11 = x11
+	movl	144(%esp),%ecx
+	#   in10 += j10
+	addl	204(%esp),%eax
+	#   in11 += j11
+	addl	208(%esp),%ecx
+	#   in10 ^= *(uint32 *) (m + 40)
+	xorl	40(%esi),%eax
+	#   in11 ^= *(uint32 *) (m + 44)
+	xorl	44(%esi),%ecx
+	#   *(uint32 *) (out + 40) = in10
+	movl	%eax,40(%edi)
+	#   *(uint32 *) (out + 44) = in11
+	movl	%ecx,44(%edi)
+	#   in12 = x12
+	movl	148(%esp),%eax
+	#   in13 = x13
+	movl	152(%esp),%ecx
+	#   in12 += j12
+	addl	212(%esp),%eax
+	#   in13 += j13
+	addl	216(%esp),%ecx
+	#   in12 ^= *(uint32 *) (m + 48)
+	xorl	48(%esi),%eax
+	#   in13 ^= *(uint32 *) (m + 52)
+	xorl	52(%esi),%ecx
+	#   *(uint32 *) (out + 48) = in12
+	movl	%eax,48(%edi)
+	#   *(uint32 *) (out + 52) = in13
+	movl	%ecx,52(%edi)
+	#   in14 = x14
+	movl	156(%esp),%eax
+	#   in15 = x15
+	movl	160(%esp),%ecx
+	#   in14 += j14
+	addl	220(%esp),%eax
+	#   in15 += j15
+	addl	224(%esp),%ecx
+	#   in14 ^= *(uint32 *) (m + 56)
+	xorl	56(%esi),%eax
+	#   in15 ^= *(uint32 *) (m + 60)
+	xorl	60(%esi),%ecx
+	#   *(uint32 *) (out + 56) = in14
+	movl	%eax,56(%edi)
+	#   *(uint32 *) (out + 60) = in15
+	movl	%ecx,60(%edi)
+	#   bytes = bytes_backup
+	movl	76(%esp),%ebx
+	#   in8 = j8
+	movl	196(%esp),%eax
+	#   in9 = j9
+	movl	200(%esp),%ecx
+	#   in8 += 1
+	add	$1,%eax
+	#   in9 += 0 + carry
+	adc	$0,%ecx
+	#   j8 = in8
+	movl	%eax,196(%esp)
+	#   j9 = in9
+	movl	%ecx,200(%esp)
+	#   bytes - 64
+	cmp	$64,%ebx
+	#   goto bytesatleast65 if unsigned>
+	ja	._bytesatleast65
+	#     goto bytesatleast64 if unsigned>=
+	jae	._bytesatleast64
+	#       m = out
+	mov	%edi,%esi
+	#       out = ctarget
+	movl	228(%esp),%edi
+	#       i = bytes
+	mov	%ebx,%ecx
+	#       while (i) { *out++ = *m++; --i }
+	rep	movsb
+._bytesatleast64:
+	#     x = x_backup
+	movl	64(%esp),%eax
+	#     in8 = j8
+	movl	196(%esp),%ecx
+	#     in9 = j9
+	movl	200(%esp),%edx
+	#     *(uint32 *) (x + 32) = in8
+	movl	%ecx,32(%eax)
+	#     *(uint32 *) (x + 36) = in9
+	movl	%edx,36(%eax)
+._done:
+	#     eax = eax_stack
+	movl	80(%esp),%eax
+	#     ebx = ebx_stack
+	movl	84(%esp),%ebx
+	#     esi = esi_stack
+	movl	88(%esp),%esi
+	#     edi = edi_stack
+	movl	92(%esp),%edi
+	#     ebp = ebp_stack
+	movl	96(%esp),%ebp
+	#     leave
+	add	%eax,%esp
+	ret
+._bytesatleast65:
+	#   bytes -= 64
+	sub	$64,%ebx
+	#   out += 64
+	add	$64,%edi
+	#   m += 64
+	add	$64,%esi
+	# goto bytesatleast1
+	jmp	._bytesatleast1
+# enter ECRYPT_keysetup
+.text
+.p2align 5
+.globl ECRYPT_keysetup
+ECRYPT_keysetup:
+	mov	%esp,%eax
+	and	$31,%eax
+	add	$256,%eax
+	sub	%eax,%esp
+	#   eax_stack = eax
+	movl	%eax,64(%esp)
+	#   ebx_stack = ebx
+	movl	%ebx,68(%esp)
+	#   esi_stack = esi
+	movl	%esi,72(%esp)
+	#   edi_stack = edi
+	movl	%edi,76(%esp)
+	#   ebp_stack = ebp
+	movl	%ebp,80(%esp)
+	#   k = arg2
+	movl	8(%esp,%eax),%ecx
+	#   kbits = arg3
+	movl	12(%esp,%eax),%edx
+	#   x = arg1
+	movl	4(%esp,%eax),%eax
+	#   in1 = *(uint32 *) (k + 0)
+	movl	0(%ecx),%ebx
+	#   in2 = *(uint32 *) (k + 4)
+	movl	4(%ecx),%esi
+	#   in3 = *(uint32 *) (k + 8)
+	movl	8(%ecx),%edi
+	#   in4 = *(uint32 *) (k + 12)
+	movl	12(%ecx),%ebp
+	#   *(uint32 *) (x + 4) = in1
+	movl	%ebx,4(%eax)
+	#   *(uint32 *) (x + 8) = in2
+	movl	%esi,8(%eax)
+	#   *(uint32 *) (x + 12) = in3
+	movl	%edi,12(%eax)
+	#   *(uint32 *) (x + 16) = in4
+	movl	%ebp,16(%eax)
+	#   kbits - 256
+	cmp	$256,%edx
+	#   goto kbits128 if unsigned<
+	jb	._kbits128
+._kbits256:
+	#     in11 = *(uint32 *) (k + 16)
+	movl	16(%ecx),%edx
+	#     in12 = *(uint32 *) (k + 20)
+	movl	20(%ecx),%ebx
+	#     in13 = *(uint32 *) (k + 24)
+	movl	24(%ecx),%esi
+	#     in14 = *(uint32 *) (k + 28)
+	movl	28(%ecx),%ecx
+	#     *(uint32 *) (x + 44) = in11
+	movl	%edx,44(%eax)
+	#     *(uint32 *) (x + 48) = in12
+	movl	%ebx,48(%eax)
+	#     *(uint32 *) (x + 52) = in13
+	movl	%esi,52(%eax)
+	#     *(uint32 *) (x + 56) = in14
+	movl	%ecx,56(%eax)
+	#     in0 = 1634760805
+	mov	$1634760805,%ecx
+	#     in5 = 857760878
+	mov	$857760878,%edx
+	#     in10 = 2036477234
+	mov	$2036477234,%ebx
+	#     in15 = 1797285236
+	mov	$1797285236,%esi
+	#     *(uint32 *) (x + 0) = in0
+	movl	%ecx,0(%eax)
+	#     *(uint32 *) (x + 20) = in5
+	movl	%edx,20(%eax)
+	#     *(uint32 *) (x + 40) = in10
+	movl	%ebx,40(%eax)
+	#     *(uint32 *) (x + 60) = in15
+	movl	%esi,60(%eax)
+	#   goto keysetupdone
+	jmp	._keysetupdone
+._kbits128:
+	#     in11 = *(uint32 *) (k + 0)
+	movl	0(%ecx),%edx
+	#     in12 = *(uint32 *) (k + 4)
+	movl	4(%ecx),%ebx
+	#     in13 = *(uint32 *) (k + 8)
+	movl	8(%ecx),%esi
+	#     in14 = *(uint32 *) (k + 12)
+	movl	12(%ecx),%ecx
+	#     *(uint32 *) (x + 44) = in11
+	movl	%edx,44(%eax)
+	#     *(uint32 *) (x + 48) = in12
+	movl	%ebx,48(%eax)
+	#     *(uint32 *) (x + 52) = in13
+	movl	%esi,52(%eax)
+	#     *(uint32 *) (x + 56) = in14
+	movl	%ecx,56(%eax)
+	#     in0 = 1634760805
+	mov	$1634760805,%ecx
+	#     in5 = 824206446
+	mov	$824206446,%edx
+	#     in10 = 2036477238
+	mov	$2036477238,%ebx
+	#     in15 = 1797285236
+	mov	$1797285236,%esi
+	#     *(uint32 *) (x + 0) = in0
+	movl	%ecx,0(%eax)
+	#     *(uint32 *) (x + 20) = in5
+	movl	%edx,20(%eax)
+	#     *(uint32 *) (x + 40) = in10
+	movl	%ebx,40(%eax)
+	#     *(uint32 *) (x + 60) = in15
+	movl	%esi,60(%eax)
+._keysetupdone:
+	#   eax = eax_stack
+	movl	64(%esp),%eax
+	#   ebx = ebx_stack
+	movl	68(%esp),%ebx
+	#   esi = esi_stack
+	movl	72(%esp),%esi
+	#   edi = edi_stack
+	movl	76(%esp),%edi
+	#   ebp = ebp_stack
+	movl	80(%esp),%ebp
+	# leave
+	add	%eax,%esp
+	ret
+# enter ECRYPT_ivsetup
+.text
+.p2align 5
+.globl ECRYPT_ivsetup
+ECRYPT_ivsetup:
+	mov	%esp,%eax
+	and	$31,%eax
+	add	$256,%eax
+	sub	%eax,%esp
+	#   eax_stack = eax
+	movl	%eax,64(%esp)
+	#   ebx_stack = ebx
+	movl	%ebx,68(%esp)
+	#   esi_stack = esi
+	movl	%esi,72(%esp)
+	#   edi_stack = edi
+	movl	%edi,76(%esp)
+	#   ebp_stack = ebp
+	movl	%ebp,80(%esp)
+	#   iv = arg2
+	movl	8(%esp,%eax),%ecx
+	#   x = arg1
+	movl	4(%esp,%eax),%eax
+	#   in6 = *(uint32 *) (iv + 0)
+	movl	0(%ecx),%edx
+	#   in7 = *(uint32 *) (iv + 4)
+	movl	4(%ecx),%ecx
+	#   in8 = 0
+	mov	$0,%ebx
+	#   in9 = 0
+	mov	$0,%esi
+	#   *(uint32 *) (x + 24) = in6
+	movl	%edx,24(%eax)
+	#   *(uint32 *) (x + 28) = in7
+	movl	%ecx,28(%eax)
+	#   *(uint32 *) (x + 32) = in8
+	movl	%ebx,32(%eax)
+	#   *(uint32 *) (x + 36) = in9
+	movl	%esi,36(%eax)
+	#   eax = eax_stack
+	movl	64(%esp),%eax
+	#   ebx = ebx_stack
+	movl	68(%esp),%ebx
+	#   esi = esi_stack
+	movl	72(%esp),%esi
+	#   edi = edi_stack
+	movl	76(%esp),%edi
+	#   ebp = ebp_stack
+	movl	80(%esp),%ebp
+	# leave
+	add	%eax,%esp
+	ret
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
new file mode 100644
index 000000000000..6214a9b09706
--- /dev/null
+++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S
@@ -0,0 +1,920 @@
+# enter ECRYPT_encrypt_bytes
+.text
+.p2align 5
+.globl ECRYPT_encrypt_bytes
+ECRYPT_encrypt_bytes:
+	mov	%rsp,%r11
+	and	$31,%r11
+	add	$256,%r11
+	sub	%r11,%rsp
+	# x = arg1
+	mov	%rdi,%r8
+	# m = arg2
+	mov	%rsi,%rsi
+	# out = arg3
+	mov	%rdx,%rdi
+	# bytes = arg4
+	mov	%rcx,%rdx
+	#               unsigned>? bytes - 0
+	cmp	$0,%rdx
+	# comment:fp stack unchanged by jump
+	# goto done if !unsigned>
+	jbe	._done
+	# comment:fp stack unchanged by fallthrough
+# start:
+._start:
+	# r11_stack = r11
+	movq	%r11,0(%rsp)
+	# r12_stack = r12
+	movq	%r12,8(%rsp)
+	# r13_stack = r13
+	movq	%r13,16(%rsp)
+	# r14_stack = r14
+	movq	%r14,24(%rsp)
+	# r15_stack = r15
+	movq	%r15,32(%rsp)
+	# rbx_stack = rbx
+	movq	%rbx,40(%rsp)
+	# rbp_stack = rbp
+	movq	%rbp,48(%rsp)
+	# in0 = *(uint64 *) (x + 0)
+	movq	0(%r8),%rcx
+	# in2 = *(uint64 *) (x + 8)
+	movq	8(%r8),%r9
+	# in4 = *(uint64 *) (x + 16)
+	movq	16(%r8),%rax
+	# in6 = *(uint64 *) (x + 24)
+	movq	24(%r8),%r10
+	# in8 = *(uint64 *) (x + 32)
+	movq	32(%r8),%r11
+	# in10 = *(uint64 *) (x + 40)
+	movq	40(%r8),%r12
+	# in12 = *(uint64 *) (x + 48)
+	movq	48(%r8),%r13
+	# in14 = *(uint64 *) (x + 56)
+	movq	56(%r8),%r14
+	# j0 = in0
+	movq	%rcx,56(%rsp)
+	# j2 = in2
+	movq	%r9,64(%rsp)
+	# j4 = in4
+	movq	%rax,72(%rsp)
+	# j6 = in6
+	movq	%r10,80(%rsp)
+	# j8 = in8
+	movq	%r11,88(%rsp)
+	# j10 = in10
+	movq	%r12,96(%rsp)
+	# j12 = in12
+	movq	%r13,104(%rsp)
+	# j14 = in14
+	movq	%r14,112(%rsp)
+	# x_backup = x
+	movq	%r8,120(%rsp)
+# bytesatleast1:
+._bytesatleast1:
+	#                   unsigned<? bytes - 64
+	cmp	$64,%rdx
+	# comment:fp stack unchanged by jump
+	#   goto nocopy if !unsigned<
+	jae	._nocopy
+	#     ctarget = out
+	movq	%rdi,128(%rsp)
+	#     out = &tmp
+	leaq	192(%rsp),%rdi
+	#     i = bytes
+	mov	%rdx,%rcx
+	#     while (i) { *out++ = *m++; --i }
+	rep	movsb
+	#     out = &tmp
+	leaq	192(%rsp),%rdi
+	#     m = &tmp
+	leaq	192(%rsp),%rsi
+	# comment:fp stack unchanged by fallthrough
+#   nocopy:
+._nocopy:
+	#   out_backup = out
+	movq	%rdi,136(%rsp)
+	#   m_backup = m
+	movq	%rsi,144(%rsp)
+	#   bytes_backup = bytes
+	movq	%rdx,152(%rsp)
+	#   x1 = j0
+	movq	56(%rsp),%rdi
+	#   x0 = x1
+	mov	%rdi,%rdx
+	#   (uint64) x1 >>= 32
+	shr	$32,%rdi
+	#   		x3 = j2
+	movq	64(%rsp),%rsi
+	#   		x2 = x3
+	mov	%rsi,%rcx
+	#   		(uint64) x3 >>= 32
+	shr	$32,%rsi
+	#   x5 = j4
+	movq	72(%rsp),%r8
+	#   x4 = x5
+	mov	%r8,%r9
+	#   (uint64) x5 >>= 32
+	shr	$32,%r8
+	#   x5_stack = x5
+	movq	%r8,160(%rsp)
+	#   		x7 = j6
+	movq	80(%rsp),%r8
+	#   		x6 = x7
+	mov	%r8,%rax
+	#   		(uint64) x7 >>= 32
+	shr	$32,%r8
+	#   x9 = j8
+	movq	88(%rsp),%r10
+	#   x8 = x9
+	mov	%r10,%r11
+	#   (uint64) x9 >>= 32
+	shr	$32,%r10
+	#   		x11 = j10
+	movq	96(%rsp),%r12
+	#   		x10 = x11
+	mov	%r12,%r13
+	#   		x10_stack = x10
+	movq	%r13,168(%rsp)
+	#   		(uint64) x11 >>= 32
+	shr	$32,%r12
+	#   x13 = j12
+	movq	104(%rsp),%r13
+	#   x12 = x13
+	mov	%r13,%r14
+	#   (uint64) x13 >>= 32
+	shr	$32,%r13
+	#   		x15 = j14
+	movq	112(%rsp),%r15
+	#   		x14 = x15
+	mov	%r15,%rbx
+	#   		(uint64) x15 >>= 32
+	shr	$32,%r15
+	#   		x15_stack = x15
+	movq	%r15,176(%rsp)
+	#   i = 20
+	mov	$20,%r15
+#   mainloop:
+._mainloop:
+	#   i_backup = i
+	movq	%r15,184(%rsp)
+	# 		x5 = x5_stack
+	movq	160(%rsp),%r15
+	# a = x12 + x0
+	lea	(%r14,%rdx),%rbp
+	# (uint32) a <<<= 7
+	rol	$7,%ebp
+	# x4 ^= a
+	xor	%rbp,%r9
+	# 		b = x1 + x5
+	lea	(%rdi,%r15),%rbp
+	# 		(uint32) b <<<= 7
+	rol	$7,%ebp
+	# 		x9 ^= b
+	xor	%rbp,%r10
+	# a = x0 + x4
+	lea	(%rdx,%r9),%rbp
+	# (uint32) a <<<= 9
+	rol	$9,%ebp
+	# x8 ^= a
+	xor	%rbp,%r11
+	# 		b = x5 + x9
+	lea	(%r15,%r10),%rbp
+	# 		(uint32) b <<<= 9
+	rol	$9,%ebp
+	# 		x13 ^= b
+	xor	%rbp,%r13
+	# a = x4 + x8
+	lea	(%r9,%r11),%rbp
+	# (uint32) a <<<= 13
+	rol	$13,%ebp
+	# x12 ^= a
+	xor	%rbp,%r14
+	# 		b = x9 + x13
+	lea	(%r10,%r13),%rbp
+	# 		(uint32) b <<<= 13
+	rol	$13,%ebp
+	# 		x1 ^= b
+	xor	%rbp,%rdi
+	# a = x8 + x12
+	lea	(%r11,%r14),%rbp
+	# (uint32) a <<<= 18
+	rol	$18,%ebp
+	# x0 ^= a
+	xor	%rbp,%rdx
+	# 		b = x13 + x1
+	lea	(%r13,%rdi),%rbp
+	# 		(uint32) b <<<= 18
+	rol	$18,%ebp
+	# 		x5 ^= b
+	xor	%rbp,%r15
+	# 				x10 = x10_stack
+	movq	168(%rsp),%rbp
+	# 		x5_stack = x5
+	movq	%r15,160(%rsp)
+	# 				c = x6 + x10
+	lea	(%rax,%rbp),%r15
+	# 				(uint32) c <<<= 7
+	rol	$7,%r15d
+	# 				x14 ^= c
+	xor	%r15,%rbx
+	# 				c = x10 + x14
+	lea	(%rbp,%rbx),%r15
+	# 				(uint32) c <<<= 9
+	rol	$9,%r15d
+	# 				x2 ^= c
+	xor	%r15,%rcx
+	# 				c = x14 + x2
+	lea	(%rbx,%rcx),%r15
+	# 				(uint32) c <<<= 13
+	rol	$13,%r15d
+	# 				x6 ^= c
+	xor	%r15,%rax
+	# 				c = x2 + x6
+	lea	(%rcx,%rax),%r15
+	# 				(uint32) c <<<= 18
+	rol	$18,%r15d
+	# 				x10 ^= c
+	xor	%r15,%rbp
+	# 						x15 = x15_stack
+	movq	176(%rsp),%r15
+	# 				x10_stack = x10
+	movq	%rbp,168(%rsp)
+	# 						d = x11 + x15
+	lea	(%r12,%r15),%rbp
+	# 						(uint32) d <<<= 7
+	rol	$7,%ebp
+	# 						x3 ^= d
+	xor	%rbp,%rsi
+	# 						d = x15 + x3
+	lea	(%r15,%rsi),%rbp
+	# 						(uint32) d <<<= 9
+	rol	$9,%ebp
+	# 						x7 ^= d
+	xor	%rbp,%r8
+	# 						d = x3 + x7
+	lea	(%rsi,%r8),%rbp
+	# 						(uint32) d <<<= 13
+	rol	$13,%ebp
+	# 						x11 ^= d
+	xor	%rbp,%r12
+	# 						d = x7 + x11
+	lea	(%r8,%r12),%rbp
+	# 						(uint32) d <<<= 18
+	rol	$18,%ebp
+	# 						x15 ^= d
+	xor	%rbp,%r15
+	# 						x15_stack = x15
+	movq	%r15,176(%rsp)
+	# 		x5 = x5_stack
+	movq	160(%rsp),%r15
+	# a = x3 + x0
+	lea	(%rsi,%rdx),%rbp
+	# (uint32) a <<<= 7
+	rol	$7,%ebp
+	# x1 ^= a
+	xor	%rbp,%rdi
+	# 		b = x4 + x5
+	lea	(%r9,%r15),%rbp
+	# 		(uint32) b <<<= 7
+	rol	$7,%ebp
+	# 		x6 ^= b
+	xor	%rbp,%rax
+	# a = x0 + x1
+	lea	(%rdx,%rdi),%rbp
+	# (uint32) a <<<= 9
+	rol	$9,%ebp
+	# x2 ^= a
+	xor	%rbp,%rcx
+	# 		b = x5 + x6
+	lea	(%r15,%rax),%rbp
+	# 		(uint32) b <<<= 9
+	rol	$9,%ebp
+	# 		x7 ^= b
+	xor	%rbp,%r8
+	# a = x1 + x2
+	lea	(%rdi,%rcx),%rbp
+	# (uint32) a <<<= 13
+	rol	$13,%ebp
+	# x3 ^= a
+	xor	%rbp,%rsi
+	# 		b = x6 + x7
+	lea	(%rax,%r8),%rbp
+	# 		(uint32) b <<<= 13
+	rol	$13,%ebp
+	# 		x4 ^= b
+	xor	%rbp,%r9
+	# a = x2 + x3
+	lea	(%rcx,%rsi),%rbp
+	# (uint32) a <<<= 18
+	rol	$18,%ebp
+	# x0 ^= a
+	xor	%rbp,%rdx
+	# 		b = x7 + x4
+	lea	(%r8,%r9),%rbp
+	# 		(uint32) b <<<= 18
+	rol	$18,%ebp
+	# 		x5 ^= b
+	xor	%rbp,%r15
+	# 				x10 = x10_stack
+	movq	168(%rsp),%rbp
+	# 		x5_stack = x5
+	movq	%r15,160(%rsp)
+	# 				c = x9 + x10
+	lea	(%r10,%rbp),%r15
+	# 				(uint32) c <<<= 7
+	rol	$7,%r15d
+	# 				x11 ^= c
+	xor	%r15,%r12
+	# 				c = x10 + x11
+	lea	(%rbp,%r12),%r15
+	# 				(uint32) c <<<= 9
+	rol	$9,%r15d
+	# 				x8 ^= c
+	xor	%r15,%r11
+	# 				c = x11 + x8
+	lea	(%r12,%r11),%r15
+	# 				(uint32) c <<<= 13
+	rol	$13,%r15d
+	# 				x9 ^= c
+	xor	%r15,%r10
+	# 				c = x8 + x9
+	lea	(%r11,%r10),%r15
+	# 				(uint32) c <<<= 18
+	rol	$18,%r15d
+	# 				x10 ^= c
+	xor	%r15,%rbp
+	# 						x15 = x15_stack
+	movq	176(%rsp),%r15
+	# 				x10_stack = x10
+	movq	%rbp,168(%rsp)
+	# 						d = x14 + x15
+	lea	(%rbx,%r15),%rbp
+	# 						(uint32) d <<<= 7
+	rol	$7,%ebp
+	# 						x12 ^= d
+	xor	%rbp,%r14
+	# 						d = x15 + x12
+	lea	(%r15,%r14),%rbp
+	# 						(uint32) d <<<= 9
+	rol	$9,%ebp
+	# 						x13 ^= d
+	xor	%rbp,%r13
+	# 						d = x12 + x13
+	lea	(%r14,%r13),%rbp
+	# 						(uint32) d <<<= 13
+	rol	$13,%ebp
+	# 						x14 ^= d
+	xor	%rbp,%rbx
+	# 						d = x13 + x14
+	lea	(%r13,%rbx),%rbp
+	# 						(uint32) d <<<= 18
+	rol	$18,%ebp
+	# 						x15 ^= d
+	xor	%rbp,%r15
+	# 						x15_stack = x15
+	movq	%r15,176(%rsp)
+	# 		x5 = x5_stack
+	movq	160(%rsp),%r15
+	# a = x12 + x0
+	lea	(%r14,%rdx),%rbp
+	# (uint32) a <<<= 7
+	rol	$7,%ebp
+	# x4 ^= a
+	xor	%rbp,%r9
+	# 		b = x1 + x5
+	lea	(%rdi,%r15),%rbp
+	# 		(uint32) b <<<= 7
+	rol	$7,%ebp
+	# 		x9 ^= b
+	xor	%rbp,%r10
+	# a = x0 + x4
+	lea	(%rdx,%r9),%rbp
+	# (uint32) a <<<= 9
+	rol	$9,%ebp
+	# x8 ^= a
+	xor	%rbp,%r11
+	# 		b = x5 + x9
+	lea	(%r15,%r10),%rbp
+	# 		(uint32) b <<<= 9
+	rol	$9,%ebp
+	# 		x13 ^= b
+	xor	%rbp,%r13
+	# a = x4 + x8
+	lea	(%r9,%r11),%rbp
+	# (uint32) a <<<= 13
+	rol	$13,%ebp
+	# x12 ^= a
+	xor	%rbp,%r14
+	# 		b = x9 + x13
+	lea	(%r10,%r13),%rbp
+	# 		(uint32) b <<<= 13
+	rol	$13,%ebp
+	# 		x1 ^= b
+	xor	%rbp,%rdi
+	# a = x8 + x12
+	lea	(%r11,%r14),%rbp
+	# (uint32) a <<<= 18
+	rol	$18,%ebp
+	# x0 ^= a
+	xor	%rbp,%rdx
+	# 		b = x13 + x1
+	lea	(%r13,%rdi),%rbp
+	# 		(uint32) b <<<= 18
+	rol	$18,%ebp
+	# 		x5 ^= b
+	xor	%rbp,%r15
+	# 				x10 = x10_stack
+	movq	168(%rsp),%rbp
+	# 		x5_stack = x5
+	movq	%r15,160(%rsp)
+	# 				c = x6 + x10
+	lea	(%rax,%rbp),%r15
+	# 				(uint32) c <<<= 7
+	rol	$7,%r15d
+	# 				x14 ^= c
+	xor	%r15,%rbx
+	# 				c = x10 + x14
+	lea	(%rbp,%rbx),%r15
+	# 				(uint32) c <<<= 9
+	rol	$9,%r15d
+	# 				x2 ^= c
+	xor	%r15,%rcx
+	# 				c = x14 + x2
+	lea	(%rbx,%rcx),%r15
+	# 				(uint32) c <<<= 13
+	rol	$13,%r15d
+	# 				x6 ^= c
+	xor	%r15,%rax
+	# 				c = x2 + x6
+	lea	(%rcx,%rax),%r15
+	# 				(uint32) c <<<= 18
+	rol	$18,%r15d
+	# 				x10 ^= c
+	xor	%r15,%rbp
+	# 						x15 = x15_stack
+	movq	176(%rsp),%r15
+	# 				x10_stack = x10
+	movq	%rbp,168(%rsp)
+	# 						d = x11 + x15
+	lea	(%r12,%r15),%rbp
+	# 						(uint32) d <<<= 7
+	rol	$7,%ebp
+	# 						x3 ^= d
+	xor	%rbp,%rsi
+	# 						d = x15 + x3
+	lea	(%r15,%rsi),%rbp
+	# 						(uint32) d <<<= 9
+	rol	$9,%ebp
+	# 						x7 ^= d
+	xor	%rbp,%r8
+	# 						d = x3 + x7
+	lea	(%rsi,%r8),%rbp
+	# 						(uint32) d <<<= 13
+	rol	$13,%ebp
+	# 						x11 ^= d
+	xor	%rbp,%r12
+	# 						d = x7 + x11
+	lea	(%r8,%r12),%rbp
+	# 						(uint32) d <<<= 18
+	rol	$18,%ebp
+	# 						x15 ^= d
+	xor	%rbp,%r15
+	# 						x15_stack = x15
+	movq	%r15,176(%rsp)
+	# 		x5 = x5_stack
+	movq	160(%rsp),%r15
+	# a = x3 + x0
+	lea	(%rsi,%rdx),%rbp
+	# (uint32) a <<<= 7
+	rol	$7,%ebp
+	# x1 ^= a
+	xor	%rbp,%rdi
+	# 		b = x4 + x5
+	lea	(%r9,%r15),%rbp
+	# 		(uint32) b <<<= 7
+	rol	$7,%ebp
+	# 		x6 ^= b
+	xor	%rbp,%rax
+	# a = x0 + x1
+	lea	(%rdx,%rdi),%rbp
+	# (uint32) a <<<= 9
+	rol	$9,%ebp
+	# x2 ^= a
+	xor	%rbp,%rcx
+	# 		b = x5 + x6
+	lea	(%r15,%rax),%rbp
+	# 		(uint32) b <<<= 9
+	rol	$9,%ebp
+	# 		x7 ^= b
+	xor	%rbp,%r8
+	# a = x1 + x2
+	lea	(%rdi,%rcx),%rbp
+	# (uint32) a <<<= 13
+	rol	$13,%ebp
+	# x3 ^= a
+	xor	%rbp,%rsi
+	# 		b = x6 + x7
+	lea	(%rax,%r8),%rbp
+	# 		(uint32) b <<<= 13
+	rol	$13,%ebp
+	# 		x4 ^= b
+	xor	%rbp,%r9
+	# a = x2 + x3
+	lea	(%rcx,%rsi),%rbp
+	# (uint32) a <<<= 18
+	rol	$18,%ebp
+	# x0 ^= a
+	xor	%rbp,%rdx
+	# 		b = x7 + x4
+	lea	(%r8,%r9),%rbp
+	# 		(uint32) b <<<= 18
+	rol	$18,%ebp
+	# 		x5 ^= b
+	xor	%rbp,%r15
+	# 				x10 = x10_stack
+	movq	168(%rsp),%rbp
+	# 		x5_stack = x5
+	movq	%r15,160(%rsp)
+	# 				c = x9 + x10
+	lea	(%r10,%rbp),%r15
+	# 				(uint32) c <<<= 7
+	rol	$7,%r15d
+	# 				x11 ^= c
+	xor	%r15,%r12
+	# 				c = x10 + x11
+	lea	(%rbp,%r12),%r15
+	# 				(uint32) c <<<= 9
+	rol	$9,%r15d
+	# 				x8 ^= c
+	xor	%r15,%r11
+	# 				c = x11 + x8
+	lea	(%r12,%r11),%r15
+	# 				(uint32) c <<<= 13
+	rol	$13,%r15d
+	# 				x9 ^= c
+	xor	%r15,%r10
+	# 				c = x8 + x9
+	lea	(%r11,%r10),%r15
+	# 				(uint32) c <<<= 18
+	rol	$18,%r15d
+	# 				x10 ^= c
+	xor	%r15,%rbp
+	# 						x15 = x15_stack
+	movq	176(%rsp),%r15
+	# 				x10_stack = x10
+	movq	%rbp,168(%rsp)
+	# 						d = x14 + x15
+	lea	(%rbx,%r15),%rbp
+	# 						(uint32) d <<<= 7
+	rol	$7,%ebp
+	# 						x12 ^= d
+	xor	%rbp,%r14
+	# 						d = x15 + x12
+	lea	(%r15,%r14),%rbp
+	# 						(uint32) d <<<= 9
+	rol	$9,%ebp
+	# 						x13 ^= d
+	xor	%rbp,%r13
+	# 						d = x12 + x13
+	lea	(%r14,%r13),%rbp
+	# 						(uint32) d <<<= 13
+	rol	$13,%ebp
+	# 						x14 ^= d
+	xor	%rbp,%rbx
+	# 						d = x13 + x14
+	lea	(%r13,%rbx),%rbp
+	# 						(uint32) d <<<= 18
+	rol	$18,%ebp
+	# 						x15 ^= d
+	xor	%rbp,%r15
+	# 						x15_stack = x15
+	movq	%r15,176(%rsp)
+	#   i = i_backup
+	movq	184(%rsp),%r15
+	#                  unsigned>? i -= 4
+	sub	$4,%r15
+	# comment:fp stack unchanged by jump
+	# goto mainloop if unsigned>
+	ja	._mainloop
+	#   (uint32) x2 += j2
+	addl	64(%rsp),%ecx
+	#   x3 <<= 32
+	shl	$32,%rsi
+	#   x3 += j2
+	addq	64(%rsp),%rsi
+	#   (uint64) x3 >>= 32
+	shr	$32,%rsi
+	#   x3 <<= 32
+	shl	$32,%rsi
+	#   x2 += x3
+	add	%rsi,%rcx
+	#   (uint32) x6 += j6
+	addl	80(%rsp),%eax
+	#   x7 <<= 32
+	shl	$32,%r8
+	#   x7 += j6
+	addq	80(%rsp),%r8
+	#   (uint64) x7 >>= 32
+	shr	$32,%r8
+	#   x7 <<= 32
+	shl	$32,%r8
+	#   x6 += x7
+	add	%r8,%rax
+	#   (uint32) x8 += j8
+	addl	88(%rsp),%r11d
+	#   x9 <<= 32
+	shl	$32,%r10
+	#   x9 += j8
+	addq	88(%rsp),%r10
+	#   (uint64) x9 >>= 32
+	shr	$32,%r10
+	#   x9 <<= 32
+	shl	$32,%r10
+	#   x8 += x9
+	add	%r10,%r11
+	#   (uint32) x12 += j12
+	addl	104(%rsp),%r14d
+	#   x13 <<= 32
+	shl	$32,%r13
+	#   x13 += j12
+	addq	104(%rsp),%r13
+	#   (uint64) x13 >>= 32
+	shr	$32,%r13
+	#   x13 <<= 32
+	shl	$32,%r13
+	#   x12 += x13
+	add	%r13,%r14
+	#   (uint32) x0 += j0
+	addl	56(%rsp),%edx
+	#   x1 <<= 32
+	shl	$32,%rdi
+	#   x1 += j0
+	addq	56(%rsp),%rdi
+	#   (uint64) x1 >>= 32
+	shr	$32,%rdi
+	#   x1 <<= 32
+	shl	$32,%rdi
+	#   x0 += x1
+	add	%rdi,%rdx
+	#   x5 = x5_stack
+	movq	160(%rsp),%rdi
+	#   (uint32) x4 += j4
+	addl	72(%rsp),%r9d
+	#   x5 <<= 32
+	shl	$32,%rdi
+	#   x5 += j4
+	addq	72(%rsp),%rdi
+	#   (uint64) x5 >>= 32
+	shr	$32,%rdi
+	#   x5 <<= 32
+	shl	$32,%rdi
+	#   x4 += x5
+	add	%rdi,%r9
+	#   x10 = x10_stack
+	movq	168(%rsp),%r8
+	#   (uint32) x10 += j10
+	addl	96(%rsp),%r8d
+	#   x11 <<= 32
+	shl	$32,%r12
+	#   x11 += j10
+	addq	96(%rsp),%r12
+	#   (uint64) x11 >>= 32
+	shr	$32,%r12
+	#   x11 <<= 32
+	shl	$32,%r12
+	#   x10 += x11
+	add	%r12,%r8
+	#   x15 = x15_stack
+	movq	176(%rsp),%rdi
+	#   (uint32) x14 += j14
+	addl	112(%rsp),%ebx
+	#   x15 <<= 32
+	shl	$32,%rdi
+	#   x15 += j14
+	addq	112(%rsp),%rdi
+	#   (uint64) x15 >>= 32
+	shr	$32,%rdi
+	#   x15 <<= 32
+	shl	$32,%rdi
+	#   x14 += x15
+	add	%rdi,%rbx
+	#   out = out_backup
+	movq	136(%rsp),%rdi
+	#   m = m_backup
+	movq	144(%rsp),%rsi
+	#   x0 ^= *(uint64 *) (m + 0)
+	xorq	0(%rsi),%rdx
+	#   *(uint64 *) (out + 0) = x0
+	movq	%rdx,0(%rdi)
+	#   x2 ^= *(uint64 *) (m + 8)
+	xorq	8(%rsi),%rcx
+	#   *(uint64 *) (out + 8) = x2
+	movq	%rcx,8(%rdi)
+	#   x4 ^= *(uint64 *) (m + 16)
+	xorq	16(%rsi),%r9
+	#   *(uint64 *) (out + 16) = x4
+	movq	%r9,16(%rdi)
+	#   x6 ^= *(uint64 *) (m + 24)
+	xorq	24(%rsi),%rax
+	#   *(uint64 *) (out + 24) = x6
+	movq	%rax,24(%rdi)
+	#   x8 ^= *(uint64 *) (m + 32)
+	xorq	32(%rsi),%r11
+	#   *(uint64 *) (out + 32) = x8
+	movq	%r11,32(%rdi)
+	#   x10 ^= *(uint64 *) (m + 40)
+	xorq	40(%rsi),%r8
+	#   *(uint64 *) (out + 40) = x10
+	movq	%r8,40(%rdi)
+	#   x12 ^= *(uint64 *) (m + 48)
+	xorq	48(%rsi),%r14
+	#   *(uint64 *) (out + 48) = x12
+	movq	%r14,48(%rdi)
+	#   x14 ^= *(uint64 *) (m + 56)
+	xorq	56(%rsi),%rbx
+	#   *(uint64 *) (out + 56) = x14
+	movq	%rbx,56(%rdi)
+	#   bytes = bytes_backup
+	movq	152(%rsp),%rdx
+	#   in8 = j8
+	movq	88(%rsp),%rcx
+	#   in8 += 1
+	add	$1,%rcx
+	#   j8 = in8
+	movq	%rcx,88(%rsp)
+	#                          unsigned>? unsigned<? bytes - 64
+	cmp	$64,%rdx
+	# comment:fp stack unchanged by jump
+	#   goto bytesatleast65 if unsigned>
+	ja	._bytesatleast65
+	# comment:fp stack unchanged by jump
+	#     goto bytesatleast64 if !unsigned<
+	jae	._bytesatleast64
+	#       m = out
+	mov	%rdi,%rsi
+	#       out = ctarget
+	movq	128(%rsp),%rdi
+	#       i = bytes
+	mov	%rdx,%rcx
+	#       while (i) { *out++ = *m++; --i }
+	rep	movsb
+	# comment:fp stack unchanged by fallthrough
+#     bytesatleast64:
+._bytesatleast64:
+	#     x = x_backup
+	movq	120(%rsp),%rdi
+	#     in8 = j8
+	movq	88(%rsp),%rsi
+	#     *(uint64 *) (x + 32) = in8
+	movq	%rsi,32(%rdi)
+	#     r11 = r11_stack
+	movq	0(%rsp),%r11
+	#     r12 = r12_stack
+	movq	8(%rsp),%r12
+	#     r13 = r13_stack
+	movq	16(%rsp),%r13
+	#     r14 = r14_stack
+	movq	24(%rsp),%r14
+	#     r15 = r15_stack
+	movq	32(%rsp),%r15
+	#     rbx = rbx_stack
+	movq	40(%rsp),%rbx
+	#     rbp = rbp_stack
+	movq	48(%rsp),%rbp
+	# comment:fp stack unchanged by fallthrough
+#     done:
+._done:
+	#     leave
+	add	%r11,%rsp
+	mov	%rdi,%rax
+	mov	%rsi,%rdx
+	ret
+#   bytesatleast65:
+._bytesatleast65:
+	#   bytes -= 64
+	sub	$64,%rdx
+	#   out += 64
+	add	$64,%rdi
+	#   m += 64
+	add	$64,%rsi
+	# comment:fp stack unchanged by jump
+	# goto bytesatleast1
+	jmp	._bytesatleast1
+# enter ECRYPT_keysetup
+.text
+.p2align 5
+.globl ECRYPT_keysetup
+ECRYPT_keysetup:
+	mov	%rsp,%r11
+	and	$31,%r11
+	add	$256,%r11
+	sub	%r11,%rsp
+	#   k = arg2
+	mov	%rsi,%rsi
+	#   kbits = arg3
+	mov	%rdx,%rdx
+	#   x = arg1
+	mov	%rdi,%rdi
+	#   in0 = *(uint64 *) (k + 0)
+	movq	0(%rsi),%r8
+	#   in2 = *(uint64 *) (k + 8)
+	movq	8(%rsi),%r9
+	#   *(uint64 *) (x + 4) = in0
+	movq	%r8,4(%rdi)
+	#   *(uint64 *) (x + 12) = in2
+	movq	%r9,12(%rdi)
+	#                    unsigned<? kbits - 256
+	cmp	$256,%rdx
+	# comment:fp stack unchanged by jump
+	#   goto kbits128 if unsigned<
+	jb	._kbits128
+#   kbits256:
+._kbits256:
+	#     in10 = *(uint64 *) (k + 16)
+	movq	16(%rsi),%rdx
+	#     in12 = *(uint64 *) (k + 24)
+	movq	24(%rsi),%rsi
+	#     *(uint64 *) (x + 44) = in10
+	movq	%rdx,44(%rdi)
+	#     *(uint64 *) (x + 52) = in12
+	movq	%rsi,52(%rdi)
+	#     in0 = 1634760805
+	mov	$1634760805,%rsi
+	#     in4 = 857760878
+	mov	$857760878,%rdx
+	#     in10 = 2036477234
+	mov	$2036477234,%rcx
+	#     in14 = 1797285236
+	mov	$1797285236,%r8
+	#     *(uint32 *) (x + 0) = in0
+	movl	%esi,0(%rdi)
+	#     *(uint32 *) (x + 20) = in4
+	movl	%edx,20(%rdi)
+	#     *(uint32 *) (x + 40) = in10
+	movl	%ecx,40(%rdi)
+	#     *(uint32 *) (x + 60) = in14
+	movl	%r8d,60(%rdi)
+	# comment:fp stack unchanged by jump
+	#   goto keysetupdone
+	jmp	._keysetupdone
+#   kbits128:
+._kbits128:
+	#     in10 = *(uint64 *) (k + 0)
+	movq	0(%rsi),%rdx
+	#     in12 = *(uint64 *) (k + 8)
+	movq	8(%rsi),%rsi
+	#     *(uint64 *) (x + 44) = in10
+	movq	%rdx,44(%rdi)
+	#     *(uint64 *) (x + 52) = in12
+	movq	%rsi,52(%rdi)
+	#     in0 = 1634760805
+	mov	$1634760805,%rsi
+	#     in4 = 824206446
+	mov	$824206446,%rdx
+	#     in10 = 2036477238
+	mov	$2036477238,%rcx
+	#     in14 = 1797285236
+	mov	$1797285236,%r8
+	#     *(uint32 *) (x + 0) = in0
+	movl	%esi,0(%rdi)
+	#     *(uint32 *) (x + 20) = in4
+	movl	%edx,20(%rdi)
+	#     *(uint32 *) (x + 40) = in10
+	movl	%ecx,40(%rdi)
+	#     *(uint32 *) (x + 60) = in14
+	movl	%r8d,60(%rdi)
+#   keysetupdone:
+._keysetupdone:
+	# leave
+	add	%r11,%rsp
+	mov	%rdi,%rax
+	mov	%rsi,%rdx
+	ret
+# enter ECRYPT_ivsetup
+.text
+.p2align 5
+.globl ECRYPT_ivsetup
+ECRYPT_ivsetup:
+	mov	%rsp,%r11
+	and	$31,%r11
+	add	$256,%r11
+	sub	%r11,%rsp
+	#   iv = arg2
+	mov	%rsi,%rsi
+	#   x = arg1
+	mov	%rdi,%rdi
+	#   in6 = *(uint64 *) (iv + 0)
+	movq	0(%rsi),%rsi
+	#   in8 = 0
+	mov	$0,%r8
+	#   *(uint64 *) (x + 24) = in6
+	movq	%rsi,24(%rdi)
+	#   *(uint64 *) (x + 32) = in8
+	movq	%r8,32(%rdi)
+	# leave
+	add	%r11,%rsp
+	mov	%rdi,%rax
+	mov	%rsi,%rdx
+	ret
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
new file mode 100644
index 000000000000..bccb76d80987
--- /dev/null
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -0,0 +1,129 @@
+/*
+ * Glue code for optimized assembly version of  Salsa20.
+ *
+ * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com>
+ *
+ * The assembly codes are public domain assembly codes written by Daniel. J.
+ * Bernstein <djb@cr.yp.to>. The codes are modified to include indentation
+ * and to remove extraneous comments and functions that are not needed.
+ * - i586 version, renamed as salsa20-i586-asm_32.S
+ *   available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
+ * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
+ *   available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/algapi.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+
+#define SALSA20_IV_SIZE        8U
+#define SALSA20_MIN_KEY_SIZE  16U
+#define SALSA20_MAX_KEY_SIZE  32U
+
+// use the ECRYPT_* function names
+#define salsa20_keysetup        ECRYPT_keysetup
+#define salsa20_ivsetup         ECRYPT_ivsetup
+#define salsa20_encrypt_bytes   ECRYPT_encrypt_bytes
+
+struct salsa20_ctx
+{
+	u32 input[16];
+};
+
+asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k,
+				 u32 keysize, u32 ivsize);
+asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv);
+asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx,
+				      const u8 *src, u8 *dst, u32 bytes);
+
+static int setkey(struct crypto_tfm *tfm, const u8 *key,
+		  unsigned int keysize)
+{
+	struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm);
+	salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8);
+	return 0;
+}
+
+static int encrypt(struct blkcipher_desc *desc,
+		   struct scatterlist *dst, struct scatterlist *src,
+		   unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	struct crypto_blkcipher *tfm = desc->tfm;
+	struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm);
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, 64);
+
+	salsa20_ivsetup(ctx, walk.iv);
+
+	if (likely(walk.nbytes == nbytes))
+	{
+		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
+				      walk.dst.virt.addr, nbytes);
+		return blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	while (walk.nbytes >= 64) {
+		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
+				      walk.dst.virt.addr,
+				      walk.nbytes - (walk.nbytes % 64));
+		err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64);
+	}
+
+	if (walk.nbytes) {
+		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
+				      walk.dst.virt.addr, walk.nbytes);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	return err;
+}
+
+static struct crypto_alg alg = {
+	.cra_name           =   "salsa20",
+	.cra_driver_name    =   "salsa20-asm",
+	.cra_priority       =   200,
+	.cra_flags          =   CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_type           =   &crypto_blkcipher_type,
+	.cra_blocksize      =   1,
+	.cra_ctxsize        =   sizeof(struct salsa20_ctx),
+	.cra_alignmask      =	3,
+	.cra_module         =   THIS_MODULE,
+	.cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+	.cra_u              =   {
+		.blkcipher = {
+			.setkey         =   setkey,
+			.encrypt        =   encrypt,
+			.decrypt        =   encrypt,
+			.min_keysize    =   SALSA20_MIN_KEY_SIZE,
+			.max_keysize    =   SALSA20_MAX_KEY_SIZE,
+			.ivsize         =   SALSA20_IV_SIZE,
+		}
+	}
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
+MODULE_ALIAS("salsa20");
+MODULE_ALIAS("salsa20-asm");
diff --git a/arch/x86/crypto/twofish_64.c b/arch/x86/crypto/twofish_64.c
deleted file mode 100644
index 182d91d5cfb9..000000000000
--- a/arch/x86/crypto/twofish_64.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Glue Code for optimized x86_64 assembler version of TWOFISH
- *
- * Originally Twofish for GPG
- * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
- * 256-bit key length added March 20, 1999
- * Some modifications to reduce the text size by Werner Koch, April, 1998
- * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
- * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
- *
- * The original author has disclaimed all copyright interest in this
- * code and thus put it in the public domain. The subsequent authors
- * have put this under the GNU General Public License.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
- * USA
- *
- * This code is a "clean room" implementation, written from the paper
- * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
- * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
- * through http://www.counterpane.com/twofish.html
- *
- * For background information on multiplication in finite fields, used for
- * the matrix operations in the key schedule, see the book _Contemporary
- * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
- * Third Edition.
- */
-
-#include <crypto/twofish.h>
-#include <linux/crypto.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/types.h>
-
-asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-
-static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	twofish_enc_blk(tfm, dst, src);
-}
-
-static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	twofish_dec_blk(tfm, dst, src);
-}
-
-static struct crypto_alg alg = {
-	.cra_name		=	"twofish",
-	.cra_driver_name	=	"twofish-x86_64",
-	.cra_priority		=	200,
-	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		=	TF_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct twofish_ctx),
-	.cra_alignmask		=	3,
-	.cra_module		=	THIS_MODULE,
-	.cra_list		=	LIST_HEAD_INIT(alg.cra_list),
-	.cra_u			=	{
-		.cipher = {
-			.cia_min_keysize	=	TF_MIN_KEY_SIZE,
-			.cia_max_keysize	=	TF_MAX_KEY_SIZE,
-			.cia_setkey		=	twofish_setkey,
-			.cia_encrypt		=	twofish_encrypt,
-			.cia_decrypt		=	twofish_decrypt
-		}
-	}
-};
-
-static int __init init(void)
-{
-	return crypto_register_alg(&alg);
-}
-
-static void __exit fini(void)
-{
-	crypto_unregister_alg(&alg);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
-MODULE_ALIAS("twofish");
diff --git a/arch/x86/crypto/twofish_32.c b/arch/x86/crypto/twofish_glue.c
index e3004dfe9c7a..cefaf8b9aa18 100644
--- a/arch/x86/crypto/twofish_32.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -1,5 +1,5 @@
 /*
- *  Glue Code for optimized 586 assembler version of TWOFISH
+ * Glue Code for assembler optimized version of TWOFISH
  *
  * Originally Twofish for GPG
  * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
@@ -44,7 +44,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 
-
 asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 
@@ -60,7 +59,7 @@ static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 
 static struct crypto_alg alg = {
 	.cra_name		=	"twofish",
-	.cra_driver_name	=	"twofish-i586",
+	.cra_driver_name	=	"twofish-asm",
 	.cra_priority		=	200,
 	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
 	.cra_blocksize		=	TF_BLOCK_SIZE,
@@ -93,5 +92,6 @@ module_init(init);
 module_exit(fini);
 
 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Twofish Cipher Algorithm, i586 asm optimized");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized");
 MODULE_ALIAS("twofish");
+MODULE_ALIAS("twofish-asm");