Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto updates from Herbert Xu: "Here is the crypto update for 4.8: API: - first part of skcipher low-level conversions - add KPP (Key-agreement Protocol Primitives) interface. Algorithms: - fix IPsec/cryptd reordering issues that affects aesni - RSA no longer does explicit leading zero removal - add SHA3 - add DH - add ECDH - improve DRBG performance by not doing CTR by hand Drivers: - add x86 AVX2 multibuffer SHA256/512 - add POWER8 optimised crc32c - add xts support to vmx - add DH support to qat - add RSA support to caam - add Layerscape support to caam - add SEC1 AEAD support to talitos - improve performance by chaining requests in marvell/cesa - add support for Araneus Alea I USB RNG - add support for Broadcom BCM5301 RNG - add support for Amlogic Meson RNG - add support Broadcom NSP SoC RNG" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (180 commits) crypto: vmx - Fix aes_p8_xts_decrypt build failure crypto: vmx - Ignore generated files crypto: vmx - Adding support for XTS crypto: vmx - Adding asm subroutines for XTS crypto: skcipher - add comment for skcipher_alg->base crypto: testmgr - Print akcipher algorithm name crypto: marvell - Fix wrong flag used for GFP in mv_cesa_dma_add_iv_op crypto: nx - off by one bug in nx_of_update_msc() crypto: rsa-pkcs1pad - fix rsa-pkcs1pad request struct crypto: scatterwalk - Inline start/map/done crypto: scatterwalk - Remove unnecessary BUG in scatterwalk_start crypto: scatterwalk - Remove unnecessary advance in scatterwalk_pagedone crypto: scatterwalk - Fix test in scatterwalk_done crypto: api - Optimise away crypto_yield when hard preemption is on crypto: scatterwalk - add no-copy support to copychunks crypto: scatterwalk - Remove scatterwalk_bytes_sglen crypto: omap - Stop using crypto scatterwalk_bytes_sglen crypto: skcipher - Remove top-level givcipher interface crypto: user - Remove crypto_lookup_skcipher call crypto: cts - Convert to skcipher ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-07-26 13:40:17 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-07-26 13:40:17 -0700
commit: bbce2ad2d711c12d93145a7bbdf086e73f414bcd (patch)
tree: 35432a39f68f4c5df44ed38037cbf05adadb923e /arch
parent: 0f776dc377f6c87f4e4d4a5f63602f33fb93b31e (diff)
parent: 0f95e2ffc58f5d32a90eb1051d17aeebc21cf91d (diff)
47 files changed, 7582 insertions, 291 deletions
diff --git a/arch/arm/boot/dts/bcm-nsp.dtsi b/arch/arm/boot/dts/bcm-nsp.dtsi
index def9e783b5c6..1ed829e699d4 100644
--- a/arch/arm/boot/dts/bcm-nsp.dtsi
+++ b/arch/arm/boot/dts/bcm-nsp.dtsi
@@ -206,6 +206,11 @@
 			brcm,nand-has-wp;
 		};
 
+		rng: rng@33000 {
+			compatible = "brcm,bcm-nsp-rng";
+			reg = <0x33000 0x14>;
+		};
+
 		ccbtimer0: timer@34000 {
 			compatible = "arm,sp804";
 			reg = <0x34000 0x1000>;
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index 03a39fe29246..1568cb5cd870 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -154,30 +154,23 @@ static int ghash_async_init(struct ahash_request *req)
 	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
 	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+	struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+	struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
 
-	if (!may_use_simd()) {
-		memcpy(cryptd_req, req, sizeof(*req));
-		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
-		return crypto_ahash_init(cryptd_req);
-	} else {
-		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
-		struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
-
-		desc->tfm = child;
-		desc->flags = req->base.flags;
-		return crypto_shash_init(desc);
-	}
+	desc->tfm = child;
+	desc->flags = req->base.flags;
+	return crypto_shash_init(desc);
 }
 
 static int ghash_async_update(struct ahash_request *req)
 {
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!may_use_simd()) {
-		struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-		struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
-		struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
+	if (!may_use_simd() ||
+	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
 		return crypto_ahash_update(cryptd_req);
@@ -190,12 +183,12 @@ static int ghash_async_update(struct ahash_request *req)
 static int ghash_async_final(struct ahash_request *req)
 {
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!may_use_simd()) {
-		struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-		struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
-		struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
+	if (!may_use_simd() ||
+	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
 		return crypto_ahash_final(cryptd_req);
@@ -212,7 +205,8 @@ static int ghash_async_digest(struct ahash_request *req)
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
 	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!may_use_simd()) {
+	if (!may_use_simd() ||
+	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
 		return crypto_ahash_digest(cryptd_req);
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts b/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts
index f895fc02ab06..40846319be69 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a-rdb.dts
@@ -49,6 +49,10 @@
 
 / {
 	model = "LS1043A RDB Board";
+
+	aliases {
+		crypto = &crypto;
+	};
 };
 
 &i2c0 {
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
index de0323b48b1e..6bd46c133010 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
@@ -159,6 +159,49 @@
 			big-endian;
 		};
 
+		crypto: crypto@1700000 {
+			compatible = "fsl,sec-v5.4", "fsl,sec-v5.0",
+				     "fsl,sec-v4.0";
+			fsl,sec-era = <3>;
+			#address-cells = <1>;
+			#size-cells = <1>;
+			ranges = <0x0 0x00 0x1700000 0x100000>;
+			reg = <0x00 0x1700000 0x0 0x100000>;
+			interrupts = <0 75 0x4>;
+
+			sec_jr0: jr@10000 {
+				compatible = "fsl,sec-v5.4-job-ring",
+					     "fsl,sec-v5.0-job-ring",
+					     "fsl,sec-v4.0-job-ring";
+				reg	   = <0x10000 0x10000>;
+				interrupts = <0 71 0x4>;
+			};
+
+			sec_jr1: jr@20000 {
+				compatible = "fsl,sec-v5.4-job-ring",
+					     "fsl,sec-v5.0-job-ring",
+					     "fsl,sec-v4.0-job-ring";
+				reg	   = <0x20000 0x10000>;
+				interrupts = <0 72 0x4>;
+			};
+
+			sec_jr2: jr@30000 {
+				compatible = "fsl,sec-v5.4-job-ring",
+					     "fsl,sec-v5.0-job-ring",
+					     "fsl,sec-v4.0-job-ring";
+				reg	   = <0x30000 0x10000>;
+				interrupts = <0 73 0x4>;
+			};
+
+			sec_jr3: jr@40000 {
+				compatible = "fsl,sec-v5.4-job-ring",
+					     "fsl,sec-v5.0-job-ring",
+					     "fsl,sec-v4.0-job-ring";
+				reg	   = <0x40000 0x10000>;
+				interrupts = <0 74 0x4>;
+			};
+		};
+
 		dcfg: dcfg@1ee0000 {
 			compatible = "fsl,ls1043a-dcfg", "syscon";
 			reg = <0x0 0x1ee0000 0x0 0x10000>;
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 44be1e03ed65..9b6e408cfa51 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -174,13 +174,15 @@ extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size);
 #define iounmap				__iounmap
 
 /*
- * io{read,write}{16,32}be() macros
+ * io{read,write}{16,32,64}be() macros
  */
 #define ioread16be(p)		({ __u16 __v = be16_to_cpu((__force __be16)__raw_readw(p)); __iormb(); __v; })
 #define ioread32be(p)		({ __u32 __v = be32_to_cpu((__force __be32)__raw_readl(p)); __iormb(); __v; })
+#define ioread64be(p)		({ __u64 __v = be64_to_cpu((__force __be64)__raw_readq(p)); __iormb(); __v; })
 
 #define iowrite16be(v,p)	({ __iowmb(); __raw_writew((__force __u16)cpu_to_be16(v), p); })
 #define iowrite32be(v,p)	({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); })
+#define iowrite64be(v,p)	({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); })
 
 /*
  * Convert a physical pointer to a virtual kernel pointer for /dev/mem
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index 9c221b69c181..7998c177f0a2 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -9,9 +9,11 @@ obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o
 obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o
 obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
 obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
+obj-$(CONFIG_CRYPT_CRC32C_VPMSUM) += crc32c-vpmsum.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
 md5-ppc-y := md5-asm.o md5-glue.o
 sha1-powerpc-y := sha1-powerpc-asm.o sha1.o
 sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
 sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
+crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
diff --git a/arch/powerpc/crypto/aes-spe-regs.h b/arch/powerpc/crypto/aes-spe-regs.h
index 30d217b399c3..2cc3a2caadae 100644
--- a/arch/powerpc/crypto/aes-spe-regs.h
+++ b/arch/powerpc/crypto/aes-spe-regs.h
@@ -18,7 +18,7 @@
 #define rLN r7	/* length of data to be processed			*/
 #define rIP r8	/* potiner to IV (CBC/CTR/XTS modes)			*/
 #define rKT r9	/* pointer to tweak key (XTS mode)			*/
-#define rT0 r11	/* pointers to en-/decrpytion tables			*/
+#define rT0 r11	/* pointers to en-/decryption tables			*/
 #define rT1 r10
 #define rD0 r9	/* data 						*/
 #define rD1 r14
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_asm.S b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
new file mode 100644
index 000000000000..dc640b212299
--- /dev/null
+++ b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
@@ -0,0 +1,1553 @@
+/*
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/ppc_asm.h>
+#include <asm/ppc-opcode.h>
+
+	.section	.rodata
+.balign 16
+
+.byteswap_constant:
+	/* byte reverse permute constant */
+	.octa 0x0F0E0D0C0B0A09080706050403020100
+
+#define MAX_SIZE	32768
+.constants:
+
+	/* Reduce 262144 kbits to 1024 bits */
+	/* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+	.octa 0x00000000b6ca9e20000000009c37c408
+
+	/* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+	.octa 0x00000000350249a800000001b51df26c
+
+	/* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+	.octa 0x00000001862dac54000000000724b9d0
+
+	/* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+	.octa 0x00000001d87fb48c00000001c00532fe
+
+	/* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+	.octa 0x00000001f39b699e00000000f05a9362
+
+	/* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+	.octa 0x0000000101da11b400000001e1007970
+
+	/* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+	.octa 0x00000001cab571e000000000a57366ee
+
+	/* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+	.octa 0x00000000c7020cfe0000000192011284
+
+	/* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+	.octa 0x00000000cdaed1ae0000000162716d9a
+
+	/* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+	.octa 0x00000001e804effc00000000cd97ecde
+
+	/* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+	.octa 0x0000000077c3ea3a0000000058812bc0
+
+	/* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+	.octa 0x0000000068df31b40000000088b8c12e
+
+	/* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+	.octa 0x00000000b059b6c200000001230b234c
+
+	/* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+	.octa 0x0000000145fb8ed800000001120b416e
+
+	/* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+	.octa 0x00000000cbc0916800000001974aecb0
+
+	/* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+	.octa 0x000000005ceeedc2000000008ee3f226
+
+	/* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+	.octa 0x0000000047d74e8600000001089aba9a
+
+	/* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+	.octa 0x00000001407e9e220000000065113872
+
+	/* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+	.octa 0x00000001da967bda000000005c07ec10
+
+	/* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+	.octa 0x000000006c8983680000000187590924
+
+	/* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+	.octa 0x00000000f2d14c9800000000e35da7c6
+
+	/* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+	.octa 0x00000001993c6ad4000000000415855a
+
+	/* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+	.octa 0x000000014683d1ac0000000073617758
+
+	/* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+	.octa 0x00000001a7c93e6c0000000176021d28
+
+	/* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+	.octa 0x000000010211e90a00000001c358fd0a
+
+	/* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+	.octa 0x000000001119403e00000001ff7a2c18
+
+	/* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+	.octa 0x000000001c3261aa00000000f2d9f7e4
+
+	/* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+	.octa 0x000000014e37a634000000016cf1f9c8
+
+	/* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+	.octa 0x0000000073786c0c000000010af9279a
+
+	/* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+	.octa 0x000000011dc037f80000000004f101e8
+
+	/* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+	.octa 0x0000000031433dfc0000000070bcf184
+
+	/* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+	.octa 0x000000009cde8348000000000a8de642
+
+	/* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+	.octa 0x0000000038d3c2a60000000062ea130c
+
+	/* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+	.octa 0x000000011b25f26000000001eb31cbb2
+
+	/* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+	.octa 0x000000001629e6f00000000170783448
+
+	/* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+	.octa 0x0000000160838b4c00000001a684b4c6
+
+	/* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+	.octa 0x000000007a44011c00000000253ca5b4
+
+	/* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+	.octa 0x00000000226f417a0000000057b4b1e2
+
+	/* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+	.octa 0x0000000045eb2eb400000000b6bd084c
+
+	/* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+	.octa 0x000000014459d70c0000000123c2d592
+
+	/* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+	.octa 0x00000001d406ed8200000000159dafce
+
+	/* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+	.octa 0x0000000160c8e1a80000000127e1a64e
+
+	/* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+	.octa 0x0000000027ba80980000000056860754
+
+	/* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+	.octa 0x000000006d92d01800000001e661aae8
+
+	/* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+	.octa 0x000000012ed7e3f200000000f82c6166
+
+	/* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+	.octa 0x000000002dc8778800000000c4f9c7ae
+
+	/* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+	.octa 0x0000000018240bb80000000074203d20
+
+	/* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+	.octa 0x000000001ad381580000000198173052
+
+	/* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+	.octa 0x00000001396b78f200000001ce8aba54
+
+	/* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+	.octa 0x000000011a68133400000001850d5d94
+
+	/* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+	.octa 0x000000012104732e00000001d609239c
+
+	/* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+	.octa 0x00000000a140d90c000000001595f048
+
+	/* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+	.octa 0x00000001b7215eda0000000042ccee08
+
+	/* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+	.octa 0x00000001aaf1df3c000000010a389d74
+
+	/* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+	.octa 0x0000000029d15b8a000000012a840da6
+
+	/* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+	.octa 0x00000000f1a96922000000001d181c0c
+
+	/* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+	.octa 0x00000001ac80d03c0000000068b7d1f6
+
+	/* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+	.octa 0x000000000f11d56a000000005b0f14fc
+
+	/* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+	.octa 0x00000001f1c022a20000000179e9e730
+
+	/* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+	.octa 0x0000000173d00ae200000001ce1368d6
+
+	/* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+	.octa 0x00000001d4ffe4ac0000000112c3a84c
+
+	/* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+	.octa 0x000000016edc5ae400000000de940fee
+
+	/* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+	.octa 0x00000001f1a0214000000000fe896b7e
+
+	/* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+	.octa 0x00000000ca0b28a000000001f797431c
+
+	/* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+	.octa 0x00000001928e30a20000000053e989ba
+
+	/* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+	.octa 0x0000000097b1b002000000003920cd16
+
+	/* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+	.octa 0x00000000b15bf90600000001e6f579b8
+
+	/* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+	.octa 0x00000000411c5d52000000007493cb0a
+
+	/* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+	.octa 0x00000001c36f330000000001bdd376d8
+
+	/* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+	.octa 0x00000001119227e0000000016badfee6
+
+	/* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+	.octa 0x00000000114d47020000000071de5c58
+
+	/* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+	.octa 0x00000000458b5b9800000000453f317c
+
+	/* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+	.octa 0x000000012e31fb8e0000000121675cce
+
+	/* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+	.octa 0x000000005cf619d800000001f409ee92
+
+	/* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+	.octa 0x0000000063f4d8b200000000f36b9c88
+
+	/* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+	.octa 0x000000004138dc8a0000000036b398f4
+
+	/* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+	.octa 0x00000001d29ee8e000000001748f9adc
+
+	/* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+	.octa 0x000000006a08ace800000001be94ec00
+
+	/* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+	.octa 0x0000000127d4201000000000b74370d6
+
+	/* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+	.octa 0x0000000019d76b6200000001174d0b98
+
+	/* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+	.octa 0x00000001b1471f6e00000000befc06a4
+
+	/* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+	.octa 0x00000001f64c19cc00000001ae125288
+
+	/* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+	.octa 0x00000000003c0ea00000000095c19b34
+
+	/* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+	.octa 0x000000014d73abf600000001a78496f2
+
+	/* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+	.octa 0x00000001620eb84400000001ac5390a0
+
+	/* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+	.octa 0x0000000147655048000000002a80ed6e
+
+	/* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+	.octa 0x0000000067b5077e00000001fa9b0128
+
+	/* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+	.octa 0x0000000010ffe20600000001ea94929e
+
+	/* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+	.octa 0x000000000fee8f1e0000000125f4305c
+
+	/* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+	.octa 0x00000001da26fbae00000001471e2002
+
+	/* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+	.octa 0x00000001b3a8bd880000000132d2253a
+
+	/* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+	.octa 0x00000000e8f3898e00000000f26b3592
+
+	/* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+	.octa 0x00000000b0d0d28c00000000bc8b67b0
+
+	/* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+	.octa 0x0000000030f2a798000000013a826ef2
+
+	/* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+	.octa 0x000000000fba10020000000081482c84
+
+	/* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+	.octa 0x00000000bdb9bd7200000000e77307c2
+
+	/* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+	.octa 0x0000000075d3bf5a00000000d4a07ec8
+
+	/* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+	.octa 0x00000000ef1f98a00000000017102100
+
+	/* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+	.octa 0x00000000689c760200000000db406486
+
+	/* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+	.octa 0x000000016d5fa5fe0000000192db7f88
+
+	/* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+	.octa 0x00000001d0d2b9ca000000018bf67b1e
+
+	/* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+	.octa 0x0000000041e7b470000000007c09163e
+
+	/* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+	.octa 0x00000001cbb6495e000000000adac060
+
+	/* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+	.octa 0x000000010052a0b000000000bd8316ae
+
+	/* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+	.octa 0x00000001d8effb5c000000019f09ab54
+
+	/* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+	.octa 0x00000001d969853c0000000125155542
+
+	/* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+	.octa 0x00000000523ccce2000000018fdb5882
+
+	/* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+	.octa 0x000000001e2436bc00000000e794b3f4
+
+	/* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+	.octa 0x00000000ddd1c3a2000000016f9bb022
+
+	/* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+	.octa 0x0000000019fcfe3800000000290c9978
+
+	/* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+	.octa 0x00000001ce95db640000000083c0f350
+
+	/* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+	.octa 0x00000000af5828060000000173ea6628
+
+	/* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+	.octa 0x00000001006388f600000001c8b4e00a
+
+	/* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+	.octa 0x0000000179eca00a00000000de95d6aa
+
+	/* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+	.octa 0x0000000122410a6a000000010b7f7248
+
+	/* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+	.octa 0x000000004288e87c00000001326e3a06
+
+	/* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+	.octa 0x000000016c5490da00000000bb62c2e6
+
+	/* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+	.octa 0x00000000d1c71f6e0000000156a4b2c2
+
+	/* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+	.octa 0x00000001b4ce08a6000000011dfe763a
+
+	/* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+	.octa 0x00000001466ba60c000000007bcca8e2
+
+	/* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+	.octa 0x00000001f6c488a40000000186118faa
+
+	/* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+	.octa 0x000000013bfb06820000000111a65a88
+
+	/* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+	.octa 0x00000000690e9e54000000003565e1c4
+
+	/* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+	.octa 0x00000000281346b6000000012ed02a82
+
+	/* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+	.octa 0x000000015646402400000000c486ecfc
+
+	/* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+	.octa 0x000000016063a8dc0000000001b951b2
+
+	/* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+	.octa 0x0000000116a663620000000048143916
+
+	/* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+	.octa 0x000000017e8aa4d200000001dc2ae124
+
+	/* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+	.octa 0x00000001728eb10c00000001416c58d6
+
+	/* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+	.octa 0x00000001b08fd7fa00000000a479744a
+
+	/* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+	.octa 0x00000001092a16e80000000096ca3a26
+
+	/* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+	.octa 0x00000000a505637c00000000ff223d4e
+
+	/* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+	.octa 0x00000000d94869b2000000010e84da42
+
+	/* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+	.octa 0x00000001c8b203ae00000001b61ba3d0
+
+	/* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+	.octa 0x000000005704aea000000000680f2de8
+
+	/* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+	.octa 0x000000012e295fa2000000008772a9a8
+
+	/* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+	.octa 0x000000011d0908bc0000000155f295bc
+
+	/* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+	.octa 0x0000000193ed97ea00000000595f9282
+
+	/* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+	.octa 0x000000013a0f1c520000000164b1c25a
+
+	/* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+	.octa 0x000000010c2c40c000000000fbd67c50
+
+	/* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+	.octa 0x00000000ff6fac3e0000000096076268
+
+	/* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+	.octa 0x000000017b3609c000000001d288e4cc
+
+	/* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+	.octa 0x0000000088c8c92200000001eaac1bdc
+
+	/* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+	.octa 0x00000001751baae600000001f1ea39e2
+
+	/* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+	.octa 0x000000010795297200000001eb6506fc
+
+	/* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+	.octa 0x0000000162b00abe000000010f806ffe
+
+	/* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+	.octa 0x000000000d7b404c000000010408481e
+
+	/* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+	.octa 0x00000000763b13d40000000188260534
+
+	/* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+	.octa 0x00000000f6dc22d80000000058fc73e0
+
+	/* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+	.octa 0x000000007daae06000000000391c59b8
+
+	/* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+	.octa 0x000000013359ab7c000000018b638400
+
+	/* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+	.octa 0x000000008add438a000000011738f5c4
+
+	/* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+	.octa 0x00000001edbefdea000000008cf7c6da
+
+	/* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+	.octa 0x000000004104e0f800000001ef97fb16
+
+	/* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+	.octa 0x00000000b48a82220000000102130e20
+
+	/* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+	.octa 0x00000001bcb4684400000000db968898
+
+	/* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+	.octa 0x000000013293ce0a00000000b5047b5e
+
+	/* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+	.octa 0x00000001710d0844000000010b90fdb2
+
+	/* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+	.octa 0x0000000117907f6e000000004834a32e
+
+	/* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+	.octa 0x0000000087ddf93e0000000059c8f2b0
+
+	/* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+	.octa 0x000000005970e9b00000000122cec508
+
+	/* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+	.octa 0x0000000185b2b7d0000000000a330cda
+
+	/* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+	.octa 0x00000001dcee0efc000000014a47148c
+
+	/* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+	.octa 0x0000000030da27220000000042c61cb8
+
+	/* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+	.octa 0x000000012f925a180000000012fe6960
+
+	/* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+	.octa 0x00000000dd2e357c00000000dbda2c20
+
+	/* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+	.octa 0x00000000071c80de000000011122410c
+
+	/* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+	.octa 0x000000011513140a00000000977b2070
+
+	/* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+	.octa 0x00000001df876e8e000000014050438e
+
+	/* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+	.octa 0x000000015f81d6ce0000000147c840e8
+
+	/* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+	.octa 0x000000019dd94dbe00000001cc7c88ce
+
+	/* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+	.octa 0x00000001373d206e00000001476b35a4
+
+	/* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+	.octa 0x00000000668ccade000000013d52d508
+
+	/* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+	.octa 0x00000001b192d268000000008e4be32e
+
+	/* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+	.octa 0x00000000e30f3a7800000000024120fe
+
+	/* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+	.octa 0x000000010ef1f7bc00000000ddecddb4
+
+	/* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+	.octa 0x00000001f5ac738000000000d4d403bc
+
+	/* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+	.octa 0x000000011822ea7000000001734b89aa
+
+	/* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+	.octa 0x00000000c3a33848000000010e7a58d6
+
+	/* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+	.octa 0x00000001bd151c2400000001f9f04e9c
+
+	/* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+	.octa 0x0000000056002d7600000000b692225e
+
+	/* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+	.octa 0x000000014657c4f4000000019b8d3f3e
+
+	/* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+	.octa 0x0000000113742d7c00000001a874f11e
+
+	/* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+	.octa 0x000000019c5920ba000000010d5a4254
+
+	/* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+	.octa 0x000000005216d2d600000000bbb2f5d6
+
+	/* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+	.octa 0x0000000136f5ad8a0000000179cc0e36
+
+	/* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+	.octa 0x000000018b07beb600000001dca1da4a
+
+	/* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+	.octa 0x00000000db1e93b000000000feb1a192
+
+	/* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+	.octa 0x000000000b96fa3a00000000d1eeedd6
+
+	/* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+	.octa 0x00000001d9968af0000000008fad9bb4
+
+	/* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+	.octa 0x000000000e4a77a200000001884938e4
+
+	/* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+	.octa 0x00000000508c2ac800000001bc2e9bc0
+
+	/* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+	.octa 0x0000000021572a8000000001f9658a68
+
+	/* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+	.octa 0x00000001b859daf2000000001b9224fc
+
+	/* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+	.octa 0x000000016f7884740000000055b2fb84
+
+	/* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+	.octa 0x00000001b438810e000000018b090348
+
+	/* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+	.octa 0x0000000095ddc6f2000000011ccbd5ea
+
+	/* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+	.octa 0x00000001d977c20c0000000007ae47f8
+
+	/* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+	.octa 0x00000000ebedb99a0000000172acbec0
+
+	/* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+	.octa 0x00000001df9e9e9200000001c6e3ff20
+
+	/* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+	.octa 0x00000001a4a3f95200000000e1b38744
+
+	/* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+	.octa 0x00000000e2f5122000000000791585b2
+
+	/* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+	.octa 0x000000004aa01f3e00000000ac53b894
+
+	/* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+	.octa 0x00000000b3e90a5800000001ed5f2cf4
+
+	/* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+	.octa 0x000000000c9ca2aa00000001df48b2e0
+
+	/* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+	.octa 0x000000015168231600000000049c1c62
+
+	/* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+	.octa 0x0000000036fce78c000000017c460c12
+
+	/* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+	.octa 0x000000009037dc10000000015be4da7e
+
+	/* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+	.octa 0x00000000d3298582000000010f38f668
+
+	/* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+	.octa 0x00000001b42e8ad60000000039f40a00
+
+	/* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+	.octa 0x00000000142a983800000000bd4c10c4
+
+	/* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+	.octa 0x0000000109c7f1900000000042db1d98
+
+	/* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+	.octa 0x0000000056ff931000000001c905bae6
+
+	/* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+	.octa 0x00000001594513aa00000000069d40ea
+
+	/* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+	.octa 0x00000001e3b5b1e8000000008e4fbad0
+
+	/* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+	.octa 0x000000011dd5fc080000000047bedd46
+
+	/* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+	.octa 0x00000001675f0cc20000000026396bf8
+
+	/* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+	.octa 0x00000000d1c8dd4400000000379beb92
+
+	/* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+	.octa 0x0000000115ebd3d8000000000abae54a
+
+	/* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+	.octa 0x00000001ecbd0dac0000000007e6a128
+
+	/* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+	.octa 0x00000000cdf67af2000000000ade29d2
+
+	/* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+	.octa 0x000000004c01ff4c00000000f974c45c
+
+	/* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+	.octa 0x00000000f2d8657e00000000e77ac60a
+
+	/* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+	.octa 0x000000006bae74c40000000145895816
+
+	/* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+	.octa 0x0000000152af8aa00000000038e362be
+
+	/* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+	.octa 0x0000000004663802000000007f991a64
+
+	/* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+	.octa 0x00000001ab2f5afc00000000fa366d3a
+
+	/* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+	.octa 0x0000000074a4ebd400000001a2bb34f0
+
+	/* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+	.octa 0x00000001d7ab3a4c0000000028a9981e
+
+	/* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+	.octa 0x00000001a8da60c600000001dbc672be
+
+	/* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+	.octa 0x000000013cf6382000000000b04d77f6
+
+	/* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+	.octa 0x00000000bec12e1e0000000124400d96
+
+	/* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+	.octa 0x00000001c6368010000000014ca4b414
+
+	/* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+	.octa 0x00000001e6e78758000000012fe2c938
+
+	/* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+	.octa 0x000000008d7f2b3c00000001faed01e6
+
+	/* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+	.octa 0x000000016b4a156e000000007e80ecfe
+
+	/* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+	.octa 0x00000001c63cfeb60000000098daee94
+
+	/* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+	.octa 0x000000015f902670000000010a04edea
+
+	/* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+	.octa 0x00000001cd5de11e00000001c00b4524
+
+	/* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+	.octa 0x000000001acaec540000000170296550
+
+	/* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+	.octa 0x000000002bd0ca780000000181afaa48
+
+	/* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+	.octa 0x0000000032d63d5c0000000185a31ffa
+
+	/* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+	.octa 0x000000001c6d4e4c000000002469f608
+
+	/* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+	.octa 0x0000000106a60b92000000006980102a
+
+	/* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+	.octa 0x00000000d3855e120000000111ea9ca8
+
+	/* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+	.octa 0x00000000e312563600000001bd1d29ce
+
+	/* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+	.octa 0x000000009e8f7ea400000001b34b9580
+
+	/* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+	.octa 0x00000001c82e562c000000003076054e
+
+	/* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+	.octa 0x00000000ca9f09ce000000012a608ea4
+
+	/* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+	.octa 0x00000000c63764e600000000784d05fe
+
+	/* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+	.octa 0x0000000168d2e49e000000016ef0d82a
+
+	/* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+	.octa 0x00000000e986c1480000000075bda454
+
+	/* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+	.octa 0x00000000cfb65894000000003dc0a1c4
+
+	/* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+	.octa 0x0000000111cadee400000000e9a5d8be
+
+	/* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+	.octa 0x0000000171fb63ce00000001609bc4b4
+
+.short_constants:
+
+	/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+	/* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
+	.octa 0x7fec2963e5bf80485cf015c388e56f72
+
+	/* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
+	.octa 0x38e888d4844752a9963a18920246e2e6
+
+	/* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
+	.octa 0x42316c00730206ad419a441956993a31
+
+	/* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
+	.octa 0x543d5c543e65ddf9924752ba2b830011
+
+	/* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
+	.octa 0x78e87aaf56767c9255bd7f9518e4a304
+
+	/* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
+	.octa 0x8f68fcec1903da7f6d76739fe0553f1e
+
+	/* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
+	.octa 0x3f4840246791d588c133722b1fe0b5c3
+
+	/* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
+	.octa 0x34c96751b04de25a64b67ee0e55ef1f3
+
+	/* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
+	.octa 0x156c8e180b4a395b069db049b8fdb1e7
+
+	/* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
+	.octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
+
+	/* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
+	.octa 0x041d37768cd75659817cdc5119b29a35
+
+	/* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
+	.octa 0x3a0777818cfaa9651ce9d94b36c41f1c
+
+	/* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
+	.octa 0x0e148e8252377a554f256efcb82be955
+
+	/* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
+	.octa 0x9c25531d19e65ddeec1631edb2dea967
+
+	/* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
+	.octa 0x790606ff9957c0a65d27e147510ac59a
+
+	/* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
+	.octa 0x82f63b786ea2d55ca66805eb18b8ea18
+
+
+.barrett_constants:
+	/* 33 bit reflected Barrett constant m - (4^32)/n */
+	.octa 0x000000000000000000000000dea713f1	/* x^64 div p(x)` */
+	/* 33 bit reflected Barrett constant n */
+	.octa 0x00000000000000000000000105ec76f1
+
+	.text
+
+#if defined(__BIG_ENDIAN__)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16		r25
+#define off32		r26
+#define off48		r27
+#define off64		r28
+#define off80		r29
+#define off96		r30
+#define off112		r31
+
+#define const1		v24
+#define const2		v25
+
+#define byteswap	v26
+#define	mask_32bit	v27
+#define	mask_64bit	v28
+#define zeroes		v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm	A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+/* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(__crc32c_vpmsum)
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+	std	r26,-48(r1)
+	std	r25,-56(r1)
+
+	li	off16,16
+	li	off32,32
+	li	off48,48
+	li	off64,64
+	li	off80,80
+	li	off96,96
+	li	off112,112
+	li	r0,0
+
+	/* Enough room for saving 10 non volatile VMX registers */
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	stvx	v20,0,r6
+	stvx	v21,off16,r6
+	stvx	v22,off32,r6
+	stvx	v23,off48,r6
+	stvx	v24,off64,r6
+	stvx	v25,off80,r6
+	stvx	v26,off96,r6
+	stvx	v27,off112,r6
+	stvx	v28,0,r7
+	stvx	v29,off16,r7
+
+	mr	r10,r3
+
+	vxor	zeroes,zeroes,zeroes
+	vspltisw v0,-1
+
+	vsldoi	mask_32bit,zeroes,v0,4
+	vsldoi	mask_64bit,zeroes,v0,8
+
+	/* Get the initial value into v8 */
+	vxor	v8,v8,v8
+	MTVRD(v8, R3)
+	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
+
+#ifdef BYTESWAP_DATA
+	addis	r3,r2,.byteswap_constant@toc@ha
+	addi	r3,r3,.byteswap_constant@toc@l
+
+	lvx	byteswap,0,r3
+	addi	r3,r3,16
+#endif
+
+	cmpdi	r5,256
+	blt	.Lshort
+
+	rldicr	r6,r5,0,56
+
+	/* Checksum in blocks of MAX_SIZE */
+1:	lis	r7,MAX_SIZE@h
+	ori	r7,r7,MAX_SIZE@l
+	mr	r9,r7
+	cmpd	r6,r7
+	bgt	2f
+	mr	r7,r6
+2:	subf	r6,r7,r6
+
+	/* our main loop does 128 bytes at a time */
+	srdi	r7,r7,7
+
+	/*
+	 * Work out the offset into the constants table to start at. Each
+	 * constant is 16 bytes, and it is used against 128 bytes of input
+	 * data - 128 / 16 = 8
+	 */
+	sldi	r8,r7,4
+	srdi	r9,r9,3
+	subf	r8,r8,r9
+
+	/* We reduce our final 128 bytes in a separate step */
+	addi	r7,r7,-1
+	mtctr	r7
+
+	addis	r3,r2,.constants@toc@ha
+	addi	r3,r3,.constants@toc@l
+
+	/* Find the start of our constants */
+	add	r3,r3,r8
+
+	/* zero v0-v7 which will contain our checksums */
+	vxor	v0,v0,v0
+	vxor	v1,v1,v1
+	vxor	v2,v2,v2
+	vxor	v3,v3,v3
+	vxor	v4,v4,v4
+	vxor	v5,v5,v5
+	vxor	v6,v6,v6
+	vxor	v7,v7,v7
+
+	lvx	const1,0,r3
+
+	/*
+	 * If we are looping back to consume more data we use the values
+	 * already in v16-v23.
+	 */
+	cmpdi	r0,1
+	beq	2f
+
+	/* First warm up pass */
+	lvx	v16,0,r4
+	lvx	v17,off16,r4
+	VPERM(v16,v16,v16,byteswap)
+	VPERM(v17,v17,v17,byteswap)
+	lvx	v18,off32,r4
+	lvx	v19,off48,r4
+	VPERM(v18,v18,v18,byteswap)
+	VPERM(v19,v19,v19,byteswap)
+	lvx	v20,off64,r4
+	lvx	v21,off80,r4
+	VPERM(v20,v20,v20,byteswap)
+	VPERM(v21,v21,v21,byteswap)
+	lvx	v22,off96,r4
+	lvx	v23,off112,r4
+	VPERM(v22,v22,v22,byteswap)
+	VPERM(v23,v23,v23,byteswap)
+	addi	r4,r4,8*16
+
+	/* xor in initial value */
+	vxor	v16,v16,v8
+
+2:	bdz	.Lfirst_warm_up_done
+
+	addi	r3,r3,16
+	lvx	const2,0,r3
+
+	/* Second warm up pass */
+	VPMSUMD(v8,v16,const1)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v9,v17,const1)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v10,v18,const1)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v11,v19,const1)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdz	.Lfirst_cool_down
+
+	/*
+	 * main loop. We modulo schedule it such that it takes three iterations
+	 * to complete - first iteration load, second iteration vpmsum, third
+	 * iteration xor.
+	 */
+	.balign	16
+4:	lvx	const1,0,r3
+	addi	r3,r3,16
+	ori	r2,r2,0
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const2)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const2)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const2)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const2)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	lvx	const2,0,r3
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdnz	4b
+
+.Lfirst_cool_down:
+	/* First cool down pass */
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const1)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const1)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const1)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const1)
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	ori	r2,r2,0
+
+.Lsecond_cool_down:
+	/* Second cool down pass */
+	vxor	v0,v0,v8
+	vxor	v1,v1,v9
+	vxor	v2,v2,v10
+	vxor	v3,v3,v11
+	vxor	v4,v4,v12
+	vxor	v5,v5,v13
+	vxor	v6,v6,v14
+	vxor	v7,v7,v15
+
+	/*
+	 * vpmsumd produces a 96 bit result in the least significant bits
+	 * of the register. Since we are bit reflected we have to shift it
+	 * left 32 bits so it occupies the least significant bits in the
+	 * bit reflected domain.
+	 */
+	vsldoi	v0,v0,zeroes,4
+	vsldoi	v1,v1,zeroes,4
+	vsldoi	v2,v2,zeroes,4
+	vsldoi	v3,v3,zeroes,4
+	vsldoi	v4,v4,zeroes,4
+	vsldoi	v5,v5,zeroes,4
+	vsldoi	v6,v6,zeroes,4
+	vsldoi	v7,v7,zeroes,4
+
+	/* xor with last 1024 bits */
+	lvx	v8,0,r4
+	lvx	v9,off16,r4
+	VPERM(v8,v8,v8,byteswap)
+	VPERM(v9,v9,v9,byteswap)
+	lvx	v10,off32,r4
+	lvx	v11,off48,r4
+	VPERM(v10,v10,v10,byteswap)
+	VPERM(v11,v11,v11,byteswap)
+	lvx	v12,off64,r4
+	lvx	v13,off80,r4
+	VPERM(v12,v12,v12,byteswap)
+	VPERM(v13,v13,v13,byteswap)
+	lvx	v14,off96,r4
+	lvx	v15,off112,r4
+	VPERM(v14,v14,v14,byteswap)
+	VPERM(v15,v15,v15,byteswap)
+
+	addi	r4,r4,8*16
+
+	vxor	v16,v0,v8
+	vxor	v17,v1,v9
+	vxor	v18,v2,v10
+	vxor	v19,v3,v11
+	vxor	v20,v4,v12
+	vxor	v21,v5,v13
+	vxor	v22,v6,v14
+	vxor	v23,v7,v15
+
+	li	r0,1
+	cmpdi	r6,0
+	addi	r6,r6,128
+	bne	1b
+
+	/* Work out how many bytes we have left */
+	andi.	r5,r5,127
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,128
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks are in the tail */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	/*
+	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
+	 * 32 bits to include the trailing 32 bits of zeros
+	 */
+	lvx	v0,0,r3
+	lvx	v1,off16,r3
+	lvx	v2,off32,r3
+	lvx	v3,off48,r3
+	lvx	v4,off64,r3
+	lvx	v5,off80,r3
+	lvx	v6,off96,r3
+	lvx	v7,off112,r3
+	addi	r3,r3,8*16
+
+	VPMSUMW(v0,v16,v0)
+	VPMSUMW(v1,v17,v1)
+	VPMSUMW(v2,v18,v2)
+	VPMSUMW(v3,v19,v3)
+	VPMSUMW(v4,v20,v4)
+	VPMSUMW(v5,v21,v5)
+	VPMSUMW(v6,v22,v6)
+	VPMSUMW(v7,v23,v7)
+
+	/* Now reduce the tail (0 - 112 bytes) */
+	cmpdi	r7,0
+	beq	1f
+
+	lvx	v16,0,r4
+	lvx	v17,0,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off32,r4
+	lvx	v17,off32,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off64,r4
+	lvx	v17,off64,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off96,r4
+	lvx	v17,off96,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+
+	/* Now xor all the parallel chunks together */
+1:	vxor	v0,v0,v1
+	vxor	v2,v2,v3
+	vxor	v4,v4,v5
+	vxor	v6,v6,v7
+
+	vxor	v0,v0,v2
+	vxor	v4,v4,v6
+
+	vxor	v0,v0,v4
+
+.Lbarrett_reduction:
+	/* Barrett constants */
+	addis	r3,r2,.barrett_constants@toc@ha
+	addi	r3,r3,.barrett_constants@toc@l
+
+	lvx	const1,0,r3
+	lvx	const2,off16,r3
+
+	vsldoi	v1,v0,v0,8
+	vxor	v0,v0,v1		/* xor two 64 bit results together */
+
+	/* shift left one bit */
+	vspltisb v1,1
+	vsl	v0,v0,v1
+
+	vand	v0,v0,mask_64bit
+
+	/*
+	 * The reflected version of Barrett reduction. Instead of bit
+	 * reflecting our data (which is expensive to do), we bit reflect our
+	 * constants and our algorithm, which means the intermediate data in
+	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
+	 * the algorithm because we don't carry in mod 2 arithmetic.
+	 */
+	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
+	VPMSUMD(v1,v1,const1)		/* ma */
+	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
+	VPMSUMD(v1,v1,const2)		/* qn */
+	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Since we are bit reflected, the result (ie the low 32 bits) is in
+	 * the high 32 bits. We just need to shift it left 4 bytes
+	 * V0 [ 0 1 X 3 ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
+
+	/* Get it into r3 */
+	MFVRD(R3, v0)
+
+.Lout:
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	lvx	v20,0,r6
+	lvx	v21,off16,r6
+	lvx	v22,off32,r6
+	lvx	v23,off48,r6
+	lvx	v24,off64,r6
+	lvx	v25,off80,r6
+	lvx	v26,off96,r6
+	lvx	v27,off112,r6
+	lvx	v28,0,r7
+	lvx	v29,off16,r7
+
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+	ld	r26,-48(r1)
+	ld	r25,-56(r1)
+
+	blr
+
+.Lfirst_warm_up_done:
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	VPMSUMD(v8,v16,const1)
+	VPMSUMD(v9,v17,const1)
+	VPMSUMD(v10,v18,const1)
+	VPMSUMD(v11,v19,const1)
+	VPMSUMD(v12,v20,const1)
+	VPMSUMD(v13,v21,const1)
+	VPMSUMD(v14,v22,const1)
+	VPMSUMD(v15,v23,const1)
+
+	b	.Lsecond_cool_down
+
+.Lshort:
+	cmpdi	r5,0
+	beq	.Lzero
+
+	addis	r3,r2,.short_constants@toc@ha
+	addi	r3,r3,.short_constants@toc@l
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,256
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks? */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	vxor	v19,v19,v19
+	vxor	v20,v20,v20
+
+	lvx	v0,0,r4
+	lvx	v16,0,r3
+	VPERM(v0,v0,v16,byteswap)
+	vxor	v0,v0,v8	/* xor in initial value */
+	VPMSUMW(v0,v0,v16)
+	bdz	.Lv0
+
+	lvx	v1,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v1,v1,v17,byteswap)
+	VPMSUMW(v1,v1,v17)
+	bdz	.Lv1
+
+	lvx	v2,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v2,v2,v16,byteswap)
+	VPMSUMW(v2,v2,v16)
+	bdz	.Lv2
+
+	lvx	v3,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v3,v3,v17,byteswap)
+	VPMSUMW(v3,v3,v17)
+	bdz	.Lv3
+
+	lvx	v4,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v4,v4,v16,byteswap)
+	VPMSUMW(v4,v4,v16)
+	bdz	.Lv4
+
+	lvx	v5,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v5,v5,v17,byteswap)
+	VPMSUMW(v5,v5,v17)
+	bdz	.Lv5
+
+	lvx	v6,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v6,v6,v16,byteswap)
+	VPMSUMW(v6,v6,v16)
+	bdz	.Lv6
+
+	lvx	v7,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v7,v7,v17,byteswap)
+	VPMSUMW(v7,v7,v17)
+	bdz	.Lv7
+
+	addi	r3,r3,128
+	addi	r4,r4,128
+
+	lvx	v8,0,r4
+	lvx	v16,0,r3
+	VPERM(v8,v8,v16,byteswap)
+	VPMSUMW(v8,v8,v16)
+	bdz	.Lv8
+
+	lvx	v9,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v9,v9,v17,byteswap)
+	VPMSUMW(v9,v9,v17)
+	bdz	.Lv9
+
+	lvx	v10,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v10,v10,v16,byteswap)
+	VPMSUMW(v10,v10,v16)
+	bdz	.Lv10
+
+	lvx	v11,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v11,v11,v17,byteswap)
+	VPMSUMW(v11,v11,v17)
+	bdz	.Lv11
+
+	lvx	v12,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v12,v12,v16,byteswap)
+	VPMSUMW(v12,v12,v16)
+	bdz	.Lv12
+
+	lvx	v13,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v13,v13,v17,byteswap)
+	VPMSUMW(v13,v13,v17)
+	bdz	.Lv13
+
+	lvx	v14,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v14,v14,v16,byteswap)
+	VPMSUMW(v14,v14,v16)
+	bdz	.Lv14
+
+	lvx	v15,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v15,v15,v17,byteswap)
+	VPMSUMW(v15,v15,v17)
+
+.Lv15:	vxor	v19,v19,v15
+.Lv14:	vxor	v20,v20,v14
+.Lv13:	vxor	v19,v19,v13
+.Lv12:	vxor	v20,v20,v12
+.Lv11:	vxor	v19,v19,v11
+.Lv10:	vxor	v20,v20,v10
+.Lv9:	vxor	v19,v19,v9
+.Lv8:	vxor	v20,v20,v8
+.Lv7:	vxor	v19,v19,v7
+.Lv6:	vxor	v20,v20,v6
+.Lv5:	vxor	v19,v19,v5
+.Lv4:	vxor	v20,v20,v4
+.Lv3:	vxor	v19,v19,v3
+.Lv2:	vxor	v20,v20,v2
+.Lv1:	vxor	v19,v19,v1
+.Lv0:	vxor	v20,v20,v0
+
+	vxor	v0,v19,v20
+
+	b	.Lbarrett_reduction
+
+.Lzero:
+	mr	r3,r10
+	b	.Lout
+
+FUNC_END(__crc32_vpmsum)
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
new file mode 100644
index 000000000000..bfe3d37a24ef
--- /dev/null
+++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
@@ -0,0 +1,167 @@
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/switch_to.h>
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+#define VMX_ALIGN		16
+#define VMX_ALIGN_MASK		(VMX_ALIGN-1)
+
+#define VECTOR_BREAKPOINT	512
+
+u32 __crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len);
+
+static u32 crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len)
+{
+	unsigned int prealign;
+	unsigned int tail;
+
+	if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt())
+		return __crc32c_le(crc, p, len);
+
+	if ((unsigned long)p & VMX_ALIGN_MASK) {
+		prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+		crc = __crc32c_le(crc, p, prealign);
+		len -= prealign;
+		p += prealign;
+	}
+
+	if (len & ~VMX_ALIGN_MASK) {
+		pagefault_disable();
+		enable_kernel_altivec();
+		crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+		pagefault_enable();
+	}
+
+	tail = len & VMX_ALIGN_MASK;
+	if (tail) {
+		p += len & ~VMX_ALIGN_MASK;
+		crc = __crc32c_le(crc, p, tail);
+	}
+
+	return crc;
+}
+
+static int crc32c_vpmsum_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 0;
+
+	return 0;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32c_vpmsum_setkey(struct crypto_shash *hash, const u8 *key,
+			       unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32c_vpmsum_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = *mctx;
+
+	return 0;
+}
+
+static int crc32c_vpmsum_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = crc32c_vpmsum(*crcp, data, len);
+
+	return 0;
+}
+
+static int __crc32c_vpmsum_finup(u32 *crcp, const u8 *data, unsigned int len,
+				u8 *out)
+{
+	*(__le32 *)out = ~cpu_to_le32(crc32c_vpmsum(*crcp, data, len));
+
+	return 0;
+}
+
+static int crc32c_vpmsum_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
+{
+	return __crc32c_vpmsum_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32c_vpmsum_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*(__le32 *)out = ~cpu_to_le32p(crcp);
+
+	return 0;
+}
+
+static int crc32c_vpmsum_digest(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *out)
+{
+	return __crc32c_vpmsum_finup(crypto_shash_ctx(desc->tfm), data, len,
+				     out);
+}
+
+static struct shash_alg alg = {
+	.setkey		= crc32c_vpmsum_setkey,
+	.init		= crc32c_vpmsum_init,
+	.update		= crc32c_vpmsum_update,
+	.final		= crc32c_vpmsum_final,
+	.finup		= crc32c_vpmsum_finup,
+	.digest		= crc32c_vpmsum_digest,
+	.descsize	= sizeof(u32),
+	.digestsize	= CHKSUM_DIGEST_SIZE,
+	.base		= {
+		.cra_name		= "crc32c",
+		.cra_driver_name	= "crc32c-vpmsum",
+		.cra_priority		= 200,
+		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(u32),
+		.cra_module		= THIS_MODULE,
+		.cra_init		= crc32c_vpmsum_cra_init,
+	}
+};
+
+static int __init crc32c_vpmsum_mod_init(void)
+{
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return -ENODEV;
+
+	return crypto_register_shash(&alg);
+}
+
+static void __exit crc32c_vpmsum_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(crc32c_vpmsum_mod_init);
+module_exit(crc32c_vpmsum_mod_fini);
+
+MODULE_AUTHOR("Anton Blanchard <anton@samba.org>");
+MODULE_DESCRIPTION("CRC32C using vector polynomial multiply-sum instructions");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("crc32c");
+MODULE_ALIAS_CRYPTO("crc32c-vpmsum");
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 1d035c1cc889..49cd8760aa7c 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -174,6 +174,8 @@
 #define PPC_INST_MFSPR_DSCR_USER_MASK	0xfc1fffff
 #define PPC_INST_MTSPR_DSCR_USER	0x7c0303a6
 #define PPC_INST_MTSPR_DSCR_USER_MASK	0xfc1fffff
+#define PPC_INST_MFVSRD			0x7c000066
+#define PPC_INST_MTVSRD			0x7c000166
 #define PPC_INST_SLBFEE			0x7c0007a7
 
 #define PPC_INST_STRING			0x7c00042a
@@ -188,6 +190,8 @@
 #define PPC_INST_WAIT			0x7c00007c
 #define PPC_INST_TLBIVAX		0x7c000624
 #define PPC_INST_TLBSRX_DOT		0x7c0006a5
+#define PPC_INST_VPMSUMW		0x10000488
+#define PPC_INST_VPMSUMD		0x100004c8
 #define PPC_INST_XXLOR			0xf0000510
 #define PPC_INST_XXSWAPD		0xf0000250
 #define PPC_INST_XVCPSGNDP		0xf0000780
@@ -359,6 +363,14 @@
 					       VSX_XX1((s), a, b))
 #define LXVD2X(s, a, b)		stringify_in_c(.long PPC_INST_LXVD2X | \
 					       VSX_XX1((s), a, b))
+#define MFVRD(a, t)		stringify_in_c(.long PPC_INST_MFVSRD | \
+					       VSX_XX1((t)+32, a, R0))
+#define MTVRD(t, a)		stringify_in_c(.long PPC_INST_MTVSRD | \
+					       VSX_XX1((t)+32, a, R0))
+#define VPMSUMW(t, a, b)	stringify_in_c(.long PPC_INST_VPMSUMW | \
+					       VSX_XX3((t), a, b))
+#define VPMSUMD(t, a, b)	stringify_in_c(.long PPC_INST_VPMSUMD | \
+					       VSX_XX3((t), a, b))
 #define XXLOR(t, a, b)		stringify_in_c(.long PPC_INST_XXLOR | \
 					       VSX_XX3((t), a, b))
 #define XXSWAPD(t, a)		stringify_in_c(.long PPC_INST_XXSWAPD | \
diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 2b31632376a5..051af612a7e1 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -286,6 +286,9 @@ n:
 
 #endif
 
+#define FUNC_START(name)	_GLOBAL(name)
+#define FUNC_END(name)
+
 /* 
  * LOAD_REG_IMMEDIATE(rn, expr)
  *   Loads the value of the constant expression 'expr' into register 'rn'
diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c
index 12e48d56f771..3963f0b68d52 100644
--- a/arch/powerpc/kernel/iomap.c
+++ b/arch/powerpc/kernel/iomap.c
@@ -38,6 +38,18 @@ EXPORT_SYMBOL(ioread16);
 EXPORT_SYMBOL(ioread16be);
 EXPORT_SYMBOL(ioread32);
 EXPORT_SYMBOL(ioread32be);
+#ifdef __powerpc64__
+u64 ioread64(void __iomem *addr)
+{
+	return readq(addr);
+}
+u64 ioread64be(void __iomem *addr)
+{
+	return readq_be(addr);
+}
+EXPORT_SYMBOL(ioread64);
+EXPORT_SYMBOL(ioread64be);
+#endif /* __powerpc64__ */
 
 void iowrite8(u8 val, void __iomem *addr)
 {
@@ -64,6 +76,18 @@ EXPORT_SYMBOL(iowrite16);
 EXPORT_SYMBOL(iowrite16be);
 EXPORT_SYMBOL(iowrite32);
 EXPORT_SYMBOL(iowrite32be);
+#ifdef __powerpc64__
+void iowrite64(u64 val, void __iomem *addr)
+{
+	writeq(val, addr);
+}
+void iowrite64be(u64 val, void __iomem *addr)
+{
+	writeq_be(val, addr);
+}
+EXPORT_SYMBOL(iowrite64);
+EXPORT_SYMBOL(iowrite64be);
+#endif /* __powerpc64__ */
 
 /*
  * These are the "repeat read/write" functions. Note the
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index 7554a8bb2adc..2ea18b050309 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -22,6 +22,7 @@
 
 #include <crypto/aes.h>
 #include <crypto/algapi.h>
+#include <crypto/internal/skcipher.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/cpufeature.h>
@@ -44,7 +45,7 @@ struct s390_aes_ctx {
 	long dec;
 	int key_len;
 	union {
-		struct crypto_blkcipher *blk;
+		struct crypto_skcipher *blk;
 		struct crypto_cipher *cip;
 	} fallback;
 };
@@ -63,7 +64,7 @@ struct s390_xts_ctx {
 	long enc;
 	long dec;
 	int key_len;
-	struct crypto_blkcipher *fallback;
+	struct crypto_skcipher *fallback;
 };
 
 /*
@@ -237,16 +238,16 @@ static int setkey_fallback_blk(struct crypto_tfm *tfm, const u8 *key,
 	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
 	unsigned int ret;
 
-	sctx->fallback.blk->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
-	sctx->fallback.blk->base.crt_flags |= (tfm->crt_flags &
-			CRYPTO_TFM_REQ_MASK);
+	crypto_skcipher_clear_flags(sctx->fallback.blk, CRYPTO_TFM_REQ_MASK);
+	crypto_skcipher_set_flags(sctx->fallback.blk, tfm->crt_flags &
+						      CRYPTO_TFM_REQ_MASK);
+
+	ret = crypto_skcipher_setkey(sctx->fallback.blk, key, len);
+
+	tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+	tfm->crt_flags |= crypto_skcipher_get_flags(sctx->fallback.blk) &
+			  CRYPTO_TFM_RES_MASK;
 
-	ret = crypto_blkcipher_setkey(sctx->fallback.blk, key, len);
-	if (ret) {
-		tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
-		tfm->crt_flags |= (sctx->fallback.blk->base.crt_flags &
-				CRYPTO_TFM_RES_MASK);
-	}
 	return ret;
 }
 
@@ -255,15 +256,17 @@ static int fallback_blk_dec(struct blkcipher_desc *desc,
 		unsigned int nbytes)
 {
 	unsigned int ret;
-	struct crypto_blkcipher *tfm;
-	struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
+	struct crypto_blkcipher *tfm = desc->tfm;
+	struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm);
+	SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk);
 
-	tfm = desc->tfm;
-	desc->tfm = sctx->fallback.blk;
+	skcipher_request_set_tfm(req, sctx->fallback.blk);
+	skcipher_request_set_callback(req, desc->flags, NULL, NULL);
+	skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
 
-	ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes);
+	ret = crypto_skcipher_decrypt(req);
 
-	desc->tfm = tfm;
+	skcipher_request_zero(req);
 	return ret;
 }
 
@@ -272,15 +275,15 @@ static int fallback_blk_enc(struct blkcipher_desc *desc,
 		unsigned int nbytes)
 {
 	unsigned int ret;
-	struct crypto_blkcipher *tfm;
-	struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
+	struct crypto_blkcipher *tfm = desc->tfm;
+	struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm);
+	SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk);
 
-	tfm = desc->tfm;
-	desc->tfm = sctx->fallback.blk;
+	skcipher_request_set_tfm(req, sctx->fallback.blk);
+	skcipher_request_set_callback(req, desc->flags, NULL, NULL);
+	skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
 
-	ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes);
-
-	desc->tfm = tfm;
+	ret = crypto_skcipher_encrypt(req);
 	return ret;
 }
 
@@ -370,8 +373,9 @@ static int fallback_init_blk(struct crypto_tfm *tfm)
 	const char *name = tfm->__crt_alg->cra_name;
 	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
 
-	sctx->fallback.blk = crypto_alloc_blkcipher(name, 0,
-			CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+	sctx->fallback.blk = crypto_alloc_skcipher(name, 0,
+						   CRYPTO_ALG_ASYNC |
+						   CRYPTO_ALG_NEED_FALLBACK);
 
 	if (IS_ERR(sctx->fallback.blk)) {
 		pr_err("Allocating AES fallback algorithm %s failed\n",
@@ -386,8 +390,7 @@ static void fallback_exit_blk(struct crypto_tfm *tfm)
 {
 	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
 
-	crypto_free_blkcipher(sctx->fallback.blk);
-	sctx->fallback.blk = NULL;
+	crypto_free_skcipher(sctx->fallback.blk);
 }
 
 static struct crypto_alg ecb_aes_alg = {
@@ -536,16 +539,16 @@ static int xts_fallback_setkey(struct crypto_tfm *tfm, const u8 *key,
 	struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm);
 	unsigned int ret;
 
-	xts_ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
-	xts_ctx->fallback->base.crt_flags |= (tfm->crt_flags &
-			CRYPTO_TFM_REQ_MASK);
+	crypto_skcipher_clear_flags(xts_ctx->fallback, CRYPTO_TFM_REQ_MASK);
+	crypto_skcipher_set_flags(xts_ctx->fallback, tfm->crt_flags &
+						     CRYPTO_TFM_REQ_MASK);
+
+	ret = crypto_skcipher_setkey(xts_ctx->fallback, key, len);
+
+	tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+	tfm->crt_flags |= crypto_skcipher_get_flags(xts_ctx->fallback) &
+			  CRYPTO_TFM_RES_MASK;
 
-	ret = crypto_blkcipher_setkey(xts_ctx->fallback, key, len);
-	if (ret) {
-		tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
-		tfm->crt_flags |= (xts_ctx->fallback->base.crt_flags &
-				CRYPTO_TFM_RES_MASK);
-	}
 	return ret;
 }
 
@@ -553,16 +556,18 @@ static int xts_fallback_decrypt(struct blkcipher_desc *desc,
 		struct scatterlist *dst, struct scatterlist *src,
 		unsigned int nbytes)
 {
-	struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm);
-	struct crypto_blkcipher *tfm;
+	struct crypto_blkcipher *tfm = desc->tfm;
+	struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm);
+	SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback);
 	unsigned int ret;
 
-	tfm = desc->tfm;
-	desc->tfm = xts_ctx->fallback;
+	skcipher_request_set_tfm(req, xts_ctx->fallback);
+	skcipher_request_set_callback(req, desc->flags, NULL, NULL);
+	skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
 
-	ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes);
+	ret = crypto_skcipher_decrypt(req);
 
-	desc->tfm = tfm;
+	skcipher_request_zero(req);
 	return ret;
 }
 
@@ -570,16 +575,18 @@ static int xts_fallback_encrypt(struct blkcipher_desc *desc,
 		struct scatterlist *dst, struct scatterlist *src,
 		unsigned int nbytes)
 {
-	struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm);
-	struct crypto_blkcipher *tfm;
+	struct crypto_blkcipher *tfm = desc->tfm;
+	struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm);
+	SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback);
 	unsigned int ret;
 
-	tfm = desc->tfm;
-	desc->tfm = xts_ctx->fallback;
+	skcipher_request_set_tfm(req, xts_ctx->fallback);
+	skcipher_request_set_callback(req, desc->flags, NULL, NULL);
+	skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
 
-	ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes);
+	ret = crypto_skcipher_encrypt(req);
 
-	desc->tfm = tfm;
+	skcipher_request_zero(req);
 	return ret;
 }
 
@@ -700,8 +707,9 @@ static int xts_fallback_init(struct crypto_tfm *tfm)
 	const char *name = tfm->__crt_alg->cra_name;
 	struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm);
 
-	xts_ctx->fallback = crypto_alloc_blkcipher(name, 0,
-			CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+	xts_ctx->fallback = crypto_alloc_skcipher(name, 0,
+						  CRYPTO_ALG_ASYNC |
+						  CRYPTO_ALG_NEED_FALLBACK);
 
 	if (IS_ERR(xts_ctx->fallback)) {
 		pr_err("Allocating XTS fallback algorithm %s failed\n",
@@ -715,8 +723,7 @@ static void xts_fallback_exit(struct crypto_tfm *tfm)
 {
 	struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm);
 
-	crypto_free_blkcipher(xts_ctx->fallback);
-	xts_ctx->fallback = NULL;
+	crypto_free_skcipher(xts_ctx->fallback);
 }
 
 static struct crypto_alg xts_aes_alg = {
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index b9b912a44d61..34b3fa2889d1 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -49,7 +49,9 @@ endif
 ifeq ($(avx2_supported),yes)
 	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
 	obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
-	obj-$(CONFIG_CRYPTO_SHA1_MB) += sha-mb/
+	obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/
+	obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/
+	obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/
 endif
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 5b7fa1471007..0ab5ee1c26af 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -59,17 +59,6 @@ struct aesni_rfc4106_gcm_ctx {
 	u8 nonce[4];
 };
 
-struct aesni_gcm_set_hash_subkey_result {
-	int err;
-	struct completion completion;
-};
-
-struct aesni_hash_subkey_req_data {
-	u8 iv[16];
-	struct aesni_gcm_set_hash_subkey_result result;
-	struct scatterlist sg;
-};
-
 struct aesni_lrw_ctx {
 	struct lrw_table_ctx lrw_table;
 	u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1];
@@ -809,71 +798,28 @@ static void rfc4106_exit(struct crypto_aead *aead)
 	cryptd_free_aead(*ctx);
 }
 
-static void
-rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
-{
-	struct aesni_gcm_set_hash_subkey_result *result = req->data;
-
-	if (err == -EINPROGRESS)
-		return;
-	result->err = err;
-	complete(&result->completion);
-}
-
 static int
 rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
 {
-	struct crypto_ablkcipher *ctr_tfm;
-	struct ablkcipher_request *req;
-	int ret = -EINVAL;
-	struct aesni_hash_subkey_req_data *req_data;
+	struct crypto_cipher *tfm;
+	int ret;
 
-	ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
-	if (IS_ERR(ctr_tfm))
-		return PTR_ERR(ctr_tfm);
+	tfm = crypto_alloc_cipher("aes", 0, 0);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
 
-	ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
+	ret = crypto_cipher_setkey(tfm, key, key_len);
 	if (ret)
-		goto out_free_ablkcipher;
-
-	ret = -ENOMEM;
-	req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
-	if (!req)
-		goto out_free_ablkcipher;
-
-	req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
-	if (!req_data)
-		goto out_free_request;
-
-	memset(req_data->iv, 0, sizeof(req_data->iv));
+		goto out_free_cipher;
 
 	/* Clear the data in the hash sub key container to zero.*/
 	/* We want to cipher all zeros to create the hash sub key. */
 	memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
 
-	init_completion(&req_data->result.completion);
-	sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
-	ablkcipher_request_set_tfm(req, ctr_tfm);
-	ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
-					CRYPTO_TFM_REQ_MAY_BACKLOG,
-					rfc4106_set_hash_subkey_done,
-					&req_data->result);
-
-	ablkcipher_request_set_crypt(req, &req_data->sg,
-		&req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
-
-	ret = crypto_ablkcipher_encrypt(req);
-	if (ret == -EINPROGRESS || ret == -EBUSY) {
-		ret = wait_for_completion_interruptible
-			(&req_data->result.completion);
-		if (!ret)
-			ret = req_data->result.err;
-	}
-	kfree(req_data);
-out_free_request:
-	ablkcipher_request_free(req);
-out_free_ablkcipher:
-	crypto_free_ablkcipher(ctr_tfm);
+	crypto_cipher_encrypt_one(tfm, hash_subkey, hash_subkey);
+
+out_free_cipher:
+	crypto_free_cipher(tfm);
 	return ret;
 }
 
@@ -1098,9 +1044,12 @@ static int rfc4106_encrypt(struct aead_request *req)
 	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
 	struct cryptd_aead *cryptd_tfm = *ctx;
 
-	aead_request_set_tfm(req, irq_fpu_usable() ?
-				  cryptd_aead_child(cryptd_tfm) :
-				  &cryptd_tfm->base);
+	tfm = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		tfm = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, tfm);
 
 	return crypto_aead_encrypt(req);
 }
@@ -1111,9 +1060,12 @@ static int rfc4106_decrypt(struct aead_request *req)
 	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
 	struct cryptd_aead *cryptd_tfm = *ctx;
 
-	aead_request_set_tfm(req, irq_fpu_usable() ?
-				  cryptd_aead_child(cryptd_tfm) :
-				  &cryptd_tfm->base);
+	tfm = &cryptd_tfm->base;
+	if (irq_fpu_usable() && (!in_atomic() ||
+				 !cryptd_aead_queued(cryptd_tfm)))
+		tfm = cryptd_aead_child(cryptd_tfm);
+
+	aead_request_set_tfm(req, tfm);
 
 	return crypto_aead_decrypt(req);
 }
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 2d5c2e0bd939..f910d1d449f0 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -70,7 +70,7 @@ static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
 	struct blkcipher_walk walk;
 	int err;
 
-	if (!may_use_simd())
+	if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd())
 		return crypto_chacha20_crypt(desc, dst, src, nbytes);
 
 	state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN);
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index a69321a77783..0420bab19efb 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -168,30 +168,23 @@ static int ghash_async_init(struct ahash_request *req)
 	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
 	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+	struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+	struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
 
-	if (!irq_fpu_usable()) {
-		memcpy(cryptd_req, req, sizeof(*req));
-		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
-		return crypto_ahash_init(cryptd_req);
-	} else {
-		struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
-		struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
-
-		desc->tfm = child;
-		desc->flags = req->base.flags;
-		return crypto_shash_init(desc);
-	}
+	desc->tfm = child;
+	desc->flags = req->base.flags;
+	return crypto_shash_init(desc);
 }
 
 static int ghash_async_update(struct ahash_request *req)
 {
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!irq_fpu_usable()) {
-		struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-		struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
-		struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
+	if (!irq_fpu_usable() ||
+	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
 		return crypto_ahash_update(cryptd_req);
@@ -204,12 +197,12 @@ static int ghash_async_update(struct ahash_request *req)
 static int ghash_async_final(struct ahash_request *req)
 {
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!irq_fpu_usable()) {
-		struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-		struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
-		struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
+	if (!irq_fpu_usable() ||
+	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
 		return crypto_ahash_final(cryptd_req);
@@ -249,7 +242,8 @@ static int ghash_async_digest(struct ahash_request *req)
 	struct ahash_request *cryptd_req = ahash_request_ctx(req);
 	struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
 
-	if (!irq_fpu_usable()) {
+	if (!irq_fpu_usable() ||
+	    (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
 		memcpy(cryptd_req, req, sizeof(*req));
 		ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
 		return crypto_ahash_digest(cryptd_req);
diff --git a/arch/x86/crypto/sha-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile
index 2f8756375df5..2f8756375df5 100644
--- a/arch/x86/crypto/sha-mb/Makefile
+++ b/arch/x86/crypto/sha1-mb/Makefile
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c
index 9c5af331a956..9e5b67127a09 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha1-mb/sha1_mb.c
@@ -67,7 +67,7 @@
 #include <asm/byteorder.h>
 #include <linux/hardirq.h>
 #include <asm/fpu/api.h>
-#include "sha_mb_ctx.h"
+#include "sha1_mb_ctx.h"
 
 #define FLUSH_INTERVAL 1000 /* in usec */
 
@@ -77,30 +77,34 @@ struct sha1_mb_ctx {
 	struct mcryptd_ahash *mcryptd_tfm;
 };
 
-static inline struct mcryptd_hash_request_ctx *cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx)
+static inline struct mcryptd_hash_request_ctx
+		*cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx)
 {
-	struct shash_desc *desc;
+	struct ahash_request *areq;
 
-	desc = container_of((void *) hash_ctx, struct shash_desc, __ctx);
-	return container_of(desc, struct mcryptd_hash_request_ctx, desc);
+	areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
+	return container_of(areq, struct mcryptd_hash_request_ctx, areq);
 }
 
-static inline struct ahash_request *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+static inline struct ahash_request
+		*cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
 {
 	return container_of((void *) ctx, struct ahash_request, __ctx);
 }
 
 static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
-				struct shash_desc *desc)
+				struct ahash_request *areq)
 {
 	rctx->flag = HASH_UPDATE;
 }
 
 static asmlinkage void (*sha1_job_mgr_init)(struct sha1_mb_mgr *state);
-static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)(struct sha1_mb_mgr *state,
-							  struct job_sha1 *job);
-static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)(struct sha1_mb_mgr *state);
-static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)(struct sha1_mb_mgr *state);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)
+			(struct sha1_mb_mgr *state, struct job_sha1 *job);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)
+						(struct sha1_mb_mgr *state);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)
+						(struct sha1_mb_mgr *state);
 
 static inline void sha1_init_digest(uint32_t *digest)
 {
@@ -131,7 +135,8 @@ static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2],
 	return i >> SHA1_LOG2_BLOCK_SIZE;
 }
 
-static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, struct sha1_hash_ctx *ctx)
+static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr,
+						struct sha1_hash_ctx *ctx)
 {
 	while (ctx) {
 		if (ctx->status & HASH_CTX_STS_COMPLETE) {
@@ -177,8 +182,8 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str
 
 				ctx->job.buffer = (uint8_t *) buffer;
 				ctx->job.len = len;
-				ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr,
-										  &ctx->job);
+				ctx = (struct sha1_hash_ctx *)sha1_job_mgr_submit(&mgr->mgr,
+										&ctx->job);
 				continue;
 			}
 		}
@@ -191,13 +196,15 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str
 		if (ctx->status & HASH_CTX_STS_LAST) {
 
 			uint8_t *buf = ctx->partial_block_buffer;
-			uint32_t n_extra_blocks = sha1_pad(buf, ctx->total_length);
+			uint32_t n_extra_blocks =
+					sha1_pad(buf, ctx->total_length);
 
 			ctx->status = (HASH_CTX_STS_PROCESSING |
 				       HASH_CTX_STS_COMPLETE);
 			ctx->job.buffer = buf;
 			ctx->job.len = (uint32_t) n_extra_blocks;
-			ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
+			ctx = (struct sha1_hash_ctx *)
+				sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
 			continue;
 		}
 
@@ -208,14 +215,17 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, str
 	return NULL;
 }
 
-static struct sha1_hash_ctx *sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr)
+static struct sha1_hash_ctx
+			*sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr)
 {
 	/*
 	 * If get_comp_job returns NULL, there are no jobs complete.
-	 * If get_comp_job returns a job, verify that it is safe to return to the user.
+	 * If get_comp_job returns a job, verify that it is safe to return to
+	 * the user.
 	 * If it is not ready, resubmit the job to finish processing.
 	 * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
-	 * Otherwise, all jobs currently being managed by the hash_ctx_mgr still need processing.
+	 * Otherwise, all jobs currently being managed by the hash_ctx_mgr
+	 * still need processing.
 	 */
 	struct sha1_hash_ctx *ctx;
 
@@ -235,7 +245,10 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
 					  int flags)
 {
 	if (flags & (~HASH_ENTIRE)) {
-		/* User should not pass anything other than FIRST, UPDATE, or LAST */
+		/*
+		 * User should not pass anything other than FIRST, UPDATE, or
+		 * LAST
+		 */
 		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
 		return ctx;
 	}
@@ -264,14 +277,20 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
 		ctx->partial_block_buffer_length = 0;
 	}
 
-	/* If we made it here, there were no errors during this call to submit */
+	/*
+	 * If we made it here, there were no errors during this call to
+	 * submit
+	 */
 	ctx->error = HASH_CTX_ERROR_NONE;
 
 	/* Store buffer ptr info from user */
 	ctx->incoming_buffer = buffer;
 	ctx->incoming_buffer_length = len;
 
-	/* Store the user's request flags and mark this ctx as currently being processed. */
+	/*
+	 * Store the user's request flags and mark this ctx as currently
+	 * being processed.
+	 */
 	ctx->status = (flags & HASH_LAST) ?
 			(HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
 			HASH_CTX_STS_PROCESSING;
@@ -285,9 +304,13 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
 	 * Or if the user's buffer contains less than a whole block,
 	 * append as much as possible to the extra block.
 	 */
-	if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
-		/* Compute how many bytes to copy from user buffer into extra block */
-		uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+	if (ctx->partial_block_buffer_length || len < SHA1_BLOCK_SIZE) {
+		/*
+		 * Compute how many bytes to copy from user buffer into
+		 * extra block
+		 */
+		uint32_t copy_len = SHA1_BLOCK_SIZE -
+					ctx->partial_block_buffer_length;
 		if (len < copy_len)
 			copy_len = len;
 
@@ -297,20 +320,28 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
 				buffer, copy_len);
 
 			ctx->partial_block_buffer_length += copy_len;
-			ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+			ctx->incoming_buffer = (const void *)
+					((const char *)buffer + copy_len);
 			ctx->incoming_buffer_length = len - copy_len;
 		}
 
-		/* The extra block should never contain more than 1 block here */
+		/*
+		 * The extra block should never contain more than 1 block
+		 * here
+		 */
 		assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
 
-		/* If the extra block buffer contains exactly 1 block, it can be hashed. */
+		/*
+		 * If the extra block buffer contains exactly 1 block, it can
+		 * be hashed.
+		 */
 		if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
 			ctx->partial_block_buffer_length = 0;
 
 			ctx->job.buffer = ctx->partial_block_buffer;
 			ctx->job.len = 1;
-			ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
+			ctx = (struct sha1_hash_ctx *)
+				sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
 		}
 	}
 
@@ -329,23 +360,24 @@ static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr)
 			return NULL;
 
 		/*
-		 * If flush returned a job, resubmit the job to finish processing.
+		 * If flush returned a job, resubmit the job to finish
+		 * processing.
 		 */
 		ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
 
 		/*
-		 * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
-		 * Otherwise, all jobs currently being managed by the sha1_ctx_mgr
-		 * still need processing. Loop.
+		 * If sha1_ctx_mgr_resubmit returned a job, it is ready to be
+		 * returned. Otherwise, all jobs currently being managed by the
+		 * sha1_ctx_mgr still need processing. Loop.
 		 */
 		if (ctx)
 			return ctx;
 	}
 }
 
-static int sha1_mb_init(struct shash_desc *desc)
+static int sha1_mb_init(struct ahash_request *areq)
 {
-	struct sha1_hash_ctx *sctx = shash_desc_ctx(desc);
+	struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
 
 	hash_ctx_init(sctx);
 	sctx->job.result_digest[0] = SHA1_H0;
@@ -363,7 +395,7 @@ static int sha1_mb_init(struct shash_desc *desc)
 static int sha1_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
 {
 	int	i;
-	struct	sha1_hash_ctx *sctx = shash_desc_ctx(&rctx->desc);
+	struct	sha1_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
 	__be32	*dst = (__be32 *) rctx->out;
 
 	for (i = 0; i < 5; ++i)
@@ -394,9 +426,11 @@ static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
 				flag |= HASH_LAST;
 
 		}
-		sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(&rctx->desc);
+		sha_ctx = (struct sha1_hash_ctx *)
+						ahash_request_ctx(&rctx->areq);
 		kernel_fpu_begin();
-		sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, flag);
+		sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx,
+						rctx->walk.data, nbytes, flag);
 		if (!sha_ctx) {
 			if (flush)
 				sha_ctx = sha1_ctx_mgr_flush(cstate->mgr);
@@ -485,11 +519,10 @@ static void sha1_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
 	mcryptd_arm_flusher(cstate, delay);
 }
 
-static int sha1_mb_update(struct shash_desc *desc, const u8 *data,
-			  unsigned int len)
+static int sha1_mb_update(struct ahash_request *areq)
 {
 	struct mcryptd_hash_request_ctx *rctx =
-			container_of(desc, struct mcryptd_hash_request_ctx, desc);
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
 	struct mcryptd_alg_cstate *cstate =
 				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
 
@@ -505,7 +538,7 @@ static int sha1_mb_update(struct shash_desc *desc, const u8 *data,
 	}
 
 	/* need to init context */
-	req_ctx_init(rctx, desc);
+	req_ctx_init(rctx, areq);
 
 	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
 
@@ -518,10 +551,11 @@ static int sha1_mb_update(struct shash_desc *desc, const u8 *data,
 		rctx->flag |= HASH_DONE;
 
 	/* submit */
-	sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc);
+	sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
 	sha1_mb_add_list(rctx, cstate);
 	kernel_fpu_begin();
-	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, HASH_UPDATE);
+	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+							nbytes, HASH_UPDATE);
 	kernel_fpu_end();
 
 	/* check if anything is returned */
@@ -544,11 +578,10 @@ done:
 	return ret;
 }
 
-static int sha1_mb_finup(struct shash_desc *desc, const u8 *data,
-			     unsigned int len, u8 *out)
+static int sha1_mb_finup(struct ahash_request *areq)
 {
 	struct mcryptd_hash_request_ctx *rctx =
-			container_of(desc, struct mcryptd_hash_request_ctx, desc);
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
 	struct mcryptd_alg_cstate *cstate =
 				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
 
@@ -563,7 +596,7 @@ static int sha1_mb_finup(struct shash_desc *desc, const u8 *data,
 	}
 
 	/* need to init context */
-	req_ctx_init(rctx, desc);
+	req_ctx_init(rctx, areq);
 
 	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
 
@@ -576,15 +609,15 @@ static int sha1_mb_finup(struct shash_desc *desc, const u8 *data,
 		rctx->flag |= HASH_DONE;
 		flag = HASH_LAST;
 	}
-	rctx->out = out;
 
 	/* submit */
 	rctx->flag |= HASH_FINAL;
-	sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc);
+	sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
 	sha1_mb_add_list(rctx, cstate);
 
 	kernel_fpu_begin();
-	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, flag);
+	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+								nbytes, flag);
 	kernel_fpu_end();
 
 	/* check if anything is returned */
@@ -605,10 +638,10 @@ done:
 	return ret;
 }
 
-static int sha1_mb_final(struct shash_desc *desc, u8 *out)
+static int sha1_mb_final(struct ahash_request *areq)
 {
 	struct mcryptd_hash_request_ctx *rctx =
-			container_of(desc, struct mcryptd_hash_request_ctx, desc);
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
 	struct mcryptd_alg_cstate *cstate =
 				this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
 
@@ -623,16 +656,16 @@ static int sha1_mb_final(struct shash_desc *desc, u8 *out)
 	}
 
 	/* need to init context */
-	req_ctx_init(rctx, desc);
+	req_ctx_init(rctx, areq);
 
-	rctx->out = out;
 	rctx->flag |= HASH_DONE | HASH_FINAL;
 
-	sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc);
+	sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
 	/* flag HASH_FINAL and 0 data size */
 	sha1_mb_add_list(rctx, cstate);
 	kernel_fpu_begin();
-	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, HASH_LAST);
+	sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
+								HASH_LAST);
 	kernel_fpu_end();
 
 	/* check if anything is returned */
@@ -654,48 +687,98 @@ done:
 	return ret;
 }
 
-static int sha1_mb_export(struct shash_desc *desc, void *out)
+static int sha1_mb_export(struct ahash_request *areq, void *out)
 {
-	struct sha1_hash_ctx *sctx = shash_desc_ctx(desc);
+	struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
 
 	memcpy(out, sctx, sizeof(*sctx));
 
 	return 0;
 }
 
-static int sha1_mb_import(struct shash_desc *desc, const void *in)
+static int sha1_mb_import(struct ahash_request *areq, const void *in)
 {
-	struct sha1_hash_ctx *sctx = shash_desc_ctx(desc);
+	struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
 
 	memcpy(sctx, in, sizeof(*sctx));
 
 	return 0;
 }
 
+static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+	struct mcryptd_ahash *mcryptd_tfm;
+	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct mcryptd_hash_ctx *mctx;
 
-static struct shash_alg sha1_mb_shash_alg = {
-	.digestsize	=	SHA1_DIGEST_SIZE,
+	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
+						CRYPTO_ALG_INTERNAL,
+						CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(mcryptd_tfm))
+		return PTR_ERR(mcryptd_tfm);
+	mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+	mctx->alg_state = &sha1_mb_alg_state;
+	ctx->mcryptd_tfm = mcryptd_tfm;
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+	return 0;
+}
+
+static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static int sha1_mb_areq_init_tfm(struct crypto_tfm *tfm)
+{
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				sizeof(struct sha1_hash_ctx));
+
+	return 0;
+}
+
+static void sha1_mb_areq_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha1_mb_areq_alg = {
 	.init		=	sha1_mb_init,
 	.update		=	sha1_mb_update,
 	.final		=	sha1_mb_final,
 	.finup		=	sha1_mb_finup,
 	.export		=	sha1_mb_export,
 	.import		=	sha1_mb_import,
-	.descsize	=	sizeof(struct sha1_hash_ctx),
-	.statesize	=	sizeof(struct sha1_hash_ctx),
-	.base		=	{
-		.cra_name	 = "__sha1-mb",
-		.cra_driver_name = "__intel_sha1-mb",
-		.cra_priority	 = 100,
-		/*
-		 * use ASYNC flag as some buffers in multi-buffer
-		 * algo may not have completed before hashing thread sleep
-		 */
-		.cra_flags	 = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC |
-				   CRYPTO_ALG_INTERNAL,
-		.cra_blocksize	 = SHA1_BLOCK_SIZE,
-		.cra_module	 = THIS_MODULE,
-		.cra_list	 = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list),
+	.halg		=	{
+		.digestsize	=	SHA1_DIGEST_SIZE,
+		.statesize	=	sizeof(struct sha1_hash_ctx),
+		.base		=	{
+			.cra_name	 = "__sha1-mb",
+			.cra_driver_name = "__intel_sha1-mb",
+			.cra_priority	 = 100,
+			/*
+			 * use ASYNC flag as some buffers in multi-buffer
+			 * algo may not have completed before hashing thread
+			 * sleep
+			 */
+			.cra_flags	= CRYPTO_ALG_TYPE_AHASH |
+						CRYPTO_ALG_ASYNC |
+						CRYPTO_ALG_INTERNAL,
+			.cra_blocksize	= SHA1_BLOCK_SIZE,
+			.cra_module	= THIS_MODULE,
+			.cra_list	= LIST_HEAD_INIT
+					(sha1_mb_areq_alg.halg.base.cra_list),
+			.cra_init	= sha1_mb_areq_init_tfm,
+			.cra_exit	= sha1_mb_areq_exit_tfm,
+			.cra_ctxsize	= sizeof(struct sha1_hash_ctx),
+		}
 	}
 };
 
@@ -780,46 +863,20 @@ static int sha1_mb_async_import(struct ahash_request *req, const void *in)
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
 	struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
 	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-	struct crypto_shash *child = mcryptd_ahash_child(mcryptd_tfm);
+	struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
 	struct mcryptd_hash_request_ctx *rctx;
-	struct shash_desc *desc;
+	struct ahash_request *areq;
 
 	memcpy(mcryptd_req, req, sizeof(*req));
 	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
 	rctx = ahash_request_ctx(mcryptd_req);
-	desc = &rctx->desc;
-	desc->tfm = child;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-
-	return crypto_ahash_import(mcryptd_req, in);
-}
-
-static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
-{
-	struct mcryptd_ahash *mcryptd_tfm;
-	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct mcryptd_hash_ctx *mctx;
+	areq = &rctx->areq;
 
-	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
-					  CRYPTO_ALG_INTERNAL,
-					  CRYPTO_ALG_INTERNAL);
-	if (IS_ERR(mcryptd_tfm))
-		return PTR_ERR(mcryptd_tfm);
-	mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
-	mctx->alg_state = &sha1_mb_alg_state;
-	ctx->mcryptd_tfm = mcryptd_tfm;
-	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
-				 sizeof(struct ahash_request) +
-				 crypto_ahash_reqsize(&mcryptd_tfm->base));
+	ahash_request_set_tfm(areq, child);
+	ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
+					rctx->complete, req);
 
-	return 0;
-}
-
-static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm)
-{
-	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
-
-	mcryptd_free_ahash(ctx->mcryptd_tfm);
+	return crypto_ahash_import(mcryptd_req, in);
 }
 
 static struct ahash_alg sha1_mb_async_alg = {
@@ -866,7 +923,8 @@ static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
 		if (time_before(cur_time, rctx->tag.expire))
 			break;
 		kernel_fpu_begin();
-		sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr);
+		sha_ctx = (struct sha1_hash_ctx *)
+					sha1_ctx_mgr_flush(cstate->mgr);
 		kernel_fpu_end();
 		if (!sha_ctx) {
 			pr_err("sha1_mb error: nothing got flushed for non-empty list\n");
@@ -927,7 +985,7 @@ static int __init sha1_mb_mod_init(void)
 	}
 	sha1_mb_alg_state.flusher = &sha1_mb_flusher;
 
-	err = crypto_register_shash(&sha1_mb_shash_alg);
+	err = crypto_register_ahash(&sha1_mb_areq_alg);
 	if (err)
 		goto err2;
 	err = crypto_register_ahash(&sha1_mb_async_alg);
@@ -937,7 +995,7 @@ static int __init sha1_mb_mod_init(void)
 
 	return 0;
 err1:
-	crypto_unregister_shash(&sha1_mb_shash_alg);
+	crypto_unregister_ahash(&sha1_mb_areq_alg);
 err2:
 	for_each_possible_cpu(cpu) {
 		cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
@@ -953,7 +1011,7 @@ static void __exit sha1_mb_mod_fini(void)
 	struct mcryptd_alg_cstate *cpu_state;
 
 	crypto_unregister_ahash(&sha1_mb_async_alg);
-	crypto_unregister_shash(&sha1_mb_shash_alg);
+	crypto_unregister_ahash(&sha1_mb_areq_alg);
 	for_each_possible_cpu(cpu) {
 		cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
 		kfree(cpu_state->mgr);
diff --git a/arch/x86/crypto/sha-mb/sha_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
index e36069d0c1bd..98a35bcc6f4a 100644
--- a/arch/x86/crypto/sha-mb/sha_mb_ctx.h
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
@@ -54,7 +54,7 @@
 #ifndef _SHA_MB_CTX_INTERNAL_H
 #define _SHA_MB_CTX_INTERNAL_H
 
-#include "sha_mb_mgr.h"
+#include "sha1_mb_mgr.h"
 
 #define HASH_UPDATE          0x00
 #define HASH_FIRST           0x01
diff --git a/arch/x86/crypto/sha-mb/sha_mb_mgr.h b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h
index 08ad1a9acfd7..08ad1a9acfd7 100644
--- a/arch/x86/crypto/sha-mb/sha_mb_mgr.h
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S
index 86688c6e7a25..86688c6e7a25 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
index 96df6a39d7e2..96df6a39d7e2 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c
index 822acb5b464c..d2add0d35f43 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c
@@ -51,7 +51,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "sha_mb_mgr.h"
+#include "sha1_mb_mgr.h"
 
 void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
 {
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
index 63a0d9c8e31f..63a0d9c8e31f 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
diff --git a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
index c9dae1cd2919..c9dae1cd2919 100644
--- a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 1024e378a358..fc61739150e7 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -374,3 +374,9 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
 
 MODULE_ALIAS_CRYPTO("sha1");
+MODULE_ALIAS_CRYPTO("sha1-ssse3");
+MODULE_ALIAS_CRYPTO("sha1-avx");
+MODULE_ALIAS_CRYPTO("sha1-avx2");
+#ifdef CONFIG_AS_SHA1_NI
+MODULE_ALIAS_CRYPTO("sha1-ni");
+#endif
diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile
new file mode 100644
index 000000000000..41089e7c400c
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/Makefile
@@ -0,0 +1,11 @@
+#
+# Arch-specific CryptoAPI modules.
+#
+
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+                                $(comma)4)$(comma)%ymm2,yes,no)
+ifeq ($(avx2_supported),yes)
+	obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb.o
+	sha256-mb-y := sha256_mb.o sha256_mb_mgr_flush_avx2.o \
+	     sha256_mb_mgr_init_avx2.o sha256_mb_mgr_submit_avx2.o sha256_x8_avx2.o
+endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c
new file mode 100644
index 000000000000..89fa85e8b10c
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb.c
@@ -0,0 +1,1030 @@
+/*
+ * Multi buffer SHA256 algorithm Glue Code
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *	Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <crypto/mcryptd.h>
+#include <crypto/crypto_wq.h>
+#include <asm/byteorder.h>
+#include <linux/hardirq.h>
+#include <asm/fpu/api.h>
+#include "sha256_mb_ctx.h"
+
+#define FLUSH_INTERVAL 1000 /* in usec */
+
+static struct mcryptd_alg_state sha256_mb_alg_state;
+
+struct sha256_mb_ctx {
+	struct mcryptd_ahash *mcryptd_tfm;
+};
+
+static inline struct mcryptd_hash_request_ctx
+		*cast_hash_to_mcryptd_ctx(struct sha256_hash_ctx *hash_ctx)
+{
+	struct ahash_request *areq;
+
+	areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
+	return container_of(areq, struct mcryptd_hash_request_ctx, areq);
+}
+
+static inline struct ahash_request
+		*cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+{
+	return container_of((void *) ctx, struct ahash_request, __ctx);
+}
+
+static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
+				struct ahash_request *areq)
+{
+	rctx->flag = HASH_UPDATE;
+}
+
+static asmlinkage void (*sha256_job_mgr_init)(struct sha256_mb_mgr *state);
+static asmlinkage struct job_sha256* (*sha256_job_mgr_submit)
+			(struct sha256_mb_mgr *state, struct job_sha256 *job);
+static asmlinkage struct job_sha256* (*sha256_job_mgr_flush)
+			(struct sha256_mb_mgr *state);
+static asmlinkage struct job_sha256* (*sha256_job_mgr_get_comp_job)
+			(struct sha256_mb_mgr *state);
+
+inline void sha256_init_digest(uint32_t *digest)
+{
+	static const uint32_t initial_digest[SHA256_DIGEST_LENGTH] = {
+				SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+				SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7};
+	memcpy(digest, initial_digest, sizeof(initial_digest));
+}
+
+inline uint32_t sha256_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2],
+			 uint32_t total_len)
+{
+	uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1);
+
+	memset(&padblock[i], 0, SHA256_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	i += ((SHA256_BLOCK_SIZE - 1) &
+	      (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1)))
+	     + 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) &padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
+
+	/* Number of extra blocks to hash */
+	return i >> SHA256_LOG2_BLOCK_SIZE;
+}
+
+static struct sha256_hash_ctx
+		*sha256_ctx_mgr_resubmit(struct sha256_ctx_mgr *mgr,
+					struct sha256_hash_ctx *ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			/* Clear PROCESSING bit */
+			ctx->status = HASH_CTX_STS_COMPLETE;
+			return ctx;
+		}
+
+		/*
+		 * If the extra blocks are empty, begin hashing what remains
+		 * in the user's buffer.
+		 */
+		if (ctx->partial_block_buffer_length == 0 &&
+		    ctx->incoming_buffer_length) {
+
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+			uint32_t copy_len;
+
+			/*
+			 * Only entire blocks can be hashed.
+			 * Copy remainder to extra blocks buffer.
+			 */
+			copy_len = len & (SHA256_BLOCK_SIZE-1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy(ctx->partial_block_buffer,
+				       ((const char *) buffer + len),
+				       copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			/* len should be a multiple of the block size now */
+			assert((len % SHA256_BLOCK_SIZE) == 0);
+
+			/* Set len to the number of blocks to be hashed */
+			len >>= SHA256_LOG2_BLOCK_SIZE;
+
+			if (len) {
+
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (struct sha256_hash_ctx *)
+				sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
+				continue;
+			}
+		}
+
+		/*
+		 * If the extra blocks are not empty, then we are
+		 * either on the last block(s) or we need more
+		 * user input before continuing.
+		 */
+		if (ctx->status & HASH_CTX_STS_LAST) {
+
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks =
+				sha256_pad(buf, ctx->total_length);
+
+			ctx->status = (HASH_CTX_STS_PROCESSING |
+				       HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (struct sha256_hash_ctx *)
+				sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static struct sha256_hash_ctx
+		*sha256_ctx_mgr_get_comp_ctx(struct sha256_ctx_mgr *mgr)
+{
+	/*
+	 * If get_comp_job returns NULL, there are no jobs complete.
+	 * If get_comp_job returns a job, verify that it is safe to return to
+	 * the user. If it is not ready, resubmit the job to finish processing.
+	 * If sha256_ctx_mgr_resubmit returned a job, it is ready to be
+	 * returned. Otherwise, all jobs currently being managed by the
+	 * hash_ctx_mgr still need processing.
+	 */
+	struct sha256_hash_ctx *ctx;
+
+	ctx = (struct sha256_hash_ctx *) sha256_job_mgr_get_comp_job(&mgr->mgr);
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static void sha256_ctx_mgr_init(struct sha256_ctx_mgr *mgr)
+{
+	sha256_job_mgr_init(&mgr->mgr);
+}
+
+static struct sha256_hash_ctx *sha256_ctx_mgr_submit(struct sha256_ctx_mgr *mgr,
+					  struct sha256_hash_ctx *ctx,
+					  const void *buffer,
+					  uint32_t len,
+					  int flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		/* User should not pass anything other than FIRST, UPDATE
+		 * or LAST
+		 */
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		/* Cannot submit to a currently processing job. */
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		/* Cannot update a finished job. */
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+	if (flags & HASH_FIRST) {
+		/* Init digest */
+		sha256_init_digest(ctx->job.result_digest);
+
+		/* Reset byte counter */
+		ctx->total_length = 0;
+
+		/* Clear extra blocks */
+		ctx->partial_block_buffer_length = 0;
+	}
+
+	/* If we made it here, there was no error during this call to submit */
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	/* Store buffer ptr info from user */
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	/*
+	 * Store the user's request flags and mark this ctx as currently
+	 * being processed.
+	 */
+	ctx->status = (flags & HASH_LAST) ?
+			(HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+			HASH_CTX_STS_PROCESSING;
+
+	/* Advance byte counter */
+	ctx->total_length += len;
+
+	/*
+	 * If there is anything currently buffered in the extra blocks,
+	 * append to it until it contains a whole block.
+	 * Or if the user's buffer contains less than a whole block,
+	 * append as much as possible to the extra block.
+	 */
+	if (ctx->partial_block_buffer_length || len < SHA256_BLOCK_SIZE) {
+		/*
+		 * Compute how many bytes to copy from user buffer into
+		 * extra block
+		 */
+		uint32_t copy_len = SHA256_BLOCK_SIZE -
+					ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			/* Copy and update relevant pointers and counters */
+			memcpy(
+		&ctx->partial_block_buffer[ctx->partial_block_buffer_length],
+				buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)
+					((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+
+		/* The extra block should never contain more than 1 block */
+		assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+		/*
+		 * If the extra block buffer contains exactly 1 block,
+		 * it can be hashed.
+		 */
+		if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (struct sha256_hash_ctx *)
+				sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static struct sha256_hash_ctx *sha256_ctx_mgr_flush(struct sha256_ctx_mgr *mgr)
+{
+	struct sha256_hash_ctx *ctx;
+
+	while (1) {
+		ctx = (struct sha256_hash_ctx *)
+					sha256_job_mgr_flush(&mgr->mgr);
+
+		/* If flush returned 0, there are no more jobs in flight. */
+		if (!ctx)
+			return NULL;
+
+		/*
+		 * If flush returned a job, resubmit the job to finish
+		 * processing.
+		 */
+		ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+		/*
+		 * If sha256_ctx_mgr_resubmit returned a job, it is ready to
+		 * be returned. Otherwise, all jobs currently being managed by
+		 * the sha256_ctx_mgr still need processing. Loop.
+		 */
+		if (ctx)
+			return ctx;
+	}
+}
+
+static int sha256_mb_init(struct ahash_request *areq)
+{
+	struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	hash_ctx_init(sctx);
+	sctx->job.result_digest[0] = SHA256_H0;
+	sctx->job.result_digest[1] = SHA256_H1;
+	sctx->job.result_digest[2] = SHA256_H2;
+	sctx->job.result_digest[3] = SHA256_H3;
+	sctx->job.result_digest[4] = SHA256_H4;
+	sctx->job.result_digest[5] = SHA256_H5;
+	sctx->job.result_digest[6] = SHA256_H6;
+	sctx->job.result_digest[7] = SHA256_H7;
+	sctx->total_length = 0;
+	sctx->partial_block_buffer_length = 0;
+	sctx->status = HASH_CTX_STS_IDLE;
+
+	return 0;
+}
+
+static int sha256_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
+{
+	int	i;
+	struct	sha256_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
+	__be32	*dst = (__be32 *) rctx->out;
+
+	for (i = 0; i < 8; ++i)
+		dst[i] = cpu_to_be32(sctx->job.result_digest[i]);
+
+	return 0;
+}
+
+static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
+			struct mcryptd_alg_cstate *cstate, bool flush)
+{
+	int	flag = HASH_UPDATE;
+	int	nbytes, err = 0;
+	struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
+	struct sha256_hash_ctx *sha_ctx;
+
+	/* more work ? */
+	while (!(rctx->flag & HASH_DONE)) {
+		nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
+		if (nbytes < 0) {
+			err = nbytes;
+			goto out;
+		}
+		/* check if the walk is done */
+		if (crypto_ahash_walk_last(&rctx->walk)) {
+			rctx->flag |= HASH_DONE;
+			if (rctx->flag & HASH_FINAL)
+				flag |= HASH_LAST;
+
+		}
+		sha_ctx = (struct sha256_hash_ctx *)
+						ahash_request_ctx(&rctx->areq);
+		kernel_fpu_begin();
+		sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx,
+						rctx->walk.data, nbytes, flag);
+		if (!sha_ctx) {
+			if (flush)
+				sha_ctx = sha256_ctx_mgr_flush(cstate->mgr);
+		}
+		kernel_fpu_end();
+		if (sha_ctx)
+			rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		else {
+			rctx = NULL;
+			goto out;
+		}
+	}
+
+	/* copy the results */
+	if (rctx->flag & HASH_FINAL)
+		sha256_mb_set_results(rctx);
+
+out:
+	*ret_rctx = rctx;
+	return err;
+}
+
+static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
+			    struct mcryptd_alg_cstate *cstate,
+			    int err)
+{
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha256_hash_ctx *sha_ctx;
+	struct mcryptd_hash_request_ctx *req_ctx;
+	int ret;
+
+	/* remove from work list */
+	spin_lock(&cstate->work_lock);
+	list_del(&rctx->waiter);
+	spin_unlock(&cstate->work_lock);
+
+	if (irqs_disabled())
+		rctx->complete(&req->base, err);
+	else {
+		local_bh_disable();
+		rctx->complete(&req->base, err);
+		local_bh_enable();
+	}
+
+	/* check to see if there are other jobs that are done */
+	sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr);
+	while (sha_ctx) {
+		req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		ret = sha_finish_walk(&req_ctx, cstate, false);
+		if (req_ctx) {
+			spin_lock(&cstate->work_lock);
+			list_del(&req_ctx->waiter);
+			spin_unlock(&cstate->work_lock);
+
+			req = cast_mcryptd_ctx_to_req(req_ctx);
+			if (irqs_disabled())
+				rctx->complete(&req->base, ret);
+			else {
+				local_bh_disable();
+				rctx->complete(&req->base, ret);
+				local_bh_enable();
+			}
+		}
+		sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr);
+	}
+
+	return 0;
+}
+
+static void sha256_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
+			     struct mcryptd_alg_cstate *cstate)
+{
+	unsigned long next_flush;
+	unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+
+	/* initialize tag */
+	rctx->tag.arrival = jiffies;    /* tag the arrival time */
+	rctx->tag.seq_num = cstate->next_seq_num++;
+	next_flush = rctx->tag.arrival + delay;
+	rctx->tag.expire = next_flush;
+
+	spin_lock(&cstate->work_lock);
+	list_add_tail(&rctx->waiter, &cstate->work_list);
+	spin_unlock(&cstate->work_lock);
+
+	mcryptd_arm_flusher(cstate, delay);
+}
+
+static int sha256_mb_update(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
+
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha256_hash_ctx *sha_ctx;
+	int ret = 0, nbytes;
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+	if (nbytes < 0) {
+		ret = nbytes;
+		goto done;
+	}
+
+	if (crypto_ahash_walk_last(&rctx->walk))
+		rctx->flag |= HASH_DONE;
+
+	/* submit */
+	sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
+	sha256_mb_add_list(rctx, cstate);
+	kernel_fpu_begin();
+	sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+							nbytes, HASH_UPDATE);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha256_mb_finup(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+		container_of(areq, struct mcryptd_hash_request_ctx, areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
+
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha256_hash_ctx *sha_ctx;
+	int ret = 0, flag = HASH_UPDATE, nbytes;
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+	if (nbytes < 0) {
+		ret = nbytes;
+		goto done;
+	}
+
+	if (crypto_ahash_walk_last(&rctx->walk)) {
+		rctx->flag |= HASH_DONE;
+		flag = HASH_LAST;
+	}
+
+	/* submit */
+	rctx->flag |= HASH_FINAL;
+	sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
+	sha256_mb_add_list(rctx, cstate);
+
+	kernel_fpu_begin();
+	sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+								nbytes, flag);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha256_mb_final(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+			container_of(areq, struct mcryptd_hash_request_ctx,
+			areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
+
+	struct sha256_hash_ctx *sha_ctx;
+	int ret = 0;
+	u8 data;
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	rctx->flag |= HASH_DONE | HASH_FINAL;
+
+	sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
+	/* flag HASH_FINAL and 0 data size */
+	sha256_mb_add_list(rctx, cstate);
+	kernel_fpu_begin();
+	sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
+								HASH_LAST);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha256_mb_export(struct ahash_request *areq, void *out)
+{
+	struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha256_mb_import(struct ahash_request *areq, const void *in)
+{
+	struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha256_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+	struct mcryptd_ahash *mcryptd_tfm;
+	struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct mcryptd_hash_ctx *mctx;
+
+	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha256-mb",
+						CRYPTO_ALG_INTERNAL,
+						CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(mcryptd_tfm))
+		return PTR_ERR(mcryptd_tfm);
+	mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+	mctx->alg_state = &sha256_mb_alg_state;
+	ctx->mcryptd_tfm = mcryptd_tfm;
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+	return 0;
+}
+
+static void sha256_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static int sha256_mb_areq_init_tfm(struct crypto_tfm *tfm)
+{
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				sizeof(struct sha256_hash_ctx));
+
+	return 0;
+}
+
+static void sha256_mb_areq_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha256_mb_areq_alg = {
+	.init		=	sha256_mb_init,
+	.update		=	sha256_mb_update,
+	.final		=	sha256_mb_final,
+	.finup		=	sha256_mb_finup,
+	.export		=	sha256_mb_export,
+	.import		=	sha256_mb_import,
+	.halg		=	{
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.statesize	=	sizeof(struct sha256_hash_ctx),
+		.base		=	{
+			.cra_name	 = "__sha256-mb",
+			.cra_driver_name = "__intel_sha256-mb",
+			.cra_priority	 = 100,
+			/*
+			 * use ASYNC flag as some buffers in multi-buffer
+			 * algo may not have completed before hashing thread
+			 * sleep
+			 */
+			.cra_flags	= CRYPTO_ALG_TYPE_AHASH |
+						CRYPTO_ALG_ASYNC |
+						CRYPTO_ALG_INTERNAL,
+			.cra_blocksize	= SHA256_BLOCK_SIZE,
+			.cra_module	= THIS_MODULE,
+			.cra_list	= LIST_HEAD_INIT
+					(sha256_mb_areq_alg.halg.base.cra_list),
+			.cra_init	= sha256_mb_areq_init_tfm,
+			.cra_exit	= sha256_mb_areq_exit_tfm,
+			.cra_ctxsize	= sizeof(struct sha256_hash_ctx),
+		}
+	}
+};
+
+static int sha256_mb_async_init(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_init(mcryptd_req);
+}
+
+static int sha256_mb_async_update(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_update(mcryptd_req);
+}
+
+static int sha256_mb_async_finup(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_finup(mcryptd_req);
+}
+
+static int sha256_mb_async_final(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_final(mcryptd_req);
+}
+
+static int sha256_mb_async_digest(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_digest(mcryptd_req);
+}
+
+static int sha256_mb_async_export(struct ahash_request *req, void *out)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha256_mb_async_import(struct ahash_request *req, const void *in)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+	struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
+	struct mcryptd_hash_request_ctx *rctx;
+	struct ahash_request *areq;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	rctx = ahash_request_ctx(mcryptd_req);
+	areq = &rctx->areq;
+
+	ahash_request_set_tfm(areq, child);
+	ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
+					rctx->complete, req);
+
+	return crypto_ahash_import(mcryptd_req, in);
+}
+
+static struct ahash_alg sha256_mb_async_alg = {
+	.init           = sha256_mb_async_init,
+	.update         = sha256_mb_async_update,
+	.final          = sha256_mb_async_final,
+	.finup          = sha256_mb_async_finup,
+	.export         = sha256_mb_async_export,
+	.import         = sha256_mb_async_import,
+	.digest         = sha256_mb_async_digest,
+	.halg = {
+		.digestsize     = SHA256_DIGEST_SIZE,
+		.statesize      = sizeof(struct sha256_hash_ctx),
+		.base = {
+			.cra_name               = "sha256",
+			.cra_driver_name        = "sha256_mb",
+			.cra_priority           = 200,
+			.cra_flags              = CRYPTO_ALG_TYPE_AHASH |
+							CRYPTO_ALG_ASYNC,
+			.cra_blocksize          = SHA256_BLOCK_SIZE,
+			.cra_type               = &crypto_ahash_type,
+			.cra_module             = THIS_MODULE,
+			.cra_list               = LIST_HEAD_INIT
+				(sha256_mb_async_alg.halg.base.cra_list),
+			.cra_init               = sha256_mb_async_init_tfm,
+			.cra_exit               = sha256_mb_async_exit_tfm,
+			.cra_ctxsize		= sizeof(struct sha256_mb_ctx),
+			.cra_alignmask		= 0,
+		},
+	},
+};
+
+static unsigned long sha256_mb_flusher(struct mcryptd_alg_cstate *cstate)
+{
+	struct mcryptd_hash_request_ctx *rctx;
+	unsigned long cur_time;
+	unsigned long next_flush = 0;
+	struct sha256_hash_ctx *sha_ctx;
+
+
+	cur_time = jiffies;
+
+	while (!list_empty(&cstate->work_list)) {
+		rctx = list_entry(cstate->work_list.next,
+				struct mcryptd_hash_request_ctx, waiter);
+		if (time_before(cur_time, rctx->tag.expire))
+			break;
+		kernel_fpu_begin();
+		sha_ctx = (struct sha256_hash_ctx *)
+					sha256_ctx_mgr_flush(cstate->mgr);
+		kernel_fpu_end();
+		if (!sha_ctx) {
+			pr_err("sha256_mb error: nothing got"
+					" flushed for non-empty list\n");
+			break;
+		}
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		sha_finish_walk(&rctx, cstate, true);
+		sha_complete_job(rctx, cstate, 0);
+	}
+
+	if (!list_empty(&cstate->work_list)) {
+		rctx = list_entry(cstate->work_list.next,
+				struct mcryptd_hash_request_ctx, waiter);
+		/* get the hash context and then flush time */
+		next_flush = rctx->tag.expire;
+		mcryptd_arm_flusher(cstate, get_delay(next_flush));
+	}
+	return next_flush;
+}
+
+static int __init sha256_mb_mod_init(void)
+{
+
+	int cpu;
+	int err;
+	struct mcryptd_alg_cstate *cpu_state;
+
+	/* check for dependent cpu features */
+	if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+	    !boot_cpu_has(X86_FEATURE_BMI2))
+		return -ENODEV;
+
+	/* initialize multibuffer structures */
+	sha256_mb_alg_state.alg_cstate = alloc_percpu
+						(struct mcryptd_alg_cstate);
+
+	sha256_job_mgr_init = sha256_mb_mgr_init_avx2;
+	sha256_job_mgr_submit = sha256_mb_mgr_submit_avx2;
+	sha256_job_mgr_flush = sha256_mb_mgr_flush_avx2;
+	sha256_job_mgr_get_comp_job = sha256_mb_mgr_get_comp_job_avx2;
+
+	if (!sha256_mb_alg_state.alg_cstate)
+		return -ENOMEM;
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
+		cpu_state->next_flush = 0;
+		cpu_state->next_seq_num = 0;
+		cpu_state->flusher_engaged = false;
+		INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
+		cpu_state->cpu = cpu;
+		cpu_state->alg_state = &sha256_mb_alg_state;
+		cpu_state->mgr = kzalloc(sizeof(struct sha256_ctx_mgr),
+					GFP_KERNEL);
+		if (!cpu_state->mgr)
+			goto err2;
+		sha256_ctx_mgr_init(cpu_state->mgr);
+		INIT_LIST_HEAD(&cpu_state->work_list);
+		spin_lock_init(&cpu_state->work_lock);
+	}
+	sha256_mb_alg_state.flusher = &sha256_mb_flusher;
+
+	err = crypto_register_ahash(&sha256_mb_areq_alg);
+	if (err)
+		goto err2;
+	err = crypto_register_ahash(&sha256_mb_async_alg);
+	if (err)
+		goto err1;
+
+
+	return 0;
+err1:
+	crypto_unregister_ahash(&sha256_mb_areq_alg);
+err2:
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
+		kfree(cpu_state->mgr);
+	}
+	free_percpu(sha256_mb_alg_state.alg_cstate);
+	return -ENODEV;
+}
+
+static void __exit sha256_mb_mod_fini(void)
+{
+	int cpu;
+	struct mcryptd_alg_cstate *cpu_state;
+
+	crypto_unregister_ahash(&sha256_mb_async_alg);
+	crypto_unregister_ahash(&sha256_mb_areq_alg);
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
+		kfree(cpu_state->mgr);
+	}
+	free_percpu(sha256_mb_alg_state.alg_cstate);
+}
+
+module_init(sha256_mb_mod_init);
+module_exit(sha256_mb_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, multi buffer accelerated");
+
+MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
new file mode 100644
index 000000000000..edd252b73206
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
@@ -0,0 +1,136 @@
+/*
+ * Header file for multi buffer SHA256 context
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *	Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SHA_MB_CTX_INTERNAL_H
+#define _SHA_MB_CTX_INTERNAL_H
+
+#include "sha256_mb_mgr.h"
+
+#define HASH_UPDATE          0x00
+#define HASH_FIRST           0x01
+#define HASH_LAST            0x02
+#define HASH_ENTIRE          0x03
+#define HASH_DONE	     0x04
+#define HASH_FINAL	     0x08
+
+#define HASH_CTX_STS_IDLE       0x00
+#define HASH_CTX_STS_PROCESSING 0x01
+#define HASH_CTX_STS_LAST       0x02
+#define HASH_CTX_STS_COMPLETE   0x04
+
+enum hash_ctx_error {
+	HASH_CTX_ERROR_NONE               =  0,
+	HASH_CTX_ERROR_INVALID_FLAGS      = -1,
+	HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
+	HASH_CTX_ERROR_ALREADY_COMPLETED  = -3,
+
+#ifdef HASH_CTX_DEBUG
+	HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
+#endif
+};
+
+
+#define hash_ctx_user_data(ctx)  ((ctx)->user_data)
+#define hash_ctx_digest(ctx)     ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx)   ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx)     ((ctx)->status)
+#define hash_ctx_error(ctx)      ((ctx)->error)
+#define hash_ctx_init(ctx) \
+	do { \
+		(ctx)->error = HASH_CTX_ERROR_NONE; \
+		(ctx)->status = HASH_CTX_STS_COMPLETE; \
+	} while (0)
+
+
+/* Hash Constants and Typedefs */
+#define SHA256_DIGEST_LENGTH        8
+#define SHA256_LOG2_BLOCK_SIZE        6
+
+#define SHA256_PADLENGTHFIELD_SIZE    8
+
+#ifdef SHA_MB_DEBUG
+#define assert(expr) \
+do { \
+	if (unlikely(!(expr))) { \
+		printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+		#expr, __FILE__, __func__, __LINE__); \
+	} \
+} while (0)
+#else
+#define assert(expr) do {} while (0)
+#endif
+
+struct sha256_ctx_mgr {
+	struct sha256_mb_mgr mgr;
+};
+
+/* typedef struct sha256_ctx_mgr sha256_ctx_mgr; */
+
+struct sha256_hash_ctx {
+	/* Must be at struct offset 0 */
+	struct job_sha256       job;
+	/* status flag */
+	int status;
+	/* error flag */
+	int error;
+
+	uint32_t	total_length;
+	const void	*incoming_buffer;
+	uint32_t	incoming_buffer_length;
+	uint8_t		partial_block_buffer[SHA256_BLOCK_SIZE * 2];
+	uint32_t	partial_block_buffer_length;
+	void		*user_data;
+};
+
+#endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h
new file mode 100644
index 000000000000..b01ae408c56d
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h
@@ -0,0 +1,108 @@
+/*
+ * Header file for multi buffer SHA256 algorithm manager
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *	Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __SHA_MB_MGR_H
+#define __SHA_MB_MGR_H
+
+#include <linux/types.h>
+
+#define NUM_SHA256_DIGEST_WORDS 8
+
+enum job_sts {	STS_UNKNOWN = 0,
+		STS_BEING_PROCESSED = 1,
+		STS_COMPLETED = 2,
+		STS_INTERNAL_ERROR = 3,
+		STS_ERROR = 4
+};
+
+struct job_sha256 {
+	u8	*buffer;
+	u32	len;
+	u32	result_digest[NUM_SHA256_DIGEST_WORDS] __aligned(32);
+	enum	job_sts status;
+	void	*user_data;
+};
+
+/* SHA256 out-of-order scheduler */
+
+/* typedef uint32_t sha8_digest_array[8][8]; */
+
+struct sha256_args_x8 {
+	uint32_t	digest[8][8];
+	uint8_t		*data_ptr[8];
+};
+
+struct sha256_lane_data {
+	struct job_sha256 *job_in_lane;
+};
+
+struct sha256_mb_mgr {
+	struct sha256_args_x8 args;
+
+	uint32_t lens[8];
+
+	/* each byte is index (0...7) of unused lanes */
+	uint64_t unused_lanes;
+	/* byte 4 is set to FF as a flag */
+	struct sha256_lane_data ldata[8];
+};
+
+
+#define SHA256_MB_MGR_NUM_LANES_AVX2 8
+
+void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state);
+struct job_sha256 *sha256_mb_mgr_submit_avx2(struct sha256_mb_mgr *state,
+					 struct job_sha256 *job);
+struct job_sha256 *sha256_mb_mgr_flush_avx2(struct sha256_mb_mgr *state);
+struct job_sha256 *sha256_mb_mgr_get_comp_job_avx2(struct sha256_mb_mgr *state);
+
+#endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S
new file mode 100644
index 000000000000..5c377bac21d0
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S
@@ -0,0 +1,304 @@
+/*
+ * Header file for multi buffer SHA256 algorithm data structure
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ *     Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# Macros for defining data structures
+
+# Usage example
+
+#START_FIELDS	# JOB_AES
+###	name		size	align
+#FIELD	_plaintext,	8,	8	# pointer to plaintext
+#FIELD	_ciphertext,	8,	8	# pointer to ciphertext
+#FIELD	_IV,		16,	8	# IV
+#FIELD	_keys,		8,	8	# pointer to keys
+#FIELD	_len,		4,	4	# length in bytes
+#FIELD	_status,	4,	4	# status enumeration
+#FIELD	_user_data,	8,	8	# pointer to user data
+#UNION  _union,         size1,  align1, \
+#	                size2,  align2, \
+#	                size3,  align3, \
+#	                ...
+#END_FIELDS
+#%assign _JOB_AES_size	_FIELD_OFFSET
+#%assign _JOB_AES_align	_STRUCT_ALIGN
+
+#########################################################################
+
+# Alternate "struc-like" syntax:
+#	STRUCT job_aes2
+#	RES_Q	.plaintext,	1
+#	RES_Q	.ciphertext, 	1
+#	RES_DQ	.IV,		1
+#	RES_B	.nested,	_JOB_AES_SIZE, _JOB_AES_ALIGN
+#	RES_U	.union,		size1, align1, \
+#				size2, align2, \
+#				...
+#	ENDSTRUCT
+#	# Following only needed if nesting
+#	%assign job_aes2_size	_FIELD_OFFSET
+#	%assign job_aes2_align	_STRUCT_ALIGN
+#
+# RES_* macros take a name, a count and an optional alignment.
+# The count in in terms of the base size of the macro, and the
+# default alignment is the base size.
+# The macros are:
+# Macro    Base size
+# RES_B	    1
+# RES_W	    2
+# RES_D     4
+# RES_Q     8
+# RES_DQ   16
+# RES_Y    32
+# RES_Z    64
+#
+# RES_U defines a union. It's arguments are a name and two or more
+# pairs of "size, alignment"
+#
+# The two assigns are only needed if this structure is being nested
+# within another. Even if the assigns are not done, one can still use
+# STRUCT_NAME_size as the size of the structure.
+#
+# Note that for nesting, you still need to assign to STRUCT_NAME_size.
+#
+# The differences between this and using "struc" directly are that each
+# type is implicitly aligned to its natural length (although this can be
+# over-ridden with an explicit third parameter), and that the structure
+# is padded at the end to its overall alignment.
+#
+
+#########################################################################
+
+#ifndef _DATASTRUCT_ASM_
+#define _DATASTRUCT_ASM_
+
+#define SZ8			8*SHA256_DIGEST_WORD_SIZE
+#define ROUNDS			64*SZ8
+#define PTR_SZ                  8
+#define SHA256_DIGEST_WORD_SIZE 4
+#define MAX_SHA256_LANES        8
+#define SHA256_DIGEST_WORDS 8
+#define SHA256_DIGEST_ROW_SIZE  (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
+#define SHA256_DIGEST_SIZE      (SHA256_DIGEST_ROW_SIZE * SHA256_DIGEST_WORDS)
+#define SHA256_BLK_SZ           64
+
+# START_FIELDS
+.macro START_FIELDS
+ _FIELD_OFFSET = 0
+ _STRUCT_ALIGN = 0
+.endm
+
+# FIELD name size align
+.macro FIELD name size align
+ _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
+ \name	= _FIELD_OFFSET
+ _FIELD_OFFSET = _FIELD_OFFSET + (\size)
+.if (\align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = \align
+.endif
+.endm
+
+# END_FIELDS
+.macro END_FIELDS
+ _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+.endm
+
+########################################################################
+
+.macro STRUCT p1
+START_FIELDS
+.struc \p1
+.endm
+
+.macro ENDSTRUCT
+ tmp = _FIELD_OFFSET
+ END_FIELDS
+ tmp = (_FIELD_OFFSET - %%tmp)
+.if (tmp > 0)
+	.lcomm	tmp
+.endif
+.endstruc
+.endm
+
+## RES_int name size align
+.macro RES_int p1 p2 p3
+ name = \p1
+ size = \p2
+ align = .\p3
+
+ _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
+.align align
+.lcomm name size
+ _FIELD_OFFSET = _FIELD_OFFSET + (size)
+.if (align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = align
+.endif
+.endm
+
+# macro RES_B name, size [, align]
+.macro RES_B _name, _size, _align=1
+RES_int _name _size _align
+.endm
+
+# macro RES_W name, size [, align]
+.macro RES_W _name, _size, _align=2
+RES_int _name 2*(_size) _align
+.endm
+
+# macro RES_D name, size [, align]
+.macro RES_D _name, _size, _align=4
+RES_int _name 4*(_size) _align
+.endm
+
+# macro RES_Q name, size [, align]
+.macro RES_Q _name, _size, _align=8
+RES_int _name 8*(_size) _align
+.endm
+
+# macro RES_DQ name, size [, align]
+.macro RES_DQ _name, _size, _align=16
+RES_int _name 16*(_size) _align
+.endm
+
+# macro RES_Y name, size [, align]
+.macro RES_Y _name, _size, _align=32
+RES_int _name 32*(_size) _align
+.endm
+
+# macro RES_Z name, size [, align]
+.macro RES_Z _name, _size, _align=64
+RES_int _name 64*(_size) _align
+.endm
+
+#endif
+
+
+########################################################################
+#### Define SHA256 Out Of Order Data Structures
+########################################################################
+
+START_FIELDS    # LANE_DATA
+###     name            size    align
+FIELD   _job_in_lane,   8,      8       # pointer to job object
+END_FIELDS
+
+ _LANE_DATA_size = _FIELD_OFFSET
+ _LANE_DATA_align = _STRUCT_ALIGN
+
+########################################################################
+
+START_FIELDS    # SHA256_ARGS_X4
+###     name            size    align
+FIELD   _digest,        4*8*8,  4       # transposed digest
+FIELD   _data_ptr,      8*8,    8       # array of pointers to data
+END_FIELDS
+
+ _SHA256_ARGS_X4_size  =  _FIELD_OFFSET
+ _SHA256_ARGS_X4_align = _STRUCT_ALIGN
+ _SHA256_ARGS_X8_size  =	_FIELD_OFFSET
+ _SHA256_ARGS_X8_align =	_STRUCT_ALIGN
+
+#######################################################################
+
+START_FIELDS    # MB_MGR
+###     name            size    align
+FIELD   _args,          _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align
+FIELD   _lens,          4*8,    8
+FIELD   _unused_lanes,  8,      8
+FIELD   _ldata,         _LANE_DATA_size*8, _LANE_DATA_align
+END_FIELDS
+
+ _MB_MGR_size  =  _FIELD_OFFSET
+ _MB_MGR_align =  _STRUCT_ALIGN
+
+_args_digest   =     _args + _digest
+_args_data_ptr =     _args + _data_ptr
+
+#######################################################################
+
+START_FIELDS    #STACK_FRAME
+###     name            size    align
+FIELD   _data,		16*SZ8,   1       # transposed digest
+FIELD   _digest,         8*SZ8,   1       # array of pointers to data
+FIELD   _ytmp,           4*SZ8,   1
+FIELD   _rsp,            8,       1
+END_FIELDS
+
+ _STACK_FRAME_size  =  _FIELD_OFFSET
+ _STACK_FRAME_align =  _STRUCT_ALIGN
+
+#######################################################################
+
+########################################################################
+#### Define constants
+########################################################################
+
+#define STS_UNKNOWN             0
+#define STS_BEING_PROCESSED     1
+#define STS_COMPLETED           2
+
+########################################################################
+#### Define JOB_SHA256 structure
+########################################################################
+
+START_FIELDS    # JOB_SHA256
+
+###     name                            size    align
+FIELD   _buffer,                        8,      8       # pointer to buffer
+FIELD   _len,                           8,      8       # length in bytes
+FIELD   _result_digest,                 8*4,    32      # Digest (output)
+FIELD   _status,                        4,      4
+FIELD   _user_data,                     8,      8
+END_FIELDS
+
+ _JOB_SHA256_size = _FIELD_OFFSET
+ _JOB_SHA256_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000000..b691da981cd9
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
@@ -0,0 +1,304 @@
+/*
+ * Flush routine for SHA256 multibuffer
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha256_mb_mgr_datastruct.S"
+
+.extern sha256_x8_avx2
+
+#LINUX register definitions
+#define arg1	%rdi
+#define arg2	%rsi
+
+# Common register definitions
+#define state	arg1
+#define job	arg2
+#define len2	arg2
+
+# idx must be a register not clobberred by sha1_mult
+#define idx		%r8
+#define DWORD_idx	%r8d
+
+#define unused_lanes	%rbx
+#define lane_data	%rbx
+#define tmp2		%rbx
+#define tmp2_w		%ebx
+
+#define job_rax		%rax
+#define tmp1		%rax
+#define size_offset	%rax
+#define tmp		%rax
+#define start_offset	%rax
+
+#define tmp3		%arg1
+
+#define extra_blocks	%arg2
+#define p		%arg2
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JNE_SKIP i
+jne     skip_\i
+.endm
+
+.altmacro
+.macro SET_OFFSET _offset
+offset = \_offset
+.endm
+.noaltmacro
+
+# JOB_SHA256* sha256_mb_mgr_flush_avx2(MB_MGR *state)
+# arg 1 : rcx : state
+ENTRY(sha256_mb_mgr_flush_avx2)
+	FRAME_BEGIN
+        push    %rbx
+
+	# If bit (32+3) is set, then all lanes are empty
+	mov	_unused_lanes(state), unused_lanes
+	bt	$32+3, unused_lanes
+	jc	return_null
+
+	# find a lane with a non-null job
+	xor	idx, idx
+	offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	one(%rip), idx
+	offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	two(%rip), idx
+	offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	three(%rip), idx
+	offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	four(%rip), idx
+	offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	five(%rip), idx
+	offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	six(%rip), idx
+	offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+	cmovne	seven(%rip), idx
+
+	# copy idx to empty lanes
+copy_lane_data:
+	offset =  (_args + _data_ptr)
+	mov	offset(state,idx,8), tmp
+
+	I = 0
+.rep 8
+	offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
+	cmpq	$0, offset(state)
+.altmacro
+	JNE_SKIP %I
+	offset =  (_args + _data_ptr + 8*I)
+	mov	tmp, offset(state)
+	offset =  (_lens + 4*I)
+	movl	$0xFFFFFFFF, offset(state)
+LABEL skip_ %I
+	I = (I+1)
+.noaltmacro
+.endr
+
+	# Find min length
+	vmovdqa _lens+0*16(state), %xmm0
+	vmovdqa _lens+1*16(state), %xmm1
+
+	vpminud %xmm1, %xmm0, %xmm2		# xmm2 has {D,C,B,A}
+	vpalignr $8, %xmm2, %xmm3, %xmm3	# xmm3 has {x,x,D,C}
+	vpminud %xmm3, %xmm2, %xmm2		# xmm2 has {x,x,E,F}
+	vpalignr $4, %xmm2, %xmm3, %xmm3	# xmm3 has {x,x,x,E}
+	vpminud %xmm3, %xmm2, %xmm2		# xmm2 has min val in low dword
+
+	vmovd	%xmm2, DWORD_idx
+	mov	idx, len2
+	and	$0xF, idx
+	shr	$4, len2
+	jz	len_is_0
+
+	vpand	clear_low_nibble(%rip), %xmm2, %xmm2
+	vpshufd	$0, %xmm2, %xmm2
+
+	vpsubd	%xmm2, %xmm0, %xmm0
+	vpsubd	%xmm2, %xmm1, %xmm1
+
+	vmovdqa	%xmm0, _lens+0*16(state)
+	vmovdqa	%xmm1, _lens+1*16(state)
+
+	# "state" and "args" are the same address, arg1
+	# len is arg2
+	call	sha256_x8_avx2
+	# state and idx are intact
+
+len_is_0:
+	# process completed job "idx"
+	imul	$_LANE_DATA_size, idx, lane_data
+	lea	_ldata(state, lane_data), lane_data
+
+	mov	_job_in_lane(lane_data), job_rax
+	movq	$0, _job_in_lane(lane_data)
+	movl	$STS_COMPLETED, _status(job_rax)
+	mov	_unused_lanes(state), unused_lanes
+	shl	$4, unused_lanes
+	or	idx, unused_lanes
+
+	mov	unused_lanes, _unused_lanes(state)
+	movl	$0xFFFFFFFF, _lens(state,idx,4)
+
+	vmovd	_args_digest(state , idx, 4) , %xmm0
+	vpinsrd	$1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+	vpinsrd	$2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+	vpinsrd	$3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+	vmovd	_args_digest+4*32(state, idx, 4), %xmm1
+	vpinsrd	$1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
+	vpinsrd	$2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
+	vpinsrd	$3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
+
+	vmovdqu	%xmm0, _result_digest(job_rax)
+	offset =  (_result_digest + 1*16)
+	vmovdqu	%xmm1, offset(job_rax)
+
+return:
+	pop     %rbx
+	FRAME_END
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+ENDPROC(sha256_mb_mgr_flush_avx2)
+
+##############################################################################
+
+.align 16
+ENTRY(sha256_mb_mgr_get_comp_job_avx2)
+	push	%rbx
+
+	## if bit 32+3 is set, then all lanes are empty
+	mov	_unused_lanes(state), unused_lanes
+	bt	$(32+3), unused_lanes
+	jc	.return_null
+
+	# Find min length
+	vmovdqa	_lens(state), %xmm0
+	vmovdqa	_lens+1*16(state), %xmm1
+
+	vpminud	%xmm1, %xmm0, %xmm2		# xmm2 has {D,C,B,A}
+	vpalignr $8, %xmm2, %xmm3, %xmm3	# xmm3 has {x,x,D,C}
+	vpminud	%xmm3, %xmm2, %xmm2		# xmm2 has {x,x,E,F}
+	vpalignr $4, %xmm2, %xmm3, %xmm3	# xmm3 has {x,x,x,E}
+	vpminud	%xmm3, %xmm2, %xmm2		# xmm2 has min val in low dword
+
+	vmovd	%xmm2, DWORD_idx
+	test	$~0xF, idx
+	jnz	.return_null
+
+	# process completed job "idx"
+	imul	$_LANE_DATA_size, idx, lane_data
+	lea	_ldata(state, lane_data), lane_data
+
+	mov	_job_in_lane(lane_data), job_rax
+	movq	$0,  _job_in_lane(lane_data)
+	movl	$STS_COMPLETED, _status(job_rax)
+	mov	_unused_lanes(state), unused_lanes
+	shl	$4, unused_lanes
+	or	idx, unused_lanes
+	mov	unused_lanes, _unused_lanes(state)
+
+	movl	$0xFFFFFFFF, _lens(state,  idx, 4)
+
+	vmovd	_args_digest(state, idx, 4), %xmm0
+	vpinsrd	$1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+	vpinsrd	$2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+	vpinsrd	$3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+	movl	_args_digest+4*32(state, idx, 4), tmp2_w
+	vpinsrd	$1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
+	vpinsrd	$2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
+	vpinsrd	$3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
+
+	vmovdqu	%xmm0, _result_digest(job_rax)
+	movl	tmp2_w, _result_digest+1*16(job_rax)
+
+	pop	%rbx
+
+	ret
+
+.return_null:
+	xor	job_rax, job_rax
+	pop	%rbx
+	ret
+ENDPROC(sha256_mb_mgr_get_comp_job_avx2)
+
+.data
+
+.align 16
+clear_low_nibble:
+.octa	0x000000000000000000000000FFFFFFF0
+one:
+.quad	1
+two:
+.quad	2
+three:
+.quad	3
+four:
+.quad	4
+five:
+.quad	5
+six:
+.quad	6
+seven:
+.quad  7
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
new file mode 100644
index 000000000000..b0c498371e67
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
@@ -0,0 +1,65 @@
+/*
+ * Initialization code for multi buffer SHA256 algorithm for AVX2
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sha256_mb_mgr.h"
+
+void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state)
+{
+	unsigned int j;
+
+	state->unused_lanes = 0xF76543210ULL;
+	for (j = 0; j < 8; j++) {
+		state->lens[j] = 0xFFFFFFFF;
+		state->ldata[j].job_in_lane = NULL;
+	}
+}
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000000..7ea670e25acc
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
@@ -0,0 +1,215 @@
+/*
+ * Buffer submit code for multi buffer SHA256 algorithm
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha256_mb_mgr_datastruct.S"
+
+.extern sha256_x8_avx2
+
+# LINUX register definitions
+arg1		= %rdi
+arg2		= %rsi
+size_offset	= %rcx
+tmp2		= %rcx
+extra_blocks	= %rdx
+
+# Common definitions
+#define state	arg1
+#define job	%rsi
+#define len2	arg2
+#define p2	arg2
+
+# idx must be a register not clobberred by sha1_x8_avx2
+idx		= %r8
+DWORD_idx	= %r8d
+last_len	= %r8
+
+p		= %r11
+start_offset	= %r11
+
+unused_lanes	= %rbx
+BYTE_unused_lanes = %bl
+
+job_rax		= %rax
+len		= %rax
+DWORD_len	= %eax
+
+lane		= %r12
+tmp3		= %r12
+
+tmp		= %r9
+DWORD_tmp	= %r9d
+
+lane_data	= %r10
+
+# JOB* sha256_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA256 *job)
+# arg 1 : rcx : state
+# arg 2 : rdx : job
+ENTRY(sha256_mb_mgr_submit_avx2)
+	FRAME_BEGIN
+	push	%rbx
+	push	%r12
+
+	mov	_unused_lanes(state), unused_lanes
+	mov	unused_lanes, lane
+	and	$0xF, lane
+	shr	$4, unused_lanes
+	imul	$_LANE_DATA_size, lane, lane_data
+	movl	$STS_BEING_PROCESSED, _status(job)
+	lea	_ldata(state, lane_data), lane_data
+	mov	unused_lanes, _unused_lanes(state)
+	movl	_len(job),  DWORD_len
+
+	mov	job, _job_in_lane(lane_data)
+	shl	$4, len
+	or	lane, len
+
+	movl	DWORD_len,  _lens(state , lane, 4)
+
+	# Load digest words from result_digest
+	vmovdqu	_result_digest(job), %xmm0
+	vmovdqu	_result_digest+1*16(job), %xmm1
+	vmovd	%xmm0, _args_digest(state, lane, 4)
+	vpextrd	$1, %xmm0, _args_digest+1*32(state , lane, 4)
+	vpextrd	$2, %xmm0, _args_digest+2*32(state , lane, 4)
+	vpextrd	$3, %xmm0, _args_digest+3*32(state , lane, 4)
+	vmovd	%xmm1, _args_digest+4*32(state , lane, 4)
+
+	vpextrd	$1, %xmm1, _args_digest+5*32(state , lane, 4)
+	vpextrd	$2, %xmm1, _args_digest+6*32(state , lane, 4)
+	vpextrd	$3, %xmm1, _args_digest+7*32(state , lane, 4)
+
+	mov	_buffer(job), p
+	mov	p, _args_data_ptr(state, lane, 8)
+
+	cmp	$0xF, unused_lanes
+	jne	return_null
+
+start_loop:
+	# Find min length
+	vmovdqa	_lens(state), %xmm0
+	vmovdqa	_lens+1*16(state), %xmm1
+
+	vpminud	%xmm1, %xmm0, %xmm2		# xmm2 has {D,C,B,A}
+	vpalignr $8, %xmm2, %xmm3, %xmm3	# xmm3 has {x,x,D,C}
+	vpminud	%xmm3, %xmm2, %xmm2		# xmm2 has {x,x,E,F}
+	vpalignr $4, %xmm2, %xmm3, %xmm3	# xmm3 has {x,x,x,E}
+	vpminud	%xmm3, %xmm2, %xmm2		# xmm2 has min val in low dword
+
+	vmovd	%xmm2, DWORD_idx
+	mov	idx, len2
+	and	$0xF, idx
+	shr	$4, len2
+	jz	len_is_0
+
+	vpand	clear_low_nibble(%rip), %xmm2, %xmm2
+	vpshufd	$0, %xmm2, %xmm2
+
+	vpsubd	%xmm2, %xmm0, %xmm0
+	vpsubd	%xmm2, %xmm1, %xmm1
+
+	vmovdqa	%xmm0, _lens + 0*16(state)
+	vmovdqa	%xmm1, _lens + 1*16(state)
+
+	# "state" and "args" are the same address, arg1
+	# len is arg2
+	call	sha256_x8_avx2
+
+	# state and idx are intact
+
+len_is_0:
+	# process completed job "idx"
+	imul	$_LANE_DATA_size, idx, lane_data
+	lea	_ldata(state, lane_data), lane_data
+
+	mov	_job_in_lane(lane_data), job_rax
+	mov	_unused_lanes(state), unused_lanes
+	movq	$0, _job_in_lane(lane_data)
+	movl	$STS_COMPLETED, _status(job_rax)
+	shl	$4, unused_lanes
+	or	idx, unused_lanes
+	mov	unused_lanes, _unused_lanes(state)
+
+	movl	$0xFFFFFFFF, _lens(state,idx,4)
+
+	vmovd	_args_digest(state, idx, 4), %xmm0
+	vpinsrd	$1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
+	vpinsrd	$2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
+	vpinsrd	$3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
+	vmovd	_args_digest+4*32(state, idx, 4), %xmm1
+
+	vpinsrd	$1, _args_digest+5*32(state , idx, 4), %xmm1, %xmm1
+	vpinsrd	$2, _args_digest+6*32(state , idx, 4), %xmm1, %xmm1
+	vpinsrd	$3, _args_digest+7*32(state , idx, 4), %xmm1, %xmm1
+
+	vmovdqu	%xmm0, _result_digest(job_rax)
+	vmovdqu	%xmm1, _result_digest+1*16(job_rax)
+
+return:
+	pop     %r12
+        pop     %rbx
+        FRAME_END
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+ENDPROC(sha256_mb_mgr_submit_avx2)
+
+.data
+
+.align 16
+clear_low_nibble:
+	.octa	0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
new file mode 100644
index 000000000000..aa21aea4c722
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
@@ -0,0 +1,593 @@
+/*
+ * Multi-buffer SHA256 algorithm hash compute routine
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *	Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include "sha256_mb_mgr_datastruct.S"
+
+## code to compute oct SHA256 using SSE-256
+## outer calling routine takes care of save and restore of XMM registers
+## Logic designed/laid out by JDG
+
+## Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; %ymm0-15
+## Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
+## Linux preserves:                       rdi rbp r8
+##
+## clobbers %ymm0-15
+
+arg1 = %rdi
+arg2 = %rsi
+reg3 = %rcx
+reg4 = %rdx
+
+# Common definitions
+STATE = arg1
+INP_SIZE = arg2
+
+IDX = %rax
+ROUND = %rbx
+TBL = reg3
+
+inp0 = %r9
+inp1 = %r10
+inp2 = %r11
+inp3 = %r12
+inp4 = %r13
+inp5 = %r14
+inp6 = %r15
+inp7 = reg4
+
+a = %ymm0
+b = %ymm1
+c = %ymm2
+d = %ymm3
+e = %ymm4
+f = %ymm5
+g = %ymm6
+h = %ymm7
+
+T1 = %ymm8
+
+a0 = %ymm12
+a1 = %ymm13
+a2 = %ymm14
+TMP = %ymm15
+TMP0 = %ymm6
+TMP1 = %ymm7
+
+TT0 = %ymm8
+TT1 = %ymm9
+TT2 = %ymm10
+TT3 = %ymm11
+TT4 = %ymm12
+TT5 = %ymm13
+TT6 = %ymm14
+TT7 = %ymm15
+
+# Define stack usage
+
+# Assume stack aligned to 32 bytes before call
+# Therefore FRAMESZ mod 32 must be 32-8 = 24
+
+#define FRAMESZ	0x388
+
+#define VMOVPS	vmovups
+
+# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+# "transpose" data in {r0...r7} using temps {t0...t1}
+# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {a7 a6 a5 a4   a3 a2 a1 a0}
+# r1 = {b7 b6 b5 b4   b3 b2 b1 b0}
+# r2 = {c7 c6 c5 c4   c3 c2 c1 c0}
+# r3 = {d7 d6 d5 d4   d3 d2 d1 d0}
+# r4 = {e7 e6 e5 e4   e3 e2 e1 e0}
+# r5 = {f7 f6 f5 f4   f3 f2 f1 f0}
+# r6 = {g7 g6 g5 g4   g3 g2 g1 g0}
+# r7 = {h7 h6 h5 h4   h3 h2 h1 h0}
+#
+# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
+# r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
+# r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
+# r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
+# r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
+# r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
+# r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
+# r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
+#
+
+.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
+	# process top half (r0..r3) {a...d}
+	vshufps	$0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
+	vshufps	$0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
+	vshufps	$0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
+	vshufps	$0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
+	vshufps	$0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
+	vshufps	$0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
+	vshufps	$0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
+	vshufps	$0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
+
+	# use r2 in place of t0
+	# process bottom half (r4..r7) {e...h}
+	vshufps	$0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
+	vshufps	$0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
+	vshufps	$0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
+	vshufps	$0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
+	vshufps	$0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
+	vshufps	$0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
+	vshufps	$0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
+	vshufps	$0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
+
+	vperm2f128	$0x13, \r1, \r5, \r6  # h6...a6
+	vperm2f128	$0x02, \r1, \r5, \r2  # h2...a2
+	vperm2f128	$0x13, \r3, \r7, \r5  # h5...a5
+	vperm2f128	$0x02, \r3, \r7, \r1  # h1...a1
+	vperm2f128	$0x13, \r0, \r4, \r7  # h7...a7
+	vperm2f128	$0x02, \r0, \r4, \r3  # h3...a3
+	vperm2f128	$0x13, \t0, \t1, \r4  # h4...a4
+	vperm2f128	$0x02, \t0, \t1, \r0  # h0...a0
+
+.endm
+
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro _PRORD reg imm tmp
+	vpslld	$(32-\imm),\reg,\tmp
+	vpsrld	$\imm,\reg, \reg
+	vpor	\tmp,\reg, \reg
+.endm
+
+# PRORD_nd reg, imm, tmp, src
+.macro _PRORD_nd reg imm tmp src
+	vpslld	$(32-\imm), \src, \tmp
+	vpsrld	$\imm, \src, \reg
+	vpor	\tmp, \reg, \reg
+.endm
+
+# PRORD dst/src, amt
+.macro PRORD reg imm
+	_PRORD	\reg,\imm,TMP
+.endm
+
+# PRORD_nd dst, src, amt
+.macro PRORD_nd reg tmp imm
+	_PRORD_nd	\reg, \imm, TMP, \tmp
+.endm
+
+# arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_00_15 _T1 i
+	PRORD_nd	a0,e,5	# sig1: a0 = (e >> 5)
+
+	vpxor	g, f, a2	# ch: a2 = f^g
+	vpand	e,a2, a2	# ch: a2 = (f^g)&e
+	vpxor	g, a2, a2	# a2 = ch
+
+	PRORD_nd	a1,e,25	# sig1: a1 = (e >> 25)
+
+	vmovdqu	\_T1,(SZ8*(\i & 0xf))(%rsp)
+	vpaddd	(TBL,ROUND,1), \_T1, \_T1	# T1 = W + K
+	vpxor	e,a0, a0	# sig1: a0 = e ^ (e >> 5)
+	PRORD	a0, 6		# sig1: a0 = (e >> 6) ^ (e >> 11)
+	vpaddd	a2, h, h	# h = h + ch
+	PRORD_nd	a2,a,11	# sig0: a2 = (a >> 11)
+	vpaddd	\_T1,h, h 	# h = h + ch + W + K
+	vpxor	a1, a0, a0	# a0 = sigma1
+	PRORD_nd	a1,a,22	# sig0: a1 = (a >> 22)
+	vpxor	c, a, \_T1	# maj: T1 = a^c
+	add	$SZ8, ROUND	# ROUND++
+	vpand	b, \_T1, \_T1	# maj: T1 = (a^c)&b
+	vpaddd	a0, h, h
+	vpaddd	h, d, d
+	vpxor	a, a2, a2	# sig0: a2 = a ^ (a >> 11)
+	PRORD	a2,2		# sig0: a2 = (a >> 2) ^ (a >> 13)
+	vpxor	a1, a2, a2	# a2 = sig0
+	vpand	c, a, a1	# maj: a1 = a&c
+	vpor	\_T1, a1, a1 	# a1 = maj
+	vpaddd	a1, h, h	# h = h + ch + W + K + maj
+	vpaddd	a2, h, h	# h = h + ch + W + K + maj + sigma0
+	ROTATE_ARGS
+.endm
+
+# arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_16_XX _T1 i
+	vmovdqu	(SZ8*((\i-15)&0xf))(%rsp), \_T1
+	vmovdqu	(SZ8*((\i-2)&0xf))(%rsp), a1
+	vmovdqu	\_T1, a0
+	PRORD	\_T1,11
+	vmovdqu	a1, a2
+	PRORD	a1,2
+	vpxor	a0, \_T1, \_T1
+	PRORD	\_T1, 7
+	vpxor	a2, a1, a1
+	PRORD	a1, 17
+	vpsrld	$3, a0, a0
+	vpxor	a0, \_T1, \_T1
+	vpsrld	$10, a2, a2
+	vpxor	a2, a1, a1
+	vpaddd	(SZ8*((\i-16)&0xf))(%rsp), \_T1, \_T1
+	vpaddd	(SZ8*((\i-7)&0xf))(%rsp), a1, a1
+	vpaddd	a1, \_T1, \_T1
+
+	ROUND_00_15 \_T1,\i
+.endm
+
+# SHA256_ARGS:
+#   UINT128 digest[8];  // transposed digests
+#   UINT8  *data_ptr[4];
+
+# void sha256_x8_avx2(SHA256_ARGS *args, UINT64 bytes);
+# arg 1 : STATE : pointer to array of pointers to input data
+# arg 2 : INP_SIZE  : size of input in blocks
+	# general registers preserved in outer calling routine
+	# outer calling routine saves all the XMM registers
+	# save rsp, allocate 32-byte aligned for local variables
+ENTRY(sha256_x8_avx2)
+
+	# save callee-saved clobbered registers to comply with C function ABI
+	push    %r12
+	push    %r13
+	push    %r14
+	push    %r15
+
+	mov	%rsp, IDX
+	sub	$FRAMESZ, %rsp
+	and	$~0x1F, %rsp
+	mov	IDX, _rsp(%rsp)
+
+	# Load the pre-transposed incoming digest.
+	vmovdqu	0*SHA256_DIGEST_ROW_SIZE(STATE),a
+	vmovdqu	1*SHA256_DIGEST_ROW_SIZE(STATE),b
+	vmovdqu	2*SHA256_DIGEST_ROW_SIZE(STATE),c
+	vmovdqu	3*SHA256_DIGEST_ROW_SIZE(STATE),d
+	vmovdqu	4*SHA256_DIGEST_ROW_SIZE(STATE),e
+	vmovdqu	5*SHA256_DIGEST_ROW_SIZE(STATE),f
+	vmovdqu	6*SHA256_DIGEST_ROW_SIZE(STATE),g
+	vmovdqu	7*SHA256_DIGEST_ROW_SIZE(STATE),h
+
+	lea	K256_8(%rip),TBL
+
+	# load the address of each of the 4 message lanes
+	# getting ready to transpose input onto stack
+	mov	_args_data_ptr+0*PTR_SZ(STATE),inp0
+	mov	_args_data_ptr+1*PTR_SZ(STATE),inp1
+	mov	_args_data_ptr+2*PTR_SZ(STATE),inp2
+	mov	_args_data_ptr+3*PTR_SZ(STATE),inp3
+	mov	_args_data_ptr+4*PTR_SZ(STATE),inp4
+	mov	_args_data_ptr+5*PTR_SZ(STATE),inp5
+	mov	_args_data_ptr+6*PTR_SZ(STATE),inp6
+	mov	_args_data_ptr+7*PTR_SZ(STATE),inp7
+
+	xor	IDX, IDX
+lloop:
+	xor	ROUND, ROUND
+
+	# save old digest
+	vmovdqu	a, _digest(%rsp)
+	vmovdqu	b, _digest+1*SZ8(%rsp)
+	vmovdqu	c, _digest+2*SZ8(%rsp)
+	vmovdqu	d, _digest+3*SZ8(%rsp)
+	vmovdqu	e, _digest+4*SZ8(%rsp)
+	vmovdqu	f, _digest+5*SZ8(%rsp)
+	vmovdqu	g, _digest+6*SZ8(%rsp)
+	vmovdqu	h, _digest+7*SZ8(%rsp)
+	i = 0
+.rep 2
+	VMOVPS	i*32(inp0, IDX), TT0
+	VMOVPS	i*32(inp1, IDX), TT1
+	VMOVPS	i*32(inp2, IDX), TT2
+	VMOVPS	i*32(inp3, IDX), TT3
+	VMOVPS	i*32(inp4, IDX), TT4
+	VMOVPS	i*32(inp5, IDX), TT5
+	VMOVPS	i*32(inp6, IDX), TT6
+	VMOVPS	i*32(inp7, IDX), TT7
+	vmovdqu	g, _ytmp(%rsp)
+	vmovdqu	h, _ytmp+1*SZ8(%rsp)
+	TRANSPOSE8	TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7,   TMP0, TMP1
+	vmovdqu	PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP1
+	vmovdqu	_ytmp(%rsp), g
+	vpshufb	TMP1, TT0, TT0
+	vpshufb	TMP1, TT1, TT1
+	vpshufb	TMP1, TT2, TT2
+	vpshufb	TMP1, TT3, TT3
+	vpshufb	TMP1, TT4, TT4
+	vpshufb	TMP1, TT5, TT5
+	vpshufb	TMP1, TT6, TT6
+	vpshufb	TMP1, TT7, TT7
+	vmovdqu	_ytmp+1*SZ8(%rsp), h
+	vmovdqu	TT4, _ytmp(%rsp)
+	vmovdqu	TT5, _ytmp+1*SZ8(%rsp)
+	vmovdqu	TT6, _ytmp+2*SZ8(%rsp)
+	vmovdqu	TT7, _ytmp+3*SZ8(%rsp)
+	ROUND_00_15	TT0,(i*8+0)
+	vmovdqu	_ytmp(%rsp), TT0
+	ROUND_00_15	TT1,(i*8+1)
+	vmovdqu	_ytmp+1*SZ8(%rsp), TT1
+	ROUND_00_15	TT2,(i*8+2)
+	vmovdqu	_ytmp+2*SZ8(%rsp), TT2
+	ROUND_00_15	TT3,(i*8+3)
+	vmovdqu	_ytmp+3*SZ8(%rsp), TT3
+	ROUND_00_15	TT0,(i*8+4)
+	ROUND_00_15	TT1,(i*8+5)
+	ROUND_00_15	TT2,(i*8+6)
+	ROUND_00_15	TT3,(i*8+7)
+	i = (i+1)
+.endr
+	add	$64, IDX
+	i = (i*8)
+
+	jmp	Lrounds_16_xx
+.align 16
+Lrounds_16_xx:
+.rep 16
+	ROUND_16_XX	T1, i
+	i = (i+1)
+.endr
+
+	cmp	$ROUNDS,ROUND
+	jb	Lrounds_16_xx
+
+	# add old digest
+	vpaddd	_digest+0*SZ8(%rsp), a, a
+	vpaddd	_digest+1*SZ8(%rsp), b, b
+	vpaddd	_digest+2*SZ8(%rsp), c, c
+	vpaddd	_digest+3*SZ8(%rsp), d, d
+	vpaddd	_digest+4*SZ8(%rsp), e, e
+	vpaddd	_digest+5*SZ8(%rsp), f, f
+	vpaddd	_digest+6*SZ8(%rsp), g, g
+	vpaddd	_digest+7*SZ8(%rsp), h, h
+
+	sub	$1, INP_SIZE  # unit is blocks
+	jne	lloop
+
+	# write back to memory (state object) the transposed digest
+	vmovdqu	a, 0*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	b, 1*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	c, 2*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	d, 3*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	e, 4*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	f, 5*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	g, 6*SHA256_DIGEST_ROW_SIZE(STATE)
+	vmovdqu	h, 7*SHA256_DIGEST_ROW_SIZE(STATE)
+
+	# update input pointers
+	add	IDX, inp0
+	mov	inp0, _args_data_ptr+0*8(STATE)
+	add	IDX, inp1
+	mov	inp1, _args_data_ptr+1*8(STATE)
+	add	IDX, inp2
+	mov	inp2, _args_data_ptr+2*8(STATE)
+	add	IDX, inp3
+	mov	inp3, _args_data_ptr+3*8(STATE)
+	add	IDX, inp4
+	mov	inp4, _args_data_ptr+4*8(STATE)
+	add	IDX, inp5
+	mov	inp5, _args_data_ptr+5*8(STATE)
+	add	IDX, inp6
+	mov	inp6, _args_data_ptr+6*8(STATE)
+	add	IDX, inp7
+	mov	inp7, _args_data_ptr+7*8(STATE)
+
+	# Postamble
+	mov	_rsp(%rsp), %rsp
+
+	# restore callee-saved clobbered registers
+	pop     %r15
+	pop     %r14
+	pop     %r13
+	pop     %r12
+
+	ret
+ENDPROC(sha256_x8_avx2)
+.data
+.align 64
+K256_8:
+	.octa	0x428a2f98428a2f98428a2f98428a2f98
+	.octa	0x428a2f98428a2f98428a2f98428a2f98
+	.octa	0x71374491713744917137449171374491
+	.octa	0x71374491713744917137449171374491
+	.octa	0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
+	.octa	0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
+	.octa	0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
+	.octa	0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
+	.octa	0x3956c25b3956c25b3956c25b3956c25b
+	.octa	0x3956c25b3956c25b3956c25b3956c25b
+	.octa	0x59f111f159f111f159f111f159f111f1
+	.octa	0x59f111f159f111f159f111f159f111f1
+	.octa	0x923f82a4923f82a4923f82a4923f82a4
+	.octa	0x923f82a4923f82a4923f82a4923f82a4
+	.octa	0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
+	.octa	0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
+	.octa	0xd807aa98d807aa98d807aa98d807aa98
+	.octa	0xd807aa98d807aa98d807aa98d807aa98
+	.octa	0x12835b0112835b0112835b0112835b01
+	.octa	0x12835b0112835b0112835b0112835b01
+	.octa	0x243185be243185be243185be243185be
+	.octa	0x243185be243185be243185be243185be
+	.octa	0x550c7dc3550c7dc3550c7dc3550c7dc3
+	.octa	0x550c7dc3550c7dc3550c7dc3550c7dc3
+	.octa	0x72be5d7472be5d7472be5d7472be5d74
+	.octa	0x72be5d7472be5d7472be5d7472be5d74
+	.octa	0x80deb1fe80deb1fe80deb1fe80deb1fe
+	.octa	0x80deb1fe80deb1fe80deb1fe80deb1fe
+	.octa	0x9bdc06a79bdc06a79bdc06a79bdc06a7
+	.octa	0x9bdc06a79bdc06a79bdc06a79bdc06a7
+	.octa	0xc19bf174c19bf174c19bf174c19bf174
+	.octa	0xc19bf174c19bf174c19bf174c19bf174
+	.octa	0xe49b69c1e49b69c1e49b69c1e49b69c1
+	.octa	0xe49b69c1e49b69c1e49b69c1e49b69c1
+	.octa	0xefbe4786efbe4786efbe4786efbe4786
+	.octa	0xefbe4786efbe4786efbe4786efbe4786
+	.octa	0x0fc19dc60fc19dc60fc19dc60fc19dc6
+	.octa	0x0fc19dc60fc19dc60fc19dc60fc19dc6
+	.octa	0x240ca1cc240ca1cc240ca1cc240ca1cc
+	.octa	0x240ca1cc240ca1cc240ca1cc240ca1cc
+	.octa	0x2de92c6f2de92c6f2de92c6f2de92c6f
+	.octa	0x2de92c6f2de92c6f2de92c6f2de92c6f
+	.octa	0x4a7484aa4a7484aa4a7484aa4a7484aa
+	.octa	0x4a7484aa4a7484aa4a7484aa4a7484aa
+	.octa	0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
+	.octa	0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
+	.octa	0x76f988da76f988da76f988da76f988da
+	.octa	0x76f988da76f988da76f988da76f988da
+	.octa	0x983e5152983e5152983e5152983e5152
+	.octa	0x983e5152983e5152983e5152983e5152
+	.octa	0xa831c66da831c66da831c66da831c66d
+	.octa	0xa831c66da831c66da831c66da831c66d
+	.octa	0xb00327c8b00327c8b00327c8b00327c8
+	.octa	0xb00327c8b00327c8b00327c8b00327c8
+	.octa	0xbf597fc7bf597fc7bf597fc7bf597fc7
+	.octa	0xbf597fc7bf597fc7bf597fc7bf597fc7
+	.octa	0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
+	.octa	0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
+	.octa	0xd5a79147d5a79147d5a79147d5a79147
+	.octa	0xd5a79147d5a79147d5a79147d5a79147
+	.octa	0x06ca635106ca635106ca635106ca6351
+	.octa	0x06ca635106ca635106ca635106ca6351
+	.octa	0x14292967142929671429296714292967
+	.octa	0x14292967142929671429296714292967
+	.octa	0x27b70a8527b70a8527b70a8527b70a85
+	.octa	0x27b70a8527b70a8527b70a8527b70a85
+	.octa	0x2e1b21382e1b21382e1b21382e1b2138
+	.octa	0x2e1b21382e1b21382e1b21382e1b2138
+	.octa	0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
+	.octa	0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
+	.octa	0x53380d1353380d1353380d1353380d13
+	.octa	0x53380d1353380d1353380d1353380d13
+	.octa	0x650a7354650a7354650a7354650a7354
+	.octa	0x650a7354650a7354650a7354650a7354
+	.octa	0x766a0abb766a0abb766a0abb766a0abb
+	.octa	0x766a0abb766a0abb766a0abb766a0abb
+	.octa	0x81c2c92e81c2c92e81c2c92e81c2c92e
+	.octa	0x81c2c92e81c2c92e81c2c92e81c2c92e
+	.octa	0x92722c8592722c8592722c8592722c85
+	.octa	0x92722c8592722c8592722c8592722c85
+	.octa	0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
+	.octa	0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
+	.octa	0xa81a664ba81a664ba81a664ba81a664b
+	.octa	0xa81a664ba81a664ba81a664ba81a664b
+	.octa	0xc24b8b70c24b8b70c24b8b70c24b8b70
+	.octa	0xc24b8b70c24b8b70c24b8b70c24b8b70
+	.octa	0xc76c51a3c76c51a3c76c51a3c76c51a3
+	.octa	0xc76c51a3c76c51a3c76c51a3c76c51a3
+	.octa	0xd192e819d192e819d192e819d192e819
+	.octa	0xd192e819d192e819d192e819d192e819
+	.octa	0xd6990624d6990624d6990624d6990624
+	.octa	0xd6990624d6990624d6990624d6990624
+	.octa	0xf40e3585f40e3585f40e3585f40e3585
+	.octa	0xf40e3585f40e3585f40e3585f40e3585
+	.octa	0x106aa070106aa070106aa070106aa070
+	.octa	0x106aa070106aa070106aa070106aa070
+	.octa	0x19a4c11619a4c11619a4c11619a4c116
+	.octa	0x19a4c11619a4c11619a4c11619a4c116
+	.octa	0x1e376c081e376c081e376c081e376c08
+	.octa	0x1e376c081e376c081e376c081e376c08
+	.octa	0x2748774c2748774c2748774c2748774c
+	.octa	0x2748774c2748774c2748774c2748774c
+	.octa	0x34b0bcb534b0bcb534b0bcb534b0bcb5
+	.octa	0x34b0bcb534b0bcb534b0bcb534b0bcb5
+	.octa	0x391c0cb3391c0cb3391c0cb3391c0cb3
+	.octa	0x391c0cb3391c0cb3391c0cb3391c0cb3
+	.octa	0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
+	.octa	0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
+	.octa	0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
+	.octa	0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
+	.octa	0x682e6ff3682e6ff3682e6ff3682e6ff3
+	.octa	0x682e6ff3682e6ff3682e6ff3682e6ff3
+	.octa	0x748f82ee748f82ee748f82ee748f82ee
+	.octa	0x748f82ee748f82ee748f82ee748f82ee
+	.octa	0x78a5636f78a5636f78a5636f78a5636f
+	.octa	0x78a5636f78a5636f78a5636f78a5636f
+	.octa	0x84c8781484c8781484c8781484c87814
+	.octa	0x84c8781484c8781484c8781484c87814
+	.octa	0x8cc702088cc702088cc702088cc70208
+	.octa	0x8cc702088cc702088cc702088cc70208
+	.octa	0x90befffa90befffa90befffa90befffa
+	.octa	0x90befffa90befffa90befffa90befffa
+	.octa	0xa4506ceba4506ceba4506ceba4506ceb
+	.octa	0xa4506ceba4506ceba4506ceba4506ceb
+	.octa	0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
+	.octa	0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
+	.octa	0xc67178f2c67178f2c67178f2c67178f2
+	.octa	0xc67178f2c67178f2c67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK:
+.octa 0x0c0d0e0f08090a0b0405060700010203
+.octa 0x0c0d0e0f08090a0b0405060700010203
+
+.align 64
+.global K256
+K256:
+	.int	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.int	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.int	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.int	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.int	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.int	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.int	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.int	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.int	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.int	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.int	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.int	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.int	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.int	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.int	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.int	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 3ae0f43ebd37..9e79baf03a4b 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -427,4 +427,14 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
 
 MODULE_ALIAS_CRYPTO("sha256");
+MODULE_ALIAS_CRYPTO("sha256-ssse3");
+MODULE_ALIAS_CRYPTO("sha256-avx");
+MODULE_ALIAS_CRYPTO("sha256-avx2");
 MODULE_ALIAS_CRYPTO("sha224");
+MODULE_ALIAS_CRYPTO("sha224-ssse3");
+MODULE_ALIAS_CRYPTO("sha224-avx");
+MODULE_ALIAS_CRYPTO("sha224-avx2");
+#ifdef CONFIG_AS_SHA256_NI
+MODULE_ALIAS_CRYPTO("sha256-ni");
+MODULE_ALIAS_CRYPTO("sha224-ni");
+#endif
diff --git a/arch/x86/crypto/sha512-mb/Makefile b/arch/x86/crypto/sha512-mb/Makefile
new file mode 100644
index 000000000000..0a57e2103980
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/Makefile
@@ -0,0 +1,11 @@
+#
+# Arch-specific CryptoAPI modules.
+#
+
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+                                $(comma)4)$(comma)%ymm2,yes,no)
+ifeq ($(avx2_supported),yes)
+	obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb.o
+	sha512-mb-y := sha512_mb.o sha512_mb_mgr_flush_avx2.o \
+	     sha512_mb_mgr_init_avx2.o sha512_mb_mgr_submit_avx2.o sha512_x4_avx2.o
+endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c
new file mode 100644
index 000000000000..f4cf5b78fd36
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb.c
@@ -0,0 +1,1046 @@
+/*
+ * Multi buffer SHA512 algorithm Glue Code
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ *	Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <crypto/mcryptd.h>
+#include <crypto/crypto_wq.h>
+#include <asm/byteorder.h>
+#include <linux/hardirq.h>
+#include <asm/fpu/api.h>
+#include "sha512_mb_ctx.h"
+
+#define FLUSH_INTERVAL 1000 /* in usec */
+
+static struct mcryptd_alg_state sha512_mb_alg_state;
+
+struct sha512_mb_ctx {
+	struct mcryptd_ahash *mcryptd_tfm;
+};
+
+static inline struct mcryptd_hash_request_ctx
+		*cast_hash_to_mcryptd_ctx(struct sha512_hash_ctx *hash_ctx)
+{
+	struct ahash_request *areq;
+
+	areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
+	return container_of(areq, struct mcryptd_hash_request_ctx, areq);
+}
+
+static inline struct ahash_request
+		*cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+{
+	return container_of((void *) ctx, struct ahash_request, __ctx);
+}
+
+static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
+				struct ahash_request *areq)
+{
+	rctx->flag = HASH_UPDATE;
+}
+
+static asmlinkage void (*sha512_job_mgr_init)(struct sha512_mb_mgr *state);
+static asmlinkage struct job_sha512* (*sha512_job_mgr_submit)
+						(struct sha512_mb_mgr *state,
+						struct job_sha512 *job);
+static asmlinkage struct job_sha512* (*sha512_job_mgr_flush)
+						(struct sha512_mb_mgr *state);
+static asmlinkage struct job_sha512* (*sha512_job_mgr_get_comp_job)
+						(struct sha512_mb_mgr *state);
+
+inline void sha512_init_digest(uint64_t *digest)
+{
+	static const uint64_t initial_digest[SHA512_DIGEST_LENGTH] = {
+					SHA512_H0, SHA512_H1, SHA512_H2,
+					SHA512_H3, SHA512_H4, SHA512_H5,
+					SHA512_H6, SHA512_H7 };
+	memcpy(digest, initial_digest, sizeof(initial_digest));
+}
+
+inline uint32_t sha512_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2],
+			 uint32_t total_len)
+{
+	uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+	memset(&padblock[i], 0, SHA512_BLOCK_SIZE);
+	padblock[i] = 0x80;
+
+	i += ((SHA512_BLOCK_SIZE - 1) &
+	      (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1)))
+	     + 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+	*((uint64_t *) &padblock[i - 16]) = 0;
+#endif
+
+	*((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
+
+	/* Number of extra blocks to hash */
+	return i >> SHA512_LOG2_BLOCK_SIZE;
+}
+
+static struct sha512_hash_ctx *sha512_ctx_mgr_resubmit
+		(struct sha512_ctx_mgr *mgr, struct sha512_hash_ctx *ctx)
+{
+	while (ctx) {
+		if (ctx->status & HASH_CTX_STS_COMPLETE) {
+			/* Clear PROCESSING bit */
+			ctx->status = HASH_CTX_STS_COMPLETE;
+			return ctx;
+		}
+
+		/*
+		 * If the extra blocks are empty, begin hashing what remains
+		 * in the user's buffer.
+		 */
+		if (ctx->partial_block_buffer_length == 0 &&
+		    ctx->incoming_buffer_length) {
+
+			const void *buffer = ctx->incoming_buffer;
+			uint32_t len = ctx->incoming_buffer_length;
+			uint32_t copy_len;
+
+			/*
+			 * Only entire blocks can be hashed.
+			 * Copy remainder to extra blocks buffer.
+			 */
+			copy_len = len & (SHA512_BLOCK_SIZE-1);
+
+			if (copy_len) {
+				len -= copy_len;
+				memcpy(ctx->partial_block_buffer,
+				       ((const char *) buffer + len),
+				       copy_len);
+				ctx->partial_block_buffer_length = copy_len;
+			}
+
+			ctx->incoming_buffer_length = 0;
+
+			/* len should be a multiple of the block size now */
+			assert((len % SHA512_BLOCK_SIZE) == 0);
+
+			/* Set len to the number of blocks to be hashed */
+			len >>= SHA512_LOG2_BLOCK_SIZE;
+
+			if (len) {
+
+				ctx->job.buffer = (uint8_t *) buffer;
+				ctx->job.len = len;
+				ctx = (struct sha512_hash_ctx *)
+					sha512_job_mgr_submit(&mgr->mgr,
+					&ctx->job);
+				continue;
+			}
+		}
+
+		/*
+		 * If the extra blocks are not empty, then we are
+		 * either on the last block(s) or we need more
+		 * user input before continuing.
+		 */
+		if (ctx->status & HASH_CTX_STS_LAST) {
+
+			uint8_t *buf = ctx->partial_block_buffer;
+			uint32_t n_extra_blocks =
+					sha512_pad(buf, ctx->total_length);
+
+			ctx->status = (HASH_CTX_STS_PROCESSING |
+				       HASH_CTX_STS_COMPLETE);
+			ctx->job.buffer = buf;
+			ctx->job.len = (uint32_t) n_extra_blocks;
+			ctx = (struct sha512_hash_ctx *)
+				sha512_job_mgr_submit(&mgr->mgr, &ctx->job);
+			continue;
+		}
+
+		if (ctx)
+			ctx->status = HASH_CTX_STS_IDLE;
+		return ctx;
+	}
+
+	return NULL;
+}
+
+static struct sha512_hash_ctx
+		*sha512_ctx_mgr_get_comp_ctx(struct sha512_ctx_mgr *mgr)
+{
+	/*
+	 * If get_comp_job returns NULL, there are no jobs complete.
+	 * If get_comp_job returns a job, verify that it is safe to return to
+	 * the user.
+	 * If it is not ready, resubmit the job to finish processing.
+	 * If sha512_ctx_mgr_resubmit returned a job, it is ready to be
+	 * returned.
+	 * Otherwise, all jobs currently being managed by the hash_ctx_mgr
+	 * still need processing.
+	 */
+	struct sha512_hash_ctx *ctx;
+
+	ctx = (struct sha512_hash_ctx *)
+				sha512_job_mgr_get_comp_job(&mgr->mgr);
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr)
+{
+	sha512_job_mgr_init(&mgr->mgr);
+}
+
+static struct sha512_hash_ctx
+			*sha512_ctx_mgr_submit(struct sha512_ctx_mgr *mgr,
+					  struct sha512_hash_ctx *ctx,
+					  const void *buffer,
+					  uint32_t len,
+					  int flags)
+{
+	if (flags & (~HASH_ENTIRE)) {
+		/*
+		 * User should not pass anything other than FIRST, UPDATE, or
+		 * LAST
+		 */
+		ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+		return ctx;
+	}
+
+	if (ctx->status & HASH_CTX_STS_PROCESSING) {
+		/* Cannot submit to a currently processing job. */
+		ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+		return ctx;
+	}
+
+	if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+		/* Cannot update a finished job. */
+		ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+		return ctx;
+	}
+
+
+	if (flags & HASH_FIRST) {
+		/* Init digest */
+		sha512_init_digest(ctx->job.result_digest);
+
+		/* Reset byte counter */
+		ctx->total_length = 0;
+
+		/* Clear extra blocks */
+		ctx->partial_block_buffer_length = 0;
+	}
+
+	/*
+	 * If we made it here, there were no errors during this call to
+	 * submit
+	 */
+	ctx->error = HASH_CTX_ERROR_NONE;
+
+	/* Store buffer ptr info from user */
+	ctx->incoming_buffer = buffer;
+	ctx->incoming_buffer_length = len;
+
+	/*
+	 * Store the user's request flags and mark this ctx as currently being
+	 * processed.
+	 */
+	ctx->status = (flags & HASH_LAST) ?
+			(HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+			HASH_CTX_STS_PROCESSING;
+
+	/* Advance byte counter */
+	ctx->total_length += len;
+
+	/*
+	 * If there is anything currently buffered in the extra blocks,
+	 * append to it until it contains a whole block.
+	 * Or if the user's buffer contains less than a whole block,
+	 * append as much as possible to the extra block.
+	 */
+	if (ctx->partial_block_buffer_length || len < SHA512_BLOCK_SIZE) {
+		/* Compute how many bytes to copy from user buffer into extra
+		 * block
+		 */
+		uint32_t copy_len = SHA512_BLOCK_SIZE -
+					ctx->partial_block_buffer_length;
+		if (len < copy_len)
+			copy_len = len;
+
+		if (copy_len) {
+			/* Copy and update relevant pointers and counters */
+			memcpy
+		(&ctx->partial_block_buffer[ctx->partial_block_buffer_length],
+				buffer, copy_len);
+
+			ctx->partial_block_buffer_length += copy_len;
+			ctx->incoming_buffer = (const void *)
+					((const char *)buffer + copy_len);
+			ctx->incoming_buffer_length = len - copy_len;
+		}
+
+		/* The extra block should never contain more than 1 block
+		 * here
+		 */
+		assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+		/* If the extra block buffer contains exactly 1 block, it can
+		 * be hashed.
+		 */
+		if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+			ctx->partial_block_buffer_length = 0;
+
+			ctx->job.buffer = ctx->partial_block_buffer;
+			ctx->job.len = 1;
+			ctx = (struct sha512_hash_ctx *)
+				sha512_job_mgr_submit(&mgr->mgr, &ctx->job);
+		}
+	}
+
+	return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr)
+{
+	struct sha512_hash_ctx *ctx;
+
+	while (1) {
+		ctx = (struct sha512_hash_ctx *)
+					sha512_job_mgr_flush(&mgr->mgr);
+
+		/* If flush returned 0, there are no more jobs in flight. */
+		if (!ctx)
+			return NULL;
+
+		/*
+		 * If flush returned a job, resubmit the job to finish
+		 * processing.
+		 */
+		ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+		/*
+		 * If sha512_ctx_mgr_resubmit returned a job, it is ready to
+		 * be returned. Otherwise, all jobs currently being managed by
+		 * the sha512_ctx_mgr still need processing. Loop.
+		 */
+		if (ctx)
+			return ctx;
+	}
+}
+
+static int sha512_mb_init(struct ahash_request *areq)
+{
+	struct sha512_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	hash_ctx_init(sctx);
+	sctx->job.result_digest[0] = SHA512_H0;
+	sctx->job.result_digest[1] = SHA512_H1;
+	sctx->job.result_digest[2] = SHA512_H2;
+	sctx->job.result_digest[3] = SHA512_H3;
+	sctx->job.result_digest[4] = SHA512_H4;
+	sctx->job.result_digest[5] = SHA512_H5;
+	sctx->job.result_digest[6] = SHA512_H6;
+	sctx->job.result_digest[7] = SHA512_H7;
+	sctx->total_length = 0;
+	sctx->partial_block_buffer_length = 0;
+	sctx->status = HASH_CTX_STS_IDLE;
+
+	return 0;
+}
+
+static int sha512_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
+{
+	int	i;
+	struct	sha512_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
+	__be64	*dst = (__be64 *) rctx->out;
+
+	for (i = 0; i < 8; ++i)
+		dst[i] = cpu_to_be64(sctx->job.result_digest[i]);
+
+	return 0;
+}
+
+static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
+			struct mcryptd_alg_cstate *cstate, bool flush)
+{
+	int	flag = HASH_UPDATE;
+	int	nbytes, err = 0;
+	struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
+	struct sha512_hash_ctx *sha_ctx;
+
+	/* more work ? */
+	while (!(rctx->flag & HASH_DONE)) {
+		nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
+		if (nbytes < 0) {
+			err = nbytes;
+			goto out;
+		}
+		/* check if the walk is done */
+		if (crypto_ahash_walk_last(&rctx->walk)) {
+			rctx->flag |= HASH_DONE;
+			if (rctx->flag & HASH_FINAL)
+				flag |= HASH_LAST;
+
+		}
+		sha_ctx = (struct sha512_hash_ctx *)
+						ahash_request_ctx(&rctx->areq);
+		kernel_fpu_begin();
+		sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx,
+						rctx->walk.data, nbytes, flag);
+		if (!sha_ctx) {
+			if (flush)
+				sha_ctx = sha512_ctx_mgr_flush(cstate->mgr);
+		}
+		kernel_fpu_end();
+		if (sha_ctx)
+			rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		else {
+			rctx = NULL;
+			goto out;
+		}
+	}
+
+	/* copy the results */
+	if (rctx->flag & HASH_FINAL)
+		sha512_mb_set_results(rctx);
+
+out:
+	*ret_rctx = rctx;
+	return err;
+}
+
+static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
+			    struct mcryptd_alg_cstate *cstate,
+			    int err)
+{
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha512_hash_ctx *sha_ctx;
+	struct mcryptd_hash_request_ctx *req_ctx;
+	int ret;
+
+	/* remove from work list */
+	spin_lock(&cstate->work_lock);
+	list_del(&rctx->waiter);
+	spin_unlock(&cstate->work_lock);
+
+	if (irqs_disabled())
+		rctx->complete(&req->base, err);
+	else {
+		local_bh_disable();
+		rctx->complete(&req->base, err);
+		local_bh_enable();
+	}
+
+	/* check to see if there are other jobs that are done */
+	sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr);
+	while (sha_ctx) {
+		req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		ret = sha_finish_walk(&req_ctx, cstate, false);
+		if (req_ctx) {
+			spin_lock(&cstate->work_lock);
+			list_del(&req_ctx->waiter);
+			spin_unlock(&cstate->work_lock);
+
+			req = cast_mcryptd_ctx_to_req(req_ctx);
+			if (irqs_disabled())
+				rctx->complete(&req->base, ret);
+			else {
+				local_bh_disable();
+				rctx->complete(&req->base, ret);
+				local_bh_enable();
+			}
+		}
+		sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr);
+	}
+
+	return 0;
+}
+
+static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
+			     struct mcryptd_alg_cstate *cstate)
+{
+	unsigned long next_flush;
+	unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+
+	/* initialize tag */
+	rctx->tag.arrival = jiffies;    /* tag the arrival time */
+	rctx->tag.seq_num = cstate->next_seq_num++;
+	next_flush = rctx->tag.arrival + delay;
+	rctx->tag.expire = next_flush;
+
+	spin_lock(&cstate->work_lock);
+	list_add_tail(&rctx->waiter, &cstate->work_list);
+	spin_unlock(&cstate->work_lock);
+
+	mcryptd_arm_flusher(cstate, delay);
+}
+
+static int sha512_mb_update(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+			container_of(areq, struct mcryptd_hash_request_ctx,
+									areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha512_mb_alg_state.alg_cstate);
+
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha512_hash_ctx *sha_ctx;
+	int ret = 0, nbytes;
+
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+	if (nbytes < 0) {
+		ret = nbytes;
+		goto done;
+	}
+
+	if (crypto_ahash_walk_last(&rctx->walk))
+		rctx->flag |= HASH_DONE;
+
+	/* submit */
+	sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
+	sha512_mb_add_list(rctx, cstate);
+	kernel_fpu_begin();
+	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+							nbytes, HASH_UPDATE);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha512_mb_finup(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+			container_of(areq, struct mcryptd_hash_request_ctx,
+									areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha512_mb_alg_state.alg_cstate);
+
+	struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+	struct sha512_hash_ctx *sha_ctx;
+	int ret = 0, flag = HASH_UPDATE, nbytes;
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+	if (nbytes < 0) {
+		ret = nbytes;
+		goto done;
+	}
+
+	if (crypto_ahash_walk_last(&rctx->walk)) {
+		rctx->flag |= HASH_DONE;
+		flag = HASH_LAST;
+	}
+
+	/* submit */
+	rctx->flag |= HASH_FINAL;
+	sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
+	sha512_mb_add_list(rctx, cstate);
+
+	kernel_fpu_begin();
+	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+								nbytes, flag);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha512_mb_final(struct ahash_request *areq)
+{
+	struct mcryptd_hash_request_ctx *rctx =
+			container_of(areq, struct mcryptd_hash_request_ctx,
+									areq);
+	struct mcryptd_alg_cstate *cstate =
+				this_cpu_ptr(sha512_mb_alg_state.alg_cstate);
+
+	struct sha512_hash_ctx *sha_ctx;
+	int ret = 0;
+	u8 data;
+
+	/* sanity check */
+	if (rctx->tag.cpu != smp_processor_id()) {
+		pr_err("mcryptd error: cpu clash\n");
+		goto done;
+	}
+
+	/* need to init context */
+	req_ctx_init(rctx, areq);
+
+	rctx->flag |= HASH_DONE | HASH_FINAL;
+
+	sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
+	/* flag HASH_FINAL and 0 data size */
+	sha512_mb_add_list(rctx, cstate);
+	kernel_fpu_begin();
+	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
+								HASH_LAST);
+	kernel_fpu_end();
+
+	/* check if anything is returned */
+	if (!sha_ctx)
+		return -EINPROGRESS;
+
+	if (sha_ctx->error) {
+		ret = sha_ctx->error;
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		goto done;
+	}
+
+	rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+	ret = sha_finish_walk(&rctx, cstate, false);
+	if (!rctx)
+		return -EINPROGRESS;
+done:
+	sha_complete_job(rctx, cstate, ret);
+	return ret;
+}
+
+static int sha512_mb_export(struct ahash_request *areq, void *out)
+{
+	struct sha512_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha512_mb_import(struct ahash_request *areq, const void *in)
+{
+	struct sha512_hash_ctx *sctx = ahash_request_ctx(areq);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha512_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+	struct mcryptd_ahash *mcryptd_tfm;
+	struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct mcryptd_hash_ctx *mctx;
+
+	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha512-mb",
+						CRYPTO_ALG_INTERNAL,
+						CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(mcryptd_tfm))
+		return PTR_ERR(mcryptd_tfm);
+	mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+	mctx->alg_state = &sha512_mb_alg_state;
+	ctx->mcryptd_tfm = mcryptd_tfm;
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+	return 0;
+}
+
+static void sha512_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static int sha512_mb_areq_init_tfm(struct crypto_tfm *tfm)
+{
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				sizeof(struct ahash_request) +
+				sizeof(struct sha512_hash_ctx));
+
+	return 0;
+}
+
+static void sha512_mb_areq_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha512_mb_areq_alg = {
+	.init		=	sha512_mb_init,
+	.update		=	sha512_mb_update,
+	.final		=	sha512_mb_final,
+	.finup		=	sha512_mb_finup,
+	.export		=	sha512_mb_export,
+	.import		=	sha512_mb_import,
+	.halg		=	{
+	.digestsize	=	SHA512_DIGEST_SIZE,
+	.statesize	=	sizeof(struct sha512_hash_ctx),
+	.base		=	{
+			.cra_name	 = "__sha512-mb",
+			.cra_driver_name = "__intel_sha512-mb",
+			.cra_priority	 = 100,
+			/*
+			 * use ASYNC flag as some buffers in multi-buffer
+			 * algo may not have completed before hashing thread
+			 * sleep
+			 */
+			.cra_flags	= CRYPTO_ALG_TYPE_AHASH |
+						CRYPTO_ALG_ASYNC |
+						CRYPTO_ALG_INTERNAL,
+			.cra_blocksize	= SHA512_BLOCK_SIZE,
+			.cra_module	= THIS_MODULE,
+			.cra_list	= LIST_HEAD_INIT
+					(sha512_mb_areq_alg.halg.base.cra_list),
+			.cra_init	= sha512_mb_areq_init_tfm,
+			.cra_exit	= sha512_mb_areq_exit_tfm,
+			.cra_ctxsize	= sizeof(struct sha512_hash_ctx),
+		}
+	}
+};
+
+static int sha512_mb_async_init(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_init(mcryptd_req);
+}
+
+static int sha512_mb_async_update(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_update(mcryptd_req);
+}
+
+static int sha512_mb_async_finup(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_finup(mcryptd_req);
+}
+
+static int sha512_mb_async_final(struct ahash_request *req)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_final(mcryptd_req);
+}
+
+static int sha512_mb_async_digest(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_digest(mcryptd_req);
+}
+
+static int sha512_mb_async_export(struct ahash_request *req, void *out)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha512_mb_async_import(struct ahash_request *req, const void *in)
+{
+	struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+	struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
+	struct mcryptd_hash_request_ctx *rctx;
+	struct ahash_request *areq;
+
+	memcpy(mcryptd_req, req, sizeof(*req));
+	ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+	rctx = ahash_request_ctx(mcryptd_req);
+
+	areq = &rctx->areq;
+
+	ahash_request_set_tfm(areq, child);
+	ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
+					rctx->complete, req);
+
+	return crypto_ahash_import(mcryptd_req, in);
+}
+
+static struct ahash_alg sha512_mb_async_alg = {
+	.init           = sha512_mb_async_init,
+	.update         = sha512_mb_async_update,
+	.final          = sha512_mb_async_final,
+	.finup          = sha512_mb_async_finup,
+	.digest         = sha512_mb_async_digest,
+	.export		= sha512_mb_async_export,
+	.import		= sha512_mb_async_import,
+	.halg = {
+		.digestsize     = SHA512_DIGEST_SIZE,
+		.statesize      = sizeof(struct sha512_hash_ctx),
+		.base = {
+			.cra_name               = "sha512",
+			.cra_driver_name        = "sha512_mb",
+			.cra_priority           = 200,
+			.cra_flags              = CRYPTO_ALG_TYPE_AHASH |
+							CRYPTO_ALG_ASYNC,
+			.cra_blocksize          = SHA512_BLOCK_SIZE,
+			.cra_type               = &crypto_ahash_type,
+			.cra_module             = THIS_MODULE,
+			.cra_list               = LIST_HEAD_INIT
+				(sha512_mb_async_alg.halg.base.cra_list),
+			.cra_init               = sha512_mb_async_init_tfm,
+			.cra_exit               = sha512_mb_async_exit_tfm,
+			.cra_ctxsize		= sizeof(struct sha512_mb_ctx),
+			.cra_alignmask		= 0,
+		},
+	},
+};
+
+static unsigned long sha512_mb_flusher(struct mcryptd_alg_cstate *cstate)
+{
+	struct mcryptd_hash_request_ctx *rctx;
+	unsigned long cur_time;
+	unsigned long next_flush = 0;
+	struct sha512_hash_ctx *sha_ctx;
+
+
+	cur_time = jiffies;
+
+	while (!list_empty(&cstate->work_list)) {
+		rctx = list_entry(cstate->work_list.next,
+				struct mcryptd_hash_request_ctx, waiter);
+		if time_before(cur_time, rctx->tag.expire)
+			break;
+		kernel_fpu_begin();
+		sha_ctx = (struct sha512_hash_ctx *)
+					sha512_ctx_mgr_flush(cstate->mgr);
+		kernel_fpu_end();
+		if (!sha_ctx) {
+			pr_err("sha512_mb error: nothing got flushed for"
+							" non-empty list\n");
+			break;
+		}
+		rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+		sha_finish_walk(&rctx, cstate, true);
+		sha_complete_job(rctx, cstate, 0);
+	}
+
+	if (!list_empty(&cstate->work_list)) {
+		rctx = list_entry(cstate->work_list.next,
+				struct mcryptd_hash_request_ctx, waiter);
+		/* get the hash context and then flush time */
+		next_flush = rctx->tag.expire;
+		mcryptd_arm_flusher(cstate, get_delay(next_flush));
+	}
+	return next_flush;
+}
+
+static int __init sha512_mb_mod_init(void)
+{
+
+	int cpu;
+	int err;
+	struct mcryptd_alg_cstate *cpu_state;
+
+	/* check for dependent cpu features */
+	if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+	    !boot_cpu_has(X86_FEATURE_BMI2))
+		return -ENODEV;
+
+	/* initialize multibuffer structures */
+	sha512_mb_alg_state.alg_cstate =
+				alloc_percpu(struct mcryptd_alg_cstate);
+
+	sha512_job_mgr_init = sha512_mb_mgr_init_avx2;
+	sha512_job_mgr_submit = sha512_mb_mgr_submit_avx2;
+	sha512_job_mgr_flush = sha512_mb_mgr_flush_avx2;
+	sha512_job_mgr_get_comp_job = sha512_mb_mgr_get_comp_job_avx2;
+
+	if (!sha512_mb_alg_state.alg_cstate)
+		return -ENOMEM;
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu);
+		cpu_state->next_flush = 0;
+		cpu_state->next_seq_num = 0;
+		cpu_state->flusher_engaged = false;
+		INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
+		cpu_state->cpu = cpu;
+		cpu_state->alg_state = &sha512_mb_alg_state;
+		cpu_state->mgr = kzalloc(sizeof(struct sha512_ctx_mgr),
+								GFP_KERNEL);
+		if (!cpu_state->mgr)
+			goto err2;
+		sha512_ctx_mgr_init(cpu_state->mgr);
+		INIT_LIST_HEAD(&cpu_state->work_list);
+		spin_lock_init(&cpu_state->work_lock);
+	}
+	sha512_mb_alg_state.flusher = &sha512_mb_flusher;
+
+	err = crypto_register_ahash(&sha512_mb_areq_alg);
+	if (err)
+		goto err2;
+	err = crypto_register_ahash(&sha512_mb_async_alg);
+	if (err)
+		goto err1;
+
+
+	return 0;
+err1:
+	crypto_unregister_ahash(&sha512_mb_areq_alg);
+err2:
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu);
+		kfree(cpu_state->mgr);
+	}
+	free_percpu(sha512_mb_alg_state.alg_cstate);
+	return -ENODEV;
+}
+
+static void __exit sha512_mb_mod_fini(void)
+{
+	int cpu;
+	struct mcryptd_alg_cstate *cpu_state;
+
+	crypto_unregister_ahash(&sha512_mb_async_alg);
+	crypto_unregister_ahash(&sha512_mb_areq_alg);
+	for_each_possible_cpu(cpu) {
+		cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu);
+		kfree(cpu_state->mgr);
+	}
+	free_percpu(sha512_mb_alg_state.alg_cstate);
+}
+
+module_init(sha512_mb_mod_init);
+module_exit(sha512_mb_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, multi buffer accelerated");
+
+MODULE_ALIAS("sha512");
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
new file mode 100644
index 000000000000..9d4b2c8208d5
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
@@ -0,0 +1,130 @@
+/*
+ * Header file for multi buffer SHA512 context
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SHA_MB_CTX_INTERNAL_H
+#define _SHA_MB_CTX_INTERNAL_H
+
+#include "sha512_mb_mgr.h"
+
+#define HASH_UPDATE          0x00
+#define HASH_FIRST           0x01
+#define HASH_LAST            0x02
+#define HASH_ENTIRE          0x03
+#define HASH_DONE            0x04
+#define HASH_FINAL           0x08
+
+#define HASH_CTX_STS_IDLE       0x00
+#define HASH_CTX_STS_PROCESSING 0x01
+#define HASH_CTX_STS_LAST       0x02
+#define HASH_CTX_STS_COMPLETE   0x04
+
+enum hash_ctx_error {
+	HASH_CTX_ERROR_NONE               =  0,
+	HASH_CTX_ERROR_INVALID_FLAGS      = -1,
+	HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
+	HASH_CTX_ERROR_ALREADY_COMPLETED  = -3,
+};
+
+#define hash_ctx_user_data(ctx)  ((ctx)->user_data)
+#define hash_ctx_digest(ctx)     ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx)   ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx)     ((ctx)->status)
+#define hash_ctx_error(ctx)      ((ctx)->error)
+#define hash_ctx_init(ctx) \
+	do { \
+		(ctx)->error = HASH_CTX_ERROR_NONE; \
+		(ctx)->status = HASH_CTX_STS_COMPLETE; \
+	} while (0)
+
+/* Hash Constants and Typedefs */
+#define SHA512_DIGEST_LENGTH          8
+#define SHA512_LOG2_BLOCK_SIZE        7
+
+#define SHA512_PADLENGTHFIELD_SIZE    16
+
+#ifdef SHA_MB_DEBUG
+#define assert(expr) \
+do { \
+	if (unlikely(!(expr))) { \
+		printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+		#expr, __FILE__, __func__, __LINE__); \
+	} \
+} while (0)
+#else
+#define assert(expr) do {} while (0)
+#endif
+
+struct sha512_ctx_mgr {
+	struct sha512_mb_mgr mgr;
+};
+
+/* typedef struct sha512_ctx_mgr sha512_ctx_mgr; */
+
+struct sha512_hash_ctx {
+	/* Must be at struct offset 0 */
+	struct job_sha512       job;
+	/* status flag */
+	int status;
+	/* error flag */
+	int error;
+
+	uint32_t        total_length;
+	const void      *incoming_buffer;
+	uint32_t        incoming_buffer_length;
+	uint8_t         partial_block_buffer[SHA512_BLOCK_SIZE * 2];
+	uint32_t        partial_block_buffer_length;
+	void            *user_data;
+};
+
+#endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h b/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h
new file mode 100644
index 000000000000..178f17eef382
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h
@@ -0,0 +1,104 @@
+/*
+ * Header file for multi buffer SHA512 algorithm manager
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __SHA_MB_MGR_H
+#define __SHA_MB_MGR_H
+
+#include <linux/types.h>
+
+#define NUM_SHA512_DIGEST_WORDS 8
+
+enum job_sts {STS_UNKNOWN = 0,
+	STS_BEING_PROCESSED = 1,
+	STS_COMPLETED =       2,
+	STS_INTERNAL_ERROR = 3,
+	STS_ERROR = 4
+};
+
+struct job_sha512 {
+	u8  *buffer;
+	u64  len;
+	u64  result_digest[NUM_SHA512_DIGEST_WORDS] __aligned(32);
+	enum job_sts status;
+	void   *user_data;
+};
+
+struct sha512_args_x4 {
+	uint64_t        digest[8][4];
+	uint8_t         *data_ptr[4];
+};
+
+struct sha512_lane_data {
+	struct job_sha512 *job_in_lane;
+};
+
+struct sha512_mb_mgr {
+	struct sha512_args_x4 args;
+
+	uint64_t lens[4];
+
+	/* each byte is index (0...7) of unused lanes */
+	uint64_t unused_lanes;
+	/* byte 4 is set to FF as a flag */
+	struct sha512_lane_data ldata[4];
+};
+
+#define SHA512_MB_MGR_NUM_LANES_AVX2 4
+
+void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state);
+struct job_sha512 *sha512_mb_mgr_submit_avx2(struct sha512_mb_mgr *state,
+						struct job_sha512 *job);
+struct job_sha512 *sha512_mb_mgr_flush_avx2(struct sha512_mb_mgr *state);
+struct job_sha512 *sha512_mb_mgr_get_comp_job_avx2(struct sha512_mb_mgr *state);
+
+#endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S
new file mode 100644
index 000000000000..cf2636d4c9ba
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S
@@ -0,0 +1,281 @@
+/*
+ * Header file for multi buffer SHA256 algorithm data structure
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Megha Dey <megha.dey@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2016 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# Macros for defining data structures
+
+# Usage example
+
+#START_FIELDS   # JOB_AES
+###     name            size    align
+#FIELD  _plaintext,     8,      8       # pointer to plaintext
+#FIELD  _ciphertext,    8,      8       # pointer to ciphertext
+#FIELD  _IV,            16,     8       # IV
+#FIELD  _keys,          8,      8       # pointer to keys
+#FIELD  _len,           4,      4       # length in bytes
+#FIELD  _status,        4,      4       # status enumeration
+#FIELD  _user_data,     8,      8       # pointer to user data
+#UNION  _union,         size1,  align1, \
+#                       size2,  align2, \
+#                       size3,  align3, \
+#                       ...
+#END_FIELDS
+#%assign _JOB_AES_size  _FIELD_OFFSET
+#%assign _JOB_AES_align _STRUCT_ALIGN
+
+#########################################################################
+
+# Alternate "struc-like" syntax:
+#       STRUCT job_aes2
+#       RES_Q   .plaintext,     1
+#       RES_Q   .ciphertext,    1
+#       RES_DQ  .IV,            1
+#       RES_B   .nested,        _JOB_AES_SIZE, _JOB_AES_ALIGN
+#       RES_U   .union,         size1, align1, \
+#                               size2, align2, \
+#                               ...
+#       ENDSTRUCT
+#       # Following only needed if nesting
+#       %assign job_aes2_size   _FIELD_OFFSET
+#       %assign job_aes2_align  _STRUCT_ALIGN
+#
+# RES_* macros take a name, a count and an optional alignment.
+# The count in in terms of the base size of the macro, and the
+# default alignment is the base size.
+# The macros are:
+# Macro    Base size
+# RES_B     1
+# RES_W     2
+# RES_D     4
+# RES_Q     8
+# RES_DQ   16
+# RES_Y    32
+# RES_Z    64
+#
+# RES_U defines a union. It's arguments are a name and two or more
+# pairs of "size, alignment"
+#
+# The two assigns are only needed if this structure is being nested
+# within another. Even if the assigns are not done, one can still use
+# STRUCT_NAME_size as the size of the structure.
+#
+# Note that for nesting, you still need to assign to STRUCT_NAME_size.
+#
+# The differences between this and using "struc" directly are that each
+# type is implicitly aligned to its natural length (although this can be
+# over-ridden with an explicit third parameter), and that the structure
+# is padded at the end to its overall alignment.
+#
+
+#########################################################################
+
+#ifndef _DATASTRUCT_ASM_
+#define _DATASTRUCT_ASM_
+
+#define PTR_SZ                  8
+#define SHA512_DIGEST_WORD_SIZE 8
+#define SHA512_MB_MGR_NUM_LANES_AVX2 4
+#define NUM_SHA512_DIGEST_WORDS 8
+#define SZ4                     4*SHA512_DIGEST_WORD_SIZE
+#define ROUNDS                  80*SZ4
+#define SHA512_DIGEST_ROW_SIZE  (SHA512_MB_MGR_NUM_LANES_AVX2 * 8)
+
+# START_FIELDS
+.macro START_FIELDS
+ _FIELD_OFFSET = 0
+ _STRUCT_ALIGN = 0
+.endm
+
+# FIELD name size align
+.macro FIELD name size align
+ _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
+ \name  = _FIELD_OFFSET
+ _FIELD_OFFSET = _FIELD_OFFSET + (\size)
+.if (\align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = \align
+.endif
+.endm
+
+# END_FIELDS
+.macro END_FIELDS
+ _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+.endm
+
+.macro STRUCT p1
+START_FIELDS
+.struc \p1
+.endm
+
+.macro ENDSTRUCT
+ tmp = _FIELD_OFFSET
+ END_FIELDS
+ tmp = (_FIELD_OFFSET - ##tmp)
+.if (tmp > 0)
+        .lcomm  tmp
+.endm
+
+## RES_int name size align
+.macro RES_int p1 p2 p3
+ name = \p1
+ size = \p2
+ align = .\p3
+
+ _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
+.align align
+.lcomm name size
+ _FIELD_OFFSET = _FIELD_OFFSET + (size)
+.if (align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = align
+.endif
+.endm
+
+# macro RES_B name, size [, align]
+.macro RES_B _name, _size, _align=1
+RES_int _name _size _align
+.endm
+
+# macro RES_W name, size [, align]
+.macro RES_W _name, _size, _align=2
+RES_int _name 2*(_size) _align
+.endm
+
+# macro RES_D name, size [, align]
+.macro RES_D _name, _size, _align=4
+RES_int _name 4*(_size) _align
+.endm
+
+# macro RES_Q name, size [, align]
+.macro RES_Q _name, _size, _align=8
+RES_int _name 8*(_size) _align
+.endm
+
+# macro RES_DQ name, size [, align]
+.macro RES_DQ _name, _size, _align=16
+RES_int _name 16*(_size) _align
+.endm
+
+# macro RES_Y name, size [, align]
+.macro RES_Y _name, _size, _align=32
+RES_int _name 32*(_size) _align
+.endm
+
+# macro RES_Z name, size [, align]
+.macro RES_Z _name, _size, _align=64
+RES_int _name 64*(_size) _align
+.endm
+
+#endif
+
+###################################################################
+### Define SHA512 Out Of Order Data Structures
+###################################################################
+
+START_FIELDS    # LANE_DATA
+###     name            size    align
+FIELD   _job_in_lane,   8,      8       # pointer to job object
+END_FIELDS
+
+ _LANE_DATA_size = _FIELD_OFFSET
+ _LANE_DATA_align = _STRUCT_ALIGN
+
+####################################################################
+
+START_FIELDS    # SHA512_ARGS_X4
+###     name            size    align
+FIELD   _digest,        8*8*4,  4      # transposed digest
+FIELD   _data_ptr,      8*4,    8       # array of pointers to data
+END_FIELDS
+
+ _SHA512_ARGS_X4_size  =  _FIELD_OFFSET
+ _SHA512_ARGS_X4_align =  _STRUCT_ALIGN
+
+#####################################################################
+
+START_FIELDS    # MB_MGR
+###     name            size    align
+FIELD   _args,          _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align
+FIELD   _lens,          8*4,    8
+FIELD   _unused_lanes,  8,      8
+FIELD   _ldata,         _LANE_DATA_size*4, _LANE_DATA_align
+END_FIELDS
+
+ _MB_MGR_size  =  _FIELD_OFFSET
+ _MB_MGR_align =  _STRUCT_ALIGN
+
+_args_digest = _args + _digest
+_args_data_ptr = _args + _data_ptr
+
+#######################################################################
+
+#######################################################################
+#### Define constants
+#######################################################################
+
+#define STS_UNKNOWN             0
+#define STS_BEING_PROCESSED     1
+#define STS_COMPLETED           2
+
+#######################################################################
+#### Define JOB_SHA512 structure
+#######################################################################
+
+START_FIELDS    # JOB_SHA512
+###     name                            size    align
+FIELD   _buffer,                        8,      8       # pointer to buffer
+FIELD   _len,                           8,      8       # length in bytes
+FIELD   _result_digest,                 8*8,    32      # Digest (output)
+FIELD   _status,                        4,      4
+FIELD   _user_data,                     8,      8
+END_FIELDS
+
+ _JOB_SHA512_size = _FIELD_OFFSET
+ _JOB_SHA512_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000000..3ddba19a0db6
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
@@ -0,0 +1,291 @@
+/*
+ * Flush routine for SHA512 multibuffer
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ *     Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha512_mb_mgr_datastruct.S"
+
+.extern sha512_x4_avx2
+
+# LINUX register definitions
+#define arg1    %rdi
+#define arg2    %rsi
+
+# idx needs to be other than arg1, arg2, rbx, r12
+#define idx     %rdx
+
+# Common definitions
+#define state   arg1
+#define job     arg2
+#define len2    arg2
+
+#define unused_lanes    %rbx
+#define lane_data       %rbx
+#define tmp2            %rbx
+
+#define job_rax         %rax
+#define tmp1            %rax
+#define size_offset     %rax
+#define tmp             %rax
+#define start_offset    %rax
+
+#define tmp3            arg1
+
+#define extra_blocks    arg2
+#define p               arg2
+
+#define tmp4            %r8
+#define lens0           %r8
+
+#define lens1           %r9
+#define lens2           %r10
+#define lens3           %r11
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JNE_SKIP i
+jne     skip_\i
+.endm
+
+.altmacro
+.macro SET_OFFSET _offset
+offset = \_offset
+.endm
+.noaltmacro
+
+# JOB* sha512_mb_mgr_flush_avx2(MB_MGR *state)
+# arg 1 : rcx : state
+ENTRY(sha512_mb_mgr_flush_avx2)
+	FRAME_BEGIN
+	push	%rbx
+
+	# If bit (32+3) is set, then all lanes are empty
+	mov     _unused_lanes(state), unused_lanes
+        bt      $32+7, unused_lanes
+        jc      return_null
+
+        # find a lane with a non-null job
+	xor     idx, idx
+        offset = (_ldata + 1*_LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  one(%rip), idx
+        offset = (_ldata + 2*_LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  two(%rip), idx
+        offset = (_ldata + 3*_LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  three(%rip), idx
+
+        # copy idx to empty lanes
+copy_lane_data:
+	offset =  (_args + _data_ptr)
+        mov     offset(state,idx,8), tmp
+
+        I = 0
+.rep 4
+	offset =  (_ldata + I * _LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+.altmacro
+        JNE_SKIP %I
+        offset =  (_args + _data_ptr + 8*I)
+        mov     tmp, offset(state)
+        offset =  (_lens + 8*I +4)
+        movl    $0xFFFFFFFF, offset(state)
+LABEL skip_ %I
+        I = (I+1)
+.noaltmacro
+.endr
+
+        # Find min length
+        mov     _lens + 0*8(state),lens0
+        mov     lens0,idx
+        mov     _lens + 1*8(state),lens1
+        cmp     idx,lens1
+        cmovb   lens1,idx
+        mov     _lens + 2*8(state),lens2
+        cmp     idx,lens2
+        cmovb   lens2,idx
+        mov     _lens + 3*8(state),lens3
+        cmp     idx,lens3
+        cmovb   lens3,idx
+        mov     idx,len2
+        and     $0xF,idx
+        and     $~0xFF,len2
+	jz      len_is_0
+
+        sub     len2, lens0
+        sub     len2, lens1
+        sub     len2, lens2
+        sub     len2, lens3
+        shr     $32,len2
+        mov     lens0, _lens + 0*8(state)
+        mov     lens1, _lens + 1*8(state)
+        mov     lens2, _lens + 2*8(state)
+        mov     lens3, _lens + 3*8(state)
+
+        # "state" and "args" are the same address, arg1
+        # len is arg2
+        call    sha512_x4_avx2
+        # state and idx are intact
+
+len_is_0:
+        # process completed job "idx"
+	imul    $_LANE_DATA_size, idx, lane_data
+        lea     _ldata(state, lane_data), lane_data
+
+        mov     _job_in_lane(lane_data), job_rax
+        movq    $0,  _job_in_lane(lane_data)
+        movl    $STS_COMPLETED, _status(job_rax)
+        mov     _unused_lanes(state), unused_lanes
+        shl     $8, unused_lanes
+        or      idx, unused_lanes
+        mov     unused_lanes, _unused_lanes(state)
+
+	movl    $0xFFFFFFFF, _lens+4(state,  idx, 8)
+
+	vmovq _args_digest+0*32(state, idx, 8), %xmm0
+        vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
+	vmovq _args_digest+2*32(state, idx, 8), %xmm1
+        vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
+	vmovq _args_digest+4*32(state, idx, 8), %xmm2
+        vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
+	vmovq _args_digest+6*32(state, idx, 8), %xmm3
+	vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
+
+	vmovdqu %xmm0, _result_digest(job_rax)
+	vmovdqu %xmm1, _result_digest+1*16(job_rax)
+	vmovdqu %xmm2, _result_digest+2*16(job_rax)
+	vmovdqu %xmm3, _result_digest+3*16(job_rax)
+
+return:
+	pop	%rbx
+	FRAME_END
+        ret
+
+return_null:
+        xor     job_rax, job_rax
+        jmp     return
+ENDPROC(sha512_mb_mgr_flush_avx2)
+.align 16
+
+ENTRY(sha512_mb_mgr_get_comp_job_avx2)
+        push    %rbx
+
+	mov     _unused_lanes(state), unused_lanes
+        bt      $(32+7), unused_lanes
+        jc      .return_null
+
+        # Find min length
+        mov     _lens(state),lens0
+        mov     lens0,idx
+        mov     _lens+1*8(state),lens1
+        cmp     idx,lens1
+        cmovb   lens1,idx
+        mov     _lens+2*8(state),lens2
+        cmp     idx,lens2
+        cmovb   lens2,idx
+        mov     _lens+3*8(state),lens3
+        cmp     idx,lens3
+        cmovb   lens3,idx
+        test    $~0xF,idx
+        jnz     .return_null
+        and     $0xF,idx
+
+        #process completed job "idx"
+	imul    $_LANE_DATA_size, idx, lane_data
+        lea     _ldata(state, lane_data), lane_data
+
+        mov     _job_in_lane(lane_data), job_rax
+        movq    $0,  _job_in_lane(lane_data)
+        movl    $STS_COMPLETED, _status(job_rax)
+        mov     _unused_lanes(state), unused_lanes
+        shl     $8, unused_lanes
+        or      idx, unused_lanes
+        mov     unused_lanes, _unused_lanes(state)
+
+        movl    $0xFFFFFFFF, _lens+4(state,  idx, 8)
+
+	vmovq   _args_digest(state, idx, 8), %xmm0
+        vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
+	vmovq    _args_digest+2*32(state, idx, 8), %xmm1
+        vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
+	vmovq    _args_digest+4*32(state, idx, 8), %xmm2
+        vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
+        vmovq    _args_digest+6*32(state, idx, 8), %xmm3
+        vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
+
+	vmovdqu %xmm0, _result_digest+0*16(job_rax)
+	vmovdqu %xmm1, _result_digest+1*16(job_rax)
+	vmovdqu %xmm2, _result_digest+2*16(job_rax)
+	vmovdqu %xmm3, _result_digest+3*16(job_rax)
+
+	pop     %rbx
+
+        ret
+
+.return_null:
+        xor     job_rax, job_rax
+	pop     %rbx
+        ret
+ENDPROC(sha512_mb_mgr_get_comp_job_avx2)
+.data
+
+.align 16
+one:
+.quad  1
+two:
+.quad  2
+three:
+.quad  3
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
new file mode 100644
index 000000000000..36870b26067a
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
@@ -0,0 +1,67 @@
+/*
+ * Initialization code for multi buffer SHA256 algorithm for AVX2
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ *     Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sha512_mb_mgr.h"
+
+void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state)
+{
+	unsigned int j;
+
+	state->lens[0] = 0;
+	state->lens[1] = 1;
+	state->lens[2] = 2;
+	state->lens[3] = 3;
+	state->unused_lanes = 0xFF03020100;
+	for (j = 0; j < 4; j++)
+		state->ldata[j].job_in_lane = NULL;
+}
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000000..815f07bdd1f8
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
@@ -0,0 +1,222 @@
+/*
+ * Buffer submit code for multi buffer SHA512 algorithm
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ *     Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha512_mb_mgr_datastruct.S"
+
+.extern sha512_x4_avx2
+
+#define arg1    %rdi
+#define arg2    %rsi
+
+#define idx             %rdx
+#define last_len        %rdx
+
+#define size_offset     %rcx
+#define tmp2            %rcx
+
+# Common definitions
+#define state   arg1
+#define job     arg2
+#define len2    arg2
+#define p2      arg2
+
+#define p               %r11
+#define start_offset    %r11
+
+#define unused_lanes    %rbx
+
+#define job_rax         %rax
+#define len             %rax
+
+#define lane            %r12
+#define tmp3            %r12
+#define lens3           %r12
+
+#define extra_blocks    %r8
+#define lens0           %r8
+
+#define tmp             %r9
+#define lens1           %r9
+
+#define lane_data       %r10
+#define lens2           %r10
+
+#define DWORD_len %eax
+
+# JOB* sha512_mb_mgr_submit_avx2(MB_MGR *state, JOB *job)
+# arg 1 : rcx : state
+# arg 2 : rdx : job
+ENTRY(sha512_mb_mgr_submit_avx2)
+	FRAME_BEGIN
+	push	%rbx
+	push	%r12
+
+        mov     _unused_lanes(state), unused_lanes
+        movzb     %bl,lane
+        shr     $8, unused_lanes
+        imul    $_LANE_DATA_size, lane,lane_data
+        movl    $STS_BEING_PROCESSED, _status(job)
+	lea     _ldata(state, lane_data), lane_data
+        mov     unused_lanes, _unused_lanes(state)
+        movl    _len(job),  DWORD_len
+
+	mov     job, _job_in_lane(lane_data)
+        movl    DWORD_len,_lens+4(state , lane, 8)
+
+	# Load digest words from result_digest
+	vmovdqu	_result_digest+0*16(job), %xmm0
+	vmovdqu _result_digest+1*16(job), %xmm1
+	vmovdqu	_result_digest+2*16(job), %xmm2
+        vmovdqu	_result_digest+3*16(job), %xmm3
+
+	vmovq    %xmm0, _args_digest(state, lane, 8)
+	vpextrq  $1, %xmm0, _args_digest+1*32(state , lane, 8)
+	vmovq    %xmm1, _args_digest+2*32(state , lane, 8)
+	vpextrq  $1, %xmm1, _args_digest+3*32(state , lane, 8)
+	vmovq    %xmm2, _args_digest+4*32(state , lane, 8)
+	vpextrq  $1, %xmm2, _args_digest+5*32(state , lane, 8)
+	vmovq    %xmm3, _args_digest+6*32(state , lane, 8)
+	vpextrq  $1, %xmm3, _args_digest+7*32(state , lane, 8)
+
+	mov     _buffer(job), p
+	mov     p, _args_data_ptr(state, lane, 8)
+
+	cmp     $0xFF, unused_lanes
+	jne     return_null
+
+start_loop:
+
+	# Find min length
+	mov     _lens+0*8(state),lens0
+	mov     lens0,idx
+	mov     _lens+1*8(state),lens1
+	cmp     idx,lens1
+	cmovb   lens1, idx
+	mov     _lens+2*8(state),lens2
+	cmp     idx,lens2
+	cmovb   lens2,idx
+	mov     _lens+3*8(state),lens3
+	cmp     idx,lens3
+	cmovb   lens3,idx
+	mov     idx,len2
+	and     $0xF,idx
+	and     $~0xFF,len2
+	jz      len_is_0
+
+	sub     len2,lens0
+	sub     len2,lens1
+	sub     len2,lens2
+	sub     len2,lens3
+	shr     $32,len2
+	mov     lens0, _lens + 0*8(state)
+	mov     lens1, _lens + 1*8(state)
+	mov     lens2, _lens + 2*8(state)
+	mov     lens3, _lens + 3*8(state)
+
+	# "state" and "args" are the same address, arg1
+	# len is arg2
+	call    sha512_x4_avx2
+	# state and idx are intact
+
+len_is_0:
+
+	# process completed job "idx"
+	imul    $_LANE_DATA_size, idx, lane_data
+	lea     _ldata(state, lane_data), lane_data
+
+	mov     _job_in_lane(lane_data), job_rax
+	mov     _unused_lanes(state), unused_lanes
+	movq    $0, _job_in_lane(lane_data)
+	movl    $STS_COMPLETED, _status(job_rax)
+	shl     $8, unused_lanes
+	or      idx, unused_lanes
+	mov     unused_lanes, _unused_lanes(state)
+
+	movl	$0xFFFFFFFF,_lens+4(state,idx,8)
+	vmovq    _args_digest+0*32(state , idx, 8), %xmm0
+	vpinsrq  $1, _args_digest+1*32(state , idx, 8), %xmm0, %xmm0
+	vmovq    _args_digest+2*32(state , idx, 8), %xmm1
+	vpinsrq  $1, _args_digest+3*32(state , idx, 8), %xmm1, %xmm1
+	vmovq    _args_digest+4*32(state , idx, 8), %xmm2
+	vpinsrq  $1, _args_digest+5*32(state , idx, 8), %xmm2, %xmm2
+	vmovq    _args_digest+6*32(state , idx, 8), %xmm3
+	vpinsrq  $1, _args_digest+7*32(state , idx, 8), %xmm3, %xmm3
+
+	vmovdqu  %xmm0, _result_digest + 0*16(job_rax)
+	vmovdqu  %xmm1, _result_digest + 1*16(job_rax)
+	vmovdqu  %xmm2, _result_digest + 2*16(job_rax)
+	vmovdqu  %xmm3, _result_digest + 3*16(job_rax)
+
+return:
+	pop	%r12
+	pop	%rbx
+	FRAME_END
+	ret
+
+return_null:
+	xor     job_rax, job_rax
+	jmp     return
+ENDPROC(sha512_mb_mgr_submit_avx2)
+.data
+
+.align 16
+H0:     .int  0x6a09e667
+H1:     .int  0xbb67ae85
+H2:     .int  0x3c6ef372
+H3:     .int  0xa54ff53a
+H4:     .int  0x510e527f
+H5:     .int  0x9b05688c
+H6:     .int  0x1f83d9ab
+H7:     .int  0x5be0cd19
diff --git a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
new file mode 100644
index 000000000000..31ab1eff6413
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
@@ -0,0 +1,529 @@
+/*
+ * Multi-buffer SHA512 algorithm hash compute routine
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ *     Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# code to compute quad SHA512 using AVX2
+# use YMMs to tackle the larger digest size
+# outer calling routine takes care of save and restore of XMM registers
+# Logic designed/laid out by JDG
+
+# Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+# Stack must be aligned to 32 bytes before call
+# Linux clobbers: rax rbx rcx rsi r8 r9 r10 r11 r12
+# Linux preserves: rcx rdx rdi rbp r13 r14 r15
+# clobbers ymm0-15
+
+#include <linux/linkage.h>
+#include "sha512_mb_mgr_datastruct.S"
+
+arg1 = %rdi
+arg2 = %rsi
+
+# Common definitions
+STATE = arg1
+INP_SIZE = arg2
+
+IDX = %rax
+ROUND = %rbx
+TBL = %r8
+
+inp0 = %r9
+inp1 = %r10
+inp2 = %r11
+inp3 = %r12
+
+a = %ymm0
+b = %ymm1
+c = %ymm2
+d = %ymm3
+e = %ymm4
+f = %ymm5
+g = %ymm6
+h = %ymm7
+
+a0 = %ymm8
+a1 = %ymm9
+a2 = %ymm10
+
+TT0 = %ymm14
+TT1 = %ymm13
+TT2 = %ymm12
+TT3 = %ymm11
+TT4 = %ymm10
+TT5 = %ymm9
+
+T1 = %ymm14
+TMP = %ymm15
+
+# Define stack usage
+STACK_SPACE1 = SZ4*16 + NUM_SHA512_DIGEST_WORDS*SZ4 + 24
+
+#define VMOVPD	vmovupd
+_digest = SZ4*16
+
+# transpose r0, r1, r2, r3, t0, t1
+# "transpose" data in {r0..r3} using temps {t0..t3}
+# Input looks like: {r0 r1 r2 r3}
+# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+#
+# output looks like: {t0 r1 r0 r3}
+# t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
+# r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
+# r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
+# r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
+
+.macro TRANSPOSE r0 r1 r2 r3 t0 t1
+	vshufps  $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
+        vshufps  $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
+        vshufps  $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
+        vshufps  $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
+
+	vperm2f128      $0x20, \r2, \r0, \r1  # h6...a6
+        vperm2f128      $0x31, \r2, \r0, \r3  # h2...a2
+        vperm2f128      $0x31, \t1, \t0, \r0  # h5...a5
+        vperm2f128      $0x20, \t1, \t0, \t0  # h1...a1
+.endm
+
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+# PRORQ reg, imm, tmp
+# packed-rotate-right-double
+# does a rotate by doing two shifts and an or
+.macro _PRORQ reg imm tmp
+	vpsllq	$(64-\imm),\reg,\tmp
+	vpsrlq	$\imm,\reg, \reg
+	vpor	\tmp,\reg, \reg
+.endm
+
+# non-destructive
+# PRORQ_nd reg, imm, tmp, src
+.macro _PRORQ_nd reg imm tmp src
+	vpsllq	$(64-\imm), \src, \tmp
+	vpsrlq	$\imm, \src, \reg
+	vpor	\tmp, \reg, \reg
+.endm
+
+# PRORQ dst/src, amt
+.macro PRORQ reg imm
+	_PRORQ	\reg, \imm, TMP
+.endm
+
+# PRORQ_nd dst, src, amt
+.macro PRORQ_nd reg tmp imm
+	_PRORQ_nd	\reg, \imm, TMP, \tmp
+.endm
+
+#; arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_00_15 _T1 i
+	PRORQ_nd a0, e, (18-14)	# sig1: a0 = (e >> 4)
+
+	vpxor   g, f, a2        # ch: a2 = f^g
+        vpand   e,a2, a2                # ch: a2 = (f^g)&e
+        vpxor   g, a2, a2               # a2 = ch
+
+        PRORQ_nd        a1,e,41         # sig1: a1 = (e >> 25)
+
+        offset = SZ4*(\i & 0xf)
+        vmovdqu \_T1,offset(%rsp)
+        vpaddq  (TBL,ROUND,1), \_T1, \_T1       # T1 = W + K
+        vpxor   e,a0, a0        # sig1: a0 = e ^ (e >> 5)
+        PRORQ   a0, 14           # sig1: a0 = (e >> 6) ^ (e >> 11)
+        vpaddq  a2, h, h        # h = h + ch
+        PRORQ_nd        a2,a,6  # sig0: a2 = (a >> 11)
+        vpaddq  \_T1,h, h       # h = h + ch + W + K
+        vpxor   a1, a0, a0      # a0 = sigma1
+	vmovdqu a,\_T1
+        PRORQ_nd        a1,a,39 # sig0: a1 = (a >> 22)
+        vpxor   c, \_T1, \_T1      # maj: T1 = a^c
+        add     $SZ4, ROUND     # ROUND++
+        vpand   b, \_T1, \_T1   # maj: T1 = (a^c)&b
+        vpaddq  a0, h, h
+        vpaddq  h, d, d
+        vpxor   a, a2, a2       # sig0: a2 = a ^ (a >> 11)
+        PRORQ   a2,28            # sig0: a2 = (a >> 2) ^ (a >> 13)
+        vpxor   a1, a2, a2      # a2 = sig0
+        vpand   c, a, a1        # maj: a1 = a&c
+        vpor    \_T1, a1, a1    # a1 = maj
+        vpaddq  a1, h, h        # h = h + ch + W + K + maj
+        vpaddq  a2, h, h        # h = h + ch + W + K + maj + sigma0
+        ROTATE_ARGS
+.endm
+
+
+#; arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_16_XX _T1 i
+	vmovdqu SZ4*((\i-15)&0xf)(%rsp), \_T1
+        vmovdqu SZ4*((\i-2)&0xf)(%rsp), a1
+        vmovdqu \_T1, a0
+        PRORQ   \_T1,7
+        vmovdqu a1, a2
+        PRORQ   a1,42
+        vpxor   a0, \_T1, \_T1
+        PRORQ   \_T1, 1
+        vpxor   a2, a1, a1
+        PRORQ   a1, 19
+        vpsrlq  $7, a0, a0
+        vpxor   a0, \_T1, \_T1
+        vpsrlq  $6, a2, a2
+        vpxor   a2, a1, a1
+        vpaddq  SZ4*((\i-16)&0xf)(%rsp), \_T1, \_T1
+        vpaddq  SZ4*((\i-7)&0xf)(%rsp), a1, a1
+        vpaddq  a1, \_T1, \_T1
+
+        ROUND_00_15 \_T1,\i
+.endm
+
+
+# void sha512_x4_avx2(void *STATE, const int INP_SIZE)
+# arg 1 : STATE    : pointer to input data
+# arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+ENTRY(sha512_x4_avx2)
+	# general registers preserved in outer calling routine
+	# outer calling routine saves all the XMM registers
+	# save callee-saved clobbered registers to comply with C function ABI
+	push    %r12
+	push    %r13
+	push    %r14
+	push    %r15
+
+	sub     $STACK_SPACE1, %rsp
+
+        # Load the pre-transposed incoming digest.
+        vmovdqu 0*SHA512_DIGEST_ROW_SIZE(STATE),a
+        vmovdqu 1*SHA512_DIGEST_ROW_SIZE(STATE),b
+        vmovdqu 2*SHA512_DIGEST_ROW_SIZE(STATE),c
+        vmovdqu 3*SHA512_DIGEST_ROW_SIZE(STATE),d
+        vmovdqu 4*SHA512_DIGEST_ROW_SIZE(STATE),e
+        vmovdqu 5*SHA512_DIGEST_ROW_SIZE(STATE),f
+        vmovdqu 6*SHA512_DIGEST_ROW_SIZE(STATE),g
+        vmovdqu 7*SHA512_DIGEST_ROW_SIZE(STATE),h
+
+        lea     K512_4(%rip),TBL
+
+        # load the address of each of the 4 message lanes
+        # getting ready to transpose input onto stack
+        mov     _data_ptr+0*PTR_SZ(STATE),inp0
+        mov     _data_ptr+1*PTR_SZ(STATE),inp1
+        mov     _data_ptr+2*PTR_SZ(STATE),inp2
+        mov     _data_ptr+3*PTR_SZ(STATE),inp3
+
+        xor     IDX, IDX
+lloop:
+        xor     ROUND, ROUND
+
+	# save old digest
+        vmovdqu a, _digest(%rsp)
+        vmovdqu b, _digest+1*SZ4(%rsp)
+        vmovdqu c, _digest+2*SZ4(%rsp)
+        vmovdqu d, _digest+3*SZ4(%rsp)
+        vmovdqu e, _digest+4*SZ4(%rsp)
+        vmovdqu f, _digest+5*SZ4(%rsp)
+        vmovdqu g, _digest+6*SZ4(%rsp)
+        vmovdqu h, _digest+7*SZ4(%rsp)
+        i = 0
+.rep 4
+	vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP
+        VMOVPD  i*32(inp0, IDX), TT2
+        VMOVPD  i*32(inp1, IDX), TT1
+        VMOVPD  i*32(inp2, IDX), TT4
+        VMOVPD  i*32(inp3, IDX), TT3
+	TRANSPOSE	TT2, TT1, TT4, TT3, TT0, TT5
+	vpshufb	TMP, TT0, TT0
+	vpshufb	TMP, TT1, TT1
+	vpshufb	TMP, TT2, TT2
+	vpshufb	TMP, TT3, TT3
+	ROUND_00_15	TT0,(i*4+0)
+	ROUND_00_15	TT1,(i*4+1)
+	ROUND_00_15	TT2,(i*4+2)
+	ROUND_00_15	TT3,(i*4+3)
+	i = (i+1)
+.endr
+        add     $128, IDX
+
+        i = (i*4)
+
+        jmp     Lrounds_16_xx
+.align 16
+Lrounds_16_xx:
+.rep 16
+        ROUND_16_XX     T1, i
+        i = (i+1)
+.endr
+        cmp     $0xa00,ROUND
+        jb      Lrounds_16_xx
+
+	# add old digest
+        vpaddq  _digest(%rsp), a, a
+        vpaddq  _digest+1*SZ4(%rsp), b, b
+        vpaddq  _digest+2*SZ4(%rsp), c, c
+        vpaddq  _digest+3*SZ4(%rsp), d, d
+        vpaddq  _digest+4*SZ4(%rsp), e, e
+        vpaddq  _digest+5*SZ4(%rsp), f, f
+        vpaddq  _digest+6*SZ4(%rsp), g, g
+        vpaddq  _digest+7*SZ4(%rsp), h, h
+
+        sub     $1, INP_SIZE  # unit is blocks
+        jne     lloop
+
+        # write back to memory (state object) the transposed digest
+        vmovdqu a, 0*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu b, 1*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu c, 2*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu d, 3*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu e, 4*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu f, 5*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu g, 6*SHA512_DIGEST_ROW_SIZE(STATE)
+        vmovdqu h, 7*SHA512_DIGEST_ROW_SIZE(STATE)
+
+	# update input data pointers
+	add     IDX, inp0
+        mov     inp0, _data_ptr+0*PTR_SZ(STATE)
+        add     IDX, inp1
+        mov     inp1, _data_ptr+1*PTR_SZ(STATE)
+        add     IDX, inp2
+        mov     inp2, _data_ptr+2*PTR_SZ(STATE)
+        add     IDX, inp3
+        mov     inp3, _data_ptr+3*PTR_SZ(STATE)
+
+	#;;;;;;;;;;;;;;;
+	#; Postamble
+	add $STACK_SPACE1, %rsp
+	# restore callee-saved clobbered registers
+
+	pop     %r15
+	pop     %r14
+	pop     %r13
+	pop     %r12
+
+	# outer calling routine restores XMM and other GP registers
+	ret
+ENDPROC(sha512_x4_avx2)
+
+.data
+.align 64
+K512_4:
+	.octa 0x428a2f98d728ae22428a2f98d728ae22,\
+		0x428a2f98d728ae22428a2f98d728ae22
+	.octa 0x7137449123ef65cd7137449123ef65cd,\
+		0x7137449123ef65cd7137449123ef65cd
+	.octa 0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f,\
+		0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f
+	.octa 0xe9b5dba58189dbbce9b5dba58189dbbc,\
+		0xe9b5dba58189dbbce9b5dba58189dbbc
+	.octa 0x3956c25bf348b5383956c25bf348b538,\
+		0x3956c25bf348b5383956c25bf348b538
+	.octa 0x59f111f1b605d01959f111f1b605d019,\
+		0x59f111f1b605d01959f111f1b605d019
+	.octa 0x923f82a4af194f9b923f82a4af194f9b,\
+		0x923f82a4af194f9b923f82a4af194f9b
+	.octa 0xab1c5ed5da6d8118ab1c5ed5da6d8118,\
+		0xab1c5ed5da6d8118ab1c5ed5da6d8118
+	.octa 0xd807aa98a3030242d807aa98a3030242,\
+		0xd807aa98a3030242d807aa98a3030242
+	.octa 0x12835b0145706fbe12835b0145706fbe,\
+		0x12835b0145706fbe12835b0145706fbe
+	.octa 0x243185be4ee4b28c243185be4ee4b28c,\
+		0x243185be4ee4b28c243185be4ee4b28c
+	.octa 0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2,\
+		0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2
+	.octa 0x72be5d74f27b896f72be5d74f27b896f,\
+		0x72be5d74f27b896f72be5d74f27b896f
+	.octa 0x80deb1fe3b1696b180deb1fe3b1696b1,\
+		0x80deb1fe3b1696b180deb1fe3b1696b1
+	.octa 0x9bdc06a725c712359bdc06a725c71235,\
+		0x9bdc06a725c712359bdc06a725c71235
+	.octa 0xc19bf174cf692694c19bf174cf692694,\
+		0xc19bf174cf692694c19bf174cf692694
+	.octa 0xe49b69c19ef14ad2e49b69c19ef14ad2,\
+		0xe49b69c19ef14ad2e49b69c19ef14ad2
+	.octa 0xefbe4786384f25e3efbe4786384f25e3,\
+		0xefbe4786384f25e3efbe4786384f25e3
+	.octa 0x0fc19dc68b8cd5b50fc19dc68b8cd5b5,\
+		0x0fc19dc68b8cd5b50fc19dc68b8cd5b5
+	.octa 0x240ca1cc77ac9c65240ca1cc77ac9c65,\
+		0x240ca1cc77ac9c65240ca1cc77ac9c65
+	.octa 0x2de92c6f592b02752de92c6f592b0275,\
+		0x2de92c6f592b02752de92c6f592b0275
+	.octa 0x4a7484aa6ea6e4834a7484aa6ea6e483,\
+		0x4a7484aa6ea6e4834a7484aa6ea6e483
+	.octa 0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4,\
+		0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4
+	.octa 0x76f988da831153b576f988da831153b5,\
+		0x76f988da831153b576f988da831153b5
+	.octa 0x983e5152ee66dfab983e5152ee66dfab,\
+		0x983e5152ee66dfab983e5152ee66dfab
+	.octa 0xa831c66d2db43210a831c66d2db43210,\
+		0xa831c66d2db43210a831c66d2db43210
+	.octa 0xb00327c898fb213fb00327c898fb213f,\
+		0xb00327c898fb213fb00327c898fb213f
+	.octa 0xbf597fc7beef0ee4bf597fc7beef0ee4,\
+		0xbf597fc7beef0ee4bf597fc7beef0ee4
+	.octa 0xc6e00bf33da88fc2c6e00bf33da88fc2,\
+		0xc6e00bf33da88fc2c6e00bf33da88fc2
+	.octa 0xd5a79147930aa725d5a79147930aa725,\
+		0xd5a79147930aa725d5a79147930aa725
+	.octa 0x06ca6351e003826f06ca6351e003826f,\
+		0x06ca6351e003826f06ca6351e003826f
+	.octa 0x142929670a0e6e70142929670a0e6e70,\
+		0x142929670a0e6e70142929670a0e6e70
+	.octa 0x27b70a8546d22ffc27b70a8546d22ffc,\
+		0x27b70a8546d22ffc27b70a8546d22ffc
+	.octa 0x2e1b21385c26c9262e1b21385c26c926,\
+		0x2e1b21385c26c9262e1b21385c26c926
+	.octa 0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed,\
+		0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed
+	.octa 0x53380d139d95b3df53380d139d95b3df,\
+		0x53380d139d95b3df53380d139d95b3df
+	.octa 0x650a73548baf63de650a73548baf63de,\
+		0x650a73548baf63de650a73548baf63de
+	.octa 0x766a0abb3c77b2a8766a0abb3c77b2a8,\
+		0x766a0abb3c77b2a8766a0abb3c77b2a8
+	.octa 0x81c2c92e47edaee681c2c92e47edaee6,\
+		0x81c2c92e47edaee681c2c92e47edaee6
+	.octa 0x92722c851482353b92722c851482353b,\
+		0x92722c851482353b92722c851482353b
+	.octa 0xa2bfe8a14cf10364a2bfe8a14cf10364,\
+		0xa2bfe8a14cf10364a2bfe8a14cf10364
+	.octa 0xa81a664bbc423001a81a664bbc423001,\
+		0xa81a664bbc423001a81a664bbc423001
+	.octa 0xc24b8b70d0f89791c24b8b70d0f89791,\
+		0xc24b8b70d0f89791c24b8b70d0f89791
+	.octa 0xc76c51a30654be30c76c51a30654be30,\
+		0xc76c51a30654be30c76c51a30654be30
+	.octa 0xd192e819d6ef5218d192e819d6ef5218,\
+		0xd192e819d6ef5218d192e819d6ef5218
+	.octa 0xd69906245565a910d69906245565a910,\
+		0xd69906245565a910d69906245565a910
+	.octa 0xf40e35855771202af40e35855771202a,\
+		0xf40e35855771202af40e35855771202a
+	.octa 0x106aa07032bbd1b8106aa07032bbd1b8,\
+		0x106aa07032bbd1b8106aa07032bbd1b8
+	.octa 0x19a4c116b8d2d0c819a4c116b8d2d0c8,\
+		0x19a4c116b8d2d0c819a4c116b8d2d0c8
+	.octa 0x1e376c085141ab531e376c085141ab53,\
+		0x1e376c085141ab531e376c085141ab53
+	.octa 0x2748774cdf8eeb992748774cdf8eeb99,\
+		0x2748774cdf8eeb992748774cdf8eeb99
+	.octa 0x34b0bcb5e19b48a834b0bcb5e19b48a8,\
+		0x34b0bcb5e19b48a834b0bcb5e19b48a8
+	.octa 0x391c0cb3c5c95a63391c0cb3c5c95a63,\
+		0x391c0cb3c5c95a63391c0cb3c5c95a63
+	.octa 0x4ed8aa4ae3418acb4ed8aa4ae3418acb,\
+		0x4ed8aa4ae3418acb4ed8aa4ae3418acb
+	.octa 0x5b9cca4f7763e3735b9cca4f7763e373,\
+		0x5b9cca4f7763e3735b9cca4f7763e373
+	.octa 0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3,\
+		0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3
+	.octa 0x748f82ee5defb2fc748f82ee5defb2fc,\
+		0x748f82ee5defb2fc748f82ee5defb2fc
+	.octa 0x78a5636f43172f6078a5636f43172f60,\
+		0x78a5636f43172f6078a5636f43172f60
+	.octa 0x84c87814a1f0ab7284c87814a1f0ab72,\
+		0x84c87814a1f0ab7284c87814a1f0ab72
+	.octa 0x8cc702081a6439ec8cc702081a6439ec,\
+		0x8cc702081a6439ec8cc702081a6439ec
+	.octa 0x90befffa23631e2890befffa23631e28,\
+		0x90befffa23631e2890befffa23631e28
+	.octa 0xa4506cebde82bde9a4506cebde82bde9,\
+		0xa4506cebde82bde9a4506cebde82bde9
+	.octa 0xbef9a3f7b2c67915bef9a3f7b2c67915,\
+		0xbef9a3f7b2c67915bef9a3f7b2c67915
+	.octa 0xc67178f2e372532bc67178f2e372532b,\
+		0xc67178f2e372532bc67178f2e372532b
+	.octa 0xca273eceea26619cca273eceea26619c,\
+		0xca273eceea26619cca273eceea26619c
+	.octa 0xd186b8c721c0c207d186b8c721c0c207,\
+		0xd186b8c721c0c207d186b8c721c0c207
+	.octa 0xeada7dd6cde0eb1eeada7dd6cde0eb1e,\
+		0xeada7dd6cde0eb1eeada7dd6cde0eb1e
+	.octa 0xf57d4f7fee6ed178f57d4f7fee6ed178,\
+		0xf57d4f7fee6ed178f57d4f7fee6ed178
+	.octa 0x06f067aa72176fba06f067aa72176fba,\
+		0x06f067aa72176fba06f067aa72176fba
+	.octa 0x0a637dc5a2c898a60a637dc5a2c898a6,\
+		0x0a637dc5a2c898a60a637dc5a2c898a6
+	.octa 0x113f9804bef90dae113f9804bef90dae,\
+		0x113f9804bef90dae113f9804bef90dae
+	.octa 0x1b710b35131c471b1b710b35131c471b,\
+		0x1b710b35131c471b1b710b35131c471b
+	.octa 0x28db77f523047d8428db77f523047d84,\
+		0x28db77f523047d8428db77f523047d84
+	.octa 0x32caab7b40c7249332caab7b40c72493,\
+		0x32caab7b40c7249332caab7b40c72493
+	.octa 0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc,\
+		0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc
+	.octa 0x431d67c49c100d4c431d67c49c100d4c,\
+		0x431d67c49c100d4c431d67c49c100d4c
+	.octa 0x4cc5d4becb3e42b64cc5d4becb3e42b6,\
+		0x4cc5d4becb3e42b64cc5d4becb3e42b6
+	.octa 0x597f299cfc657e2a597f299cfc657e2a,\
+		0x597f299cfc657e2a597f299cfc657e2a
+	.octa 0x5fcb6fab3ad6faec5fcb6fab3ad6faec,\
+		0x5fcb6fab3ad6faec5fcb6fab3ad6faec
+	.octa 0x6c44198c4a4758176c44198c4a475817,\
+		0x6c44198c4a4758176c44198c4a475817
+
+PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607
+                         .octa 0x18191a1b1c1d1e1f1011121314151617
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 0b17c83d027d..2b0e2a6825f3 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -346,4 +346,10 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
 
 MODULE_ALIAS_CRYPTO("sha512");
+MODULE_ALIAS_CRYPTO("sha512-ssse3");
+MODULE_ALIAS_CRYPTO("sha512-avx");
+MODULE_ALIAS_CRYPTO("sha512-avx2");
 MODULE_ALIAS_CRYPTO("sha384");
+MODULE_ALIAS_CRYPTO("sha384-ssse3");
+MODULE_ALIAS_CRYPTO("sha384-avx");
+MODULE_ALIAS_CRYPTO("sha384-avx2");
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-07-26 13:40:17 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-07-26 13:40:17 -0700
commit	bbce2ad2d711c12d93145a7bbdf086e73f414bcd (patch)
tree	35432a39f68f4c5df44ed38037cbf05adadb923e /arch
parent	0f776dc377f6c87f4e4d4a5f63602f33fb93b31e (diff)
parent	0f95e2ffc58f5d32a90eb1051d17aeebc21cf91d (diff)