74 files changed, 2276 insertions, 3137 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b094816a7e0f..b32ebf92b0ce 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -65,6 +65,7 @@ config X86
 	select HAVE_KERNEL_LZMA
 	select HAVE_KERNEL_XZ
 	select HAVE_KERNEL_LZO
+	select HAVE_KERNEL_LZ4
 	select HAVE_HW_BREAKPOINT
 	select HAVE_MIXED_BREAKPOINTS_REGS
 	select PERF_EVENTS
@@ -102,6 +103,7 @@ config X86
 	select HAVE_ARCH_SECCOMP_FILTER
 	select BUILDTIME_EXTABLE_SORT
 	select GENERIC_CMOS_UPDATE
+	select HAVE_ARCH_SOFT_DIRTY
 	select CLOCKSOURCE_WATCHDOG
 	select GENERIC_CLOCKEVENTS
 	select ARCH_CLOCKSOURCE_DATA if X86_64
@@ -121,6 +123,7 @@ config X86
 	select OLD_SIGACTION if X86_32
 	select COMPAT_OLD_SIGACTION if IA32_EMULATION
 	select RTC_LIB
+	select HAVE_DEBUG_STACKOVERFLOW
 
 config INSTRUCTION_DECODER
 	def_bool y
@@ -2258,11 +2261,11 @@ source "drivers/pcmcia/Kconfig"
 source "drivers/pci/hotplug/Kconfig"
 
 config RAPIDIO
-	bool "RapidIO support"
+	tristate "RapidIO support"
 	depends on PCI
 	default n
 	help
-	  If you say Y here, the kernel will include drivers and
+	  If enabled this option will include drivers and the core
 	  infrastructure code to support RapidIO interconnect devices.
 
 source "drivers/rapidio/Kconfig"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index c963881de0d0..78d91afb8e50 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -59,16 +59,6 @@ config EARLY_PRINTK_DBGP
 	  with klogd/syslogd or the X server. You should normally N here,
 	  unless you want to debug such a crash. You need usb debug device.
 
-config DEBUG_STACKOVERFLOW
-	bool "Check for stack overflows"
-	depends on DEBUG_KERNEL
-	---help---
-	  Say Y here if you want to check the overflows of kernel, IRQ
-	  and exception stacks. This option will cause messages of the
-	  stacks in detail when free stack space drops below a certain
-	  limit.
-	  If in doubt, say "N".
-
 config X86_PTDUMP
 	bool "Export kernel pagetable layout to userspace via debugfs"
 	depends on DEBUG_KERNEL
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 5ef205c5f37b..dcd90df10ab4 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,8 @@
 # create a compressed vmlinux image from the original vmlinux
 #
 
-targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo
+targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
+	vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
 
 KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -63,12 +64,15 @@ $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,xzkern)
 $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,lzo)
+$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
+	$(call if_changed,lz4)
 
 suffix-$(CONFIG_KERNEL_GZIP)	:= gz
 suffix-$(CONFIG_KERNEL_BZIP2)	:= bz2
 suffix-$(CONFIG_KERNEL_LZMA)	:= lzma
 suffix-$(CONFIG_KERNEL_XZ)	:= xz
 suffix-$(CONFIG_KERNEL_LZO) 	:= lzo
+suffix-$(CONFIG_KERNEL_LZ4) 	:= lz4
 
 quiet_cmd_mkpiggy = MKPIGGY $@
       cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 7cb56c6ca351..0319c88290a5 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -145,6 +145,10 @@ static int lines, cols;
 #include "../../../../lib/decompress_unlzo.c"
 #endif
 
+#ifdef CONFIG_KERNEL_LZ4
+#include "../../../../lib/decompress_unlz4.c"
+#endif
+
 static void scroll(void)
 {
 	int i;
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a3a0ed80f17c..7d6ba9db1be9 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -3,8 +3,6 @@
 #
 
 avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
-avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
-					$(comma)4)$(comma)%ymm2,yes,no)
 
 obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@@ -29,6 +27,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
 obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
 obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
 
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
@@ -42,10 +41,8 @@ endif
 
 # These modules require assembler to support AVX2.
 ifeq ($(avx2_supported),yes)
-	obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
 	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
 	obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
-	obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
 endif
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
@@ -73,10 +70,8 @@ ifeq ($(avx_supported),yes)
 endif
 
 ifeq ($(avx2_supported),yes)
-	blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
 	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
 	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
-	twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
 endif
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
@@ -87,3 +82,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
 sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S
deleted file mode 100644
index 784452e0d05d..000000000000
--- a/arch/x86/crypto/blowfish-avx2-asm_64.S
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- * x86_64/AVX2 assembler optimized version of Blowfish
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-
-#include <linux/linkage.h>
-
-.file "blowfish-avx2-asm_64.S"
-
-.data
-.align 32
-
-.Lprefetch_mask:
-.long 0*64
-.long 1*64
-.long 2*64
-.long 3*64
-.long 4*64
-.long 5*64
-.long 6*64
-.long 7*64
-
-.Lbswap32_mask:
-.long 0x00010203
-.long 0x04050607
-.long 0x08090a0b
-.long 0x0c0d0e0f
-
-.Lbswap128_mask:
-	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.Lbswap_iv_mask:
-	.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
-
-.text
-/* structure of crypto context */
-#define p	0
-#define s0	((16 + 2) * 4)
-#define s1	((16 + 2 + (1 * 256)) * 4)
-#define s2	((16 + 2 + (2 * 256)) * 4)
-#define s3	((16 + 2 + (3 * 256)) * 4)
-
-/* register macros */
-#define CTX	%rdi
-#define RIO	 %rdx
-
-#define RS0	%rax
-#define RS1	%r8
-#define RS2	%r9
-#define RS3	%r10
-
-#define RLOOP	%r11
-#define RLOOPd	%r11d
-
-#define RXr0	%ymm8
-#define RXr1	%ymm9
-#define RXr2	%ymm10
-#define RXr3	%ymm11
-#define RXl0	%ymm12
-#define RXl1	%ymm13
-#define RXl2	%ymm14
-#define RXl3	%ymm15
-
-/* temp regs */
-#define RT0	%ymm0
-#define RT0x	%xmm0
-#define RT1	%ymm1
-#define RT1x	%xmm1
-#define RIDX0	%ymm2
-#define RIDX1	%ymm3
-#define RIDX1x	%xmm3
-#define RIDX2	%ymm4
-#define RIDX3	%ymm5
-
-/* vpgatherdd mask and '-1' */
-#define RNOT	%ymm6
-
-/* byte mask, (-1 >> 24) */
-#define RBYTE	%ymm7
-
-/***********************************************************************
- * 32-way AVX2 blowfish
- ***********************************************************************/
-#define F(xl, xr) \
-	vpsrld $24, xl, RIDX0; \
-	vpsrld $16, xl, RIDX1; \
-	vpsrld $8, xl, RIDX2; \
-	vpand RBYTE, RIDX1, RIDX1; \
-	vpand RBYTE, RIDX2, RIDX2; \
-	vpand RBYTE, xl, RIDX3; \
-	\
-	vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	vpcmpeqd RIDX0, RIDX0, RIDX0; \
-	\
-	vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
-	vpcmpeqd RIDX1, RIDX1, RIDX1; \
-	vpaddd RT0, RT1, RT0; \
-	\
-	vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
-	vpxor RT0, RT1, RT0; \
-	\
-	vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	vpaddd RT0, RT1, RT0; \
-	\
-	vpxor RT0, xr, xr;
-
-#define add_roundkey(xl, nmem) \
-	vpbroadcastd nmem, RT0; \
-	vpxor RT0, xl ## 0, xl ## 0; \
-	vpxor RT0, xl ## 1, xl ## 1; \
-	vpxor RT0, xl ## 2, xl ## 2; \
-	vpxor RT0, xl ## 3, xl ## 3;
-
-#define round_enc() \
-	add_roundkey(RXr, p(CTX,RLOOP,4)); \
-	F(RXl0, RXr0); \
-	F(RXl1, RXr1); \
-	F(RXl2, RXr2); \
-	F(RXl3, RXr3); \
-	\
-	add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
-	F(RXr0, RXl0); \
-	F(RXr1, RXl1); \
-	F(RXr2, RXl2); \
-	F(RXr3, RXl3);
-
-#define round_dec() \
-	add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
-	F(RXl0, RXr0); \
-	F(RXl1, RXr1); \
-	F(RXl2, RXr2); \
-	F(RXl3, RXr3); \
-	\
-	add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
-	F(RXr0, RXl0); \
-	F(RXr1, RXl1); \
-	F(RXr2, RXl2); \
-	F(RXr3, RXl3);
-
-#define init_round_constants() \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	leaq s0(CTX), RS0; \
-	leaq s1(CTX), RS1; \
-	leaq s2(CTX), RS2; \
-	leaq s3(CTX), RS3; \
-	vpsrld $24, RNOT, RBYTE;
-
-#define transpose_2x2(x0, x1, t0) \
-	vpunpckldq x0, x1, t0; \
-	vpunpckhdq x0, x1, x1; \
-	\
-	vpunpcklqdq t0, x1, x0; \
-	vpunpckhqdq t0, x1, x1;
-
-#define read_block(xl, xr) \
-	vbroadcasti128 .Lbswap32_mask, RT1; \
-	\
-	vpshufb RT1, xl ## 0, xl ## 0; \
-	vpshufb RT1, xr ## 0, xr ## 0; \
-	vpshufb RT1, xl ## 1, xl ## 1; \
-	vpshufb RT1, xr ## 1, xr ## 1; \
-	vpshufb RT1, xl ## 2, xl ## 2; \
-	vpshufb RT1, xr ## 2, xr ## 2; \
-	vpshufb RT1, xl ## 3, xl ## 3; \
-	vpshufb RT1, xr ## 3, xr ## 3; \
-	\
-	transpose_2x2(xl ## 0, xr ## 0, RT0); \
-	transpose_2x2(xl ## 1, xr ## 1, RT0); \
-	transpose_2x2(xl ## 2, xr ## 2, RT0); \
-	transpose_2x2(xl ## 3, xr ## 3, RT0);
-
-#define write_block(xl, xr) \
-	vbroadcasti128 .Lbswap32_mask, RT1; \
-	\
-	transpose_2x2(xl ## 0, xr ## 0, RT0); \
-	transpose_2x2(xl ## 1, xr ## 1, RT0); \
-	transpose_2x2(xl ## 2, xr ## 2, RT0); \
-	transpose_2x2(xl ## 3, xr ## 3, RT0); \
-	\
-	vpshufb RT1, xl ## 0, xl ## 0; \
-	vpshufb RT1, xr ## 0, xr ## 0; \
-	vpshufb RT1, xl ## 1, xl ## 1; \
-	vpshufb RT1, xr ## 1, xr ## 1; \
-	vpshufb RT1, xl ## 2, xl ## 2; \
-	vpshufb RT1, xr ## 2, xr ## 2; \
-	vpshufb RT1, xl ## 3, xl ## 3; \
-	vpshufb RT1, xr ## 3, xr ## 3;
-
-.align 8
-__blowfish_enc_blk32:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	RXl0..4, RXr0..4: plaintext
-	 * output:
-	 *	RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
-	 */
-	init_round_constants();
-
-	read_block(RXl, RXr);
-
-	movl $1, RLOOPd;
-	add_roundkey(RXl, p+4*(0)(CTX));
-
-.align 4
-.L__enc_loop:
-	round_enc();
-
-	leal 2(RLOOPd), RLOOPd;
-	cmpl $17, RLOOPd;
-	jne .L__enc_loop;
-
-	add_roundkey(RXr, p+4*(17)(CTX));
-
-	write_block(RXl, RXr);
-
-	ret;
-ENDPROC(__blowfish_enc_blk32)
-
-.align 8
-__blowfish_dec_blk32:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	RXl0..4, RXr0..4: ciphertext
-	 * output:
-	 *	RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
-	 */
-	init_round_constants();
-
-	read_block(RXl, RXr);
-
-	movl $14, RLOOPd;
-	add_roundkey(RXl, p+4*(17)(CTX));
-
-.align 4
-.L__dec_loop:
-	round_dec();
-
-	addl $-2, RLOOPd;
-	jns .L__dec_loop;
-
-	add_roundkey(RXr, p+4*(0)(CTX));
-
-	write_block(RXl, RXr);
-
-	ret;
-ENDPROC(__blowfish_dec_blk32)
-
-ENTRY(blowfish_ecb_enc_32way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 */
-
-	vzeroupper;
-
-	vmovdqu 0*32(%rdx), RXl0;
-	vmovdqu 1*32(%rdx), RXr0;
-	vmovdqu 2*32(%rdx), RXl1;
-	vmovdqu 3*32(%rdx), RXr1;
-	vmovdqu 4*32(%rdx), RXl2;
-	vmovdqu 5*32(%rdx), RXr2;
-	vmovdqu 6*32(%rdx), RXl3;
-	vmovdqu 7*32(%rdx), RXr3;
-
-	call __blowfish_enc_blk32;
-
-	vmovdqu RXr0, 0*32(%rsi);
-	vmovdqu RXl0, 1*32(%rsi);
-	vmovdqu RXr1, 2*32(%rsi);
-	vmovdqu RXl1, 3*32(%rsi);
-	vmovdqu RXr2, 4*32(%rsi);
-	vmovdqu RXl2, 5*32(%rsi);
-	vmovdqu RXr3, 6*32(%rsi);
-	vmovdqu RXl3, 7*32(%rsi);
-
-	vzeroupper;
-
-	ret;
-ENDPROC(blowfish_ecb_enc_32way)
-
-ENTRY(blowfish_ecb_dec_32way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 */
-
-	vzeroupper;
-
-	vmovdqu 0*32(%rdx), RXl0;
-	vmovdqu 1*32(%rdx), RXr0;
-	vmovdqu 2*32(%rdx), RXl1;
-	vmovdqu 3*32(%rdx), RXr1;
-	vmovdqu 4*32(%rdx), RXl2;
-	vmovdqu 5*32(%rdx), RXr2;
-	vmovdqu 6*32(%rdx), RXl3;
-	vmovdqu 7*32(%rdx), RXr3;
-
-	call __blowfish_dec_blk32;
-
-	vmovdqu RXr0, 0*32(%rsi);
-	vmovdqu RXl0, 1*32(%rsi);
-	vmovdqu RXr1, 2*32(%rsi);
-	vmovdqu RXl1, 3*32(%rsi);
-	vmovdqu RXr2, 4*32(%rsi);
-	vmovdqu RXl2, 5*32(%rsi);
-	vmovdqu RXr3, 6*32(%rsi);
-	vmovdqu RXl3, 7*32(%rsi);
-
-	vzeroupper;
-
-	ret;
-ENDPROC(blowfish_ecb_dec_32way)
-
-ENTRY(blowfish_cbc_dec_32way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 */
-
-	vzeroupper;
-
-	vmovdqu 0*32(%rdx), RXl0;
-	vmovdqu 1*32(%rdx), RXr0;
-	vmovdqu 2*32(%rdx), RXl1;
-	vmovdqu 3*32(%rdx), RXr1;
-	vmovdqu 4*32(%rdx), RXl2;
-	vmovdqu 5*32(%rdx), RXr2;
-	vmovdqu 6*32(%rdx), RXl3;
-	vmovdqu 7*32(%rdx), RXr3;
-
-	call __blowfish_dec_blk32;
-
-	/* xor with src */
-	vmovq (%rdx), RT0x;
-	vpshufd $0x4f, RT0x, RT0x;
-	vinserti128 $1, 8(%rdx), RT0, RT0;
-	vpxor RT0, RXr0, RXr0;
-	vpxor 0*32+24(%rdx), RXl0, RXl0;
-	vpxor 1*32+24(%rdx), RXr1, RXr1;
-	vpxor 2*32+24(%rdx), RXl1, RXl1;
-	vpxor 3*32+24(%rdx), RXr2, RXr2;
-	vpxor 4*32+24(%rdx), RXl2, RXl2;
-	vpxor 5*32+24(%rdx), RXr3, RXr3;
-	vpxor 6*32+24(%rdx), RXl3, RXl3;
-
-	vmovdqu RXr0, (0*32)(%rsi);
-	vmovdqu RXl0, (1*32)(%rsi);
-	vmovdqu RXr1, (2*32)(%rsi);
-	vmovdqu RXl1, (3*32)(%rsi);
-	vmovdqu RXr2, (4*32)(%rsi);
-	vmovdqu RXl2, (5*32)(%rsi);
-	vmovdqu RXr3, (6*32)(%rsi);
-	vmovdqu RXl3, (7*32)(%rsi);
-
-	vzeroupper;
-
-	ret;
-ENDPROC(blowfish_cbc_dec_32way)
-
-ENTRY(blowfish_ctr_32way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 *	%rcx: iv (big endian, 64bit)
-	 */
-
-	vzeroupper;
-
-	vpcmpeqd RT0, RT0, RT0;
-	vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
-
-	vpcmpeqd RT1x, RT1x, RT1x;
-	vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */
-	vpxor RIDX0, RIDX0, RIDX0;
-	vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */
-
-	vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */
-
-	vpcmpeqd RT1, RT1, RT1;
-	vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */
-	vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */
-
-	vbroadcasti128 .Lbswap_iv_mask, RIDX0;
-	vbroadcasti128 .Lbswap128_mask, RIDX1;
-
-	/* load IV and byteswap */
-	vmovq (%rcx), RT1x;
-	vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */
-	vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */
-
-	/* construct IVs */
-	vpsubq RT0, RT1, RT1;		/* a: le1, b: le0, c: le3, d: le2 */
-	vpshufb RIDX1, RT1, RXl0;	/* a: be0, b: be1, c: be2, d: be3 */
-	vpsubq RIDX2, RT1, RT1;		/* le5, le4, le7, le6 */
-	vpshufb RIDX1, RT1, RXr0;	/* be4, be5, be6, be7 */
-	vpsubq RIDX2, RT1, RT1;
-	vpshufb RIDX1, RT1, RXl1;
-	vpsubq RIDX2, RT1, RT1;
-	vpshufb RIDX1, RT1, RXr1;
-	vpsubq RIDX2, RT1, RT1;
-	vpshufb RIDX1, RT1, RXl2;
-	vpsubq RIDX2, RT1, RT1;
-	vpshufb RIDX1, RT1, RXr2;
-	vpsubq RIDX2, RT1, RT1;
-	vpshufb RIDX1, RT1, RXl3;
-	vpsubq RIDX2, RT1, RT1;
-	vpshufb RIDX1, RT1, RXr3;
-
-	/* store last IV */
-	vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */
-	vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */
-	vmovq RT1x, (%rcx);
-
-	call __blowfish_enc_blk32;
-
-	/* dst = src ^ iv */
-	vpxor 0*32(%rdx), RXr0, RXr0;
-	vpxor 1*32(%rdx), RXl0, RXl0;
-	vpxor 2*32(%rdx), RXr1, RXr1;
-	vpxor 3*32(%rdx), RXl1, RXl1;
-	vpxor 4*32(%rdx), RXr2, RXr2;
-	vpxor 5*32(%rdx), RXl2, RXl2;
-	vpxor 6*32(%rdx), RXr3, RXr3;
-	vpxor 7*32(%rdx), RXl3, RXl3;
-	vmovdqu RXr0, (0*32)(%rsi);
-	vmovdqu RXl0, (1*32)(%rsi);
-	vmovdqu RXr1, (2*32)(%rsi);
-	vmovdqu RXl1, (3*32)(%rsi);
-	vmovdqu RXr2, (4*32)(%rsi);
-	vmovdqu RXl2, (5*32)(%rsi);
-	vmovdqu RXr3, (6*32)(%rsi);
-	vmovdqu RXl3, (7*32)(%rsi);
-
-	vzeroupper;
-
-	ret;
-ENDPROC(blowfish_ctr_32way)
diff --git a/arch/x86/crypto/blowfish_avx2_glue.c b/arch/x86/crypto/blowfish_avx2_glue.c
deleted file mode 100644
index 4417e9aea78d..000000000000
--- a/arch/x86/crypto/blowfish_avx2_glue.c
+++ /dev/null
@@ -1,585 +0,0 @@
-/*
- * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
- *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/crypto.h>
-#include <linux/err.h>
-#include <crypto/algapi.h>
-#include <crypto/blowfish.h>
-#include <crypto/cryptd.h>
-#include <crypto/ctr.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
-#include <asm/crypto/blowfish.h>
-#include <asm/crypto/ablk_helper.h>
-#include <crypto/scatterwalk.h>
-
-#define BF_AVX2_PARALLEL_BLOCKS 32
-
-/* 32-way AVX2 parallel cipher functions */
-asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst,
-				       const u8 *src);
-asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst,
-				       const u8 *src);
-asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst,
-				       const u8 *src);
-asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src,
-				   __be64 *iv);
-
-static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes)
-{
-	if (fpu_enabled)
-		return true;
-
-	/* FPU is only used when chunk to be processed is large enough, so
-	 * do not enable FPU until it is necessary.
-	 */
-	if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS)
-		return false;
-
-	kernel_fpu_begin();
-	return true;
-}
-
-static inline void bf_fpu_end(bool fpu_enabled)
-{
-	if (fpu_enabled)
-		kernel_fpu_end();
-}
-
-static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
-		     bool enc)
-{
-	bool fpu_enabled = false;
-	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	const unsigned int bsize = BF_BLOCK_SIZE;
-	unsigned int nbytes;
-	int err;
-
-	err = blkcipher_walk_virt(desc, walk);
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
-	while ((nbytes = walk->nbytes)) {
-		u8 *wsrc = walk->src.virt.addr;
-		u8 *wdst = walk->dst.virt.addr;
-
-		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
-
-		/* Process multi-block AVX2 batch */
-		if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
-			do {
-				if (enc)
-					blowfish_ecb_enc_32way(ctx, wdst, wsrc);
-				else
-					blowfish_ecb_dec_32way(ctx, wdst, wsrc);
-
-				wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS;
-				wdst += bsize * BF_AVX2_PARALLEL_BLOCKS;
-				nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
-			} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
-
-			if (nbytes < bsize)
-				goto done;
-		}
-
-		/* Process multi-block batch */
-		if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
-			do {
-				if (enc)
-					blowfish_enc_blk_4way(ctx, wdst, wsrc);
-				else
-					blowfish_dec_blk_4way(ctx, wdst, wsrc);
-
-				wsrc += bsize * BF_PARALLEL_BLOCKS;
-				wdst += bsize * BF_PARALLEL_BLOCKS;
-				nbytes -= bsize * BF_PARALLEL_BLOCKS;
-			} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
-
-			if (nbytes < bsize)
-				goto done;
-		}
-
-		/* Handle leftovers */
-		do {
-			if (enc)
-				blowfish_enc_blk(ctx, wdst, wsrc);
-			else
-				blowfish_dec_blk(ctx, wdst, wsrc);
-
-			wsrc += bsize;
-			wdst += bsize;
-			nbytes -= bsize;
-		} while (nbytes >= bsize);
-
-done:
-		err = blkcipher_walk_done(desc, walk, nbytes);
-	}
-
-	bf_fpu_end(fpu_enabled);
-	return err;
-}
-
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	return ecb_crypt(desc, &walk, true);
-}
-
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	return ecb_crypt(desc, &walk, false);
-}
-
-static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
-				  struct blkcipher_walk *walk)
-{
-	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	unsigned int bsize = BF_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u64 *src = (u64 *)walk->src.virt.addr;
-	u64 *dst = (u64 *)walk->dst.virt.addr;
-	u64 *iv = (u64 *)walk->iv;
-
-	do {
-		*dst = *src ^ *iv;
-		blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
-		iv = dst;
-
-		src += 1;
-		dst += 1;
-		nbytes -= bsize;
-	} while (nbytes >= bsize);
-
-	*(u64 *)walk->iv = *iv;
-	return nbytes;
-}
-
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt(desc, &walk);
-
-	while ((nbytes = walk.nbytes)) {
-		nbytes = __cbc_encrypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	return err;
-}
-
-static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
-				  struct blkcipher_walk *walk)
-{
-	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	const unsigned int bsize = BF_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u64 *src = (u64 *)walk->src.virt.addr;
-	u64 *dst = (u64 *)walk->dst.virt.addr;
-	u64 last_iv;
-	int i;
-
-	/* Start of the last block. */
-	src += nbytes / bsize - 1;
-	dst += nbytes / bsize - 1;
-
-	last_iv = *src;
-
-	/* Process multi-block AVX2 batch */
-	if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
-		do {
-			nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1);
-			src -= BF_AVX2_PARALLEL_BLOCKS - 1;
-			dst -= BF_AVX2_PARALLEL_BLOCKS - 1;
-
-			blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src);
-
-			nbytes -= bsize;
-			if (nbytes < bsize)
-				goto done;
-
-			*dst ^= *(src - 1);
-			src -= 1;
-			dst -= 1;
-		} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Process multi-block batch */
-	if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
-		u64 ivs[BF_PARALLEL_BLOCKS - 1];
-
-		do {
-			nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1);
-			src -= BF_PARALLEL_BLOCKS - 1;
-			dst -= BF_PARALLEL_BLOCKS - 1;
-
-			for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
-				ivs[i] = src[i];
-
-			blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
-
-			for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
-				dst[i + 1] ^= ivs[i];
-
-			nbytes -= bsize;
-			if (nbytes < bsize)
-				goto done;
-
-			*dst ^= *(src - 1);
-			src -= 1;
-			dst -= 1;
-		} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Handle leftovers */
-	for (;;) {
-		blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
-
-		nbytes -= bsize;
-		if (nbytes < bsize)
-			break;
-
-		*dst ^= *(src - 1);
-		src -= 1;
-		dst -= 1;
-	}
-
-done:
-	*dst ^= *(u64 *)walk->iv;
-	*(u64 *)walk->iv = last_iv;
-
-	return nbytes;
-}
-
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	bool fpu_enabled = false;
-	struct blkcipher_walk walk;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt(desc, &walk);
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
-	while ((nbytes = walk.nbytes)) {
-		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
-		nbytes = __cbc_decrypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	bf_fpu_end(fpu_enabled);
-	return err;
-}
-
-static void ctr_crypt_final(struct blkcipher_desc *desc,
-			    struct blkcipher_walk *walk)
-{
-	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	u8 *ctrblk = walk->iv;
-	u8 keystream[BF_BLOCK_SIZE];
-	u8 *src = walk->src.virt.addr;
-	u8 *dst = walk->dst.virt.addr;
-	unsigned int nbytes = walk->nbytes;
-
-	blowfish_enc_blk(ctx, keystream, ctrblk);
-	crypto_xor(keystream, src, nbytes);
-	memcpy(dst, keystream, nbytes);
-
-	crypto_inc(ctrblk, BF_BLOCK_SIZE);
-}
-
-static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
-				struct blkcipher_walk *walk)
-{
-	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	unsigned int bsize = BF_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u64 *src = (u64 *)walk->src.virt.addr;
-	u64 *dst = (u64 *)walk->dst.virt.addr;
-	int i;
-
-	/* Process multi-block AVX2 batch */
-	if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
-		do {
-			blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src,
-					   (__be64 *)walk->iv);
-
-			src += BF_AVX2_PARALLEL_BLOCKS;
-			dst += BF_AVX2_PARALLEL_BLOCKS;
-			nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
-		} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Process four block batch */
-	if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
-		__be64 ctrblocks[BF_PARALLEL_BLOCKS];
-		u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
-
-		do {
-			/* create ctrblks for parallel encrypt */
-			for (i = 0; i < BF_PARALLEL_BLOCKS; i++) {
-				if (dst != src)
-					dst[i] = src[i];
-
-				ctrblocks[i] = cpu_to_be64(ctrblk++);
-			}
-
-			blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
-						  (u8 *)ctrblocks);
-
-			src += BF_PARALLEL_BLOCKS;
-			dst += BF_PARALLEL_BLOCKS;
-			nbytes -= bsize * BF_PARALLEL_BLOCKS;
-		} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
-
-		*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Handle leftovers */
-	do {
-		u64 ctrblk;
-
-		if (dst != src)
-			*dst = *src;
-
-		ctrblk = *(u64 *)walk->iv;
-		be64_add_cpu((__be64 *)walk->iv, 1);
-
-		blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
-
-		src += 1;
-		dst += 1;
-	} while ((nbytes -= bsize) >= bsize);
-
-done:
-	return nbytes;
-}
-
-static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		     struct scatterlist *src, unsigned int nbytes)
-{
-	bool fpu_enabled = false;
-	struct blkcipher_walk walk;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE);
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
-	while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
-		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
-		nbytes = __ctr_crypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	bf_fpu_end(fpu_enabled);
-
-	if (walk.nbytes) {
-		ctr_crypt_final(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, 0);
-	}
-
-	return err;
-}
-
-static struct crypto_alg bf_algs[6] = { {
-	.cra_name		= "__ecb-blowfish-avx2",
-	.cra_driver_name	= "__driver-ecb-blowfish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= BF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct bf_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.setkey		= blowfish_setkey,
-			.encrypt	= ecb_encrypt,
-			.decrypt	= ecb_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "__cbc-blowfish-avx2",
-	.cra_driver_name	= "__driver-cbc-blowfish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= BF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct bf_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.setkey		= blowfish_setkey,
-			.encrypt	= cbc_encrypt,
-			.decrypt	= cbc_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "__ctr-blowfish-avx2",
-	.cra_driver_name	= "__driver-ctr-blowfish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct bf_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.ivsize		= BF_BLOCK_SIZE,
-			.setkey		= blowfish_setkey,
-			.encrypt	= ctr_crypt,
-			.decrypt	= ctr_crypt,
-		},
-	},
-}, {
-	.cra_name		= "ecb(blowfish)",
-	.cra_driver_name	= "ecb-blowfish-avx2",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= BF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "cbc(blowfish)",
-	.cra_driver_name	= "cbc-blowfish-avx2",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= BF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.ivsize		= BF_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= __ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "ctr(blowfish)",
-	.cra_driver_name	= "ctr-blowfish-avx2",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.ivsize		= BF_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_encrypt,
-			.geniv		= "chainiv",
-		},
-	},
-} };
-
-
-static int __init init(void)
-{
-	u64 xcr0;
-
-	if (!cpu_has_avx2 || !cpu_has_osxsave) {
-		pr_info("AVX2 instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
-		return -ENODEV;
-	}
-
-	return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
-}
-
-static void __exit fini(void)
-{
-	crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized");
-MODULE_ALIAS("blowfish");
-MODULE_ALIAS("blowfish-asm");
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 3548d76dbaa9..50ec333b70e6 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -1,7 +1,7 @@
 /*
  * Glue Code for assembler optimized version of Blowfish
  *
- * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  *
  * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
  *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
@@ -32,24 +32,40 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <crypto/algapi.h>
-#include <asm/crypto/blowfish.h>
 
 /* regular block cipher functions */
 asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
 				   bool xor);
-EXPORT_SYMBOL_GPL(__blowfish_enc_blk);
-
 asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
-EXPORT_SYMBOL_GPL(blowfish_dec_blk);
 
 /* 4-way parallel cipher functions */
 asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
 					const u8 *src, bool xor);
-EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way);
-
 asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
 				      const u8 *src);
-EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way);
+
+static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
+{
+	__blowfish_enc_blk(ctx, dst, src, false);
+}
+
+static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	__blowfish_enc_blk(ctx, dst, src, true);
+}
+
+static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+					 const u8 *src)
+{
+	__blowfish_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
+				      const u8 *src)
+{
+	__blowfish_enc_blk_4way(ctx, dst, src, true);
+}
 
 static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 91a1878fcc3e..0e0b8863a34b 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -51,16 +51,6 @@
 #define ymm14_x xmm14
 #define ymm15_x xmm15
 
-/*
- * AES-NI instructions do not support ymmX registers, so we need splitting and
- * merging.
- */
-#define vaesenclast256(zero, yreg, tmp) \
-	vextracti128 $1, yreg, tmp##_x; \
-	vaesenclast zero##_x, yreg##_x, yreg##_x; \
-	vaesenclast zero##_x, tmp##_x, tmp##_x; \
-	vinserti128 $1, tmp##_x, yreg, yreg;
-
 /**********************************************************************
   32-way camellia
  **********************************************************************/
@@ -79,46 +69,70 @@
 	 * S-function with AES subbytes \
 	 */ \
 	vbroadcasti128 .Linv_shift_row, t4; \
-	vpbroadcastb .L0f0f0f0f, t7; \
-	vbroadcasti128 .Lpre_tf_lo_s1, t0; \
-	vbroadcasti128 .Lpre_tf_hi_s1, t1; \
+	vpbroadcastd .L0f0f0f0f, t7; \
+	vbroadcasti128 .Lpre_tf_lo_s1, t5; \
+	vbroadcasti128 .Lpre_tf_hi_s1, t6; \
+	vbroadcasti128 .Lpre_tf_lo_s4, t2; \
+	vbroadcasti128 .Lpre_tf_hi_s4, t3; \
 	\
 	/* AES inverse shift rows */ \
 	vpshufb t4, x0, x0; \
 	vpshufb t4, x7, x7; \
-	vpshufb t4, x1, x1; \
-	vpshufb t4, x4, x4; \
-	vpshufb t4, x2, x2; \
-	vpshufb t4, x5, x5; \
 	vpshufb t4, x3, x3; \
 	vpshufb t4, x6, x6; \
+	vpshufb t4, x2, x2; \
+	vpshufb t4, x5, x5; \
+	vpshufb t4, x1, x1; \
+	vpshufb t4, x4, x4; \
 	\
 	/* prefilter sboxes 1, 2 and 3 */ \
-	vbroadcasti128 .Lpre_tf_lo_s4, t2; \
-	vbroadcasti128 .Lpre_tf_hi_s4, t3; \
-	filter_8bit(x0, t0, t1, t7, t6); \
-	filter_8bit(x7, t0, t1, t7, t6); \
-	filter_8bit(x1, t0, t1, t7, t6); \
-	filter_8bit(x4, t0, t1, t7, t6); \
-	filter_8bit(x2, t0, t1, t7, t6); \
-	filter_8bit(x5, t0, t1, t7, t6); \
-	\
 	/* prefilter sbox 4 */ \
+	filter_8bit(x0, t5, t6, t7, t4); \
+	filter_8bit(x7, t5, t6, t7, t4); \
+	vextracti128 $1, x0, t0##_x; \
+	vextracti128 $1, x7, t1##_x; \
+	filter_8bit(x3, t2, t3, t7, t4); \
+	filter_8bit(x6, t2, t3, t7, t4); \
+	vextracti128 $1, x3, t3##_x; \
+	vextracti128 $1, x6, t2##_x; \
+	filter_8bit(x2, t5, t6, t7, t4); \
+	filter_8bit(x5, t5, t6, t7, t4); \
+	filter_8bit(x1, t5, t6, t7, t4); \
+	filter_8bit(x4, t5, t6, t7, t4); \
+	\
 	vpxor t4##_x, t4##_x, t4##_x; \
-	filter_8bit(x3, t2, t3, t7, t6); \
-	filter_8bit(x6, t2, t3, t7, t6); \
 	\
 	/* AES subbytes + AES shift rows */ \
+	vextracti128 $1, x2, t6##_x; \
+	vextracti128 $1, x5, t5##_x; \
+	vaesenclast t4##_x, x0##_x, x0##_x; \
+	vaesenclast t4##_x, t0##_x, t0##_x; \
+	vinserti128 $1, t0##_x, x0, x0; \
+	vaesenclast t4##_x, x7##_x, x7##_x; \
+	vaesenclast t4##_x, t1##_x, t1##_x; \
+	vinserti128 $1, t1##_x, x7, x7; \
+	vaesenclast t4##_x, x3##_x, x3##_x; \
+	vaesenclast t4##_x, t3##_x, t3##_x; \
+	vinserti128 $1, t3##_x, x3, x3; \
+	vaesenclast t4##_x, x6##_x, x6##_x; \
+	vaesenclast t4##_x, t2##_x, t2##_x; \
+	vinserti128 $1, t2##_x, x6, x6; \
+	vextracti128 $1, x1, t3##_x; \
+	vextracti128 $1, x4, t2##_x; \
 	vbroadcasti128 .Lpost_tf_lo_s1, t0; \
 	vbroadcasti128 .Lpost_tf_hi_s1, t1; \
-	vaesenclast256(t4, x0, t5); \
-	vaesenclast256(t4, x7, t5); \
-	vaesenclast256(t4, x1, t5); \
-	vaesenclast256(t4, x4, t5); \
-	vaesenclast256(t4, x2, t5); \
-	vaesenclast256(t4, x5, t5); \
-	vaesenclast256(t4, x3, t5); \
-	vaesenclast256(t4, x6, t5); \
+	vaesenclast t4##_x, x2##_x, x2##_x; \
+	vaesenclast t4##_x, t6##_x, t6##_x; \
+	vinserti128 $1, t6##_x, x2, x2; \
+	vaesenclast t4##_x, x5##_x, x5##_x; \
+	vaesenclast t4##_x, t5##_x, t5##_x; \
+	vinserti128 $1, t5##_x, x5, x5; \
+	vaesenclast t4##_x, x1##_x, x1##_x; \
+	vaesenclast t4##_x, t3##_x, t3##_x; \
+	vinserti128 $1, t3##_x, x1, x1; \
+	vaesenclast t4##_x, x4##_x, x4##_x; \
+	vaesenclast t4##_x, t2##_x, t2##_x; \
+	vinserti128 $1, t2##_x, x4, x4; \
 	\
 	/* postfilter sboxes 1 and 4 */ \
 	vbroadcasti128 .Lpost_tf_lo_s3, t2; \
@@ -139,22 +153,12 @@
 	/* postfilter sbox 2 */ \
 	filter_8bit(x1, t4, t5, t7, t2); \
 	filter_8bit(x4, t4, t5, t7, t2); \
+	vpxor t7, t7, t7; \
 	\
 	vpsrldq $1, t0, t1; \
 	vpsrldq $2, t0, t2; \
+	vpshufb t7, t1, t1; \
 	vpsrldq $3, t0, t3; \
-	vpsrldq $4, t0, t4; \
-	vpsrldq $5, t0, t5; \
-	vpsrldq $6, t0, t6; \
-	vpsrldq $7, t0, t7; \
-	vpbroadcastb t0##_x, t0; \
-	vpbroadcastb t1##_x, t1; \
-	vpbroadcastb t2##_x, t2; \
-	vpbroadcastb t3##_x, t3; \
-	vpbroadcastb t4##_x, t4; \
-	vpbroadcastb t6##_x, t6; \
-	vpbroadcastb t5##_x, t5; \
-	vpbroadcastb t7##_x, t7; \
 	\
 	/* P-function */ \
 	vpxor x5, x0, x0; \
@@ -162,11 +166,21 @@
 	vpxor x7, x2, x2; \
 	vpxor x4, x3, x3; \
 	\
+	vpshufb t7, t2, t2; \
+	vpsrldq $4, t0, t4; \
+	vpshufb t7, t3, t3; \
+	vpsrldq $5, t0, t5; \
+	vpshufb t7, t4, t4; \
+	\
 	vpxor x2, x4, x4; \
 	vpxor x3, x5, x5; \
 	vpxor x0, x6, x6; \
 	vpxor x1, x7, x7; \
 	\
+	vpsrldq $6, t0, t6; \
+	vpshufb t7, t5, t5; \
+	vpshufb t7, t6, t6; \
+	\
 	vpxor x7, x0, x0; \
 	vpxor x4, x1, x1; \
 	vpxor x5, x2, x2; \
@@ -179,12 +193,16 @@
 	\
 	/* Add key material and result to CD (x becomes new CD) */ \
 	\
-	vpxor t7, x0, x0; \
-	vpxor 4 * 32(mem_cd), x0, x0; \
-	\
 	vpxor t6, x1, x1; \
 	vpxor 5 * 32(mem_cd), x1, x1; \
 	\
+	vpsrldq $7, t0, t6; \
+	vpshufb t7, t0, t0; \
+	vpshufb t7, t6, t7; \
+	\
+	vpxor t7, x0, x0; \
+	vpxor 4 * 32(mem_cd), x0, x0; \
+	\
 	vpxor t5, x2, x2; \
 	vpxor 6 * 32(mem_cd), x2, x2; \
 	\
@@ -204,7 +222,7 @@
 	vpxor 3 * 32(mem_cd), x7, x7;
 
 /*
- * Size optimization... with inlined roundsm16 binary would be over 5 times
+ * Size optimization... with inlined roundsm32 binary would be over 5 times
  * larger and would only marginally faster.
  */
 .align 8
@@ -324,13 +342,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	 */ \
 	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
 	vpxor tt0, tt0, tt0; \
-	vpbroadcastb t0##_x, t3; \
+	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t2; \
+	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t1; \
+	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t0; \
+	vpshufb tt0, t0, t0; \
 	\
 	vpand l0, t0, t0; \
 	vpand l1, t1, t1; \
@@ -340,6 +358,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 	\
 	vpxor l4, t0, l4; \
+	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
 	vmovdqu l4, 4 * 32(l); \
 	vpxor l5, t1, l5; \
 	vmovdqu l5, 5 * 32(l); \
@@ -354,14 +373,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	 * rl ^= t2; \
 	 */ \
 	\
-	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
-	vpbroadcastb t0##_x, t3; \
+	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t2; \
+	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t1; \
+	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t0; \
+	vpshufb tt0, t0, t0; \
 	\
 	vpor 4 * 32(r), t0, t0; \
 	vpor 5 * 32(r), t1, t1; \
@@ -373,6 +391,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	vpxor 2 * 32(r), t2, t2; \
 	vpxor 3 * 32(r), t3, t3; \
 	vmovdqu t0, 0 * 32(r); \
+	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
 	vmovdqu t1, 1 * 32(r); \
 	vmovdqu t2, 2 * 32(r); \
 	vmovdqu t3, 3 * 32(r); \
@@ -382,14 +401,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	 * t2 &= rl; \
 	 * rr ^= rol32(t2, 1); \
 	 */ \
-	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
-	vpbroadcastb t0##_x, t3; \
+	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t2; \
+	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t1; \
+	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t0; \
+	vpshufb tt0, t0, t0; \
 	\
 	vpand 0 * 32(r), t0, t0; \
 	vpand 1 * 32(r), t1, t1; \
@@ -403,6 +421,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	vpxor 6 * 32(r), t2, t2; \
 	vpxor 7 * 32(r), t3, t3; \
 	vmovdqu t0, 4 * 32(r); \
+	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
 	vmovdqu t1, 5 * 32(r); \
 	vmovdqu t2, 6 * 32(r); \
 	vmovdqu t3, 7 * 32(r); \
@@ -413,14 +432,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	 * ll ^= t0; \
 	 */ \
 	\
-	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
-	vpbroadcastb t0##_x, t3; \
+	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t2; \
+	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t1; \
+	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t0; \
+	vpshufb tt0, t0, t0; \
 	\
 	vpor l4, t0, t0; \
 	vpor l5, t1, t1; \
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
new file mode 100644
index 000000000000..35e97569d05f
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -0,0 +1,643 @@
+########################################################################
+# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+#
+# Copyright (c) 2013, Intel Corporation
+#
+# Authors:
+#     Erdinc Ozturk <erdinc.ozturk@intel.com>
+#     Vinodh Gopal <vinodh.gopal@intel.com>
+#     James Guilford <james.guilford@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the
+#   distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+#       Function API:
+#       UINT16 crc_t10dif_pcl(
+#               UINT16 init_crc, //initial CRC value, 16 bits
+#               const unsigned char *buf, //buffer pointer to calculate CRC on
+#               UINT64 len //buffer length in bytes (64-bit data)
+#       );
+#
+#       Reference paper titled "Fast CRC Computation for Generic
+#	Polynomials Using PCLMULQDQ Instruction"
+#       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+#  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+#
+#
+
+#include <linux/linkage.h>
+
+.text
+
+#define        arg1 %rdi
+#define        arg2 %rsi
+#define        arg3 %rdx
+
+#define        arg1_low32 %edi
+
+ENTRY(crc_t10dif_pcl)
+.align 16
+
+	# adjust the 16-bit initial_crc value, scale it to 32 bits
+	shl	$16, arg1_low32
+
+	# Allocate Stack Space
+	mov     %rsp, %rcx
+	sub	$16*2, %rsp
+	# align stack to 16 byte boundary
+	and     $~(0x10 - 1), %rsp
+
+	# check if smaller than 256
+	cmp	$256, arg3
+
+	# for sizes less than 128, we can't fold 64B at a time...
+	jl	_less_than_128
+
+
+	# load the initial crc value
+	movd	arg1_low32, %xmm10	# initial crc
+
+	# crc value does not need to be byte-reflected, but it needs
+	# to be moved to the high part of the register.
+	# because data will be byte-reflected and will align with
+	# initial crc at correct place.
+	pslldq	$12, %xmm10
+
+	movdqa  SHUF_MASK(%rip), %xmm11
+	# receive the initial 64B data, xor the initial crc value
+	movdqu	16*0(arg2), %xmm0
+	movdqu	16*1(arg2), %xmm1
+	movdqu	16*2(arg2), %xmm2
+	movdqu	16*3(arg2), %xmm3
+	movdqu	16*4(arg2), %xmm4
+	movdqu	16*5(arg2), %xmm5
+	movdqu	16*6(arg2), %xmm6
+	movdqu	16*7(arg2), %xmm7
+
+	pshufb	%xmm11, %xmm0
+	# XOR the initial_crc value
+	pxor	%xmm10, %xmm0
+	pshufb	%xmm11, %xmm1
+	pshufb	%xmm11, %xmm2
+	pshufb	%xmm11, %xmm3
+	pshufb	%xmm11, %xmm4
+	pshufb	%xmm11, %xmm5
+	pshufb	%xmm11, %xmm6
+	pshufb	%xmm11, %xmm7
+
+	movdqa	rk3(%rip), %xmm10	#xmm10 has rk3 and rk4
+					#imm value of pclmulqdq instruction
+					#will determine which constant to use
+
+	#################################################################
+	# we subtract 256 instead of 128 to save one instruction from the loop
+	sub	$256, arg3
+
+	# at this section of the code, there is 64*x+y (0<=y<64) bytes of
+	# buffer. The _fold_64_B_loop will fold 64B at a time
+	# until we have 64+y Bytes of buffer
+
+
+	# fold 64B at a time. This section of the code folds 4 xmm
+	# registers in parallel
+_fold_64_B_loop:
+
+	# update the buffer pointer
+	add	$128, arg2		#    buf += 64#
+
+	movdqu	16*0(arg2), %xmm9
+	movdqu	16*1(arg2), %xmm12
+	pshufb	%xmm11, %xmm9
+	pshufb	%xmm11, %xmm12
+	movdqa	%xmm0, %xmm8
+	movdqa	%xmm1, %xmm13
+	pclmulqdq	$0x0 , %xmm10, %xmm0
+	pclmulqdq	$0x11, %xmm10, %xmm8
+	pclmulqdq	$0x0 , %xmm10, %xmm1
+	pclmulqdq	$0x11, %xmm10, %xmm13
+	pxor	%xmm9 , %xmm0
+	xorps	%xmm8 , %xmm0
+	pxor	%xmm12, %xmm1
+	xorps	%xmm13, %xmm1
+
+	movdqu	16*2(arg2), %xmm9
+	movdqu	16*3(arg2), %xmm12
+	pshufb	%xmm11, %xmm9
+	pshufb	%xmm11, %xmm12
+	movdqa	%xmm2, %xmm8
+	movdqa	%xmm3, %xmm13
+	pclmulqdq	$0x0, %xmm10, %xmm2
+	pclmulqdq	$0x11, %xmm10, %xmm8
+	pclmulqdq	$0x0, %xmm10, %xmm3
+	pclmulqdq	$0x11, %xmm10, %xmm13
+	pxor	%xmm9 , %xmm2
+	xorps	%xmm8 , %xmm2
+	pxor	%xmm12, %xmm3
+	xorps	%xmm13, %xmm3
+
+	movdqu	16*4(arg2), %xmm9
+	movdqu	16*5(arg2), %xmm12
+	pshufb	%xmm11, %xmm9
+	pshufb	%xmm11, %xmm12
+	movdqa	%xmm4, %xmm8
+	movdqa	%xmm5, %xmm13
+	pclmulqdq	$0x0,  %xmm10, %xmm4
+	pclmulqdq	$0x11, %xmm10, %xmm8
+	pclmulqdq	$0x0,  %xmm10, %xmm5
+	pclmulqdq	$0x11, %xmm10, %xmm13
+	pxor	%xmm9 ,  %xmm4
+	xorps	%xmm8 ,  %xmm4
+	pxor	%xmm12,  %xmm5
+	xorps	%xmm13,  %xmm5
+
+	movdqu	16*6(arg2), %xmm9
+	movdqu	16*7(arg2), %xmm12
+	pshufb	%xmm11, %xmm9
+	pshufb	%xmm11, %xmm12
+	movdqa	%xmm6 , %xmm8
+	movdqa	%xmm7 , %xmm13
+	pclmulqdq	$0x0 , %xmm10, %xmm6
+	pclmulqdq	$0x11, %xmm10, %xmm8
+	pclmulqdq	$0x0 , %xmm10, %xmm7
+	pclmulqdq	$0x11, %xmm10, %xmm13
+	pxor	%xmm9 , %xmm6
+	xorps	%xmm8 , %xmm6
+	pxor	%xmm12, %xmm7
+	xorps	%xmm13, %xmm7
+
+	sub	$128, arg3
+
+	# check if there is another 64B in the buffer to be able to fold
+	jge	_fold_64_B_loop
+	##################################################################
+
+
+	add	$128, arg2
+	# at this point, the buffer pointer is pointing at the last y Bytes
+	# of the buffer the 64B of folded data is in 4 of the xmm
+	# registers: xmm0, xmm1, xmm2, xmm3
+
+
+	# fold the 8 xmm registers to 1 xmm register with different constants
+
+	movdqa	rk9(%rip), %xmm10
+	movdqa	%xmm0, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm0
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	xorps	%xmm0, %xmm7
+
+	movdqa	rk11(%rip), %xmm10
+	movdqa	%xmm1, %xmm8
+	pclmulqdq	 $0x11, %xmm10, %xmm1
+	pclmulqdq	 $0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	xorps	%xmm1, %xmm7
+
+	movdqa	rk13(%rip), %xmm10
+	movdqa	%xmm2, %xmm8
+	pclmulqdq	 $0x11, %xmm10, %xmm2
+	pclmulqdq	 $0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	pxor	%xmm2, %xmm7
+
+	movdqa	rk15(%rip), %xmm10
+	movdqa	%xmm3, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm3
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	xorps	%xmm3, %xmm7
+
+	movdqa	rk17(%rip), %xmm10
+	movdqa	%xmm4, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm4
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	pxor	%xmm4, %xmm7
+
+	movdqa	rk19(%rip), %xmm10
+	movdqa	%xmm5, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm5
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	xorps	%xmm5, %xmm7
+
+	movdqa	rk1(%rip), %xmm10	#xmm10 has rk1 and rk2
+					#imm value of pclmulqdq instruction
+					#will determine which constant to use
+	movdqa	%xmm6, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm6
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	pxor	%xmm6, %xmm7
+
+
+	# instead of 64, we add 48 to the loop counter to save 1 instruction
+	# from the loop instead of a cmp instruction, we use the negative
+	# flag with the jl instruction
+	add	$128-16, arg3
+	jl	_final_reduction_for_128
+
+	# now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
+	# and the rest is in memory. We can fold 16 bytes at a time if y>=16
+	# continue folding 16B at a time
+
+_16B_reduction_loop:
+	movdqa	%xmm7, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm7
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	movdqu	(arg2), %xmm0
+	pshufb	%xmm11, %xmm0
+	pxor	%xmm0 , %xmm7
+	add	$16, arg2
+	sub	$16, arg3
+	# instead of a cmp instruction, we utilize the flags with the
+	# jge instruction equivalent of: cmp arg3, 16-16
+	# check if there is any more 16B in the buffer to be able to fold
+	jge	_16B_reduction_loop
+
+	#now we have 16+z bytes left to reduce, where 0<= z < 16.
+	#first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+	# check if any more data to fold. If not, compute the CRC of
+	# the final 128 bits
+	add	$16, arg3
+	je	_128_done
+
+	# here we are getting data that is less than 16 bytes.
+	# since we know that there was data before the pointer, we can
+	# offset the input pointer before the actual point, to receive
+	# exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_xmms:
+	movdqa	%xmm7, %xmm2
+
+	movdqu	-16(arg2, arg3), %xmm1
+	pshufb	%xmm11, %xmm1
+
+	# get rid of the extra data that was loaded before
+	# load the shift constant
+	lea	pshufb_shf_table+16(%rip), %rax
+	sub	arg3, %rax
+	movdqu	(%rax), %xmm0
+
+	# shift xmm2 to the left by arg3 bytes
+	pshufb	%xmm0, %xmm2
+
+	# shift xmm7 to the right by 16-arg3 bytes
+	pxor	mask1(%rip), %xmm0
+	pshufb	%xmm0, %xmm7
+	pblendvb	%xmm2, %xmm1	#xmm0 is implicit
+
+	# fold 16 Bytes
+	movdqa	%xmm1, %xmm2
+	movdqa	%xmm7, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm7
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	pxor	%xmm2, %xmm7
+
+_128_done:
+	# compute crc of a 128-bit value
+	movdqa	rk5(%rip), %xmm10	# rk5 and rk6 in xmm10
+	movdqa	%xmm7, %xmm0
+
+	#64b fold
+	pclmulqdq	$0x1, %xmm10, %xmm7
+	pslldq	$8   ,  %xmm0
+	pxor	%xmm0,  %xmm7
+
+	#32b fold
+	movdqa	%xmm7, %xmm0
+
+	pand	mask2(%rip), %xmm0
+
+	psrldq	$12, %xmm7
+	pclmulqdq	$0x10, %xmm10, %xmm7
+	pxor	%xmm0, %xmm7
+
+	#barrett reduction
+_barrett:
+	movdqa	rk7(%rip), %xmm10	# rk7 and rk8 in xmm10
+	movdqa	%xmm7, %xmm0
+	pclmulqdq	$0x01, %xmm10, %xmm7
+	pslldq	$4, %xmm7
+	pclmulqdq	$0x11, %xmm10, %xmm7
+
+	pslldq	$4, %xmm7
+	pxor	%xmm0, %xmm7
+	pextrd	$1, %xmm7, %eax
+
+_cleanup:
+	# scale the result back to 16 bits
+	shr	$16, %eax
+	mov     %rcx, %rsp
+	ret
+
+########################################################################
+
+.align 16
+_less_than_128:
+
+	# check if there is enough buffer to be able to fold 16B at a time
+	cmp	$32, arg3
+	jl	_less_than_32
+	movdqa  SHUF_MASK(%rip), %xmm11
+
+	# now if there is, load the constants
+	movdqa	rk1(%rip), %xmm10	# rk1 and rk2 in xmm10
+
+	movd	arg1_low32, %xmm0	# get the initial crc value
+	pslldq	$12, %xmm0	# align it to its correct place
+	movdqu	(arg2), %xmm7	# load the plaintext
+	pshufb	%xmm11, %xmm7	# byte-reflect the plaintext
+	pxor	%xmm0, %xmm7
+
+
+	# update the buffer pointer
+	add	$16, arg2
+
+	# update the counter. subtract 32 instead of 16 to save one
+	# instruction from the loop
+	sub	$32, arg3
+
+	jmp	_16B_reduction_loop
+
+
+.align 16
+_less_than_32:
+	# mov initial crc to the return value. this is necessary for
+	# zero-length buffers.
+	mov	arg1_low32, %eax
+	test	arg3, arg3
+	je	_cleanup
+
+	movdqa  SHUF_MASK(%rip), %xmm11
+
+	movd	arg1_low32, %xmm0	# get the initial crc value
+	pslldq	$12, %xmm0	# align it to its correct place
+
+	cmp	$16, arg3
+	je	_exact_16_left
+	jl	_less_than_16_left
+
+	movdqu	(arg2), %xmm7	# load the plaintext
+	pshufb	%xmm11, %xmm7	# byte-reflect the plaintext
+	pxor	%xmm0 , %xmm7	# xor the initial crc value
+	add	$16, arg2
+	sub	$16, arg3
+	movdqa	rk1(%rip), %xmm10	# rk1 and rk2 in xmm10
+	jmp	_get_last_two_xmms
+
+
+.align 16
+_less_than_16_left:
+	# use stack space to load data less than 16 bytes, zero-out
+	# the 16B in memory first.
+
+	pxor	%xmm1, %xmm1
+	mov	%rsp, %r11
+	movdqa	%xmm1, (%r11)
+
+	cmp	$4, arg3
+	jl	_only_less_than_4
+
+	# backup the counter value
+	mov	arg3, %r9
+	cmp	$8, arg3
+	jl	_less_than_8_left
+
+	# load 8 Bytes
+	mov	(arg2), %rax
+	mov	%rax, (%r11)
+	add	$8, %r11
+	sub	$8, arg3
+	add	$8, arg2
+_less_than_8_left:
+
+	cmp	$4, arg3
+	jl	_less_than_4_left
+
+	# load 4 Bytes
+	mov	(arg2), %eax
+	mov	%eax, (%r11)
+	add	$4, %r11
+	sub	$4, arg3
+	add	$4, arg2
+_less_than_4_left:
+
+	cmp	$2, arg3
+	jl	_less_than_2_left
+
+	# load 2 Bytes
+	mov	(arg2), %ax
+	mov	%ax, (%r11)
+	add	$2, %r11
+	sub	$2, arg3
+	add	$2, arg2
+_less_than_2_left:
+	cmp     $1, arg3
+        jl      _zero_left
+
+	# load 1 Byte
+	mov	(arg2), %al
+	mov	%al, (%r11)
+_zero_left:
+	movdqa	(%rsp), %xmm7
+	pshufb	%xmm11, %xmm7
+	pxor	%xmm0 , %xmm7	# xor the initial crc value
+
+	# shl r9, 4
+	lea	pshufb_shf_table+16(%rip), %rax
+	sub	%r9, %rax
+	movdqu	(%rax), %xmm0
+	pxor	mask1(%rip), %xmm0
+
+	pshufb	%xmm0, %xmm7
+	jmp	_128_done
+
+.align 16
+_exact_16_left:
+	movdqu	(arg2), %xmm7
+	pshufb	%xmm11, %xmm7
+	pxor	%xmm0 , %xmm7   # xor the initial crc value
+
+	jmp	_128_done
+
+_only_less_than_4:
+	cmp	$3, arg3
+	jl	_only_less_than_3
+
+	# load 3 Bytes
+	mov	(arg2), %al
+	mov	%al, (%r11)
+
+	mov	1(arg2), %al
+	mov	%al, 1(%r11)
+
+	mov	2(arg2), %al
+	mov	%al, 2(%r11)
+
+	movdqa	 (%rsp), %xmm7
+	pshufb	 %xmm11, %xmm7
+	pxor	 %xmm0 , %xmm7  # xor the initial crc value
+
+	psrldq	$5, %xmm7
+
+	jmp	_barrett
+_only_less_than_3:
+	cmp	$2, arg3
+	jl	_only_less_than_2
+
+	# load 2 Bytes
+	mov	(arg2), %al
+	mov	%al, (%r11)
+
+	mov	1(arg2), %al
+	mov	%al, 1(%r11)
+
+	movdqa	(%rsp), %xmm7
+	pshufb	%xmm11, %xmm7
+	pxor	%xmm0 , %xmm7   # xor the initial crc value
+
+	psrldq	$6, %xmm7
+
+	jmp	_barrett
+_only_less_than_2:
+
+	# load 1 Byte
+	mov	(arg2), %al
+	mov	%al, (%r11)
+
+	movdqa	(%rsp), %xmm7
+	pshufb	%xmm11, %xmm7
+	pxor	%xmm0 , %xmm7   # xor the initial crc value
+
+	psrldq	$7, %xmm7
+
+	jmp	_barrett
+
+ENDPROC(crc_t10dif_pcl)
+
+.data
+
+# precomputed constants
+# these constants are precomputed from the poly:
+# 0x8bb70000 (0x8bb7 scaled to 32 bits)
+.align 16
+# Q = 0x18BB70000
+# rk1 = 2^(32*3) mod Q << 32
+# rk2 = 2^(32*5) mod Q << 32
+# rk3 = 2^(32*15) mod Q << 32
+# rk4 = 2^(32*17) mod Q << 32
+# rk5 = 2^(32*3) mod Q << 32
+# rk6 = 2^(32*2) mod Q << 32
+# rk7 = floor(2^64/Q)
+# rk8 = Q
+rk1:
+.quad 0x2d56000000000000
+rk2:
+.quad 0x06df000000000000
+rk3:
+.quad 0x9d9d000000000000
+rk4:
+.quad 0x7cf5000000000000
+rk5:
+.quad 0x2d56000000000000
+rk6:
+.quad 0x1368000000000000
+rk7:
+.quad 0x00000001f65a57f8
+rk8:
+.quad 0x000000018bb70000
+
+rk9:
+.quad 0xceae000000000000
+rk10:
+.quad 0xbfd6000000000000
+rk11:
+.quad 0x1e16000000000000
+rk12:
+.quad 0x713c000000000000
+rk13:
+.quad 0xf7f9000000000000
+rk14:
+.quad 0x80a6000000000000
+rk15:
+.quad 0x044c000000000000
+rk16:
+.quad 0xe658000000000000
+rk17:
+.quad 0xad18000000000000
+rk18:
+.quad 0xa497000000000000
+rk19:
+.quad 0x6ee3000000000000
+rk20:
+.quad 0xe7b5000000000000
+
+
+
+mask1:
+.octa 0x80808080808080808080808080808080
+mask2:
+.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+
+SHUF_MASK:
+.octa 0x000102030405060708090A0B0C0D0E0F
+
+pshufb_shf_table:
+# use these values for shift constants for the pshufb instruction
+# different alignments result in values as shown:
+#	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+#	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+#	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+#	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+#	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+#	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+#	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
+#	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
+#	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
+#	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
+#	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
+#	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
+#	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
+#	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
+#	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
+.octa 0x8f8e8d8c8b8a89888786858483828100
+.octa 0x000e0d0c0b0a09080706050403020100
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
new file mode 100644
index 000000000000..7845d7fd54c0
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -0,0 +1,151 @@
+/*
+ * Cryptographic API.
+ *
+ * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/i387.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
+				size_t len);
+
+struct chksum_desc_ctx {
+	__u16 crc;
+};
+
+/*
+ * Steps through buffer one byte at at time, calculates reflected
+ * crc using table.
+ */
+
+static int chksum_init(struct shash_desc *desc)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	ctx->crc = 0;
+
+	return 0;
+}
+
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	if (irq_fpu_usable()) {
+		kernel_fpu_begin();
+		ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
+		kernel_fpu_end();
+	} else
+		ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
+	return 0;
+}
+
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	*(__u16 *)out = ctx->crc;
+	return 0;
+}
+
+static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
+			u8 *out)
+{
+	if (irq_fpu_usable()) {
+		kernel_fpu_begin();
+		*(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
+		kernel_fpu_end();
+	} else
+		*(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
+	return 0;
+}
+
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	return __chksum_finup(&ctx->crc, data, len, out);
+}
+
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+			 unsigned int length, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	return __chksum_finup(&ctx->crc, data, length, out);
+}
+
+static struct shash_alg alg = {
+	.digestsize		=	CRC_T10DIF_DIGEST_SIZE,
+	.init		=	chksum_init,
+	.update		=	chksum_update,
+	.final		=	chksum_final,
+	.finup		=	chksum_finup,
+	.digest		=	chksum_digest,
+	.descsize		=	sizeof(struct chksum_desc_ctx),
+	.base			=	{
+		.cra_name		=	"crct10dif",
+		.cra_driver_name	=	"crct10dif-pclmul",
+		.cra_priority		=	200,
+		.cra_blocksize		=	CRC_T10DIF_BLOCK_SIZE,
+		.cra_module		=	THIS_MODULE,
+	}
+};
+
+static const struct x86_cpu_id crct10dif_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id);
+
+static int __init crct10dif_intel_mod_init(void)
+{
+	if (!x86_match_cpu(crct10dif_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_shash(&alg);
+}
+
+static void __exit crct10dif_intel_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(crct10dif_intel_mod_init);
+module_exit(crct10dif_intel_mod_fini);
+
+MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
+MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS("crct10dif");
+MODULE_ALIAS("crct10dif-pclmul");
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 597d4da69656..50226c4b86ed 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -187,7 +187,36 @@ static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
 	return 0;
 }
 
-static struct shash_alg alg = {
+static int sha224_ssse3_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA224_H0;
+	sctx->state[1] = SHA224_H1;
+	sctx->state[2] = SHA224_H2;
+	sctx->state[3] = SHA224_H3;
+	sctx->state[4] = SHA224_H4;
+	sctx->state[5] = SHA224_H5;
+	sctx->state[6] = SHA224_H6;
+	sctx->state[7] = SHA224_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash)
+{
+	u8 D[SHA256_DIGEST_SIZE];
+
+	sha256_ssse3_final(desc, D);
+
+	memcpy(hash, D, SHA224_DIGEST_SIZE);
+	memset(D, 0, SHA256_DIGEST_SIZE);
+
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
 	.digestsize	=	SHA256_DIGEST_SIZE,
 	.init		=	sha256_ssse3_init,
 	.update		=	sha256_ssse3_update,
@@ -204,7 +233,24 @@ static struct shash_alg alg = {
 		.cra_blocksize	=	SHA256_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
-};
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_ssse3_init,
+	.update		=	sha256_ssse3_update,
+	.final		=	sha224_ssse3_final,
+	.export		=	sha256_ssse3_export,
+	.import		=	sha256_ssse3_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-ssse3",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };
 
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
@@ -227,7 +273,7 @@ static bool __init avx_usable(void)
 
 static int __init sha256_ssse3_mod_init(void)
 {
-	/* test for SSE3 first */
+	/* test for SSSE3 first */
 	if (cpu_has_ssse3)
 		sha256_transform_asm = sha256_transform_ssse3;
 
@@ -254,7 +300,7 @@ static int __init sha256_ssse3_mod_init(void)
 		else
 #endif
 			pr_info("Using SSSE3 optimized SHA-256 implementation\n");
-		return crypto_register_shash(&alg);
+		return crypto_register_shashes(algs, ARRAY_SIZE(algs));
 	}
 	pr_info("Neither AVX nor SSSE3 is available/usable.\n");
 
@@ -263,7 +309,7 @@ static int __init sha256_ssse3_mod_init(void)
 
 static void __exit sha256_ssse3_mod_fini(void)
 {
-	crypto_unregister_shash(&alg);
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
 }
 
 module_init(sha256_ssse3_mod_init);
@@ -273,3 +319,4 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
 
 MODULE_ALIAS("sha256");
+MODULE_ALIAS("sha384");
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 6cbd8df348d2..f30cd10293f0 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -194,7 +194,37 @@ static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
 	return 0;
 }
 
-static struct shash_alg alg = {
+static int sha384_ssse3_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA384_H0;
+	sctx->state[1] = SHA384_H1;
+	sctx->state[2] = SHA384_H2;
+	sctx->state[3] = SHA384_H3;
+	sctx->state[4] = SHA384_H4;
+	sctx->state[5] = SHA384_H5;
+	sctx->state[6] = SHA384_H6;
+	sctx->state[7] = SHA384_H7;
+
+	sctx->count[0] = sctx->count[1] = 0;
+
+	return 0;
+}
+
+static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash)
+{
+	u8 D[SHA512_DIGEST_SIZE];
+
+	sha512_ssse3_final(desc, D);
+
+	memcpy(hash, D, SHA384_DIGEST_SIZE);
+	memset(D, 0, SHA512_DIGEST_SIZE);
+
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
 	.digestsize	=	SHA512_DIGEST_SIZE,
 	.init		=	sha512_ssse3_init,
 	.update		=	sha512_ssse3_update,
@@ -211,7 +241,24 @@ static struct shash_alg alg = {
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
-};
+},  {
+	.digestsize	=	SHA384_DIGEST_SIZE,
+	.init		=	sha384_ssse3_init,
+	.update		=	sha512_ssse3_update,
+	.final		=	sha384_ssse3_final,
+	.export		=	sha512_ssse3_export,
+	.import		=	sha512_ssse3_import,
+	.descsize	=	sizeof(struct sha512_state),
+	.statesize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha384",
+		.cra_driver_name =	"sha384-ssse3",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA384_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };
 
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
@@ -234,7 +281,7 @@ static bool __init avx_usable(void)
 
 static int __init sha512_ssse3_mod_init(void)
 {
-	/* test for SSE3 first */
+	/* test for SSSE3 first */
 	if (cpu_has_ssse3)
 		sha512_transform_asm = sha512_transform_ssse3;
 
@@ -261,7 +308,7 @@ static int __init sha512_ssse3_mod_init(void)
 		else
 #endif
 			pr_info("Using SSSE3 optimized SHA-512 implementation\n");
-		return crypto_register_shash(&alg);
+		return crypto_register_shashes(algs, ARRAY_SIZE(algs));
 	}
 	pr_info("Neither AVX nor SSSE3 is available/usable.\n");
 
@@ -270,7 +317,7 @@ static int __init sha512_ssse3_mod_init(void)
 
 static void __exit sha512_ssse3_mod_fini(void)
 {
-	crypto_unregister_shash(&alg);
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
 }
 
 module_init(sha512_ssse3_mod_init);
@@ -280,3 +327,4 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
 
 MODULE_ALIAS("sha512");
+MODULE_ALIAS("sha384");
diff --git a/arch/x86/crypto/twofish-avx2-asm_64.S b/arch/x86/crypto/twofish-avx2-asm_64.S
deleted file mode 100644
index e1a83b9cd389..000000000000
--- a/arch/x86/crypto/twofish-avx2-asm_64.S
+++ /dev/null
@@ -1,600 +0,0 @@
-/*
- * x86_64/AVX2 assembler optimized version of Twofish
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-
-#include <linux/linkage.h>
-#include "glue_helper-asm-avx2.S"
-
-.file "twofish-avx2-asm_64.S"
-
-.data
-.align 16
-
-.Lvpshufb_mask0:
-.long 0x80808000
-.long 0x80808004
-.long 0x80808008
-.long 0x8080800c
-
-.Lbswap128_mask:
-	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.Lxts_gf128mul_and_shl1_mask_0:
-	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
-.Lxts_gf128mul_and_shl1_mask_1:
-	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
-
-.text
-
-/* structure of crypto context */
-#define s0	0
-#define s1	1024
-#define s2	2048
-#define s3	3072
-#define w	4096
-#define	k	4128
-
-/* register macros */
-#define CTX	%rdi
-
-#define RS0	CTX
-#define RS1	%r8
-#define RS2	%r9
-#define RS3	%r10
-#define RK	%r11
-#define RW	%rax
-#define RROUND  %r12
-#define RROUNDd %r12d
-
-#define RA0	%ymm8
-#define RB0	%ymm9
-#define RC0	%ymm10
-#define RD0	%ymm11
-#define RA1	%ymm12
-#define RB1	%ymm13
-#define RC1	%ymm14
-#define RD1	%ymm15
-
-/* temp regs */
-#define RX0	%ymm0
-#define RY0	%ymm1
-#define RX1	%ymm2
-#define RY1	%ymm3
-#define RT0	%ymm4
-#define RIDX	%ymm5
-
-#define RX0x	%xmm0
-#define RY0x	%xmm1
-#define RX1x	%xmm2
-#define RY1x	%xmm3
-#define RT0x	%xmm4
-
-/* vpgatherdd mask and '-1' */
-#define RNOT	%ymm6
-
-/* byte mask, (-1 >> 24) */
-#define RBYTE	%ymm7
-
-/**********************************************************************
-  16-way AVX2 twofish
- **********************************************************************/
-#define init_round_constants() \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	vpsrld $24, RNOT, RBYTE; \
-	leaq k(CTX), RK; \
-	leaq w(CTX), RW; \
-	leaq s1(CTX), RS1; \
-	leaq s2(CTX), RS2; \
-	leaq s3(CTX), RS3; \
-
-#define g16(ab, rs0, rs1, rs2, rs3, xy) \
-	vpand RBYTE, ab ## 0, RIDX; \
-	vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-		\
-		vpand RBYTE, ab ## 1, RIDX; \
-		vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
-		vpcmpeqd RNOT, RNOT, RNOT; \
-	\
-	vpsrld $8, ab ## 0, RIDX; \
-	vpand RBYTE, RIDX, RIDX; \
-	vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	vpxor RT0, xy ## 0, xy ## 0; \
-		\
-		vpsrld $8, ab ## 1, RIDX; \
-		vpand RBYTE, RIDX, RIDX; \
-		vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
-		vpcmpeqd RNOT, RNOT, RNOT; \
-		vpxor RT0, xy ## 1, xy ## 1; \
-	\
-	vpsrld $16, ab ## 0, RIDX; \
-	vpand RBYTE, RIDX, RIDX; \
-	vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	vpxor RT0, xy ## 0, xy ## 0; \
-		\
-		vpsrld $16, ab ## 1, RIDX; \
-		vpand RBYTE, RIDX, RIDX; \
-		vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
-		vpcmpeqd RNOT, RNOT, RNOT; \
-		vpxor RT0, xy ## 1, xy ## 1; \
-	\
-	vpsrld $24, ab ## 0, RIDX; \
-	vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	vpxor RT0, xy ## 0, xy ## 0; \
-		\
-		vpsrld $24, ab ## 1, RIDX; \
-		vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
-		vpcmpeqd RNOT, RNOT, RNOT; \
-		vpxor RT0, xy ## 1, xy ## 1;
-
-#define g1_16(a, x) \
-	g16(a, RS0, RS1, RS2, RS3, x);
-
-#define g2_16(b, y) \
-	g16(b, RS1, RS2, RS3, RS0, y);
-
-#define encrypt_round_end16(a, b, c, d, nk) \
-	vpaddd RY0, RX0, RX0; \
-	vpaddd RX0, RY0, RY0; \
-	vpbroadcastd nk(RK,RROUND,8), RT0; \
-	vpaddd RT0, RX0, RX0; \
-	vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
-	vpaddd RT0, RY0, RY0; \
-	\
-	vpxor RY0, d ## 0, d ## 0; \
-	\
-	vpxor RX0, c ## 0, c ## 0; \
-	vpsrld $1, c ## 0, RT0; \
-	vpslld $31, c ## 0, c ## 0; \
-	vpor RT0, c ## 0, c ## 0; \
-	\
-		vpaddd RY1, RX1, RX1; \
-		vpaddd RX1, RY1, RY1; \
-		vpbroadcastd nk(RK,RROUND,8), RT0; \
-		vpaddd RT0, RX1, RX1; \
-		vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
-		vpaddd RT0, RY1, RY1; \
-		\
-		vpxor RY1, d ## 1, d ## 1; \
-		\
-		vpxor RX1, c ## 1, c ## 1; \
-		vpsrld $1, c ## 1, RT0; \
-		vpslld $31, c ## 1, c ## 1; \
-		vpor RT0, c ## 1, c ## 1; \
-
-#define encrypt_round16(a, b, c, d, nk) \
-	g2_16(b, RY); \
-	\
-	vpslld $1, b ## 0, RT0; \
-	vpsrld $31, b ## 0, b ## 0; \
-	vpor RT0, b ## 0, b ## 0; \
-	\
-		vpslld $1, b ## 1, RT0; \
-		vpsrld $31, b ## 1, b ## 1; \
-		vpor RT0, b ## 1, b ## 1; \
-	\
-	g1_16(a, RX); \
-	\
-	encrypt_round_end16(a, b, c, d, nk);
-
-#define encrypt_round_first16(a, b, c, d, nk) \
-	vpslld $1, d ## 0, RT0; \
-	vpsrld $31, d ## 0, d ## 0; \
-	vpor RT0, d ## 0, d ## 0; \
-	\
-		vpslld $1, d ## 1, RT0; \
-		vpsrld $31, d ## 1, d ## 1; \
-		vpor RT0, d ## 1, d ## 1; \
-	\
-	encrypt_round16(a, b, c, d, nk);
-
-#define encrypt_round_last16(a, b, c, d, nk) \
-	g2_16(b, RY); \
-	\
-	g1_16(a, RX); \
-	\
-	encrypt_round_end16(a, b, c, d, nk);
-
-#define decrypt_round_end16(a, b, c, d, nk) \
-	vpaddd RY0, RX0, RX0; \
-	vpaddd RX0, RY0, RY0; \
-	vpbroadcastd nk(RK,RROUND,8), RT0; \
-	vpaddd RT0, RX0, RX0; \
-	vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
-	vpaddd RT0, RY0, RY0; \
-	\
-	vpxor RX0, c ## 0, c ## 0; \
-	\
-	vpxor RY0, d ## 0, d ## 0; \
-	vpsrld $1, d ## 0, RT0; \
-	vpslld $31, d ## 0, d ## 0; \
-	vpor RT0, d ## 0, d ## 0; \
-	\
-		vpaddd RY1, RX1, RX1; \
-		vpaddd RX1, RY1, RY1; \
-		vpbroadcastd nk(RK,RROUND,8), RT0; \
-		vpaddd RT0, RX1, RX1; \
-		vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
-		vpaddd RT0, RY1, RY1; \
-		\
-		vpxor RX1, c ## 1, c ## 1; \
-		\
-		vpxor RY1, d ## 1, d ## 1; \
-		vpsrld $1, d ## 1, RT0; \
-		vpslld $31, d ## 1, d ## 1; \
-		vpor RT0, d ## 1, d ## 1;
-
-#define decrypt_round16(a, b, c, d, nk) \
-	g1_16(a, RX); \
-	\
-	vpslld $1, a ## 0, RT0; \
-	vpsrld $31, a ## 0, a ## 0; \
-	vpor RT0, a ## 0, a ## 0; \
-	\
-		vpslld $1, a ## 1, RT0; \
-		vpsrld $31, a ## 1, a ## 1; \
-		vpor RT0, a ## 1, a ## 1; \
-	\
-	g2_16(b, RY); \
-	\
-	decrypt_round_end16(a, b, c, d, nk);
-
-#define decrypt_round_first16(a, b, c, d, nk) \
-	vpslld $1, c ## 0, RT0; \
-	vpsrld $31, c ## 0, c ## 0; \
-	vpor RT0, c ## 0, c ## 0; \
-	\
-		vpslld $1, c ## 1, RT0; \
-		vpsrld $31, c ## 1, c ## 1; \
-		vpor RT0, c ## 1, c ## 1; \
-	\
-	decrypt_round16(a, b, c, d, nk)
-
-#define decrypt_round_last16(a, b, c, d, nk) \
-	g1_16(a, RX); \
-	\
-	g2_16(b, RY); \
-	\
-	decrypt_round_end16(a, b, c, d, nk);
-
-#define encrypt_cycle16() \
-	encrypt_round16(RA, RB, RC, RD, 0); \
-	encrypt_round16(RC, RD, RA, RB, 8);
-
-#define encrypt_cycle_first16() \
-	encrypt_round_first16(RA, RB, RC, RD, 0); \
-	encrypt_round16(RC, RD, RA, RB, 8);
-
-#define encrypt_cycle_last16() \
-	encrypt_round16(RA, RB, RC, RD, 0); \
-	encrypt_round_last16(RC, RD, RA, RB, 8);
-
-#define decrypt_cycle16(n) \
-	decrypt_round16(RC, RD, RA, RB, 8); \
-	decrypt_round16(RA, RB, RC, RD, 0);
-
-#define decrypt_cycle_first16(n) \
-	decrypt_round_first16(RC, RD, RA, RB, 8); \
-	decrypt_round16(RA, RB, RC, RD, 0);
-
-#define decrypt_cycle_last16(n) \
-	decrypt_round16(RC, RD, RA, RB, 8); \
-	decrypt_round_last16(RA, RB, RC, RD, 0);
-
-#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
-	vpunpckhdq x1, x0, t2; \
-	vpunpckldq x1, x0, x0; \
-	\
-	vpunpckldq x3, x2, t1; \
-	vpunpckhdq x3, x2, x2; \
-	\
-	vpunpckhqdq t1,	x0, x1; \
-	vpunpcklqdq t1,	x0, x0; \
-	\
-	vpunpckhqdq x2, t2, x3; \
-	vpunpcklqdq x2,	t2, x2;
-
-#define read_blocks8(offs,a,b,c,d) \
-	transpose_4x4(a, b, c, d, RX0, RY0);
-
-#define write_blocks8(offs,a,b,c,d) \
-	transpose_4x4(a, b, c, d, RX0, RY0);
-
-#define inpack_enc8(a,b,c,d) \
-	vpbroadcastd 4*0(RW), RT0; \
-	vpxor RT0, a, a; \
-	\
-	vpbroadcastd 4*1(RW), RT0; \
-	vpxor RT0, b, b; \
-	\
-	vpbroadcastd 4*2(RW), RT0; \
-	vpxor RT0, c, c; \
-	\
-	vpbroadcastd 4*3(RW), RT0; \
-	vpxor RT0, d, d;
-
-#define outunpack_enc8(a,b,c,d) \
-	vpbroadcastd 4*4(RW), RX0; \
-	vpbroadcastd 4*5(RW), RY0; \
-	vpxor RX0, c, RX0; \
-	vpxor RY0, d, RY0; \
-	\
-	vpbroadcastd 4*6(RW), RT0; \
-	vpxor RT0, a, c; \
-	vpbroadcastd 4*7(RW), RT0; \
-	vpxor RT0, b, d; \
-	\
-	vmovdqa RX0, a; \
-	vmovdqa RY0, b;
-
-#define inpack_dec8(a,b,c,d) \
-	vpbroadcastd 4*4(RW), RX0; \
-	vpbroadcastd 4*5(RW), RY0; \
-	vpxor RX0, a, RX0; \
-	vpxor RY0, b, RY0; \
-	\
-	vpbroadcastd 4*6(RW), RT0; \
-	vpxor RT0, c, a; \
-	vpbroadcastd 4*7(RW), RT0; \
-	vpxor RT0, d, b; \
-	\
-	vmovdqa RX0, c; \
-	vmovdqa RY0, d;
-
-#define outunpack_dec8(a,b,c,d) \
-	vpbroadcastd 4*0(RW), RT0; \
-	vpxor RT0, a, a; \
-	\
-	vpbroadcastd 4*1(RW), RT0; \
-	vpxor RT0, b, b; \
-	\
-	vpbroadcastd 4*2(RW), RT0; \
-	vpxor RT0, c, c; \
-	\
-	vpbroadcastd 4*3(RW), RT0; \
-	vpxor RT0, d, d;
-
-#define read_blocks16(a,b,c,d) \
-	read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
-	read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
-
-#define write_blocks16(a,b,c,d) \
-	write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
-	write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
-
-#define xor_blocks16(a,b,c,d) \
-	xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
-	xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
-
-#define inpack_enc16(a,b,c,d) \
-	inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
-	inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
-
-#define outunpack_enc16(a,b,c,d) \
-	outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
-	outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
-
-#define inpack_dec16(a,b,c,d) \
-	inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
-	inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
-
-#define outunpack_dec16(a,b,c,d) \
-	outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
-	outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
-
-.align 8
-__twofish_enc_blk16:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
-	 * output:
-	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
-	 */
-	init_round_constants();
-
-	read_blocks16(RA, RB, RC, RD);
-	inpack_enc16(RA, RB, RC, RD);
-
-	xorl RROUNDd, RROUNDd;
-	encrypt_cycle_first16();
-	movl $2, RROUNDd;
-
-.align 4
-.L__enc_loop:
-	encrypt_cycle16();
-
-	addl $2, RROUNDd;
-	cmpl $14, RROUNDd;
-	jne .L__enc_loop;
-
-	encrypt_cycle_last16();
-
-	outunpack_enc16(RA, RB, RC, RD);
-	write_blocks16(RA, RB, RC, RD);
-
-	ret;
-ENDPROC(__twofish_enc_blk16)
-
-.align 8
-__twofish_dec_blk16:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
-	 * output:
-	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
-	 */
-	init_round_constants();
-
-	read_blocks16(RA, RB, RC, RD);
-	inpack_dec16(RA, RB, RC, RD);
-
-	movl $14, RROUNDd;
-	decrypt_cycle_first16();
-	movl $12, RROUNDd;
-
-.align 4
-.L__dec_loop:
-	decrypt_cycle16();
-
-	addl $-2, RROUNDd;
-	jnz .L__dec_loop;
-
-	decrypt_cycle_last16();
-
-	outunpack_dec16(RA, RB, RC, RD);
-	write_blocks16(RA, RB, RC, RD);
-
-	ret;
-ENDPROC(__twofish_dec_blk16)
-
-ENTRY(twofish_ecb_enc_16way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 */
-
-	vzeroupper;
-	pushq %r12;
-
-	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	call __twofish_enc_blk16;
-
-	store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	popq %r12;
-	vzeroupper;
-
-	ret;
-ENDPROC(twofish_ecb_enc_16way)
-
-ENTRY(twofish_ecb_dec_16way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 */
-
-	vzeroupper;
-	pushq %r12;
-
-	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	call __twofish_dec_blk16;
-
-	store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	popq %r12;
-	vzeroupper;
-
-	ret;
-ENDPROC(twofish_ecb_dec_16way)
-
-ENTRY(twofish_cbc_dec_16way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 */
-
-	vzeroupper;
-	pushq %r12;
-
-	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	call __twofish_dec_blk16;
-
-	store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
-			RX0);
-
-	popq %r12;
-	vzeroupper;
-
-	ret;
-ENDPROC(twofish_cbc_dec_16way)
-
-ENTRY(twofish_ctr_16way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (16 blocks)
-	 *	%rdx: src (16 blocks)
-	 *	%rcx: iv (little endian, 128bit)
-	 */
-
-	vzeroupper;
-	pushq %r12;
-
-	load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
-		       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
-		       RBYTE);
-
-	call __twofish_enc_blk16;
-
-	store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	popq %r12;
-	vzeroupper;
-
-	ret;
-ENDPROC(twofish_ctr_16way)
-
-.align 8
-twofish_xts_crypt_16way:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (16 blocks)
-	 *	%rdx: src (16 blocks)
-	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
-	 *	%r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
-	 */
-
-	vzeroupper;
-	pushq %r12;
-
-	load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
-		       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
-		       .Lxts_gf128mul_and_shl1_mask_0,
-		       .Lxts_gf128mul_and_shl1_mask_1);
-
-	call *%r8;
-
-	store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	popq %r12;
-	vzeroupper;
-
-	ret;
-ENDPROC(twofish_xts_crypt_16way)
-
-ENTRY(twofish_xts_enc_16way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (16 blocks)
-	 *	%rdx: src (16 blocks)
-	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
-	 */
-	leaq __twofish_enc_blk16, %r8;
-	jmp twofish_xts_crypt_16way;
-ENDPROC(twofish_xts_enc_16way)
-
-ENTRY(twofish_xts_dec_16way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (16 blocks)
-	 *	%rdx: src (16 blocks)
-	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
-	 */
-	leaq __twofish_dec_blk16, %r8;
-	jmp twofish_xts_crypt_16way;
-ENDPROC(twofish_xts_dec_16way)
diff --git a/arch/x86/crypto/twofish_avx2_glue.c b/arch/x86/crypto/twofish_avx2_glue.c
deleted file mode 100644
index ce33b5be64ee..000000000000
--- a/arch/x86/crypto/twofish_avx2_glue.c
+++ /dev/null
@@ -1,584 +0,0 @@
-/*
- * Glue Code for x86_64/AVX2 assembler optimized version of Twofish
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/crypto.h>
-#include <linux/err.h>
-#include <crypto/algapi.h>
-#include <crypto/ctr.h>
-#include <crypto/twofish.h>
-#include <crypto/lrw.h>
-#include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
-#include <asm/crypto/twofish.h>
-#include <asm/crypto/ablk_helper.h>
-#include <asm/crypto/glue_helper.h>
-#include <crypto/scatterwalk.h>
-
-#define TF_AVX2_PARALLEL_BLOCKS 16
-
-/* 16-way AVX2 parallel cipher functions */
-asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst,
-				      const u8 *src);
-asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst,
-				      const u8 *src);
-asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
-
-asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src,
-				  le128 *iv);
-
-asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst,
-				      const u8 *src, le128 *iv);
-asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst,
-				      const u8 *src, le128 *iv);
-
-static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
-					const u8 *src)
-{
-	__twofish_enc_blk_3way(ctx, dst, src, false);
-}
-
-static const struct common_glue_ctx twofish_enc = {
-	.num_funcs = 4,
-	.fpu_blocks_limit = 8,
-
-	.funcs = { {
-		.num_blocks = 16,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) }
-	}, {
-		.num_blocks = 8,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
-	}, {
-		.num_blocks = 3,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
-	} }
-};
-
-static const struct common_glue_ctx twofish_ctr = {
-	.num_funcs = 4,
-	.fpu_blocks_limit = 8,
-
-	.funcs = { {
-		.num_blocks = 16,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) }
-	},  {
-		.num_blocks = 8,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
-	}, {
-		.num_blocks = 3,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
-	} }
-};
-
-static const struct common_glue_ctx twofish_enc_xts = {
-	.num_funcs = 3,
-	.fpu_blocks_limit = 8,
-
-	.funcs = { {
-		.num_blocks = 16,
-		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) }
-	}, {
-		.num_blocks = 8,
-		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
-	} }
-};
-
-static const struct common_glue_ctx twofish_dec = {
-	.num_funcs = 4,
-	.fpu_blocks_limit = 8,
-
-	.funcs = { {
-		.num_blocks = 16,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) }
-	}, {
-		.num_blocks = 8,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
-	}, {
-		.num_blocks = 3,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
-	} }
-};
-
-static const struct common_glue_ctx twofish_dec_cbc = {
-	.num_funcs = 4,
-	.fpu_blocks_limit = 8,
-
-	.funcs = { {
-		.num_blocks = 16,
-		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) }
-	}, {
-		.num_blocks = 8,
-		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
-	}, {
-		.num_blocks = 3,
-		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
-	} }
-};
-
-static const struct common_glue_ctx twofish_dec_xts = {
-	.num_funcs = 3,
-	.fpu_blocks_limit = 8,
-
-	.funcs = { {
-		.num_blocks = 16,
-		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) }
-	}, {
-		.num_blocks = 8,
-		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
-	} }
-};
-
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
-}
-
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
-}
-
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
-				       dst, src, nbytes);
-}
-
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
-				       nbytes);
-}
-
-static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		     struct scatterlist *src, unsigned int nbytes)
-{
-	return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
-}
-
-static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
-{
-	/* since reusing AVX functions, starts using FPU at 8 parallel blocks */
-	return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
-}
-
-static inline void twofish_fpu_end(bool fpu_enabled)
-{
-	glue_fpu_end(fpu_enabled);
-}
-
-struct crypt_priv {
-	struct twofish_ctx *ctx;
-	bool fpu_enabled;
-};
-
-static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
-{
-	const unsigned int bsize = TF_BLOCK_SIZE;
-	struct crypt_priv *ctx = priv;
-	int i;
-
-	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
-
-	while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
-		twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
-		srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
-		nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
-	}
-
-	while (nbytes >= 8 * bsize) {
-		twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
-		srcdst += bsize * 8;
-		nbytes -= bsize * 8;
-	}
-
-	while (nbytes >= 3 * bsize) {
-		twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
-		srcdst += bsize * 3;
-		nbytes -= bsize * 3;
-	}
-
-	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
-		twofish_enc_blk(ctx->ctx, srcdst, srcdst);
-}
-
-static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
-{
-	const unsigned int bsize = TF_BLOCK_SIZE;
-	struct crypt_priv *ctx = priv;
-	int i;
-
-	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
-
-	while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
-		twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
-		srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
-		nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
-	}
-
-	while (nbytes >= 8 * bsize) {
-		twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
-		srcdst += bsize * 8;
-		nbytes -= bsize * 8;
-	}
-
-	while (nbytes >= 3 * bsize) {
-		twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
-		srcdst += bsize * 3;
-		nbytes -= bsize * 3;
-	}
-
-	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
-		twofish_dec_blk(ctx->ctx, srcdst, srcdst);
-}
-
-static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[TF_AVX2_PARALLEL_BLOCKS];
-	struct crypt_priv crypt_ctx = {
-		.ctx = &ctx->twofish_ctx,
-		.fpu_enabled = false,
-	};
-	struct lrw_crypt_req req = {
-		.tbuf = buf,
-		.tbuflen = sizeof(buf),
-
-		.table_ctx = &ctx->lrw_table,
-		.crypt_ctx = &crypt_ctx,
-		.crypt_fn = encrypt_callback,
-	};
-	int ret;
-
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-	ret = lrw_crypt(desc, dst, src, nbytes, &req);
-	twofish_fpu_end(crypt_ctx.fpu_enabled);
-
-	return ret;
-}
-
-static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[TF_AVX2_PARALLEL_BLOCKS];
-	struct crypt_priv crypt_ctx = {
-		.ctx = &ctx->twofish_ctx,
-		.fpu_enabled = false,
-	};
-	struct lrw_crypt_req req = {
-		.tbuf = buf,
-		.tbuflen = sizeof(buf),
-
-		.table_ctx = &ctx->lrw_table,
-		.crypt_ctx = &crypt_ctx,
-		.crypt_fn = decrypt_callback,
-	};
-	int ret;
-
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-	ret = lrw_crypt(desc, dst, src, nbytes, &req);
-	twofish_fpu_end(crypt_ctx.fpu_enabled);
-
-	return ret;
-}
-
-static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-
-	return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
-				     XTS_TWEAK_CAST(twofish_enc_blk),
-				     &ctx->tweak_ctx, &ctx->crypt_ctx);
-}
-
-static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-
-	return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
-				     XTS_TWEAK_CAST(twofish_enc_blk),
-				     &ctx->tweak_ctx, &ctx->crypt_ctx);
-}
-
-static struct crypto_alg tf_algs[10] = { {
-	.cra_name		= "__ecb-twofish-avx2",
-	.cra_driver_name	= "__driver-ecb-twofish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct twofish_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.setkey		= twofish_setkey,
-			.encrypt	= ecb_encrypt,
-			.decrypt	= ecb_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "__cbc-twofish-avx2",
-	.cra_driver_name	= "__driver-cbc-twofish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct twofish_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.setkey		= twofish_setkey,
-			.encrypt	= cbc_encrypt,
-			.decrypt	= cbc_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "__ctr-twofish-avx2",
-	.cra_driver_name	= "__driver-ctr-twofish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct twofish_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= twofish_setkey,
-			.encrypt	= ctr_crypt,
-			.decrypt	= ctr_crypt,
-		},
-	},
-}, {
-	.cra_name		= "__lrw-twofish-avx2",
-	.cra_driver_name	= "__driver-lrw-twofish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_exit		= lrw_twofish_exit_tfm,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE +
-					  TF_BLOCK_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE +
-					  TF_BLOCK_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= lrw_twofish_setkey,
-			.encrypt	= lrw_encrypt,
-			.decrypt	= lrw_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "__xts-twofish-avx2",
-	.cra_driver_name	= "__driver-xts-twofish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct twofish_xts_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE * 2,
-			.max_keysize	= TF_MAX_KEY_SIZE * 2,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= xts_twofish_setkey,
-			.encrypt	= xts_encrypt,
-			.decrypt	= xts_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "ecb(twofish)",
-	.cra_driver_name	= "ecb-twofish-avx2",
-	.cra_priority		= 500,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "cbc(twofish)",
-	.cra_driver_name	= "cbc-twofish-avx2",
-	.cra_priority		= 500,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= __ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "ctr(twofish)",
-	.cra_driver_name	= "ctr-twofish-avx2",
-	.cra_priority		= 500,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_encrypt,
-			.geniv		= "chainiv",
-		},
-	},
-}, {
-	.cra_name		= "lrw(twofish)",
-	.cra_driver_name	= "lrw-twofish-avx2",
-	.cra_priority		= 500,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE +
-					  TF_BLOCK_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE +
-					  TF_BLOCK_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "xts(twofish)",
-	.cra_driver_name	= "xts-twofish-avx2",
-	.cra_priority		= 500,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE * 2,
-			.max_keysize	= TF_MAX_KEY_SIZE * 2,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-} };
-
-static int __init init(void)
-{
-	u64 xcr0;
-
-	if (!cpu_has_avx2 || !cpu_has_osxsave) {
-		pr_info("AVX2 instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX2 detected but unusable.\n");
-		return -ENODEV;
-	}
-
-	return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
-}
-
-static void __exit fini(void)
-{
-	crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized");
-MODULE_ALIAS("twofish");
-MODULE_ALIAS("twofish-asm");
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 2047a562f6b3..a62ba541884e 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -50,26 +50,18 @@
 /* 8-way parallel cipher functions */
 asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
-EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way);
-
 asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
-EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way);
 
 asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
-EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way);
-
 asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
 				 const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(twofish_ctr_8way);
 
 asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(twofish_xts_enc_8way);
 asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(twofish_xts_dec_8way);
 
 static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 					const u8 *src)
@@ -77,19 +69,17 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 	__twofish_enc_blk_3way(ctx, dst, src, false);
 }
 
-void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
 				  GLUE_FUNC_CAST(twofish_enc_blk));
 }
-EXPORT_SYMBOL_GPL(twofish_xts_enc);
 
-void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
 				  GLUE_FUNC_CAST(twofish_dec_blk));
 }
-EXPORT_SYMBOL_GPL(twofish_xts_dec);
 
 
 static const struct common_glue_ctx twofish_enc = {
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 52ff81cce008..bae3aba95b15 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -308,8 +308,6 @@ static int load_aout_binary(struct linux_binprm *bprm)
 		(current->mm->start_data = N_DATADDR(ex));
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
-	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
-	current->mm->cached_hole_size = 0;
 
 	retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
 	if (retval < 0) {
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index b31bf97775fc..2dfac58f3b11 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -111,7 +111,7 @@ static inline void acpi_disable_pci(void)
 }
 
 /* Low-level suspend routine. */
-extern int acpi_suspend_lowlevel(void);
+extern int (*acpi_suspend_lowlevel)(void);
 
 /* Physical address to resume after wakeup */
 #define acpi_wakeup_address ((unsigned long)(real_mode_header->wakeup_start))
diff --git a/arch/x86/include/asm/crypto/blowfish.h b/arch/x86/include/asm/crypto/blowfish.h
deleted file mode 100644
index f097b2face10..000000000000
--- a/arch/x86/include/asm/crypto/blowfish.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef ASM_X86_BLOWFISH_H
-#define ASM_X86_BLOWFISH_H
-
-#include <linux/crypto.h>
-#include <crypto/blowfish.h>
-
-#define BF_PARALLEL_BLOCKS 4
-
-/* regular block cipher functions */
-asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
-				   bool xor);
-asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
-
-/* 4-way parallel cipher functions */
-asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
-					const u8 *src, bool xor);
-asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
-				      const u8 *src);
-
-static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
-{
-	__blowfish_enc_blk(ctx, dst, src, false);
-}
-
-static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
-					const u8 *src)
-{
-	__blowfish_enc_blk(ctx, dst, src, true);
-}
-
-static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
-					 const u8 *src)
-{
-	__blowfish_enc_blk_4way(ctx, dst, src, false);
-}
-
-static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
-				      const u8 *src)
-{
-	__blowfish_enc_blk_4way(ctx, dst, src, true);
-}
-
-#endif
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
index e655c6029b45..878c51ceebb5 100644
--- a/arch/x86/include/asm/crypto/twofish.h
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -28,20 +28,6 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
 
-/* 8-way parallel cipher functions */
-asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
-				     const u8 *src);
-asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
-				     const u8 *src);
-asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
-				     const u8 *src);
-asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
-				 const u8 *src, le128 *iv);
-asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
-				     const u8 *src, le128 *iv);
-asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
-				     const u8 *src, le128 *iv);
-
 /* helpers from twofish_x86_64-3way module */
 extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
 extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
@@ -57,8 +43,4 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm);
 extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
 			      unsigned int keylen);
 
-/* helpers from twofish-avx module */
-extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
-extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
-
 #endif /* ASM_X86_TWOFISH_H */
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
index 75ce3f47d204..77a99ac06d00 100644
--- a/arch/x86/include/asm/emergency-restart.h
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -1,18 +1,6 @@
 #ifndef _ASM_X86_EMERGENCY_RESTART_H
 #define _ASM_X86_EMERGENCY_RESTART_H
 
-enum reboot_type {
-	BOOT_TRIPLE = 't',
-	BOOT_KBD = 'k',
-	BOOT_BIOS = 'b',
-	BOOT_ACPI = 'a',
-	BOOT_EFI = 'e',
-	BOOT_CF9 = 'p',
-	BOOT_CF9_COND = 'q',
-};
-
-extern enum reboot_type reboot_type;
-
 extern void machine_emergency_restart(void);
 
 #endif /* _ASM_X86_EMERGENCY_RESTART_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d8e8eefbe24c..34f69cb9350a 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -345,4 +345,11 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 
 #define IO_SPACE_LIMIT 0xffff
 
+#ifdef CONFIG_MTRR
+extern int __must_check arch_phys_wc_add(unsigned long base,
+					 unsigned long size);
+extern void arch_phys_wc_del(int handle);
+#define arch_phys_wc_add arch_phys_wc_add
+#endif
+
 #endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f87f7fcefa0a..c0efd16bdfa1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_mmu {
 	u64 *pae_root;
 	u64 *lm_root;
 	u64 rsvd_bits_mask[2][4];
+	u64 bad_mt_xwr;
 
 	/*
 	 * Bitmap: bit set = last pte in walk
@@ -323,6 +324,7 @@ struct kvm_pmu {
 	u64 global_ovf_ctrl;
 	u64 counter_bitmask[2];
 	u64 global_ctrl_mask;
+	u64 reserved_bits;
 	u8 version;
 	struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
 	struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
@@ -511,6 +513,9 @@ struct kvm_vcpu_arch {
 	 * instruction.
 	 */
 	bool write_fault_to_shadow_pgtable;
+
+	/* set at EPT violation at this point */
+	unsigned long exit_qualification;
 };
 
 struct kvm_lpage_info {
@@ -802,8 +807,8 @@ extern u32  kvm_min_guest_tsc_khz;
 extern u32  kvm_max_guest_tsc_khz;
 
 enum emulation_result {
-	EMULATE_DONE,       /* no further processing */
-	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
+	EMULATE_DONE,         /* no further processing */
+	EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
 	EMULATE_FAIL,         /* can't emulate this instruction */
 };
 
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index d354fb781c57..a55c7efcc4ed 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -95,8 +95,8 @@ static inline unsigned char current_lock_cmos_reg(void)
 unsigned char rtc_cmos_read(unsigned char addr);
 void rtc_cmos_write(unsigned char val, unsigned char addr);
 
-extern int mach_set_rtc_mmss(unsigned long nowtime);
-extern unsigned long mach_get_cmos_time(void);
+extern int mach_set_rtc_mmss(const struct timespec *now);
+extern void mach_get_cmos_time(struct timespec *now);
 
 #define RTC_IRQ 8
 
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6b52980c29c1..29e3093bbd21 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -214,6 +214,13 @@ void mce_log_therm_throt_event(__u64 status);
 /* Interrupt Handler for core thermal thresholds */
 extern int (*platform_thermal_notify)(__u64 msr_val);
 
+/* Interrupt Handler for package thermal thresholds */
+extern int (*platform_thermal_package_notify)(__u64 msr_val);
+
+/* Callback support of rate control, return true, if
+ * callback has rate control */
+extern bool (*platform_thermal_package_rate_control)(void);
+
 #ifdef CONFIG_X86_THERMAL_VECTOR
 extern void mcheck_intel_therm_init(void);
 #else
diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/mrst-vrtc.h
index 73668abdbedf..1e69a75412a4 100644
--- a/arch/x86/include/asm/mrst-vrtc.h
+++ b/arch/x86/include/asm/mrst-vrtc.h
@@ -3,7 +3,7 @@
 
 extern unsigned char vrtc_cmos_read(unsigned char reg);
 extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
-extern unsigned long vrtc_get_time(void);
-extern int vrtc_set_mmss(unsigned long nowtime);
+extern void vrtc_get_time(struct timespec *now);
+extern int vrtc_set_mmss(const struct timespec *now);
 
 #endif
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index e235582f9930..f768f6298419 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -26,7 +26,10 @@
 #include <uapi/asm/mtrr.h>
 
 
-/*  The following functions are for use by other drivers  */
+/*
+ * The following functions are for use by other drivers that cannot use
+ * arch_phys_wc_add and arch_phys_wc_del.
+ */
 # ifdef CONFIG_MTRR
 extern u8 mtrr_type_lookup(u64 addr, u64 end);
 extern void mtrr_save_fixed_ranges(void *);
@@ -45,6 +48,7 @@ extern void mtrr_aps_init(void);
 extern void mtrr_bp_restore(void);
 extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
 extern int amd_special_default_mtrr(void);
+extern int phys_wc_to_mtrr_index(int handle);
 #  else
 static inline u8 mtrr_type_lookup(u64 addr, u64 end)
 {
@@ -80,6 +84,10 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
 static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
 {
 }
+static inline int phys_wc_to_mtrr_index(int handle)
+{
+	return -1;
+}
 
 #define mtrr_ap_init() do {} while (0)
 #define mtrr_bp_init() do {} while (0)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5b0818bc8963..7dc305a46058 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -207,7 +207,7 @@ static inline pte_t pte_mkexec(pte_t pte)
 
 static inline pte_t pte_mkdirty(pte_t pte)
 {
-	return pte_set_flags(pte, _PAGE_DIRTY);
+	return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
 }
 
 static inline pte_t pte_mkyoung(pte_t pte)
@@ -271,7 +271,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd)
 
 static inline pmd_t pmd_mkdirty(pmd_t pmd)
 {
-	return pmd_set_flags(pmd, _PAGE_DIRTY);
+	return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
 }
 
 static inline pmd_t pmd_mkhuge(pmd_t pmd)
@@ -294,6 +294,26 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 	return pmd_clear_flags(pmd, _PAGE_PRESENT);
 }
 
+static inline int pte_soft_dirty(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_SOFT_DIRTY;
+}
+
+static inline int pmd_soft_dirty(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
+}
+
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+	return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
+}
+
 /*
  * Mask out unsupported bits in a present pgprot.  Non-present pgprots
  * can use those bits for other purposes, so leave them be.
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index e6423002c10b..c98ac63aae48 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -55,6 +55,18 @@
 #define _PAGE_HIDDEN	(_AT(pteval_t, 0))
 #endif
 
+/*
+ * The same hidden bit is used by kmemcheck, but since kmemcheck
+ * works on kernel pages while soft-dirty engine on user space,
+ * they do not conflict with each other.
+ */
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
+#else
+#define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 0))
+#endif
+
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #else
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 109a9dd5d454..be8269b00e2a 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 
 struct pvclock_vsyscall_time_info {
 	struct pvclock_vcpu_time_info pvti;
-	u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1df6e84691f..27811190cbd7 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -89,7 +89,6 @@ struct thread_info {
 #define TIF_FORK		18	/* ret_from_fork */
 #define TIF_NOHZ		19	/* in adaptive nohz mode */
 #define TIF_MEMDIE		20	/* is terminating due to OOM killer */
-#define TIF_DEBUG		21	/* uses debug registers */
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
@@ -113,7 +112,6 @@ struct thread_info {
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
 #define _TIF_NOHZ		(1 << TIF_NOHZ)
-#define _TIF_DEBUG		(1 << TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
@@ -154,7 +152,7 @@ struct thread_info {
 	(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
-#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
+#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
 
 #define PREEMPT_ACTIVE		0x10000000
 
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2cbaa1..966502d4682e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR		0
 #define VMX_EPT_EXTENT_CONTEXT			1
 #define VMX_EPT_EXTENT_GLOBAL			2
+#define VMX_EPT_EXTENT_SHIFT			24
 
 #define VMX_EPT_EXECUTE_ONLY_BIT		(1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT			(1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT				(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT			(1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT			(1ull << 17)
+#define VMX_EPT_INVEPT_BIT			(1ull << 20)
 #define VMX_EPT_AD_BIT				    (1ull << 21)
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index d8d99222b36a..828a1565ba57 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -142,6 +142,8 @@ struct x86_cpuinit_ops {
 	void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
 };
 
+struct timespec;
+
 /**
  * struct x86_platform_ops - platform specific runtime functions
  * @calibrate_tsc:		calibrate TSC
@@ -156,8 +158,8 @@ struct x86_cpuinit_ops {
  */
 struct x86_platform_ops {
 	unsigned long (*calibrate_tsc)(void);
-	unsigned long (*get_wallclock)(void);
-	int (*set_wallclock)(unsigned long nowtime);
+	void (*get_wallclock)(struct timespec *ts);
+	int (*set_wallclock)(const struct timespec *ts);
 	void (*iommu_shutdown)(void);
 	bool (*is_untracked_pat_range)(u64 start, u64 end);
 	void (*nmi_init)(void);
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082c7cf7..0e79420376eb 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED         45
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
+#define EXIT_REASON_INVEPT              50
 #define EXIT_REASON_PREEMPTION_TIMER    52
 #define EXIT_REASON_WBINVD              54
 #define EXIT_REASON_XSETBV              55
@@ -106,12 +107,13 @@
 	{ EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
 	{ EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
 	{ EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
+	{ EXIT_REASON_INVEPT,                "INVEPT" }, \
+	{ EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }, \
 	{ EXIT_REASON_WBINVD,                "WBINVD" }, \
 	{ EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
 	{ EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
 	{ EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
 	{ EXIT_REASON_INVD,                  "INVD" }, \
-	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
-	{ EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }
+	{ EXIT_REASON_INVPCID,               "INVPCID" }
 
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 230c8ea878e5..d81a972dd506 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -44,6 +44,7 @@
 #include <asm/mpspec.h>
 #include <asm/smp.h>
 
+#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
 static int __initdata acpi_force = 0;
 u32 acpi_rsdt_forced;
 int acpi_disabled;
@@ -559,6 +560,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
 int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
 			   int trigger, int polarity) = acpi_register_gsi_pic;
 
+#ifdef CONFIG_ACPI_SLEEP
+int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel;
+#else
+int (*acpi_suspend_lowlevel)(void);
+#endif
+
 /*
  * success: return IRQ number (>=0)
  * failure: return < 0
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index b44577bc9744..2a34aaf3c8f1 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -26,12 +26,12 @@ static char temp_stack[4096];
 #endif
 
 /**
- * acpi_suspend_lowlevel - save kernel state
+ * x86_acpi_suspend_lowlevel - save kernel state
  *
  * Create an identity mapped page table and copy the wakeup routine to
  * low memory.
  */
-int acpi_suspend_lowlevel(void)
+int x86_acpi_suspend_lowlevel(void)
 {
 	struct wakeup_header *header =
 		(struct wakeup_header *) __va(real_mode_header->wakeup_header);
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index 67f59f8c6956..c9c2c982d5e4 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -15,3 +15,5 @@ extern unsigned long acpi_copy_wakeup_routine(unsigned long);
 extern void wakeup_long64(void);
 
 extern void do_suspend_lowlevel(void);
+
+extern int x86_acpi_suspend_lowlevel(void);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 39cc7f7acab3..63092afb142e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -25,6 +25,7 @@
 #include <linux/kdebug.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
+#include <linux/reboot.h>
 
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
@@ -36,7 +37,6 @@
 #include <asm/ipi.h>
 #include <asm/smp.h>
 #include <asm/x86_init.h>
-#include <asm/emergency-restart.h>
 #include <asm/nmi.h>
 
 /* BMC sets a bit this MMR non-zero before sending an NMI */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 5013a48d1aff..c587a8757227 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -90,7 +90,7 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
 static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
-	int mbytes = num_physpages >> (20-PAGE_SHIFT);
+	int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
 
 	if (c->x86_model < 6) {
 		/* Based on AMD doc 20734R - June 2000 */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 98f2083832eb..41e8e00a6637 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -55,12 +55,24 @@ struct thermal_state {
 	struct _thermal_state package_power_limit;
 	struct _thermal_state core_thresh0;
 	struct _thermal_state core_thresh1;
+	struct _thermal_state pkg_thresh0;
+	struct _thermal_state pkg_thresh1;
 };
 
 /* Callback to handle core threshold interrupts */
 int (*platform_thermal_notify)(__u64 msr_val);
 EXPORT_SYMBOL(platform_thermal_notify);
 
+/* Callback to handle core package threshold_interrupts */
+int (*platform_thermal_package_notify)(__u64 msr_val);
+EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
+
+/* Callback support of rate control, return true, if
+ * callback has rate control */
+bool (*platform_thermal_package_rate_control)(void);
+EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
+
+
 static DEFINE_PER_CPU(struct thermal_state, thermal_state);
 
 static atomic_t therm_throt_en	= ATOMIC_INIT(0);
@@ -195,19 +207,25 @@ static int therm_throt_process(bool new_event, int event, int level)
 	return 0;
 }
 
-static int thresh_event_valid(int event)
+static int thresh_event_valid(int level, int event)
 {
 	struct _thermal_state *state;
 	unsigned int this_cpu = smp_processor_id();
 	struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
 	u64 now = get_jiffies_64();
 
-	state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1;
+	if (level == PACKAGE_LEVEL)
+		state = (event == 0) ? &pstate->pkg_thresh0 :
+						&pstate->pkg_thresh1;
+	else
+		state = (event == 0) ? &pstate->core_thresh0 :
+						&pstate->core_thresh1;
 
 	if (time_before64(now, state->next_check))
 		return 0;
 
 	state->next_check = now + CHECK_INTERVAL;
+
 	return 1;
 }
 
@@ -322,6 +340,39 @@ device_initcall(thermal_throttle_init_device);
 
 #endif /* CONFIG_SYSFS */
 
+static void notify_package_thresholds(__u64 msr_val)
+{
+	bool notify_thres_0 = false;
+	bool notify_thres_1 = false;
+
+	if (!platform_thermal_package_notify)
+		return;
+
+	/* lower threshold check */
+	if (msr_val & THERM_LOG_THRESHOLD0)
+		notify_thres_0 = true;
+	/* higher threshold check */
+	if (msr_val & THERM_LOG_THRESHOLD1)
+		notify_thres_1 = true;
+
+	if (!notify_thres_0 && !notify_thres_1)
+		return;
+
+	if (platform_thermal_package_rate_control &&
+		platform_thermal_package_rate_control()) {
+		/* Rate control is implemented in callback */
+		platform_thermal_package_notify(msr_val);
+		return;
+	}
+
+	/* lower threshold reached */
+	if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
+		platform_thermal_package_notify(msr_val);
+	/* higher threshold reached */
+	if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
+		platform_thermal_package_notify(msr_val);
+}
+
 static void notify_thresholds(__u64 msr_val)
 {
 	/* check whether the interrupt handler is defined;
@@ -331,10 +382,12 @@ static void notify_thresholds(__u64 msr_val)
 		return;
 
 	/* lower threshold reached */
-	if ((msr_val & THERM_LOG_THRESHOLD0) &&	thresh_event_valid(0))
+	if ((msr_val & THERM_LOG_THRESHOLD0) &&
+			thresh_event_valid(CORE_LEVEL, 0))
 		platform_thermal_notify(msr_val);
 	/* higher threshold reached */
-	if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1))
+	if ((msr_val & THERM_LOG_THRESHOLD1) &&
+			thresh_event_valid(CORE_LEVEL, 1))
 		platform_thermal_notify(msr_val);
 }
 
@@ -360,6 +413,8 @@ static void intel_thermal_interrupt(void)
 
 	if (this_cpu_has(X86_FEATURE_PTS)) {
 		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+		/* check violations of package thermal thresholds */
+		notify_package_thresholds(msr_val);
 		therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
 					THERMAL_THROTTLING_EVENT,
 					PACKAGE_LEVEL);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index ca22b73aaa25..f961de9964c7 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -51,9 +51,13 @@
 #include <asm/e820.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
+#include <asm/pat.h>
 
 #include "mtrr.h"
 
+/* arch_phys_wc_add returns an MTRR register index plus this offset. */
+#define MTRR_TO_PHYS_WC_OFFSET 1000
+
 u32 num_var_ranges;
 
 unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
@@ -525,6 +529,73 @@ int mtrr_del(int reg, unsigned long base, unsigned long size)
 }
 EXPORT_SYMBOL(mtrr_del);
 
+/**
+ * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
+ * @base: Physical base address
+ * @size: Size of region
+ *
+ * If PAT is available, this does nothing.  If PAT is unavailable, it
+ * attempts to add a WC MTRR covering size bytes starting at base and
+ * logs an error if this fails.
+ *
+ * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
+ * but drivers should not try to interpret that return value.
+ */
+int arch_phys_wc_add(unsigned long base, unsigned long size)
+{
+	int ret;
+
+	if (pat_enabled)
+		return 0;  /* Success!  (We don't need to do anything.) */
+
+	ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
+	if (ret < 0) {
+		pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.",
+			(void *)base, (void *)(base + size - 1));
+		return ret;
+	}
+	return ret + MTRR_TO_PHYS_WC_OFFSET;
+}
+EXPORT_SYMBOL(arch_phys_wc_add);
+
+/*
+ * arch_phys_wc_del - undoes arch_phys_wc_add
+ * @handle: Return value from arch_phys_wc_add
+ *
+ * This cleans up after mtrr_add_wc_if_needed.
+ *
+ * The API guarantees that mtrr_del_wc_if_needed(error code) and
+ * mtrr_del_wc_if_needed(0) do nothing.
+ */
+void arch_phys_wc_del(int handle)
+{
+	if (handle >= 1) {
+		WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET);
+		mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0);
+	}
+}
+EXPORT_SYMBOL(arch_phys_wc_del);
+
+/*
+ * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * @handle: Return value from arch_phys_wc_add
+ *
+ * This will turn the return value from arch_phys_wc_add into an mtrr
+ * index suitable for debugging.
+ *
+ * Note: There is no legitimate use for this function, except possibly
+ * in printk line.  Alas there is an illegitimate use in some ancient
+ * drm ioctls.
+ */
+int phys_wc_to_mtrr_index(int handle)
+{
+	if (handle < MTRR_TO_PHYS_WC_OFFSET)
+		return -1;
+	else
+		return handle - MTRR_TO_PHYS_WC_OFFSET;
+}
+EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
+
 /*
  * HACK ALERT!
  * These should be called implicitly, but we can't yet until all the initcall
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
index 0db655ef3918..639d1289b1ba 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
@@ -491,10 +491,8 @@ static struct perf_amd_iommu __perf_iommu = {
 static __init int amd_iommu_pc_init(void)
 {
 	/* Make sure the IOMMU PC resource is available */
-	if (!amd_iommu_pc_supported()) {
-		pr_err("perf: amd_iommu PMU not installed. No support!\n");
+	if (!amd_iommu_pc_supported())
 		return -ENODEV;
-	}
 
 	_init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
 
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
index 7b3fe56b1c21..31f0f335ed22 100644
--- a/arch/x86/kernel/cpu/powerflags.c
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -11,10 +11,10 @@ const char *const x86_power_flags[32] = {
 	"fid",  /* frequency id control */
 	"vid",  /* voltage id control */
 	"ttp",  /* thermal trip */
-	"tm",
-	"stc",
-	"100mhzsteps",
-	"hwpstate",
+	"tm",	/* hardware thermal control */
+	"stc",	/* software thermal control */
+	"100mhzsteps", /* 100 MHz multiplier control */
+	"hwpstate", /* hardware P-state control */
 	"",	/* tsc invariant mapped to constant_tsc */
 	"cpb",  /* core performance boost */
 	"eff_freq_ro", /* Readonly aperf/mperf */
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index b1581527a236..4934890e4db2 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -364,9 +364,7 @@ static void dt_add_ioapic_domain(unsigned int ioapic_num,
 		 * and assigned so we can keep the 1:1 mapping which the ioapic
 		 * is having.
 		 */
-		ret = irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY);
-		if (ret)
-			pr_err("Error mapping legacy IRQs: %d\n", ret);
+		irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY);
 
 		if (num > NR_IRQS_LEGACY) {
 			ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY,
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 02f07634d265..f66ff162dce8 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -393,6 +393,9 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
 		unregister_hw_breakpoint(t->ptrace_bps[i]);
 		t->ptrace_bps[i] = NULL;
 	}
+
+	t->debugreg6 = 0;
+	t->ptrace_dr7 = 0;
 }
 
 void hw_breakpoint_restore(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 3dd37ebd591b..1f354f4b602b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -48,10 +48,9 @@ static struct pvclock_wall_clock wall_clock;
  * have elapsed since the hypervisor wrote the data. So we try to account for
  * that with system time
  */
-static unsigned long kvm_get_wallclock(void)
+static void kvm_get_wallclock(struct timespec *now)
 {
 	struct pvclock_vcpu_time_info *vcpu_time;
-	struct timespec ts;
 	int low, high;
 	int cpu;
 
@@ -64,14 +63,12 @@ static unsigned long kvm_get_wallclock(void)
 	cpu = smp_processor_id();
 
 	vcpu_time = &hv_clock[cpu].pvti;
-	pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
+	pvclock_read_wallclock(&wall_clock, vcpu_time, now);
 
 	preempt_enable();
-
-	return ts.tv_sec;
 }
 
-static int kvm_set_wallclock(unsigned long now)
+static int kvm_set_wallclock(const struct timespec *now)
 {
 	return -1;
 }
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 0920212e6159..ba77ebc2c353 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -111,7 +111,7 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
 	 */
 	list_for_each_entry_rcu(a, &desc->head, list) {
 		u64 before, delta, whole_msecs;
-		int decimal_msecs, thishandled;
+		int remainder_ns, decimal_msecs, thishandled;
 
 		before = local_clock();
 		thishandled = a->handler(type, regs);
@@ -123,8 +123,9 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
 			continue;
 
 		nmi_longest_ns = delta;
-		whole_msecs = do_div(delta, (1000 * 1000));
-		decimal_msecs = do_div(delta, 1000) % 1000;
+		whole_msecs = delta;
+		remainder_ns = do_div(whole_msecs, (1000 * 1000));
+		decimal_msecs = remainder_ns / 1000;
 		printk_ratelimited(KERN_INFO
 			"INFO: NMI handler (%ps) took too long to run: "
 			"%lld.%03d msecs\n", a->handler, whole_msecs,
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 29a8120e6fe8..7461f50d5bb1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -601,30 +601,48 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[])
 	return dr7;
 }
 
-static int
-ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
-			 struct task_struct *tsk, int disabled)
+static int ptrace_fill_bp_fields(struct perf_event_attr *attr,
+					int len, int type, bool disabled)
+{
+	int err, bp_len, bp_type;
+
+	err = arch_bp_generic_fields(len, type, &bp_len, &bp_type);
+	if (!err) {
+		attr->bp_len = bp_len;
+		attr->bp_type = bp_type;
+		attr->disabled = disabled;
+	}
+
+	return err;
+}
+
+static struct perf_event *
+ptrace_register_breakpoint(struct task_struct *tsk, int len, int type,
+				unsigned long addr, bool disabled)
 {
-	int err;
-	int gen_len, gen_type;
 	struct perf_event_attr attr;
+	int err;
 
-	/*
-	 * We should have at least an inactive breakpoint at this
-	 * slot. It means the user is writing dr7 without having
-	 * written the address register first
-	 */
-	if (!bp)
-		return -EINVAL;
+	ptrace_breakpoint_init(&attr);
+	attr.bp_addr = addr;
 
-	err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+	err = ptrace_fill_bp_fields(&attr, len, type, disabled);
 	if (err)
-		return err;
+		return ERR_PTR(err);
+
+	return register_user_hw_breakpoint(&attr, ptrace_triggered,
+						 NULL, tsk);
+}
 
-	attr = bp->attr;
-	attr.bp_len = gen_len;
-	attr.bp_type = gen_type;
-	attr.disabled = disabled;
+static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+					int disabled)
+{
+	struct perf_event_attr attr = bp->attr;
+	int err;
+
+	err = ptrace_fill_bp_fields(&attr, len, type, disabled);
+	if (err)
+		return err;
 
 	return modify_user_hw_breakpoint(bp, &attr);
 }
@@ -634,67 +652,50 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
  */
 static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
 {
-	struct thread_struct *thread = &(tsk->thread);
+	struct thread_struct *thread = &tsk->thread;
 	unsigned long old_dr7;
-	int i, orig_ret = 0, rc = 0;
-	int enabled, second_pass = 0;
-	unsigned len, type;
-	struct perf_event *bp;
-
-	if (ptrace_get_breakpoints(tsk) < 0)
-		return -ESRCH;
+	bool second_pass = false;
+	int i, rc, ret = 0;
 
 	data &= ~DR_CONTROL_RESERVED;
 	old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+
 restore:
-	/*
-	 * Loop through all the hardware breakpoints, making the
-	 * appropriate changes to each.
-	 */
+	rc = 0;
 	for (i = 0; i < HBP_NUM; i++) {
-		enabled = decode_dr7(data, i, &len, &type);
-		bp = thread->ptrace_bps[i];
-
-		if (!enabled) {
-			if (bp) {
-				/*
-				 * Don't unregister the breakpoints right-away,
-				 * unless all register_user_hw_breakpoint()
-				 * requests have succeeded. This prevents
-				 * any window of opportunity for debug
-				 * register grabbing by other users.
-				 */
-				if (!second_pass)
-					continue;
-
-				rc = ptrace_modify_breakpoint(bp, len, type,
-							      tsk, 1);
-				if (rc)
-					break;
+		unsigned len, type;
+		bool disabled = !decode_dr7(data, i, &len, &type);
+		struct perf_event *bp = thread->ptrace_bps[i];
+
+		if (!bp) {
+			if (disabled)
+				continue;
+
+			bp = ptrace_register_breakpoint(tsk,
+					len, type, 0, disabled);
+			if (IS_ERR(bp)) {
+				rc = PTR_ERR(bp);
+				break;
 			}
+
+			thread->ptrace_bps[i] = bp;
 			continue;
 		}
 
-		rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
+		rc = ptrace_modify_breakpoint(bp, len, type, disabled);
 		if (rc)
 			break;
 	}
-	/*
-	 * Make a second pass to free the remaining unused breakpoints
-	 * or to restore the original breakpoints if an error occurred.
-	 */
-	if (!second_pass) {
-		second_pass = 1;
-		if (rc < 0) {
-			orig_ret = rc;
-			data = old_dr7;
-		}
+
+	/* Restore if the first pass failed, second_pass shouldn't fail. */
+	if (rc && !WARN_ON(second_pass)) {
+		ret = rc;
+		data = old_dr7;
+		second_pass = true;
 		goto restore;
 	}
 
-	ptrace_put_breakpoints(tsk);
-
-	return ((orig_ret < 0) ? orig_ret : rc);
+	return ret;
 }
 
 /*
@@ -702,25 +703,17 @@ restore:
  */
 static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
 {
-	struct thread_struct *thread = &(tsk->thread);
+	struct thread_struct *thread = &tsk->thread;
 	unsigned long val = 0;
 
 	if (n < HBP_NUM) {
-		struct perf_event *bp;
+		struct perf_event *bp = thread->ptrace_bps[n];
 
-		if (ptrace_get_breakpoints(tsk) < 0)
-			return -ESRCH;
-
-		bp = thread->ptrace_bps[n];
-		if (!bp)
-			val = 0;
-		else
+		if (bp)
 			val = bp->hw.info.address;
-
-		ptrace_put_breakpoints(tsk);
 	} else if (n == 6) {
 		val = thread->debugreg6;
-	 } else if (n == 7) {
+	} else if (n == 7) {
 		val = thread->ptrace_dr7;
 	}
 	return val;
@@ -729,29 +722,14 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
 static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 				      unsigned long addr)
 {
-	struct perf_event *bp;
 	struct thread_struct *t = &tsk->thread;
-	struct perf_event_attr attr;
+	struct perf_event *bp = t->ptrace_bps[nr];
 	int err = 0;
 
-	if (ptrace_get_breakpoints(tsk) < 0)
-		return -ESRCH;
-
-	if (!t->ptrace_bps[nr]) {
-		ptrace_breakpoint_init(&attr);
-		/*
-		 * Put stub len and type to register (reserve) an inactive but
-		 * correct bp
-		 */
-		attr.bp_addr = addr;
-		attr.bp_len = HW_BREAKPOINT_LEN_1;
-		attr.bp_type = HW_BREAKPOINT_W;
-		attr.disabled = 1;
-
-		bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
-						 NULL, tsk);
-
+	if (!bp) {
 		/*
+		 * Put stub len and type to create an inactive but correct bp.
+		 *
 		 * CHECKME: the previous code returned -EIO if the addr wasn't
 		 * a valid task virtual addr. The new one will return -EINVAL in
 		 *  this case.
@@ -760,22 +738,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 		 * writing for the user. And anyway this is the previous
 		 * behaviour.
 		 */
-		if (IS_ERR(bp)) {
+		bp = ptrace_register_breakpoint(tsk,
+				X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE,
+				addr, true);
+		if (IS_ERR(bp))
 			err = PTR_ERR(bp);
-			goto put;
-		}
-
-		t->ptrace_bps[nr] = bp;
+		else
+			t->ptrace_bps[nr] = bp;
 	} else {
-		bp = t->ptrace_bps[nr];
+		struct perf_event_attr attr = bp->attr;
 
-		attr = bp->attr;
 		attr.bp_addr = addr;
 		err = modify_user_hw_breakpoint(bp, &attr);
 	}
 
-put:
-	ptrace_put_breakpoints(tsk);
 	return err;
 }
 
@@ -785,30 +761,20 @@ put:
 static int ptrace_set_debugreg(struct task_struct *tsk, int n,
 			       unsigned long val)
 {
-	struct thread_struct *thread = &(tsk->thread);
-	int rc = 0;
-
+	struct thread_struct *thread = &tsk->thread;
 	/* There are no DR4 or DR5 registers */
-	if (n == 4 || n == 5)
-		return -EIO;
+	int rc = -EIO;
 
-	if (n == 6) {
-		thread->debugreg6 = val;
-		goto ret_path;
-	}
 	if (n < HBP_NUM) {
 		rc = ptrace_set_breakpoint_addr(tsk, n, val);
-		if (rc)
-			return rc;
-	}
-	/* All that's left is DR7 */
-	if (n == 7) {
+	} else if (n == 6) {
+		thread->debugreg6 = val;
+		rc = 0;
+	} else if (n == 7) {
 		rc = ptrace_write_dr7(tsk, val);
 		if (!rc)
 			thread->ptrace_dr7 = val;
 	}
-
-ret_path:
 	return rc;
 }
 
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2cb9470ea85b..a16bae3f83b3 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
 
-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
-	if (!pvclock_vdso_info) {
-		BUG();
-		return NULL;
-	}
-
-	return &pvclock_vdso_info[cpu];
-}
-
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
-	return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
-}
-
 #ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
-			        void *v)
-{
-	struct task_migration_notifier *mn = v;
-	struct pvclock_vsyscall_time_info *pvti;
-
-	pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
-
-	/* this is NULL when pvclock vsyscall is not initialized */
-	if (unlikely(pvti == NULL))
-		return NOTIFY_DONE;
-
-	pvti->migrate_count++;
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block pvclock_migrate = {
-	.notifier_call = pvclock_task_migrate,
-};
-
 /*
  * Initialize the generic pvclock vsyscall state.  This will allocate
  * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
 
 	WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
 
-	pvclock_vdso_info = i;
-
 	for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
 		__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
 			     __pa(i) + (idx*PAGE_SIZE),
 			     PAGE_KERNEL_VVAR);
 	}
 
-
-	register_task_migration_notifier(&pvclock_migrate);
-
 	return 0;
 }
 #endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 76fa1e9a2b39..563ed91e6faa 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,22 +36,6 @@ void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
 
 static const struct desc_ptr no_idt = {};
-static int reboot_mode;
-enum reboot_type reboot_type = BOOT_ACPI;
-int reboot_force;
-
-/*
- * This variable is used privately to keep track of whether or not
- * reboot_type is still set to its default value (i.e., reboot= hasn't
- * been set on the command line).  This is needed so that we can
- * suppress DMI scanning for reboot quirks.  Without it, it's
- * impossible to override a faulty reboot quirk without recompiling.
- */
-static int reboot_default = 1;
-
-#ifdef CONFIG_SMP
-static int reboot_cpu = -1;
-#endif
 
 /*
  * This is set if we need to go through the 'emergency' path.
@@ -64,79 +48,6 @@ static int reboot_emergency;
 bool port_cf9_safe = false;
 
 /*
- * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
- * warm   Don't set the cold reboot flag
- * cold   Set the cold reboot flag
- * bios   Reboot by jumping through the BIOS
- * smp    Reboot by executing reset on BSP or other CPU
- * triple Force a triple fault (init)
- * kbd    Use the keyboard controller. cold reset (default)
- * acpi   Use the RESET_REG in the FADT
- * efi    Use efi reset_system runtime service
- * pci    Use the so-called "PCI reset register", CF9
- * force  Avoid anything that could hang.
- */
-static int __init reboot_setup(char *str)
-{
-	for (;;) {
-		/*
-		 * Having anything passed on the command line via
-		 * reboot= will cause us to disable DMI checking
-		 * below.
-		 */
-		reboot_default = 0;
-
-		switch (*str) {
-		case 'w':
-			reboot_mode = 0x1234;
-			break;
-
-		case 'c':
-			reboot_mode = 0;
-			break;
-
-#ifdef CONFIG_SMP
-		case 's':
-			if (isdigit(*(str+1))) {
-				reboot_cpu = (int) (*(str+1) - '0');
-				if (isdigit(*(str+2)))
-					reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
-			}
-			/*
-			 * We will leave sorting out the final value
-			 * when we are ready to reboot, since we might not
-			 * have detected BSP APIC ID or smp_num_cpu
-			 */
-			break;
-#endif /* CONFIG_SMP */
-
-		case 'b':
-		case 'a':
-		case 'k':
-		case 't':
-		case 'e':
-		case 'p':
-			reboot_type = *str;
-			break;
-
-		case 'f':
-			reboot_force = 1;
-			break;
-		}
-
-		str = strchr(str, ',');
-		if (str)
-			str++;
-		else
-			break;
-	}
-	return 1;
-}
-
-__setup("reboot=", reboot_setup);
-
-
-/*
  * Reboot options and system auto-detection code provided by
  * Dell Inc. so their systems "just work". :-)
  */
@@ -536,6 +447,7 @@ static void native_machine_emergency_restart(void)
 	int i;
 	int attempt = 0;
 	int orig_reboot_type = reboot_type;
+	unsigned short mode;
 
 	if (reboot_emergency)
 		emergency_vmx_disable_all();
@@ -543,7 +455,8 @@ static void native_machine_emergency_restart(void)
 	tboot_shutdown(TB_SHUTDOWN_REBOOT);
 
 	/* Tell the BIOS if we want cold or warm reboot */
-	*((unsigned short *)__va(0x472)) = reboot_mode;
+	mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0;
+	*((unsigned short *)__va(0x472)) = mode;
 
 	for (;;) {
 		/* Could also try the reset bit in the Hammer NB */
@@ -585,7 +498,7 @@ static void native_machine_emergency_restart(void)
 
 		case BOOT_EFI:
 			if (efi_enabled(EFI_RUNTIME_SERVICES))
-				efi.reset_system(reboot_mode ?
+				efi.reset_system(reboot_mode == REBOOT_WARM ?
 						 EFI_RESET_WARM :
 						 EFI_RESET_COLD,
 						 EFI_SUCCESS, 0, NULL);
@@ -614,26 +527,10 @@ void native_machine_shutdown(void)
 {
 	/* Stop the cpus and apics */
 #ifdef CONFIG_SMP
-
-	/* The boot cpu is always logical cpu 0 */
-	int reboot_cpu_id = 0;
-
-	/* See if there has been given a command line override */
-	if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
-		cpu_online(reboot_cpu))
-		reboot_cpu_id = reboot_cpu;
-
-	/* Make certain the cpu I'm about to reboot on is online */
-	if (!cpu_online(reboot_cpu_id))
-		reboot_cpu_id = smp_processor_id();
-
-	/* Make certain I only run on the appropriate processor */
-	set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
-
 	/*
-	 * O.K Now that I'm on the appropriate processor, stop all of the
-	 * others. Also disable the local irq to not receive the per-cpu
-	 * timer interrupt which may trigger scheduler's load balance.
+	 * Stop all of the others. Also disable the local irq to
+	 * not receive the per-cpu timer interrupt which may trigger
+	 * scheduler's load balance.
 	 */
 	local_irq_disable();
 	stop_other_cpus();
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 198eb201ed3b..0aa29394ed6f 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -38,8 +38,9 @@ EXPORT_SYMBOL(rtc_lock);
  * jump to the next second precisely 500 ms later. Check the Motorola
  * MC146818A or Dallas DS12887 data sheet for details.
  */
-int mach_set_rtc_mmss(unsigned long nowtime)
+int mach_set_rtc_mmss(const struct timespec *now)
 {
+	unsigned long nowtime = now->tv_sec;
 	struct rtc_time tm;
 	int retval = 0;
 
@@ -58,7 +59,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
 	return retval;
 }
 
-unsigned long mach_get_cmos_time(void)
+void mach_get_cmos_time(struct timespec *now)
 {
 	unsigned int status, year, mon, day, hour, min, sec, century = 0;
 	unsigned long flags;
@@ -107,7 +108,8 @@ unsigned long mach_get_cmos_time(void)
 	} else
 		year += CMOS_YEARS_OFFS;
 
-	return mktime(year, mon, day, hour, min, sec);
+	now->tv_sec = mktime(year, mon, day, hour, min, sec);
+	now->tv_nsec = 0;
 }
 
 /* Routines for accessing the CMOS RAM/RTC. */
@@ -135,18 +137,13 @@ EXPORT_SYMBOL(rtc_cmos_write);
 
 int update_persistent_clock(struct timespec now)
 {
-	return x86_platform.set_wallclock(now.tv_sec);
+	return x86_platform.set_wallclock(&now);
 }
 
 /* not static: needed by APM */
 void read_persistent_clock(struct timespec *ts)
 {
-	unsigned long retval;
-
-	retval = x86_platform.get_wallclock();
-
-	ts->tv_sec = retval;
-	ts->tv_nsec = 0;
+	x86_platform.get_wallclock(ts);
 }
 
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 56f7fcfe7fa2..e68709da8251 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1040,8 +1040,6 @@ void __init setup_arch(char **cmdline_p)
 	/* max_low_pfn get updated here */
 	find_low_pfn_range();
 #else
-	num_physpages = max_pfn;
-
 	check_x2apic();
 
 	/* How many end-of-memory variables you have, grandma! */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index f4fe0b8879e0..cdaa347dfcad 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -265,23 +265,30 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
 	 */
 }
 
-void smp_trace_reschedule_interrupt(struct pt_regs *regs)
+static inline void smp_entering_irq(void)
 {
 	ack_APIC_irq();
+	irq_enter();
+}
+
+void smp_trace_reschedule_interrupt(struct pt_regs *regs)
+{
+	/*
+	 * Need to call irq_enter() before calling the trace point.
+	 * __smp_reschedule_interrupt() calls irq_enter/exit() too (in
+	 * scheduler_ipi(). This is OK, since those functions are allowed
+	 * to nest.
+	 */
+	smp_entering_irq();
 	trace_reschedule_entry(RESCHEDULE_VECTOR);
 	__smp_reschedule_interrupt();
 	trace_reschedule_exit(RESCHEDULE_VECTOR);
+	exiting_irq();
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
 }
 
-static inline void call_function_entering_irq(void)
-{
-	ack_APIC_irq();
-	irq_enter();
-}
-
 static inline void __smp_call_function_interrupt(void)
 {
 	generic_smp_call_function_interrupt();
@@ -290,14 +297,14 @@ static inline void __smp_call_function_interrupt(void)
 
 void smp_call_function_interrupt(struct pt_regs *regs)
 {
-	call_function_entering_irq();
+	smp_entering_irq();
 	__smp_call_function_interrupt();
 	exiting_irq();
 }
 
 void smp_trace_call_function_interrupt(struct pt_regs *regs)
 {
-	call_function_entering_irq();
+	smp_entering_irq();
 	trace_call_function_entry(CALL_FUNCTION_VECTOR);
 	__smp_call_function_interrupt();
 	trace_call_function_exit(CALL_FUNCTION_VECTOR);
@@ -312,14 +319,14 @@ static inline void __smp_call_function_single_interrupt(void)
 
 void smp_call_function_single_interrupt(struct pt_regs *regs)
 {
-	call_function_entering_irq();
+	smp_entering_irq();
 	__smp_call_function_single_interrupt();
 	exiting_irq();
 }
 
 void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
 {
-	call_function_entering_irq();
+	smp_entering_irq();
 	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
 	__smp_call_function_single_interrupt();
 	trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afc11245827c..c98f05442325 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
 	*((u32 *) (apic->regs + reg_off)) = val;
 }
 
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
-{
-	return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
-{
-	return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
 static inline int apic_test_vector(int vec, void *bitmap)
 {
 	return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
 {
 	apic->irr_pending = true;
-	return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+	apic_set_vector(vec, apic->regs + APIC_IRR);
 }
 
 static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -681,28 +671,21 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (unlikely(!apic_enabled(apic)))
 			break;
 
+		result = 1;
+
 		if (dest_map)
 			__set_bit(vcpu->vcpu_id, dest_map);
 
-		if (kvm_x86_ops->deliver_posted_interrupt) {
-			result = 1;
+		if (kvm_x86_ops->deliver_posted_interrupt)
 			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
-		} else {
-			result = !apic_test_and_set_irr(vector, apic);
-
-			if (!result) {
-				if (trig_mode)
-					apic_debug("level trig mode repeatedly "
-						"for vector %d", vector);
-				goto out;
-			}
+		else {
+			apic_set_irr(vector, apic);
 
 			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		}
-out:
 		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
-				trig_mode, vector, !result);
+					  trig_mode, vector, false);
 		break;
 
 	case APIC_DM_REMRD:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0d094da49541..9651c9937588 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -132,8 +132,8 @@ module_param(dbg, bool, 0644);
 	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 					    * PT32_LEVEL_BITS))) - 1))
 
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
-			| PT64_NX_MASK)
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+			| shadow_x_mask | shadow_nx_mask)
 
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -331,11 +331,6 @@ static int is_large_pte(u64 pte)
 	return pte & PT_PAGE_SIZE_MASK;
 }
 
-static int is_dirty_gpte(unsigned long pte)
-{
-	return pte & PT_DIRTY_MASK;
-}
-
 static int is_rmap_spte(u64 pte)
 {
 	return is_shadow_present_pte(pte);
@@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
 	return __shadow_walk_next(iterator, *iterator->sptep);
 }
 
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
 {
 	u64 spte;
 
+	BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
+			VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+
 	spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
-	       shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+	       shadow_user_mask | shadow_x_mask;
+
+	if (accessed)
+		spte |= shadow_accessed_mask;
 
 	mmu_spte_set(sptep, spte);
 }
@@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 	mmu_free_roots(vcpu);
 }
 
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-	int bit7;
-
-	bit7 = (gpte >> 7) & 1;
-	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 				     bool no_dirty_log)
 {
@@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 	return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
 
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
-				  struct kvm_mmu_page *sp, u64 *spte,
-				  u64 gpte)
-{
-	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-		goto no_present;
-
-	if (!is_present_gpte(gpte))
-		goto no_present;
-
-	if (!(gpte & PT_ACCESSED_MASK))
-		goto no_present;
-
-	return false;
-
-no_present:
-	drop_spte(vcpu->kvm, spte);
-	return true;
-}
-
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp,
 				    u64 *start, u64 *end)
@@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 					      iterator.level - 1,
 					      1, ACC_ALL, iterator.sptep);
 
-			link_shadow_page(iterator.sptep, sp);
+			link_shadow_page(iterator.sptep, sp, true);
 		}
 	}
 	return emulate;
@@ -2811,6 +2784,13 @@ exit:
 static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
 {
 	/*
+	 * Do not fix the mmio spte with invalid generation number which
+	 * need to be updated by slow page fault path.
+	 */
+	if (unlikely(error_code & PFERR_RSVD_MASK))
+		return false;
+
+	/*
 	 * #PF can be fast only if the shadow page table is present and it
 	 * is caused by write-protect, that means we just need change the
 	 * W bit of the spte which can be done out of mmu-lock.
@@ -3202,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 	mmu_sync_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
 				  u32 access, struct x86_exception *exception)
@@ -3471,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 	++vcpu->stat.tlb_flush;
 	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
@@ -3494,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
 	nonpaging_free(vcpu);
 }
 
-static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
-{
-	unsigned mask;
-
-	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
-
-	mask = (unsigned)~ACC_WRITE_MASK;
-	/* Allow write access to dirty gptes */
-	mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
-	*access &= mask;
-}
-
 static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
 			   unsigned access, int *nr_present)
 {
@@ -3523,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
 	return false;
 }
 
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-	unsigned access;
-
-	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-	access &= ~(gpte >> PT64_NX_SHIFT);
-
-	return access;
-}
-
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
 {
 	unsigned index;
@@ -3542,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
 	return mmu->last_pte_bitmap & (1 << index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@@ -3556,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;
 
+	context->bad_mt_xwr = 0;
+
 	if (!context->nx)
 		exb_bit_rsvd = rsvd_bits(63, 63);
 	switch (context->root_level) {
@@ -3611,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 	}
 }
 
-static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+		struct kvm_mmu *context, bool execonly)
+{
+	int maxphyaddr = cpuid_maxphyaddr(vcpu);
+	int pte;
+
+	context->rsvd_bits_mask[0][3] =
+		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+	context->rsvd_bits_mask[0][2] =
+		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+	context->rsvd_bits_mask[0][1] =
+		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+	context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+
+	/* large page */
+	context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+	context->rsvd_bits_mask[1][2] =
+		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+	context->rsvd_bits_mask[1][1] =
+		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+	context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+
+	for (pte = 0; pte < 64; pte++) {
+		int rwx_bits = pte & 7;
+		int mt = pte >> 3;
+		if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
+				rwx_bits == 0x2 || rwx_bits == 0x6 ||
+				(rwx_bits == 0x4 && !execonly))
+			context->bad_mt_xwr |= (1ull << pte);
+	}
+}
+
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+		struct kvm_mmu *mmu, bool ept)
 {
 	unsigned bit, byte, pfec;
 	u8 map;
@@ -3629,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
 			w = bit & ACC_WRITE_MASK;
 			u = bit & ACC_USER_MASK;
 
-			/* Not really needed: !nx will cause pte.nx to fault */
-			x |= !mmu->nx;
-			/* Allow supervisor writes if !cr0.wp */
-			w |= !is_write_protection(vcpu) && !uf;
-			/* Disallow supervisor fetches of user code if cr4.smep */
-			x &= !(smep && u && !uf);
+			if (!ept) {
+				/* Not really needed: !nx will cause pte.nx to fault */
+				x |= !mmu->nx;
+				/* Allow supervisor writes if !cr0.wp */
+				w |= !is_write_protection(vcpu) && !uf;
+				/* Disallow supervisor fetches of user code if cr4.smep */
+				x &= !(smep && u && !uf);
+			} else
+				/* Not really needed: no U/S accesses on ept  */
+				u = 1;
 
 			fault = (ff && !x) || (uf && !u) || (wf && !w);
 			map |= fault << bit;
@@ -3669,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 	context->root_level = level;
 
 	reset_rsvds_bits_mask(vcpu, context);
-	update_permission_bitmask(vcpu, context);
+	update_permission_bitmask(vcpu, context, false);
 	update_last_pte_bitmap(vcpu, context);
 
 	ASSERT(is_pae(vcpu));
@@ -3699,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
 	context->root_level = PT32_ROOT_LEVEL;
 
 	reset_rsvds_bits_mask(vcpu, context);
-	update_permission_bitmask(vcpu, context);
+	update_permission_bitmask(vcpu, context, false);
 	update_last_pte_bitmap(vcpu, context);
 
 	context->new_cr3 = paging_new_cr3;
@@ -3761,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 		context->gva_to_gpa = paging32_gva_to_gpa;
 	}
 
-	update_permission_bitmask(vcpu, context);
+	update_permission_bitmask(vcpu, context, false);
 	update_last_pte_bitmap(vcpu, context);
 
 	return 0;
@@ -3793,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+		bool execonly)
+{
+	ASSERT(vcpu);
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+	context->nx = true;
+	context->new_cr3 = paging_new_cr3;
+	context->page_fault = ept_page_fault;
+	context->gva_to_gpa = ept_gva_to_gpa;
+	context->sync_page = ept_sync_page;
+	context->invlpg = ept_invlpg;
+	context->update_pte = ept_update_pte;
+	context->free = paging_free;
+	context->root_level = context->shadow_root_level;
+	context->root_hpa = INVALID_PAGE;
+	context->direct_map = false;
+
+	update_permission_bitmask(vcpu, context, true);
+	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
+
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
 	int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
@@ -3840,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
 	}
 
-	update_permission_bitmask(vcpu, g_context);
+	update_permission_bitmask(vcpu, g_context, false);
 	update_last_pte_bitmap(vcpu, g_context);
 
 	return 0;
@@ -3916,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new)
 		return true;
 	if ((old ^ new) & PT64_BASE_ADDR_MASK)
 		return true;
-	old ^= PT64_NX_MASK;
-	new ^= PT64_NX_MASK;
+	old ^= shadow_nx_mask;
+	new ^= shadow_nx_mask;
 	return (old & ~new & PT64_PERM_MASK) != 0;
 }
 
@@ -4175,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 	switch (er) {
 	case EMULATE_DONE:
 		return 1;
-	case EMULATE_DO_MMIO:
+	case EMULATE_USER_EXIT:
 		++vcpu->stat.mmio_exits;
 		/* fall through */
 	case EMULATE_FAIL:
@@ -4383,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
 	/*
 	 * The very rare case: if the generation-number is round,
 	 * zap all shadow pages.
-	 *
-	 * The max value is MMIO_MAX_GEN - 1 since it is not called
-	 * when mark memslot invalid.
 	 */
-	if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+	if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
 		printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
 		kvm_mmu_invalidate_zap_all_pages(kvm);
 	}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5b59c573aba7..77e044a0f5f7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,6 +71,8 @@ enum {
 
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+		bool execonly);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7769699d48a8..043330159179 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,6 +23,13 @@
  * so the code in this file is compiled twice, once per pte size.
  */
 
+/*
+ * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
+ * uses for EPT without A/D paging type.
+ */
+extern u64 __pure __using_nonexistent_pte_bit(void)
+	       __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
+
 #if PTTYPE == 64
 	#define pt_element_t u64
 	#define guest_walker guest_walker64
@@ -32,6 +39,10 @@
 	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
 	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT64_LEVEL_BITS
+	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 	#ifdef CONFIG_X86_64
 	#define PT_MAX_FULL_LEVELS 4
 	#define CMPXCHG cmpxchg
@@ -49,7 +60,26 @@
 	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT32_LEVEL_BITS
 	#define PT_MAX_FULL_LEVELS 2
+	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 	#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+	#define pt_element_t u64
+	#define guest_walker guest_walkerEPT
+	#define FNAME(name) ept_##name
+	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+	#define PT_LEVEL_BITS PT64_LEVEL_BITS
+	#define PT_GUEST_ACCESSED_MASK 0
+	#define PT_GUEST_DIRTY_MASK 0
+	#define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
+	#define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+	#define CMPXCHG cmpxchg64
+	#define PT_MAX_FULL_LEVELS 4
 #else
 	#error Invalid PTTYPE value
 #endif
@@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
 	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
+static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+{
+	unsigned mask;
+
+	/* dirty bit is not supported, so no need to track it */
+	if (!PT_GUEST_DIRTY_MASK)
+		return;
+
+	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+	mask = (unsigned)~ACC_WRITE_MASK;
+	/* Allow write access to dirty gptes */
+	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+		PT_WRITABLE_MASK;
+	*access &= mask;
+}
+
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+	int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
+
+	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
+		((mmu->bad_mt_xwr & (1ull << low6)) != 0);
+}
+
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+	return is_present_gpte(pte);
+#else
+	return pte & 7;
+#endif
+}
+
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			       pt_element_t __user *ptep_user, unsigned index,
 			       pt_element_t orig_pte, pt_element_t new_pte)
@@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	return (ret != orig_pte);
 }
 
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu_page *sp, u64 *spte,
+				  u64 gpte)
+{
+	if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+		goto no_present;
+
+	if (!FNAME(is_present_gpte)(gpte))
+		goto no_present;
+
+	/* if accessed bit is not supported prefetch non accessed gpte */
+	if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
+		goto no_present;
+
+	return false;
+
+no_present:
+	drop_spte(vcpu->kvm, spte);
+	return true;
+}
+
+static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+	unsigned access;
+#if PTTYPE == PTTYPE_EPT
+	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+		ACC_USER_MASK;
+#else
+	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+	access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+
+	return access;
+}
+
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 					     struct kvm_mmu *mmu,
 					     struct guest_walker *walker,
@@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 	gfn_t table_gfn;
 	int ret;
 
+	/* dirty/accessed bits are not supported, so no need to update them */
+	if (!PT_GUEST_DIRTY_MASK)
+		return 0;
+
 	for (level = walker->max_level; level >= walker->level; --level) {
 		pte = orig_pte = walker->ptes[level - 1];
 		table_gfn = walker->table_gfn[level - 1];
 		ptep_user = walker->ptep_user[level - 1];
 		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
-		if (!(pte & PT_ACCESSED_MASK)) {
+		if (!(pte & PT_GUEST_ACCESSED_MASK)) {
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
-			pte |= PT_ACCESSED_MASK;
+			pte |= PT_GUEST_ACCESSED_MASK;
 		}
-		if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
+		if (level == walker->level && write_fault &&
+				!(pte & PT_GUEST_DIRTY_MASK)) {
 			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
-			pte |= PT_DIRTY_MASK;
+			pte |= PT_GUEST_DIRTY_MASK;
 		}
 		if (pte == orig_pte)
 			continue;
@@ -170,7 +275,7 @@ retry_walk:
 	if (walker->level == PT32E_ROOT_LEVEL) {
 		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
-		if (!is_present_gpte(pte))
+		if (!FNAME(is_present_gpte)(pte))
 			goto error;
 		--walker->level;
 	}
@@ -179,7 +284,7 @@ retry_walk:
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
 	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
 
-	accessed_dirty = PT_ACCESSED_MASK;
+	accessed_dirty = PT_GUEST_ACCESSED_MASK;
 	pt_access = pte_access = ACC_ALL;
 	++walker->level;
 
@@ -215,17 +320,17 @@ retry_walk:
 
 		trace_kvm_mmu_paging_element(pte, walker->level);
 
-		if (unlikely(!is_present_gpte(pte)))
+		if (unlikely(!FNAME(is_present_gpte)(pte)))
 			goto error;
 
-		if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
-					      walker->level))) {
+		if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
+					             walker->level))) {
 			errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
 			goto error;
 		}
 
 		accessed_dirty &= pte;
-		pte_access = pt_access & gpte_access(vcpu, pte);
+		pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
 
 		walker->ptes[walker->level - 1] = pte;
 	} while (!is_last_gpte(mmu, walker->level, pte));
@@ -248,13 +353,15 @@ retry_walk:
 	walker->gfn = real_gpa >> PAGE_SHIFT;
 
 	if (!write_fault)
-		protect_clean_gpte(&pte_access, pte);
+		FNAME(protect_clean_gpte)(&pte_access, pte);
 	else
 		/*
-		 * On a write fault, fold the dirty bit into accessed_dirty by
-		 * shifting it one place right.
+		 * On a write fault, fold the dirty bit into accessed_dirty.
+		 * For modes without A/D bits support accessed_dirty will be
+		 * always clear.
 		 */
-		accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
+		accessed_dirty &= pte >>
+			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
 
 	if (unlikely(!accessed_dirty)) {
 		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -279,6 +386,25 @@ error:
 	walker->fault.vector = PF_VECTOR;
 	walker->fault.error_code_valid = true;
 	walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+	/*
+	 * Use PFERR_RSVD_MASK in error_code to to tell if EPT
+	 * misconfiguration requires to be injected. The detection is
+	 * done by is_rsvd_bits_set() above.
+	 *
+	 * We set up the value of exit_qualification to inject:
+	 * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
+	 * [5:3] - Calculated by the page walk of the guest EPT page tables
+	 * [7:8] - Derived from [7:8] of real exit_qualification
+	 *
+	 * The other bits are set to 0.
+	 */
+	if (!(errcode & PFERR_RSVD_MASK)) {
+		vcpu->arch.exit_qualification &= 0x187;
+		vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
+	}
+#endif
 	walker->fault.address = addr;
 	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
 
@@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 					access);
 }
 
+#if PTTYPE != PTTYPE_EPT
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 				   struct kvm_vcpu *vcpu, gva_t addr,
 				   u32 access)
@@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
 					addr, access);
 }
+#endif
 
 static bool
 FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	gfn_t gfn;
 	pfn_t pfn;
 
-	if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
 		return false;
 
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
 
 	gfn = gpte_to_gfn(gpte);
-	pte_access = sp->role.access & gpte_access(vcpu, gpte);
-	protect_clean_gpte(&pte_access, gpte);
+	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+	FNAME(protect_clean_gpte)(&pte_access, gpte);
 	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
 			no_dirty_log && (pte_access & ACC_WRITE_MASK));
 	if (is_error_pfn(pfn))
@@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			goto out_gpte_changed;
 
 		if (sp)
-			link_shadow_page(it.sptep, sp);
+			link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
 	}
 
 	for (;
@@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
 		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
 				      true, direct_access, it.sptep);
-		link_shadow_page(it.sptep, sp);
+		link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
 	}
 
 	clear_sp_write_flooding_count(it.sptep);
@@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	return gpa;
 }
 
+#if PTTYPE != PTTYPE_EPT
 static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 				      u32 access,
 				      struct x86_exception *exception)
@@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 
 	return gpa;
 }
+#endif
 
 /*
  * Using the cached information from sp->gfns is safe because:
@@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 					  sizeof(pt_element_t)))
 			return -EINVAL;
 
-		if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
+		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
 			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
 
 		gfn = gpte_to_gfn(gpte);
 		pte_access = sp->role.access;
-		pte_access &= gpte_access(vcpu, gpte);
-		protect_clean_gpte(&pte_access, gpte);
+		pte_access &= FNAME(gpte_access)(vcpu, gpte);
+		FNAME(protect_clean_gpte)(&pte_access, gpte);
 
 		if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
 		      &nr_present))
@@ -830,3 +960,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 #undef gpte_to_gfn
 #undef gpte_to_gfn_lvl
 #undef CMPXCHG
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index c53e797e7369..5c4f63151b4d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc)
 
 static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
 		unsigned config, bool exclude_user, bool exclude_kernel,
-		bool intr)
+		bool intr, bool in_tx, bool in_tx_cp)
 {
 	struct perf_event *event;
 	struct perf_event_attr attr = {
@@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
 		.exclude_kernel = exclude_kernel,
 		.config = config,
 	};
+	if (in_tx)
+		attr.config |= HSW_IN_TX;
+	if (in_tx_cp)
+		attr.config |= HSW_IN_TX_CHECKPOINTED;
 
 	attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
 
@@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 
 	if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
 				ARCH_PERFMON_EVENTSEL_INV |
-				ARCH_PERFMON_EVENTSEL_CMASK))) {
+				ARCH_PERFMON_EVENTSEL_CMASK |
+				HSW_IN_TX |
+				HSW_IN_TX_CHECKPOINTED))) {
 		config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
 				unit_mask);
 		if (config != PERF_COUNT_HW_MAX)
@@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 	reprogram_counter(pmc, type, config,
 			!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
 			!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
-			eventsel & ARCH_PERFMON_EVENTSEL_INT);
+			eventsel & ARCH_PERFMON_EVENTSEL_INT,
+			(eventsel & HSW_IN_TX),
+			(eventsel & HSW_IN_TX_CHECKPOINTED));
 }
 
 static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
@@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
 			arch_events[fixed_pmc_events[idx]].event_type,
 			!(en & 0x2), /* exclude user */
 			!(en & 0x1), /* exclude kernel */
-			pmi);
+			pmi, false, false);
 }
 
 static inline u8 fixed_en_pmi(u64 ctrl, int idx)
@@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
 			if (data == pmc->eventsel)
 				return 0;
-			if (!(data & 0xffffffff00200000ull)) {
+			if (!(data & pmu->reserved_bits)) {
 				reprogram_gp_counter(pmc, data);
 				return 0;
 			}
@@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
 	pmu->counter_bitmask[KVM_PMC_GP] = 0;
 	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
 	pmu->version = 0;
+	pmu->reserved_bits = 0xffffffff00200000ull;
 
 	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
 	if (!entry)
@@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
 	pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
 		(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
 	pmu->global_ctrl_mask = ~pmu->global_ctrl;
+
+	entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+	if (entry &&
+	    (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+	    (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+		pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
 }
 
 void kvm_pmu_init(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 064d0be67ecc..57b4e129891a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -373,6 +373,7 @@ struct nested_vmx {
 	 * we must keep them pinned while L2 runs.
 	 */
 	struct page *apic_access_page;
+	u64 msr_ia32_feature_control;
 };
 
 #define POSTED_INTR_ON  0
@@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page)
 	kvm_release_page_clean(page);
 }
 
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
@@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
 		(vmcs12->secondary_vm_exec_control & bit);
 }
 
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
-	struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 {
 	return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
 	/*
@@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 	 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
 	 * 17 must be 1.
 	 */
+	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
+		nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
 	nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 	/* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+	nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
-	nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#else
-	nested_vmx_exit_ctls_high = 0;
+		VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
-	nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+	nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+				      VM_EXIT_LOAD_IA32_EFER);
 
 	/* entry controls */
 	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 	/* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
 	nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 	nested_vmx_entry_ctls_high &=
-		VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
-	nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+#ifdef CONFIG_X86_64
+		VM_ENTRY_IA32E_MODE |
+#endif
+		VM_ENTRY_LOAD_IA32_PAT;
+	nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
+				       VM_ENTRY_LOAD_IA32_EFER);
 
 	/* cpu-based controls */
 	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_WBINVD_EXITING;
 
+	if (enable_ept) {
+		/* nested EPT: emulate EPT also to L1 */
+		nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+		nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+			 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+		nested_vmx_ept_caps &= vmx_capability.ept;
+		/*
+		 * Since invept is completely emulated we support both global
+		 * and context invalidation independent of what host cpu
+		 * supports
+		 */
+		nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+			VMX_EPT_EXTENT_CONTEXT_BIT;
+	} else
+		nested_vmx_ept_caps = 0;
+
 	/* miscellaneous data */
 	rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
 	nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 
 	switch (msr_index) {
 	case MSR_IA32_FEATURE_CONTROL:
-		*pdata = 0;
-		break;
+		if (nested_vmx_allowed(vcpu)) {
+			*pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+			break;
+		}
+		return 0;
 	case MSR_IA32_VMX_BASIC:
 		/*
 		 * This MSR reports some information about VMX support. We
@@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 					nested_vmx_secondary_ctls_high);
 		break;
 	case MSR_IA32_VMX_EPT_VPID_CAP:
-		/* Currently, no nested ept or nested vpid */
-		*pdata = 0;
+		/* Currently, no nested vpid support */
+		*pdata = nested_vmx_ept_caps;
 		break;
 	default:
 		return 0;
@@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 	return 1;
 }
 
-static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
+	u32 msr_index = msr_info->index;
+	u64 data = msr_info->data;
+	bool host_initialized = msr_info->host_initiated;
+
 	if (!nested_vmx_allowed(vcpu))
 		return 0;
 
-	if (msr_index == MSR_IA32_FEATURE_CONTROL)
-		/* TODO: the right thing. */
+	if (msr_index == MSR_IA32_FEATURE_CONTROL) {
+		if (!host_initialized &&
+				to_vmx(vcpu)->nested.msr_ia32_feature_control
+				& FEATURE_CONTROL_LOCKED)
+			return 0;
+		to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
 		return 1;
+	}
+
 	/*
 	 * No need to treat VMX capability MSRs specially: If we don't handle
 	 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
@@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		/* Otherwise falls through */
 	default:
-		if (vmx_set_vmx_msr(vcpu, msr_index, data))
+		if (vmx_set_vmx_msr(vcpu, msr_info))
 			break;
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
@@ -5302,9 +5344,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 
 	/* It is a write fault? */
 	error_code = exit_qualification & (1U << 1);
+	/* It is a fetch fault? */
+	error_code |= (exit_qualification & (1U << 2)) << 2;
 	/* ept page table is present? */
 	error_code |= (exit_qualification >> 3) & 0x1;
 
+	vcpu->arch.exit_qualification = exit_qualification;
+
 	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
@@ -5438,7 +5484,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 
 		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
 
-		if (err == EMULATE_DO_MMIO) {
+		if (err == EMULATE_USER_EXIT) {
 			ret = 0;
 			goto out;
 		}
@@ -5567,8 +5613,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
 		free_loaded_vmcs(&vmx->vmcs01);
 }
 
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+			    X86_EFLAGS_SF | X86_EFLAGS_OF))
+			| X86_EFLAGS_CF);
+}
+
 static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-				 u32 vm_instruction_error);
+					u32 vm_instruction_error)
+{
+	if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+		/*
+		 * failValid writes the error number to the current VMCS, which
+		 * can't be done there isn't a current VMCS.
+		 */
+		nested_vmx_failInvalid(vcpu);
+		return;
+	}
+	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+			    X86_EFLAGS_SF | X86_EFLAGS_OF))
+			| X86_EFLAGS_ZF);
+	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+	/*
+	 * We don't need to force a shadow sync because
+	 * VM_INSTRUCTION_ERROR is not shadowed
+	 */
+}
 
 /*
  * Emulate the VMXON instruction.
@@ -5583,6 +5668,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 	struct kvm_segment cs;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs *shadow_vmcs;
+	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 
 	/* The Intel VMX Instruction Reference lists a bunch of bits that
 	 * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5611,6 +5698,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		skip_emulated_instruction(vcpu);
 		return 1;
 	}
+
+	if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+			!= VMXON_NEEDED_FEATURES) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
 	if (enable_shadow_vmcs) {
 		shadow_vmcs = alloc_vmcs();
 		if (!shadow_vmcs)
@@ -5628,6 +5722,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 	vmx->nested.vmxon = true;
 
 	skip_emulated_instruction(vcpu);
+	nested_vmx_succeed(vcpu);
 	return 1;
 }
 
@@ -5712,6 +5807,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
 		return 1;
 	free_nested(to_vmx(vcpu));
 	skip_emulated_instruction(vcpu);
+	nested_vmx_succeed(vcpu);
 	return 1;
 }
 
@@ -5768,48 +5864,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-/*
- * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
- */
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
-{
-	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
-			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
-}
-
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
-{
-	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
-			    X86_EFLAGS_SF | X86_EFLAGS_OF))
-			| X86_EFLAGS_CF);
-}
-
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-					u32 vm_instruction_error)
-{
-	if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
-		/*
-		 * failValid writes the error number to the current VMCS, which
-		 * can't be done there isn't a current VMCS.
-		 */
-		nested_vmx_failInvalid(vcpu);
-		return;
-	}
-	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-			    X86_EFLAGS_SF | X86_EFLAGS_OF))
-			| X86_EFLAGS_ZF);
-	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
-	/*
-	 * We don't need to force a shadow sync because
-	 * VM_INSTRUCTION_ERROR is not shadowed
-	 */
-}
-
 /* Emulate the VMCLEAR instruction */
 static int handle_vmclear(struct kvm_vcpu *vcpu)
 {
@@ -5972,8 +6026,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 	unsigned long field;
 	u64 field_value;
 	struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
-	unsigned long *fields = (unsigned long *)shadow_read_write_fields;
-	int num_fields = max_shadow_read_write_fields;
+	const unsigned long *fields = shadow_read_write_fields;
+	const int num_fields = max_shadow_read_write_fields;
 
 	vmcs_load(shadow_vmcs);
 
@@ -6002,12 +6056,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 {
-	unsigned long *fields[] = {
-		(unsigned long *)shadow_read_write_fields,
-		(unsigned long *)shadow_read_only_fields
+	const unsigned long *fields[] = {
+		shadow_read_write_fields,
+		shadow_read_only_fields
 	};
-	int num_lists =  ARRAY_SIZE(fields);
-	int max_fields[] = {
+	const int max_fields[] = {
 		max_shadow_read_write_fields,
 		max_shadow_read_only_fields
 	};
@@ -6018,7 +6071,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 
 	vmcs_load(shadow_vmcs);
 
-	for (q = 0; q < num_lists; q++) {
+	for (q = 0; q < ARRAY_SIZE(fields); q++) {
 		for (i = 0; i < max_fields[q]; i++) {
 			field = fields[q][i];
 			vmcs12_read_any(&vmx->vcpu, field, &field_value);
@@ -6248,6 +6301,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+	u32 vmx_instruction_info, types;
+	unsigned long type;
+	gva_t gva;
+	struct x86_exception e;
+	struct {
+		u64 eptp, gpa;
+	} operand;
+	u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
+
+	if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+	    !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+	types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+
+	if (!(types & (1UL << type))) {
+		nested_vmx_failValid(vcpu,
+				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+		return 1;
+	}
+
+	/* According to the Intel VMX instruction reference, the memory
+	 * operand is read even if it isn't needed (e.g., for type==global)
+	 */
+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+			vmx_instruction_info, &gva))
+		return 1;
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+				sizeof(operand), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+
+	switch (type) {
+	case VMX_EPT_EXTENT_CONTEXT:
+		if ((operand.eptp & eptp_mask) !=
+				(nested_ept_get_cr3(vcpu) & eptp_mask))
+			break;
+	case VMX_EPT_EXTENT_GLOBAL:
+		kvm_mmu_sync_roots(vcpu);
+		kvm_mmu_flush_tlb(vcpu);
+		nested_vmx_succeed(vcpu);
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -6292,6 +6413,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
 	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op,
 	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+	[EXIT_REASON_INVEPT]                  = handle_invept,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -6518,6 +6640,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
 	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
 	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+	case EXIT_REASON_INVEPT:
 		/*
 		 * VMX instructions trap unconditionally. This allows L1 to
 		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6550,7 +6673,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return nested_cpu_has2(vmcs12,
 			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
 	case EXIT_REASON_EPT_VIOLATION:
+		/*
+		 * L0 always deals with the EPT violation. If nested EPT is
+		 * used, and the nested mmu code discovers that the address is
+		 * missing in the guest EPT table (EPT12), the EPT violation
+		 * will be injected with nested_ept_inject_page_fault()
+		 */
+		return 0;
 	case EXIT_REASON_EPT_MISCONFIG:
+		/*
+		 * L2 never uses directly L1's EPT, but rather L0's own EPT
+		 * table (shadow on EPT) or a merged EPT table that L0 built
+		 * (EPT on EPT). So any problems with the structure of the
+		 * table is L0's fault.
+		 */
 		return 0;
 	case EXIT_REASON_PREEMPTION_TIMER:
 		return vmcs12->pin_based_vm_exec_control &
@@ -6638,7 +6774,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
 	    !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-	                                get_vmcs12(vcpu), vcpu)))) {
+					get_vmcs12(vcpu))))) {
 		if (vmx_interrupt_allowed(vcpu)) {
 			vmx->soft_vnmi_blocked = 0;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -7326,6 +7462,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 		entry->ecx |= bit(X86_FEATURE_VMX);
 }
 
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+		struct x86_exception *fault)
+{
+	struct vmcs12 *vmcs12;
+	nested_vmx_vmexit(vcpu);
+	vmcs12 = get_vmcs12(vcpu);
+
+	if (fault->error_code & PFERR_RSVD_MASK)
+		vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+	else
+		vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+	vmcs12->exit_qualification = vcpu->arch.exit_qualification;
+	vmcs12->guest_physical_address = fault->address;
+}
+
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+	/* return the page table to be shadowed - in our case, EPT12 */
+	return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+	int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+			nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
+
+	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
+	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
+	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+
+	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+
+	return r;
+}
+
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7388,7 +7566,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmcs12->guest_interruptibility_info);
 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
 	kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
-	vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
 	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
 		vmcs12->guest_pending_dbg_exceptions);
 	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
@@ -7508,15 +7686,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
-	/* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
-	vmcs_write32(VM_EXIT_CONTROLS,
-		vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
-	vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+	/* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+	 * bits are further modified by vmx_set_efer() below.
+	 */
+	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+	 * emulated by vmx_set_efer(), below.
+	 */
+	vmcs_write32(VM_ENTRY_CONTROLS,
+		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+			~VM_ENTRY_IA32E_MODE) |
 		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
 
-	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
 		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
-	else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vcpu->arch.pat = vmcs12->guest_ia32_pat;
+	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
 		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
 
 
@@ -7538,6 +7725,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmx_flush_tlb(vcpu);
 	}
 
+	if (nested_cpu_has_ept(vmcs12)) {
+		kvm_mmu_unload(vcpu);
+		nested_ept_init_mmu_context(vcpu);
+	}
+
 	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
 		vcpu->arch.efer = vmcs12->guest_ia32_efer;
 	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -7565,6 +7757,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	kvm_set_cr3(vcpu, vmcs12->guest_cr3);
 	kvm_mmu_reset_context(vcpu);
 
+	/*
+	 * L1 may access the L2's PDPTR, so save them to construct vmcs12
+	 */
+	if (enable_ept) {
+		vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+		vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+		vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+	}
+
 	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
 	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
@@ -7887,6 +8089,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->guest_pending_dbg_exceptions =
 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+	/*
+	 * In some cases (usually, nested EPT), L2 is allowed to change its
+	 * own CR3 without exiting. If it has changed it, we must keep it.
+	 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+	 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+	 *
+	 * Additionally, restore L2's PDPTR to vmcs12.
+	 */
+	if (enable_ept) {
+		vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+		vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+		vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+		vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+		vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+	}
+
 	vmcs12->vm_entry_controls =
 		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
 		(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
@@ -7948,6 +8166,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 				   struct vmcs12 *vmcs12)
 {
+	struct kvm_segment seg;
+
 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
 		vcpu->arch.efer = vmcs12->host_ia32_efer;
 	else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
@@ -7982,7 +8202,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
 	kvm_set_cr4(vcpu, vmcs12->host_cr4);
 
-	/* shadow page tables on either EPT or shadow page tables */
+	if (nested_cpu_has_ept(vmcs12))
+		nested_ept_uninit_mmu_context(vcpu);
+
 	kvm_set_cr3(vcpu, vmcs12->host_cr3);
 	kvm_mmu_reset_context(vcpu);
 
@@ -8001,23 +8223,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
 	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
 	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
-	vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
-	vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
-	vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
-	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
-	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
-	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
-	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
-	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
-	vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
-	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
-
-	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
+
+	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
 		vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+		vcpu->arch.pat = vmcs12->host_ia32_pat;
+	}
 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
 		vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
 			vmcs12->host_ia32_perf_global_ctrl);
 
+	/* Set L1 segment info according to Intel SDM
+	    27.5.2 Loading Host Segment and Descriptor-Table Registers */
+	seg = (struct kvm_segment) {
+		.base = 0,
+		.limit = 0xFFFFFFFF,
+		.selector = vmcs12->host_cs_selector,
+		.type = 11,
+		.present = 1,
+		.s = 1,
+		.g = 1
+	};
+	if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+		seg.l = 1;
+	else
+		seg.db = 1;
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+	seg = (struct kvm_segment) {
+		.base = 0,
+		.limit = 0xFFFFFFFF,
+		.type = 3,
+		.present = 1,
+		.s = 1,
+		.db = 1,
+		.g = 1
+	};
+	seg.selector = vmcs12->host_ds_selector;
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+	seg.selector = vmcs12->host_es_selector;
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+	seg.selector = vmcs12->host_ss_selector;
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+	seg.selector = vmcs12->host_fs_selector;
+	seg.base = vmcs12->host_fs_base;
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+	seg.selector = vmcs12->host_gs_selector;
+	seg.base = vmcs12->host_gs_base;
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+	seg = (struct kvm_segment) {
+		.base = vmcs12->host_tr_base,
+		.limit = 0x67,
+		.selector = vmcs12->host_tr_selector,
+		.type = 11,
+		.present = 1
+	};
+	vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
 	kvm_set_dr(vcpu, 7, 0x400);
 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d21bce505315..668f19aee6ca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		 */
 	}
 
-	/*
-	 * Does the new cr3 value map to physical memory? (Note, we
-	 * catch an invalid cr3 even in real-mode, because it would
-	 * cause trouble later on when we turn on paging anyway.)
-	 *
-	 * A real CPU would silently accept an invalid cr3 and would
-	 * attempt to use it - with largely undefined (and often hard
-	 * to debug) behavior on the guest side.
-	 */
-	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-		return 1;
 	vcpu->arch.cr3 = cr3;
 	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 	vcpu->arch.mmu.new_cr3(vcpu);
@@ -850,7 +839,8 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+	MSR_IA32_FEATURE_CONTROL
 };
 
 static unsigned num_msrs_to_save;
@@ -4955,6 +4945,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
 
+static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+				unsigned long *db)
+{
+	u32 dr6 = 0;
+	int i;
+	u32 enable, rwlen;
+
+	enable = dr7;
+	rwlen = dr7 >> 16;
+	for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+		if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+			dr6 |= (1 << i);
+	return dr6;
+}
+
+static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
+{
+	struct kvm_run *kvm_run = vcpu->run;
+
+	/*
+	 * Use the "raw" value to see if TF was passed to the processor.
+	 * Note that the new value of the flags has not been saved yet.
+	 *
+	 * This is correct even for TF set by the guest, because "the
+	 * processor will not generate this exception after the instruction
+	 * that sets the TF flag".
+	 */
+	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+
+	if (unlikely(rflags & X86_EFLAGS_TF)) {
+		if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+			kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
+			kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+			kvm_run->debug.arch.exception = DB_VECTOR;
+			kvm_run->exit_reason = KVM_EXIT_DEBUG;
+			*r = EMULATE_USER_EXIT;
+		} else {
+			vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
+			/*
+			 * "Certain debug exceptions may clear bit 0-3.  The
+			 * remaining contents of the DR6 register are never
+			 * cleared by the processor".
+			 */
+			vcpu->arch.dr6 &= ~15;
+			vcpu->arch.dr6 |= DR6_BS;
+			kvm_queue_exception(vcpu, DB_VECTOR);
+		}
+	}
+}
+
+static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+{
+	struct kvm_run *kvm_run = vcpu->run;
+	unsigned long eip = vcpu->arch.emulate_ctxt.eip;
+	u32 dr6 = 0;
+
+	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+	    (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+		dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+					   vcpu->arch.guest_debug_dr7,
+					   vcpu->arch.eff_db);
+
+		if (dr6 != 0) {
+			kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+			kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
+				get_segment_base(vcpu, VCPU_SREG_CS);
+
+			kvm_run->debug.arch.exception = DB_VECTOR;
+			kvm_run->exit_reason = KVM_EXIT_DEBUG;
+			*r = EMULATE_USER_EXIT;
+			return true;
+		}
+	}
+
+	if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
+		dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+					   vcpu->arch.dr7,
+					   vcpu->arch.db);
+
+		if (dr6 != 0) {
+			vcpu->arch.dr6 &= ~15;
+			vcpu->arch.dr6 |= dr6;
+			kvm_queue_exception(vcpu, DB_VECTOR);
+			*r = EMULATE_DONE;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 			    unsigned long cr2,
 			    int emulation_type,
@@ -4975,6 +5056,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		init_emulate_ctxt(vcpu);
+
+		/*
+		 * We will reenter on the same instruction since
+		 * we do not set complete_userspace_io.  This does not
+		 * handle watchpoints yet, those would be handled in
+		 * the emulate_ops.
+		 */
+		if (kvm_vcpu_check_breakpoint(vcpu, &r))
+			return r;
+
 		ctxt->interruptibility = 0;
 		ctxt->have_exception = false;
 		ctxt->perm_ok = false;
@@ -5037,11 +5128,11 @@ restart:
 			writeback = false;
 			vcpu->arch.complete_userspace_io = complete_emulated_pio;
 		}
-		r = EMULATE_DO_MMIO;
+		r = EMULATE_USER_EXIT;
 	} else if (vcpu->mmio_needed) {
 		if (!vcpu->mmio_is_write)
 			writeback = false;
-		r = EMULATE_DO_MMIO;
+		r = EMULATE_USER_EXIT;
 		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
 	} else if (r == EMULATION_RESTART)
 		goto restart;
@@ -5050,10 +5141,12 @@ restart:
 
 	if (writeback) {
 		toggle_interruptibility(vcpu, ctxt->interruptibility);
-		kvm_set_rflags(vcpu, ctxt->eflags);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 		kvm_rip_write(vcpu, ctxt->eip);
+		if (r == EMULATE_DONE)
+			kvm_vcpu_check_singlestep(vcpu, &r);
+		kvm_set_rflags(vcpu, ctxt->eflags);
 	} else
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
 
@@ -5347,7 +5440,7 @@ static struct notifier_block pvclock_gtod_notifier = {
 int kvm_arch_init(void *opaque)
 {
 	int r;
-	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+	struct kvm_x86_ops *ops = opaque;
 
 	if (kvm_x86_ops) {
 		printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -7019,6 +7112,15 @@ out_free:
 	return -ENOMEM;
 }
 
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+	/*
+	 * memslots->generation has been incremented.
+	 * mmio generation may have reached its maximum value.
+	 */
+	kvm_mmu_invalidate_mmio_sptes(kvm);
+}
+
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_userspace_memory_region *mem,
@@ -7079,11 +7181,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 */
 	if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
 		kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-	/*
-	 * If memory slot is created, or moved, we need to clear all
-	 * mmio sptes.
-	 */
-	kvm_mmu_invalidate_mmio_sptes(kvm);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index d482bcaf61c1..6a22c19da663 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -882,9 +882,9 @@ int lguest_setup_irq(unsigned int irq)
  * It would be far better for everyone if the Guest had its own clock, but
  * until then the Host gives us the time on every interrupt.
  */
-static unsigned long lguest_get_wallclock(void)
+static void lguest_get_wallclock(struct timespec *now)
 {
-	return lguest_data.time.tv_sec;
+	*now = lguest_data.time;
 }
 
 /*
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 252b8f5489ba..4500142bc4aa 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -1,6 +1,7 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/swap.h> /* for totalram_pages */
+#include <linux/bootmem.h>
 
 void *kmap(struct page *page)
 {
@@ -121,6 +122,11 @@ void __init set_highmem_pages_init(void)
 	struct zone *zone;
 	int nid;
 
+	/*
+	 * Explicitly reset zone->managed_pages because set_highmem_pages_init()
+	 * is invoked before free_all_bootmem()
+	 */
+	reset_all_zones_managed_pages();
 	for_each_zone(zone) {
 		unsigned long zone_start_pfn, zone_end_pfn;
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1f34e9219775..2ec29ac78ae6 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -494,7 +494,6 @@ int devmem_is_allowed(unsigned long pagenr)
 
 void free_init_pages(char *what, unsigned long begin, unsigned long end)
 {
-	unsigned long addr;
 	unsigned long begin_aligned, end_aligned;
 
 	/* Make sure boundaries are page aligned */
@@ -509,8 +508,6 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	if (begin >= end)
 		return;
 
-	addr = begin;
-
 	/*
 	 * If debugging page accesses then do not free this memory but
 	 * mark them not present - any buggy init-section access will
@@ -529,18 +526,13 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
 	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
 
-	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-
-	for (; addr < end; addr += PAGE_SIZE) {
-		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
-		free_reserved_page(virt_to_page(addr));
-	}
+	free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what);
 #endif
 }
 
 void free_initmem(void)
 {
-	free_init_pages("unused kernel memory",
+	free_init_pages("unused kernel",
 			(unsigned long)(&__init_begin),
 			(unsigned long)(&__init_end));
 }
@@ -566,7 +558,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
 	 *   - relocate_initrd()
 	 * So here We can do PAGE_ALIGN() safely to get partial page to be freed
 	 */
-	free_init_pages("initrd memory", start, PAGE_ALIGN(end));
+	free_init_pages("initrd", start, PAGE_ALIGN(end));
 }
 #endif
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3ac7e319918d..4287f1ffba7e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -660,10 +660,8 @@ void __init initmem_init(void)
 		highstart_pfn = max_low_pfn;
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 		pages_to_mb(highend_pfn - highstart_pfn));
-	num_physpages = highend_pfn;
 	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
-	num_physpages = max_low_pfn;
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
 
@@ -671,7 +669,7 @@ void __init initmem_init(void)
 	sparse_memory_present_with_active_regions(0);
 
 #ifdef CONFIG_FLATMEM
-	max_mapnr = num_physpages;
+	max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;
 #endif
 	__vmalloc_start_set = true;
 
@@ -739,9 +737,6 @@ static void __init test_wp_bit(void)
 
 void __init mem_init(void)
 {
-	int codesize, reservedpages, datasize, initsize;
-	int tmp;
-
 	pci_iommu_alloc();
 
 #ifdef CONFIG_FLATMEM
@@ -759,32 +754,11 @@ void __init mem_init(void)
 	set_highmem_pages_init();
 
 	/* this will put all low memory onto the freelists */
-	totalram_pages += free_all_bootmem();
-
-	reservedpages = 0;
-	for (tmp = 0; tmp < max_low_pfn; tmp++)
-		/*
-		 * Only count reserved RAM pages:
-		 */
-		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
-			reservedpages++;
+	free_all_bootmem();
 
 	after_bootmem = 1;
 
-	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
-	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
-	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
-
-	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
-			"%dk reserved, %dk data, %dk init, %ldk highmem)\n",
-		nr_free_pages() << (PAGE_SHIFT-10),
-		num_physpages << (PAGE_SHIFT-10),
-		codesize >> 10,
-		reservedpages << (PAGE_SHIFT-10),
-		datasize >> 10,
-		initsize >> 10,
-		totalhigh_pages << (PAGE_SHIFT-10));
-
+	mem_init_print_info(NULL);
 	printk(KERN_INFO "virtual kernel memory layout:\n"
 		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #ifdef CONFIG_HIGHMEM
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b3940b6b4d7e..104d56a9245f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -712,36 +712,22 @@ EXPORT_SYMBOL_GPL(arch_add_memory);
 
 static void __meminit free_pagetable(struct page *page, int order)
 {
-	struct zone *zone;
-	bool bootmem = false;
 	unsigned long magic;
 	unsigned int nr_pages = 1 << order;
 
 	/* bootmem page has reserved flag */
 	if (PageReserved(page)) {
 		__ClearPageReserved(page);
-		bootmem = true;
 
 		magic = (unsigned long)page->lru.next;
 		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
 			while (nr_pages--)
 				put_page_bootmem(page++);
 		} else
-			__free_pages_bootmem(page, order);
+			while (nr_pages--)
+				free_reserved_page(page++);
 	} else
 		free_pages((unsigned long)page_address(page), order);
-
-	/*
-	 * SECTION_INFO pages and MIX_SECTION_INFO pages
-	 * are all allocated by bootmem.
-	 */
-	if (bootmem) {
-		zone = page_zone(page);
-		zone_span_writelock(zone);
-		zone->present_pages += nr_pages;
-		zone_span_writeunlock(zone);
-		totalram_pages += nr_pages;
-	}
 }
 
 static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
@@ -1058,9 +1044,6 @@ static void __init register_page_bootmem_info(void)
 
 void __init mem_init(void)
 {
-	long codesize, reservedpages, datasize, initsize;
-	unsigned long absent_pages;
-
 	pci_iommu_alloc();
 
 	/* clear_bss() already clear the empty_zero_page */
@@ -1068,29 +1051,14 @@ void __init mem_init(void)
 	register_page_bootmem_info();
 
 	/* this will put all memory onto the freelists */
-	totalram_pages = free_all_bootmem();
-
-	absent_pages = absent_pages_in_range(0, max_pfn);
-	reservedpages = max_pfn - totalram_pages - absent_pages;
+	free_all_bootmem();
 	after_bootmem = 1;
 
-	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
-	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
-	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
-
 	/* Register memory areas for /proc/kcore */
 	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
 			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
 
-	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
-			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
-		nr_free_pages() << (PAGE_SHIFT-10),
-		max_pfn << (PAGE_SHIFT-10),
-		codesize >> 10,
-		absent_pages << (PAGE_SHIFT-10),
-		reservedpages << (PAGE_SHIFT-10),
-		datasize >> 10,
-		initsize >> 10);
+	mem_init_print_info(NULL);
 }
 
 #ifdef CONFIG_DEBUG_RODATA
@@ -1166,11 +1134,10 @@ void mark_rodata_ro(void)
 	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
 #endif
 
-	free_init_pages("unused kernel memory",
+	free_init_pages("unused kernel",
 			(unsigned long) __va(__pa_symbol(text_end)),
 			(unsigned long) __va(__pa_symbol(rodata_start)));
-
-	free_init_pages("unused kernel memory",
+	free_init_pages("unused kernel",
 			(unsigned long) __va(__pa_symbol(rodata_end)),
 			(unsigned long) __va(__pa_symbol(_sdata)));
 }
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 845df6835f9f..62c29a5bfe26 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -115,10 +115,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (mmap_is_legacy()) {
 		mm->mmap_base = mmap_legacy_base();
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 73a6d7395bd3..0342d27ca798 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -83,10 +83,8 @@ void __init initmem_init(void)
 		highstart_pfn = max_low_pfn;
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 	       pages_to_mb(highend_pfn - highstart_pfn));
-	num_physpages = highend_pfn;
 	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
-	num_physpages = max_low_pfn;
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17fda6a8b3c2..dfa537a03be1 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -240,7 +240,6 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 {
 	pud_t *pud;
-	unsigned long addr;
 	int i;
 
 	if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
@@ -248,8 +247,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 
 	pud = pud_offset(pgd, 0);
 
- 	for (addr = i = 0; i < PREALLOCATED_PMDS;
-	     i++, pud++, addr += PUD_SIZE) {
+	for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
 		pmd_t *pmd = pmds[i];
 
 		if (i >= KERNEL_PGD_BOUNDARY)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f66b54086ce5..79c216aa0e2b 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -12,6 +12,7 @@
 #include <linux/netdevice.h>
 #include <linux/filter.h>
 #include <linux/if_vlan.h>
+#include <linux/random.h>
 
 /*
  * Conventions :
@@ -144,6 +145,39 @@ static int pkt_type_offset(void)
 	return -1;
 }
 
+struct bpf_binary_header {
+	unsigned int	pages;
+	/* Note : for security reasons, bpf code will follow a randomly
+	 * sized amount of int3 instructions
+	 */
+	u8		image[];
+};
+
+static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
+						  u8 **image_ptr)
+{
+	unsigned int sz, hole;
+	struct bpf_binary_header *header;
+
+	/* Most of BPF filters are really small,
+	 * but if some of them fill a page, allow at least
+	 * 128 extra bytes to insert a random section of int3
+	 */
+	sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE);
+	header = module_alloc(sz);
+	if (!header)
+		return NULL;
+
+	memset(header, 0xcc, sz); /* fill whole space with int3 instructions */
+
+	header->pages = sz / PAGE_SIZE;
+	hole = sz - (proglen + sizeof(*header));
+
+	/* insert a random number of int3 instructions before BPF code */
+	*image_ptr = &header->image[prandom_u32() % hole];
+	return header;
+}
+
 void bpf_jit_compile(struct sk_filter *fp)
 {
 	u8 temp[64];
@@ -153,6 +187,7 @@ void bpf_jit_compile(struct sk_filter *fp)
 	int t_offset, f_offset;
 	u8 t_op, f_op, seen = 0, pass;
 	u8 *image = NULL;
+	struct bpf_binary_header *header = NULL;
 	u8 *func;
 	int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */
 	unsigned int cleanup_addr; /* epilogue code offset */
@@ -693,7 +728,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf] - addrs[i];
 				if (unlikely(proglen + ilen > oldproglen)) {
 					pr_err("bpb_jit_compile fatal error\n");
 					kfree(addrs);
-					module_free(NULL, image);
+					module_free(NULL, header);
 					return;
 				}
 				memcpy(image + proglen, temp, ilen);
@@ -717,10 +752,8 @@ cond_branch:			f_offset = addrs[i + filter[i].jf] - addrs[i];
 			break;
 		}
 		if (proglen == oldproglen) {
-			image = module_alloc(max_t(unsigned int,
-						   proglen,
-						   sizeof(struct work_struct)));
-			if (!image)
+			header = bpf_alloc_binary(proglen, &image);
+			if (!header)
 				goto out;
 		}
 		oldproglen = proglen;
@@ -730,7 +763,8 @@ cond_branch:			f_offset = addrs[i + filter[i].jf] - addrs[i];
 		bpf_jit_dump(flen, proglen, pass, image);
 
 	if (image) {
-		bpf_flush_icache(image, image + proglen);
+		bpf_flush_icache(header, image + proglen);
+		set_memory_ro((unsigned long)header, header->pages);
 		fp->bpf_func = (void *)image;
 	}
 out:
@@ -738,20 +772,13 @@ out:
 	return;
 }
 
-static void jit_free_defer(struct work_struct *arg)
-{
-	module_free(NULL, arg);
-}
-
-/* run from softirq, we must use a work_struct to call
- * module_free() from process context
- */
 void bpf_jit_free(struct sk_filter *fp)
 {
 	if (fp->bpf_func != sk_run_filter) {
-		struct work_struct *work = (struct work_struct *)fp->bpf_func;
+		unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
+		struct bpf_binary_header *header = (void *)addr;
 
-		INIT_WORK(work, jit_free_defer);
-		schedule_work(work);
+		set_memory_rw(addr, header->pages);
+		module_free(NULL, header);
 	}
 }
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 3e724256dbee..d641897a1f4e 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -324,14 +324,11 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 	res->start = start;
 	res->end = end;
 	info->res_offset[info->res_num] = addr.translation_offset;
+	info->res_num++;
 
-	if (!pci_use_crs) {
+	if (!pci_use_crs)
 		dev_printk(KERN_DEBUG, &info->bridge->dev,
 			   "host bridge window %pR (ignored)\n", res);
-		return AE_OK;
-	}
-
-	info->res_num++;
 
 	return AE_OK;
 }
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index b410b71bdcf7..c8d5577044bb 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -274,8 +274,9 @@ static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
 	return status;
 }
 
-int efi_set_rtc_mmss(unsigned long nowtime)
+int efi_set_rtc_mmss(const struct timespec *now)
 {
+	unsigned long nowtime = now->tv_sec;
 	efi_status_t 	status;
 	efi_time_t 	eft;
 	efi_time_cap_t 	cap;
@@ -310,7 +311,7 @@ int efi_set_rtc_mmss(unsigned long nowtime)
 	return 0;
 }
 
-unsigned long efi_get_time(void)
+void efi_get_time(struct timespec *now)
 {
 	efi_status_t status;
 	efi_time_t eft;
@@ -320,8 +321,9 @@ unsigned long efi_get_time(void)
 	if (status != EFI_SUCCESS)
 		pr_err("Oops: efitime: can't read time!\n");
 
-	return mktime(eft.year, eft.month, eft.day, eft.hour,
-		      eft.minute, eft.second);
+	now->tv_sec = mktime(eft.year, eft.month, eft.day, eft.hour,
+			     eft.minute, eft.second);
+	now->tv_nsec = 0;
 }
 
 /*
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
index d62b0a3b5c14..5e355b134ba4 100644
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -56,7 +56,7 @@ void vrtc_cmos_write(unsigned char val, unsigned char reg)
 }
 EXPORT_SYMBOL_GPL(vrtc_cmos_write);
 
-unsigned long vrtc_get_time(void)
+void vrtc_get_time(struct timespec *now)
 {
 	u8 sec, min, hour, mday, mon;
 	unsigned long flags;
@@ -82,17 +82,18 @@ unsigned long vrtc_get_time(void)
 	printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d "
 		"mon: %d year: %d\n", sec, min, hour, mday, mon, year);
 
-	return mktime(year, mon, mday, hour, min, sec);
+	now->tv_sec = mktime(year, mon, mday, hour, min, sec);
+	now->tv_nsec = 0;
 }
 
-int vrtc_set_mmss(unsigned long nowtime)
+int vrtc_set_mmss(const struct timespec *now)
 {
 	unsigned long flags;
 	struct rtc_time tm;
 	int year;
 	int retval = 0;
 
-	rtc_time_to_tm(nowtime, &tm);
+	rtc_time_to_tm(now->tv_sec, &tm);
 	if (!rtc_valid_tm(&tm) && tm.tm_year >= 72) {
 		/*
 		 * tm.year is the number of years since 1900, and the
@@ -110,7 +111,7 @@ int vrtc_set_mmss(unsigned long nowtime)
 	} else {
 		printk(KERN_ERR
 		       "%s: Invalid vRTC value: write of %lx to vRTC failed\n",
-			__FUNCTION__, nowtime);
+			__FUNCTION__, now->tv_sec);
 		retval = -EINVAL;
 	}
 	return retval;
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index c74436e687bf..72074d528400 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode)
 	cycle_t ret;
 	u64 last;
 	u32 version;
-	u32 migrate_count;
 	u8 flags;
 	unsigned cpu, cpu1;
 
 
 	/*
-	 * When looping to get a consistent (time-info, tsc) pair, we
-	 * also need to deal with the possibility we can switch vcpus,
-	 * so make sure we always re-fetch time-info for the current vcpu.
+	 * Note: hypervisor must guarantee that:
+	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
+	 * 2. that per-CPU pvclock time info is updated if the
+	 *    underlying CPU changes.
+	 * 3. that version is increased whenever underlying CPU
+	 *    changes.
+	 *
 	 */
 	do {
 		cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode)
 
 		pvti = get_pvti(cpu);
 
-		migrate_count = pvti->migrate_count;
-
 		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
 
 		/*
@@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode)
 		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
 	} while (unlikely(cpu != cpu1 ||
 			  (pvti->pvti.version & 1) ||
-			  pvti->pvti.version != version ||
-			  pvti->migrate_count != migrate_count));
+			  pvti->pvti.version != version));
 
 	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
 		*mode = VCLOCK_NONE;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index a690868be837..ee365895b06b 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -15,6 +15,7 @@
 #include <linux/math64.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
+#include <linux/pvclock_gtod.h>
 
 #include <asm/pvclock.h>
 #include <asm/xen/hypervisor.h>
@@ -179,34 +180,56 @@ static void xen_read_wallclock(struct timespec *ts)
 	put_cpu_var(xen_vcpu);
 }
 
-static unsigned long xen_get_wallclock(void)
+static void xen_get_wallclock(struct timespec *now)
 {
-	struct timespec ts;
+	xen_read_wallclock(now);
+}
 
-	xen_read_wallclock(&ts);
-	return ts.tv_sec;
+static int xen_set_wallclock(const struct timespec *now)
+{
+	return -1;
 }
 
-static int xen_set_wallclock(unsigned long now)
+static int xen_pvclock_gtod_notify(struct notifier_block *nb,
+				   unsigned long was_set, void *priv)
 {
+	/* Protected by the calling core code serialization */
+	static struct timespec next_sync;
+
 	struct xen_platform_op op;
-	int rc;
+	struct timespec now;
 
-	/* do nothing for domU */
-	if (!xen_initial_domain())
-		return -1;
+	now = __current_kernel_time();
+
+	/*
+	 * We only take the expensive HV call when the clock was set
+	 * or when the 11 minutes RTC synchronization time elapsed.
+	 */
+	if (!was_set && timespec_compare(&now, &next_sync) < 0)
+		return NOTIFY_OK;
 
 	op.cmd = XENPF_settime;
-	op.u.settime.secs = now;
-	op.u.settime.nsecs = 0;
+	op.u.settime.secs = now.tv_sec;
+	op.u.settime.nsecs = now.tv_nsec;
 	op.u.settime.system_time = xen_clocksource_read();
 
-	rc = HYPERVISOR_dom0_op(&op);
-	WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now);
+	(void)HYPERVISOR_dom0_op(&op);
 
-	return rc;
+	/*
+	 * Move the next drift compensation time 11 minutes
+	 * ahead. That's emulating the sync_cmos_clock() update for
+	 * the hardware RTC.
+	 */
+	next_sync = now;
+	next_sync.tv_sec += 11 * 60;
+
+	return NOTIFY_OK;
 }
 
+static struct notifier_block xen_pvclock_gtod_notifier = {
+	.notifier_call = xen_pvclock_gtod_notify,
+};
+
 static struct clocksource xen_clocksource __read_mostly = {
 	.name = "xen",
 	.rating = 400,
@@ -482,6 +505,9 @@ static void __init xen_time_init(void)
 	xen_setup_runstate_info(cpu);
 	xen_setup_timer(cpu);
 	xen_setup_cpu_clockevents();
+
+	if (xen_initial_domain())
+		pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
 }
 
 void __init xen_init_time_ops(void)
@@ -494,7 +520,9 @@ void __init xen_init_time_ops(void)
 
 	x86_platform.calibrate_tsc = xen_tsc_khz;
 	x86_platform.get_wallclock = xen_get_wallclock;
-	x86_platform.set_wallclock = xen_set_wallclock;
+	/* Dom0 uses the native method to set the hardware RTC. */
+	if (!xen_initial_domain())
+		x86_platform.set_wallclock = xen_set_wallclock;
 }
 
 #ifdef CONFIG_XEN_PVHVM