110 files changed, 6169 insertions, 585 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bbbe02197afb..c33e3ad2c8fd 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -82,11 +82,6 @@ config GENERIC_HWEIGHT
 	bool
 	default y
 
-config GENERIC_GPIO
-	bool
-	help
-	  Generic GPIO API support
-
 config PPC
 	bool
 	default y
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 5416e28a7538..863d877e0b5f 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -262,8 +262,31 @@ config PPC_EARLY_DEBUG_OPAL_HVSI
 	  Select this to enable early debugging for the PowerNV platform
 	  using an "hvsi" console
 
+config PPC_EARLY_DEBUG_MEMCONS
+	bool "In memory console"
+	help
+	  Select this to enable early debugging using an in memory console.
+	  This console provides input and output buffers stored within the
+	  kernel BSS and should be safe to select on any system. A debugger
+	  can then be used to read kernel output or send input to the console.
 endchoice
 
+config PPC_MEMCONS_OUTPUT_SIZE
+	int "In memory console output buffer size"
+	depends on PPC_EARLY_DEBUG_MEMCONS
+	default 4096
+	help
+	  Selects the size of the output buffer (in bytes) of the in memory
+	  console.
+
+config PPC_MEMCONS_INPUT_SIZE
+	int "In memory console input buffer size"
+	depends on PPC_EARLY_DEBUG_MEMCONS
+	default 128
+	help
+	  Selects the size of the input buffer (in bytes) of the in memory
+	  console.
+
 config PPC_EARLY_DEBUG_OPAL
 	def_bool y
 	depends on PPC_EARLY_DEBUG_OPAL_RAW || PPC_EARLY_DEBUG_OPAL_HVSI
diff --git a/arch/powerpc/boot/dts/mpc5121.dtsi b/arch/powerpc/boot/dts/mpc5121.dtsi
index 2e82d0e71dd3..bd14c00e5146 100644
--- a/arch/powerpc/boot/dts/mpc5121.dtsi
+++ b/arch/powerpc/boot/dts/mpc5121.dtsi
@@ -152,6 +152,8 @@
 			compatible = "fsl,mpc5121-sdhc";
 			reg = <0x1500 0x100>;
 			interrupts = <8 0x8>;
+			dmas = <&dma0 30>;
+			dma-names = "rx-tx";
 		};
 
 		i2c@1700 {
diff --git a/arch/powerpc/include/asm/context_tracking.h b/arch/powerpc/include/asm/context_tracking.h
new file mode 100644
index 000000000000..b6f5a33b8ee2
--- /dev/null
+++ b/arch/powerpc/include/asm/context_tracking.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_POWERPC_CONTEXT_TRACKING_H
+#define _ASM_POWERPC_CONTEXT_TRACKING_H
+
+#ifdef CONFIG_CONTEXT_TRACKING
+#define SCHEDULE_USER bl	.schedule_user
+#else
+#define SCHEDULE_USER bl	.schedule
+#endif
+
+#endif
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index fcc54ad159ba..26807e5aff51 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -224,8 +224,10 @@ extern const char *powerpc_base_platform;
 /* We only set the TM feature if the kernel was compiled with TM supprt */
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 #define CPU_FTR_TM_COMP		CPU_FTR_TM
+#define PPC_FEATURE2_HTM_COMP	PPC_FEATURE2_HTM
 #else
 #define CPU_FTR_TM_COMP		0
+#define PPC_FEATURE2_HTM_COMP	0
 #endif
 
 /* We need to mark all pages as being coherent if we're SMP or we have a
diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index 0df54646f968..681bc0314b6b 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -52,6 +52,7 @@
 #define FW_FEATURE_BEST_ENERGY	ASM_CONST(0x0000000080000000)
 #define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0000000100000000)
 #define FW_FEATURE_PRRN		ASM_CONST(0x0000000200000000)
+#define FW_FEATURE_OPALv3	ASM_CONST(0x0000000400000000)
 
 #ifndef __ASSEMBLY__
 
@@ -69,7 +70,8 @@ enum {
 		FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
 		FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN,
 	FW_FEATURE_PSERIES_ALWAYS = 0,
-	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_OPALv2,
+	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_OPALv2 |
+		FW_FEATURE_OPALv3,
 	FW_FEATURE_POWERNV_ALWAYS = 0,
 	FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
 	FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 4bc2c3dad6ad..cf4df8e2139a 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -270,6 +270,9 @@
 #define H_SET_MODE		0x31C
 #define MAX_HCALL_OPCODE	H_SET_MODE
 
+/* Platform specific hcalls, used by KVM */
+#define H_RTAS			0xf000
+
 #ifndef __ASSEMBLY__
 
 /**
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index e45c4947a772..ba713f166fa5 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -95,15 +95,14 @@ static inline bool arch_irqs_disabled(void)
 #define __hard_irq_disable()	__mtmsrd(local_paca->kernel_msr, 1)
 #endif
 
-static inline void hard_irq_disable(void)
-{
-	__hard_irq_disable();
-	get_paca()->soft_enabled = 0;
-	get_paca()->irq_happened |= PACA_IRQ_HARD_DIS;
-}
-
-/* include/linux/interrupt.h needs hard_irq_disable to be a macro */
-#define hard_irq_disable	hard_irq_disable
+#define hard_irq_disable()	do {			\
+	u8 _was_enabled = get_paca()->soft_enabled;	\
+	__hard_irq_disable();				\
+	get_paca()->soft_enabled = 0;			\
+	get_paca()->irq_happened |= PACA_IRQ_HARD_DIS;	\
+	if (_was_enabled)				\
+		trace_hardirqs_off();			\
+} while(0)
 
 static inline bool lazy_irq_pending(void)
 {
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 5a56e1c5f851..349ed85c7d61 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -142,6 +142,8 @@ extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+					  unsigned int vec);
 extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 			   bool upper, u32 val);
@@ -156,7 +158,8 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
 			unsigned long pte_index);
 extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
 			unsigned long *nb_ret);
-extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr);
+extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr,
+			unsigned long gpa, bool dirty);
 extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 			long pte_index, unsigned long pteh, unsigned long ptel);
 extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
@@ -458,6 +461,8 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 #define OSI_SC_MAGIC_R4			0x77810F9B
 
 #define INS_DCBZ			0x7c0007ec
+/* TO = 31 for unconditional trap */
+#define INS_TW				0x7fe00008
 
 /* LPIDs we support with this build -- runtime limit may be lower */
 #define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 38bec1dc9928..9c1ff330c805 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -268,4 +268,17 @@ static inline int is_vrma_hpte(unsigned long hpte_v)
 		(HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)));
 }
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+/*
+ * Note modification of an HPTE; set the HPTE modified bit
+ * if anyone is interested.
+ */
+static inline void note_hpte_modification(struct kvm *kvm,
+					  struct revmap_entry *rev)
+{
+	if (atomic_read(&kvm->arch.hpte_mod_interest))
+		rev->guest_rpte |= HPTE_GR_MODIFIED;
+}
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index cdc3d2717cc6..9039d3c97eec 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -20,6 +20,11 @@
 #ifndef __ASM_KVM_BOOK3S_ASM_H__
 #define __ASM_KVM_BOOK3S_ASM_H__
 
+/* XICS ICP register offsets */
+#define XICS_XIRR		4
+#define XICS_MFRR		0xc
+#define XICS_IPI		2	/* interrupt source # for IPIs */
+
 #ifdef __ASSEMBLY__
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
@@ -81,10 +86,11 @@ struct kvmppc_host_state {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	u8 hwthread_req;
 	u8 hwthread_state;
-
+	u8 host_ipi;
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
 	unsigned long xics_phys;
+	u32 saved_xirr;
 	u64 dabr;
 	u64 host_mmcr[3];
 	u32 host_pmc[8];
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index b7cd3356a532..d3c1eb34c986 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -26,6 +26,8 @@
 /* LPIDs we support with this build -- runtime limit may be lower */
 #define KVMPPC_NR_LPIDS                        64
 
+#define KVMPPC_INST_EHPRIV	0x7c00021c
+
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
 	vcpu->arch.gpr[num] = val;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index d1bb86074721..af326cde7cb6 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -44,6 +44,10 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
 
+/* These values are internal and can be increased later */
+#define KVM_NR_IRQCHIPS          1
+#define KVM_IRQCHIP_NUM_PINS     256
+
 #if !defined(CONFIG_KVM_440)
 #include <linux/mmu_notifier.h>
 
@@ -188,6 +192,10 @@ struct kvmppc_linear_info {
 	int		 type;
 };
 
+/* XICS components, defined in book3s_xics.c */
+struct kvmppc_xics;
+struct kvmppc_icp;
+
 /*
  * The reverse mapping array has one entry for each HPTE,
  * which stores the guest's view of the second word of the HPTE
@@ -255,6 +263,13 @@ struct kvm_arch {
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 #ifdef CONFIG_PPC_BOOK3S_64
 	struct list_head spapr_tce_tables;
+	struct list_head rtas_tokens;
+#endif
+#ifdef CONFIG_KVM_MPIC
+	struct openpic *mpic;
+#endif
+#ifdef CONFIG_KVM_XICS
+	struct kvmppc_xics *xics;
 #endif
 };
 
@@ -301,11 +316,13 @@ struct kvmppc_vcore {
  * that a guest can register.
  */
 struct kvmppc_vpa {
+	unsigned long gpa;	/* Current guest phys addr */
 	void *pinned_addr;	/* Address in kernel linear mapping */
 	void *pinned_end;	/* End of region */
 	unsigned long next_gpa;	/* Guest phys addr for update */
 	unsigned long len;	/* Number of bytes required */
 	u8 update_pending;	/* 1 => update pinned_addr from next_gpa */
+	bool dirty;		/* true => area has been modified by kernel */
 };
 
 struct kvmppc_pte {
@@ -359,6 +376,11 @@ struct kvmppc_slb {
 #define KVMPPC_BOOKE_MAX_IAC	4
 #define KVMPPC_BOOKE_MAX_DAC	2
 
+/* KVMPPC_EPR_USER takes precedence over KVMPPC_EPR_KERNEL */
+#define KVMPPC_EPR_NONE		0 /* EPR not supported */
+#define KVMPPC_EPR_USER		1 /* exit to userspace to fill EPR */
+#define KVMPPC_EPR_KERNEL	2 /* in-kernel irqchip */
+
 struct kvmppc_booke_debug_reg {
 	u32 dbcr0;
 	u32 dbcr1;
@@ -370,6 +392,12 @@ struct kvmppc_booke_debug_reg {
 	u64 dac[KVMPPC_BOOKE_MAX_DAC];
 };
 
+#define KVMPPC_IRQ_DEFAULT	0
+#define KVMPPC_IRQ_MPIC		1
+#define KVMPPC_IRQ_XICS		2
+
+struct openpic;
+
 struct kvm_vcpu_arch {
 	ulong host_stack;
 	u32 host_pid;
@@ -502,8 +530,11 @@ struct kvm_vcpu_arch {
 	spinlock_t wdt_lock;
 	struct timer_list wdt_timer;
 	u32 tlbcfg[4];
+	u32 tlbps[4];
 	u32 mmucfg;
+	u32 eptcfg;
 	u32 epr;
+	u32 crit_save;
 	struct kvmppc_booke_debug_reg dbg_reg;
 #endif
 	gpa_t paddr_accessed;
@@ -521,7 +552,7 @@ struct kvm_vcpu_arch {
 	u8 sane;
 	u8 cpu_type;
 	u8 hcall_needed;
-	u8 epr_enabled;
+	u8 epr_flags; /* KVMPPC_EPR_xxx */
 	u8 epr_needed;
 
 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
@@ -548,6 +579,13 @@ struct kvm_vcpu_arch {
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
 
+	int irq_type;		/* one of KVM_IRQ_* */
+	int irq_cpu_id;
+	struct openpic *mpic;	/* KVM_IRQ_MPIC */
+#ifdef CONFIG_KVM_XICS
+	struct kvmppc_icp *icp; /* XICS presentation controller */
+#endif
+
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	struct kvm_vcpu_arch_shared shregs;
 
@@ -588,5 +626,6 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_FQPR	0x0060
 
 #define __KVM_HAVE_ARCH_WQP
+#define __KVM_HAVE_CREATE_DEVICE
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 44a657adf416..a5287fe03d77 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -44,7 +44,7 @@ enum emulation_result {
 	EMULATE_DO_DCR,       /* kvm_run filled with DCR request */
 	EMULATE_FAIL,         /* can't emulate this instruction */
 	EMULATE_AGAIN,        /* something went wrong. go again */
-	EMULATE_DO_PAPR,      /* kvm_run filled with PAPR request */
+	EMULATE_EXIT_USER,    /* emulation requires exit to user-space */
 };
 
 extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
@@ -104,8 +104,7 @@ extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                        struct kvm_interrupt *irq);
-extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
-                                         struct kvm_interrupt *irq);
+extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -131,6 +130,7 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 			struct kvm_memory_slot *memslot, unsigned long porder);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce *args);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
@@ -152,7 +152,7 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old);
+				const struct kvm_memory_slot *old);
 extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
 				      struct kvm_ppc_smmu_info *info);
 extern void kvmppc_core_flush_memslot(struct kvm *kvm,
@@ -165,6 +165,18 @@ extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
 
 extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
 
+int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
+
+extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
+extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
+extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
+extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server,
+				u32 priority);
+extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+				u32 *priority);
+extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq);
+extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
@@ -246,12 +258,29 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
 
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 
+struct openpic;
+
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
 	paca[cpu].kvm_hstate.xics_phys = addr;
 }
 
+static inline u32 kvmppc_get_xics_latch(void)
+{
+	u32 xirr = get_paca()->kvm_hstate.saved_xirr;
+
+	get_paca()->kvm_hstate.saved_xirr = 0;
+
+	return xirr;
+}
+
+static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
+{
+	paca[cpu].kvm_hstate.host_ipi = host_ipi;
+}
+
+extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu);
 extern void kvm_linear_init(void);
 
 #else
@@ -260,6 +289,46 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 
 static inline void kvm_linear_init(void)
 {}
+
+static inline u32 kvmppc_get_xics_latch(void)
+{
+	return 0;
+}
+
+static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
+{}
+
+static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+	kvm_vcpu_kick(vcpu);
+}
+#endif
+
+#ifdef CONFIG_KVM_XICS
+static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
+}
+extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
+extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
+extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
+extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
+extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
+			struct kvm_vcpu *vcpu, u32 cpu);
+#else
+static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
+	{ return 0; }
+static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
+static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu,
+					 unsigned long server)
+	{ return -EINVAL; }
+static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm,
+					struct kvm_irq_level *args)
+	{ return -ENOTTY; }
+static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+	{ return 0; }
 #endif
 
 static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
@@ -271,6 +340,32 @@ static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
 #endif
 }
 
+#ifdef CONFIG_KVM_MPIC
+
+void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu);
+int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+			     u32 cpu);
+void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu);
+
+#else
+
+static inline void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline int kvmppc_mpic_connect_vcpu(struct kvm_device *dev,
+		struct kvm_vcpu *vcpu, u32 cpu)
+{
+	return -EINVAL;
+}
+
+static inline void kvmppc_mpic_disconnect_vcpu(struct openpic *opp,
+		struct kvm_vcpu *vcpu)
+{
+}
+
+#endif /* CONFIG_KVM_MPIC */
+
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 			      struct kvm_config_tlb *cfg);
 int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
@@ -283,8 +378,15 @@ void kvmppc_init_lpid(unsigned long nr_lpids);
 
 static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
 {
-	/* Clear i-cache for new pages */
 	struct page *page;
+	/*
+	 * We can only access pages that the kernel maps
+	 * as memory. Bail out for unmapped ones.
+	 */
+	if (!pfn_valid(pfn))
+		return;
+
+	/* Clear i-cache for new pages */
 	page = pfn_to_page(pfn);
 	if (!test_bit(PG_arch_1, &page->flags)) {
 		flush_dcache_icache_page(page);
@@ -324,4 +426,6 @@ static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
 	return ea;
 }
 
+extern void xics_wake_cpu(int cpu);
+
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 3f3f691be2e7..92386fc4e82a 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -29,6 +29,7 @@ struct rtc_time;
 struct file;
 struct pci_controller;
 struct kimage;
+struct pci_host_bridge;
 
 struct machdep_calls {
 	char		*name;
@@ -108,6 +109,8 @@ struct machdep_calls {
 	void		(*pcibios_fixup)(void);
 	int		(*pci_probe_mode)(struct pci_bus *);
 	void		(*pci_irq_fixup)(struct pci_dev *dev);
+	int		(*pcibios_root_bridge_prepare)(struct pci_host_bridge
+				*bridge);
 
 	/* To setup PHBs when using automatic OF platform driver for PCI */
 	int		(*pci_setup_phb)(struct pci_controller *host);
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index b6c8b58b1d76..cbb9305ab15a 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -243,7 +243,8 @@ enum OpalMCE_TlbErrorType {
 
 enum OpalThreadStatus {
 	OPAL_THREAD_INACTIVE = 0x0,
-	OPAL_THREAD_STARTED = 0x1
+	OPAL_THREAD_STARTED = 0x1,
+	OPAL_THREAD_UNAVAILABLE = 0x2 /* opal-v3 */
 };
 
 enum OpalPciBusCompare {
@@ -563,6 +564,8 @@ extern void opal_nvram_init(void);
 
 extern int opal_machine_check(struct pt_regs *regs);
 
+extern void opal_shutdown(void);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_H */
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index ffbc5fd549ac..8b11b5bd9938 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -39,11 +39,6 @@ struct pci_controller {
 	resource_size_t io_base_phys;
 	resource_size_t pci_io_size;
 
-	/* Some machines (PReP) have a non 1:1 mapping of
-	 * the PCI memory space in the CPU bus space
-	 */
-	resource_size_t pci_mem_offset;
-
 	/* Some machines have a special region to forward the ISA
 	 * "memory" cycles such as VGA memory regions. Left to 0
 	 * if unsupported
@@ -86,6 +81,7 @@ struct pci_controller {
 	 */
 	struct resource	io_resource;
 	struct resource mem_resources[3];
+	resource_size_t mem_offset[3];
 	int global_number;		/* PCI domain number */
 
 	resource_size_t dma_window_base_cur;
@@ -163,6 +159,8 @@ struct pci_dn {
 
 	int	pci_ext_config_space;	/* for pci devices */
 
+	int	force_32bit_msi:1;
+
 	struct	pci_dev *pcidev;	/* back-pointer to the pci device */
 #ifdef CONFIG_EEH
 	struct eeh_dev *edev;		/* eeh device */
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index 91acb12bac92..b66ae722a8e9 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -186,7 +186,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 
 static inline pgtable_t pmd_pgtable(pmd_t pmd)
 {
-	return (pgtable_t)(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE);
+	return (pgtable_t)(pmd_val(pmd) & ~PMD_MASKED_BITS);
 }
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 0c34e4803499..eccfc161e58e 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -115,6 +115,10 @@
 #define PPC_INST_MFSPR_DSCR_MASK	0xfc1fffff
 #define PPC_INST_MTSPR_DSCR		0x7c1103a6
 #define PPC_INST_MTSPR_DSCR_MASK	0xfc1fffff
+#define PPC_INST_MFSPR_DSCR_USER	0x7c0302a6
+#define PPC_INST_MFSPR_DSCR_USER_MASK	0xfc1fffff
+#define PPC_INST_MTSPR_DSCR_USER	0x7c0303a6
+#define PPC_INST_MTSPR_DSCR_USER_MASK	0xfc1fffff
 #define PPC_INST_SLBFEE			0x7c0007a7
 
 #define PPC_INST_STRING			0x7c00042a
diff --git a/arch/powerpc/include/asm/pte-hash64-64k.h b/arch/powerpc/include/asm/pte-hash64-64k.h
index 3e13e23e4fdf..d836d945068d 100644
--- a/arch/powerpc/include/asm/pte-hash64-64k.h
+++ b/arch/powerpc/include/asm/pte-hash64-64k.h
@@ -47,7 +47,7 @@
  * generic accessors and iterators here
  */
 #define __real_pte(e,p) 	((real_pte_t) { \
-			(e), ((e) & _PAGE_COMBO) ? \
+			(e), (pte_val(e) & _PAGE_COMBO) ? \
 				(pte_val(*((p) + PTRS_PER_PTE))) : 0 })
 #define __rpte_to_hidx(r,index)	((pte_val((r).pte) & _PAGE_COMBO) ? \
         (((r).hidx >> ((index)<<2)) & 0xf) : ((pte_val((r).pte) >> 12) & 0xf))
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 3d17427e4fd7..a6136515c7f2 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -300,6 +300,7 @@
 #define     LPCR_PECE1	0x00002000	/* decrementer can cause exit */
 #define     LPCR_PECE2	0x00001000	/* machine check etc can cause exit */
 #define   LPCR_MER	0x00000800	/* Mediated External Exception */
+#define   LPCR_MER_SH	11
 #define   LPCR_LPES    0x0000000c
 #define   LPCR_LPES0   0x00000008      /* LPAR Env selector 0 */
 #define   LPCR_LPES1   0x00000004      /* LPAR Env selector 1 */
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index a8bc2bb4adc9..34fd70488d83 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -264,6 +264,8 @@ extern void rtas_progress(char *s, unsigned short hex);
 extern void rtas_initialize(void);
 extern int rtas_suspend_cpu(struct rtas_suspend_me_data *data);
 extern int rtas_suspend_last_cpu(struct rtas_suspend_me_data *data);
+extern int rtas_online_cpus_mask(cpumask_var_t cpus);
+extern int rtas_offline_cpus_mask(cpumask_var_t cpus);
 extern int rtas_ibm_suspend_me(struct rtas_args *);
 
 struct rtc_time;
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 8ceea14d6fe4..ba7b1973866e 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -97,7 +97,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_PERFMON_CTXSW	6	/* perfmon needs ctxsw calls */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SINGLESTEP		8	/* singlestepping active */
-#define TIF_MEMDIE		9	/* is terminating due to OOM killer */
+#define TIF_NOHZ		9	/* in adaptive nohz mode */
 #define TIF_SECCOMP		10	/* secure computing */
 #define TIF_RESTOREALL		11	/* Restore all regs (implies NOERROR) */
 #define TIF_NOERROR		12	/* Force successful syscall return */
@@ -106,6 +106,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SYSCALL_TRACEPOINT	15	/* syscall tracepoint instrumentation */
 #define TIF_EMULATE_STACK_STORE	16	/* Is an instruction emulation
 						for stack store? */
+#define TIF_MEMDIE		17	/* is terminating due to OOM killer */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -124,8 +125,10 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_UPROBE		(1<<TIF_UPROBE)
 #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_EMULATE_STACK_STORE	(1<<TIF_EMULATE_STACK_STORE)
+#define _TIF_NOHZ		(1<<TIF_NOHZ)
 #define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
-				 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT)
+				 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
+				 _TIF_NOHZ)
 
 #define _TIF_USER_WORK_MASK	(_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
 				 _TIF_NOTIFY_RESUME | _TIF_UPROBE)
diff --git a/arch/powerpc/include/asm/udbg.h b/arch/powerpc/include/asm/udbg.h
index 5a7510e9d09d..dc590919f8eb 100644
--- a/arch/powerpc/include/asm/udbg.h
+++ b/arch/powerpc/include/asm/udbg.h
@@ -52,6 +52,7 @@ extern void __init udbg_init_40x_realmode(void);
 extern void __init udbg_init_cpm(void);
 extern void __init udbg_init_usbgecko(void);
 extern void __init udbg_init_wsp(void);
+extern void __init udbg_init_memcons(void);
 extern void __init udbg_init_ehv_bc(void);
 extern void __init udbg_init_ps3gelic(void);
 extern void __init udbg_init_debug_opal_raw(void);
diff --git a/arch/powerpc/include/uapi/asm/cputable.h b/arch/powerpc/include/uapi/asm/cputable.h
index ed9dd8156962..5b7657959faa 100644
--- a/arch/powerpc/include/uapi/asm/cputable.h
+++ b/arch/powerpc/include/uapi/asm/cputable.h
@@ -1,6 +1,7 @@
 #ifndef _UAPI__ASM_POWERPC_CPUTABLE_H
 #define _UAPI__ASM_POWERPC_CPUTABLE_H
 
+/* in AT_HWCAP */
 #define PPC_FEATURE_32			0x80000000
 #define PPC_FEATURE_64			0x40000000
 #define PPC_FEATURE_601_INSTR		0x20000000
@@ -33,4 +34,12 @@
 #define PPC_FEATURE_TRUE_LE		0x00000002
 #define PPC_FEATURE_PPC_LE		0x00000001
 
+/* in AT_HWCAP2 */
+#define PPC_FEATURE2_ARCH_2_07		0x80000000
+#define PPC_FEATURE2_HTM		0x40000000
+#define PPC_FEATURE2_DSCR		0x20000000
+#define PPC_FEATURE2_EBB		0x10000000
+#define PPC_FEATURE2_ISEL		0x08000000
+#define PPC_FEATURE2_TAR		0x04000000
+
 #endif /* _UAPI__ASM_POWERPC_CPUTABLE_H */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 16064d00adb9..0fb1a6e9ff90 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -25,6 +25,8 @@
 /* Select powerpc specific features in <linux/kvm.h> */
 #define __KVM_HAVE_SPAPR_TCE
 #define __KVM_HAVE_PPC_SMT
+#define __KVM_HAVE_IRQCHIP
+#define __KVM_HAVE_IRQ_LINE
 
 struct kvm_regs {
 	__u64 pc;
@@ -272,8 +274,31 @@ struct kvm_debug_exit_arch {
 
 /* for KVM_SET_GUEST_DEBUG */
 struct kvm_guest_debug_arch {
+	struct {
+		/* H/W breakpoint/watchpoint address */
+		__u64 addr;
+		/*
+		 * Type denotes h/w breakpoint, read watchpoint, write
+		 * watchpoint or watchpoint (both read and write).
+		 */
+#define KVMPPC_DEBUG_NONE		0x0
+#define KVMPPC_DEBUG_BREAKPOINT		(1UL << 1)
+#define KVMPPC_DEBUG_WATCH_WRITE	(1UL << 2)
+#define KVMPPC_DEBUG_WATCH_READ		(1UL << 3)
+		__u32 type;
+		__u32 reserved;
+	} bp[16];
 };
 
+/* Debug related defines */
+/*
+ * kvm_guest_debug->control is a 32 bit field. The lower 16 bits are generic
+ * and upper 16 bits are architecture specific. Architecture specific defines
+ * that ioctl is for setting hardware breakpoint or software breakpoint.
+ */
+#define KVM_GUESTDBG_USE_SW_BP		0x00010000
+#define KVM_GUESTDBG_USE_HW_BP		0x00020000
+
 /* definition of registers in kvm_run */
 struct kvm_sync_regs {
 };
@@ -299,6 +324,12 @@ struct kvm_allocate_rma {
 	__u64 rma_size;
 };
 
+/* for KVM_CAP_PPC_RTAS */
+struct kvm_rtas_token_args {
+	char name[120];
+	__u64 token;	/* Use a token of 0 to undefine a mapping */
+};
+
 struct kvm_book3e_206_tlb_entry {
 	__u32 mas8;
 	__u32 mas1;
@@ -359,6 +390,26 @@ struct kvm_get_htab_header {
 	__u16	n_invalid;
 };
 
+/* Per-vcpu XICS interrupt controller state */
+#define KVM_REG_PPC_ICP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
+
+#define  KVM_REG_PPC_ICP_CPPR_SHIFT	56	/* current proc priority */
+#define  KVM_REG_PPC_ICP_CPPR_MASK	0xff
+#define  KVM_REG_PPC_ICP_XISR_SHIFT	32	/* interrupt status field */
+#define  KVM_REG_PPC_ICP_XISR_MASK	0xffffff
+#define  KVM_REG_PPC_ICP_MFRR_SHIFT	24	/* pending IPI priority */
+#define  KVM_REG_PPC_ICP_MFRR_MASK	0xff
+#define  KVM_REG_PPC_ICP_PPRI_SHIFT	16	/* pending irq priority */
+#define  KVM_REG_PPC_ICP_PPRI_MASK	0xff
+
+/* Device control API: PPC-specific devices */
+#define KVM_DEV_MPIC_GRP_MISC		1
+#define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
+
+#define KVM_DEV_MPIC_GRP_REGISTER	2	/* 32-bit */
+#define KVM_DEV_MPIC_GRP_IRQ_ACTIVE	3	/* 32-bit */
+
+/* One-Reg API: PPC-specific registers */
 #define KVM_REG_PPC_HIOR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
 #define KVM_REG_PPC_IAC1	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2)
 #define KVM_REG_PPC_IAC2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3)
@@ -417,4 +468,47 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_EPCR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85)
 #define KVM_REG_PPC_EPR		(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86)
 
+/* Timer Status Register OR/CLEAR interface */
+#define KVM_REG_PPC_OR_TSR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x87)
+#define KVM_REG_PPC_CLEAR_TSR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x88)
+#define KVM_REG_PPC_TCR		(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x89)
+#define KVM_REG_PPC_TSR		(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8a)
+
+/* Debugging: Special instruction for software breakpoint */
+#define KVM_REG_PPC_DEBUG_INST	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8b)
+
+/* MMU registers */
+#define KVM_REG_PPC_MAS0	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8c)
+#define KVM_REG_PPC_MAS1	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8d)
+#define KVM_REG_PPC_MAS2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8e)
+#define KVM_REG_PPC_MAS7_3	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8f)
+#define KVM_REG_PPC_MAS4	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x90)
+#define KVM_REG_PPC_MAS6	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x91)
+#define KVM_REG_PPC_MMUCFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x92)
+/*
+ * TLBnCFG fields TLBnCFG_N_ENTRY and TLBnCFG_ASSOC can be changed only using
+ * KVM_CAP_SW_TLB ioctl
+ */
+#define KVM_REG_PPC_TLB0CFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x93)
+#define KVM_REG_PPC_TLB1CFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x94)
+#define KVM_REG_PPC_TLB2CFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x95)
+#define KVM_REG_PPC_TLB3CFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x96)
+#define KVM_REG_PPC_TLB0PS	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x97)
+#define KVM_REG_PPC_TLB1PS	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x98)
+#define KVM_REG_PPC_TLB2PS	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x99)
+#define KVM_REG_PPC_TLB3PS	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a)
+#define KVM_REG_PPC_EPTCFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b)
+
+/* PPC64 eXternal Interrupt Controller Specification */
+#define KVM_DEV_XICS_GRP_SOURCES	1	/* 64-bit source attributes */
+
+/* Layout of 64-bit source attribute values */
+#define  KVM_XICS_DESTINATION_SHIFT	0
+#define  KVM_XICS_DESTINATION_MASK	0xffffffffULL
+#define  KVM_XICS_PRIORITY_SHIFT	32
+#define  KVM_XICS_PRIORITY_MASK		0xff
+#define  KVM_XICS_LEVEL_SENSITIVE	(1ULL << 40)
+#define  KVM_XICS_MASKED		(1ULL << 41)
+#define  KVM_XICS_PENDING		(1ULL << 42)
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 172233eab799..b51a97cfedf8 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -480,6 +480,7 @@ int main(void)
 	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
+	DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
 	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
@@ -576,6 +577,8 @@ int main(void)
 	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
 	HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
 	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
+	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
+	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
 	HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
 	HSTATE_FIELD(HSTATE_PMC, host_pmc);
 	HSTATE_FIELD(HSTATE_PURR, host_purr);
@@ -599,6 +602,7 @@ int main(void)
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
 	DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
+	DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save));
 #endif /* CONFIG_PPC_BOOK3S */
 #endif /* CONFIG_KVM */
 
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index ae9f433daabf..c60bbec25c1f 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -98,10 +98,14 @@ extern void __restore_cpu_e6500(void);
 				 PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \
 				 PPC_FEATURE_TRUE_LE | \
 				 PPC_FEATURE_PSERIES_PERFMON_COMPAT)
+#define COMMON_USER2_POWER7	(PPC_FEATURE2_DSCR)
 #define COMMON_USER_POWER8	(COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\
 				 PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \
 				 PPC_FEATURE_TRUE_LE | \
 				 PPC_FEATURE_PSERIES_PERFMON_COMPAT)
+#define COMMON_USER2_POWER8	(PPC_FEATURE2_ARCH_2_07 | \
+				 PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_DSCR | \
+				 PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR)
 #define COMMON_USER_PA6T	(COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\
 				 PPC_FEATURE_TRUE_LE | \
 				 PPC_FEATURE_HAS_ALTIVEC_COMP)
@@ -428,6 +432,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER7 (architected)",
 		.cpu_features		= CPU_FTRS_POWER7,
 		.cpu_user_features	= COMMON_USER_POWER7,
+		.cpu_user_features2	= COMMON_USER2_POWER7,
 		.mmu_features		= MMU_FTRS_POWER7,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
@@ -443,6 +448,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER8 (architected)",
 		.cpu_features		= CPU_FTRS_POWER8,
 		.cpu_user_features	= COMMON_USER_POWER8,
+		.cpu_user_features2	= COMMON_USER2_POWER8,
 		.mmu_features		= MMU_FTRS_POWER8,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
@@ -458,6 +464,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER7 (raw)",
 		.cpu_features		= CPU_FTRS_POWER7,
 		.cpu_user_features	= COMMON_USER_POWER7,
+		.cpu_user_features2	= COMMON_USER2_POWER7,
 		.mmu_features		= MMU_FTRS_POWER7,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
@@ -475,6 +482,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER7+ (raw)",
 		.cpu_features		= CPU_FTRS_POWER7,
 		.cpu_user_features	= COMMON_USER_POWER7,
+		.cpu_user_features	= COMMON_USER2_POWER7,
 		.mmu_features		= MMU_FTRS_POWER7,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
@@ -492,6 +500,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "POWER8 (raw)",
 		.cpu_features		= CPU_FTRS_POWER8,
 		.cpu_user_features	= COMMON_USER_POWER8,
+		.cpu_user_features2	= COMMON_USER2_POWER8,
 		.mmu_features		= MMU_FTRS_POWER8,
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
@@ -1995,6 +2004,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_user_features	= COMMON_USER_BOOKE |
 			PPC_FEATURE_HAS_SPE_COMP |
 			PPC_FEATURE_HAS_EFP_SINGLE_COMP,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
 		.mmu_features		= MMU_FTR_TYPE_FSL_E,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
@@ -2014,6 +2024,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 			PPC_FEATURE_HAS_SPE_COMP |
 			PPC_FEATURE_HAS_EFP_SINGLE_COMP |
 			PPC_FEATURE_HAS_EFP_DOUBLE_COMP,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
 		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS,
 		.icache_bsize		= 32,
 		.dcache_bsize		= 32,
@@ -2030,6 +2041,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "e500mc",
 		.cpu_features		= CPU_FTRS_E500MC,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
 		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
 			MMU_FTR_USE_TLBILX,
 		.icache_bsize		= 64,
@@ -2048,6 +2060,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_name		= "e5500",
 		.cpu_features		= CPU_FTRS_E5500,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
 		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
 			MMU_FTR_USE_TLBILX,
 		.icache_bsize		= 64,
@@ -2069,6 +2082,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_features		= CPU_FTRS_E6500,
 		.cpu_user_features	= COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU |
 			PPC_FEATURE_HAS_ALTIVEC_COMP,
+		.cpu_user_features2	= PPC_FEATURE2_ISEL,
 		.mmu_features		= MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
 			MMU_FTR_USE_TLBILX,
 		.icache_bsize		= 64,
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index e514de57a125..d22e73e4618b 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -439,8 +439,6 @@ ret_from_fork:
 ret_from_kernel_thread:
 	REST_NVGPRS(r1)
 	bl	schedule_tail
-	li	r3,0
-	stw	r3,0(r1)
 	mtlr	r14
 	mr	r3,r15
 	PPC440EP_ERR42
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 3fe5259e2fea..51cfb8fc301f 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -33,6 +33,7 @@
 #include <asm/irqflags.h>
 #include <asm/ftrace.h>
 #include <asm/hw_irq.h>
+#include <asm/context_tracking.h>
 
 /*
  * System calls.
@@ -150,7 +151,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 	CURRENT_THREAD_INFO(r11, r1)
 	ld	r10,TI_FLAGS(r11)
 	andi.	r11,r10,_TIF_SYSCALL_T_OR_A
-	bne-	syscall_dotrace
+	bne	syscall_dotrace
 .Lsyscall_dotrace_cont:
 	cmpldi	0,r0,NR_syscalls
 	bge-	syscall_enosys
@@ -376,8 +377,6 @@ _GLOBAL(ret_from_fork)
 _GLOBAL(ret_from_kernel_thread)
 	bl	.schedule_tail
 	REST_NVGPRS(r1)
-	li	r3,0
-	std	r3,0(r1)
 	ld	r14, 0(r14)
 	mtlr	r14
 	mr	r3,r15
@@ -634,7 +633,7 @@ _GLOBAL(ret_from_except_lite)
 	andi.	r0,r4,_TIF_NEED_RESCHED
 	beq	1f
 	bl	.restore_interrupts
-	bl	.schedule
+	SCHEDULE_USER
 	b	.ret_from_except_lite
 
 1:	bl	.save_nvgprs
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index 42a756eec9ff..645170a07ada 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -489,7 +489,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	 */
 
 	mfspr	r14,SPRN_DBSR		/* check single-step/branch taken */
-	andis.	r15,r14,DBSR_IC@h
+	andis.	r15,r14,(DBSR_IC|DBSR_BT)@h
 	beq+	1f
 
 	LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
@@ -500,7 +500,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	bge+	cr1,1f
 
 	/* here it looks like we got an inappropriate debug exception. */
-	lis	r14,DBSR_IC@h		/* clear the IC event */
+	lis	r14,(DBSR_IC|DBSR_BT)@h		/* clear the event */
 	rlwinm	r11,r11,0,~MSR_DE	/* clear DE in the CSRR1 value */
 	mtspr	SPRN_DBSR,r14
 	mtspr	SPRN_CSRR1,r11
@@ -555,7 +555,7 @@ kernel_dbg_exc:
 	 */
 
 	mfspr	r14,SPRN_DBSR		/* check single-step/branch taken */
-	andis.	r15,r14,DBSR_IC@h
+	andis.	r15,r14,(DBSR_IC|DBSR_BT)@h
 	beq+	1f
 
 	LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
@@ -566,7 +566,7 @@ kernel_dbg_exc:
 	bge+	cr1,1f
 
 	/* here it looks like we got an inappropriate debug exception. */
-	lis	r14,DBSR_IC@h		/* clear the IC event */
+	lis	r14,(DBSR_IC|DBSR_BT)@h		/* clear the event */
 	rlwinm	r11,r11,0,~MSR_DE	/* clear DE in the DSRR1 value */
 	mtspr	SPRN_DBSR,r14
 	mtspr	SPRN_DSRR1,r11
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 466a2908bb63..611acdf30096 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/cpu.h>
+#include <linux/hardirq.h>
 
 #include <asm/page.h>
 #include <asm/current.h>
@@ -335,10 +336,13 @@ void default_machine_kexec(struct kimage *image)
 	pr_debug("kexec: Starting switchover sequence.\n");
 
 	/* switch to a staticly allocated stack.  Based on irq stack code.
+	 * We setup preempt_count to avoid using VMX in memcpy.
 	 * XXX: the task struct will likely be invalid once we do the copy!
 	 */
 	kexec_stack.thread_info.task = current_thread_info()->task;
 	kexec_stack.thread_info.flags = 0;
+	kexec_stack.thread_info.preempt_count = HARDIRQ_OFFSET;
+	kexec_stack.thread_info.cpu = current_thread_info()->cpu;
 
 	/* We need a static PACA, too; copy this CPU's PACA over and switch to
 	 * it.  Also poison per_cpu_offset to catch anyone using non-static
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 19e096bd0e73..e469f30e6eeb 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -657,6 +657,17 @@ _GLOBAL(__ucmpdi2)
 	li	r3,2
 	blr
 
+_GLOBAL(__bswapdi2)
+	rotlwi  r9,r4,8
+	rotlwi  r10,r3,8
+	rlwimi  r9,r4,24,0,7
+	rlwimi  r10,r3,24,0,7
+	rlwimi  r9,r4,24,16,23
+	rlwimi  r10,r3,24,16,23
+	mr      r3,r9
+	mr      r4,r10
+	blr
+
 _GLOBAL(abs)
 	srawi	r4,r3,31
 	xor	r3,r3,r4
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 5cfa8008693b..6820e45f557b 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -234,6 +234,17 @@ _GLOBAL(__flush_dcache_icache)
 	isync
 	blr
 
+_GLOBAL(__bswapdi2)
+	srdi	r8,r3,32
+	rlwinm	r7,r3,8,0xffffffff
+	rlwimi	r7,r3,24,0,7
+	rlwinm	r9,r8,8,0xffffffff
+	rlwimi	r7,r3,24,16,23
+	rlwimi	r9,r8,24,0,7
+	rlwimi	r9,r8,24,16,23
+	sldi	r7,r7,32
+	or	r3,r7,r9
+	blr
 
 #if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE)
 /*
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index f325dc923409..6053f037ef0a 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -359,7 +359,6 @@ static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
 				      enum pci_mmap_state mmap_state,
 				      int write_combine)
 {
-	unsigned long prot = pgprot_val(protection);
 
 	/* Write combine is always 0 on non-memory space mappings. On
 	 * memory space, if the user didn't pass 1, we check for a
@@ -376,9 +375,9 @@ static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
 
 	/* XXX would be nice to have a way to ask for write-through */
 	if (write_combine)
-		return pgprot_noncached_wc(prot);
+		return pgprot_noncached_wc(protection);
 	else
-		return pgprot_noncached(prot);
+		return pgprot_noncached(protection);
 }
 
 /*
@@ -786,22 +785,8 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose,
 				hose->isa_mem_size = size;
 			}
 
-			/* We get the PCI/Mem offset from the first range or
-			 * the, current one if the offset came from an ISA
-			 * hole. If they don't match, bugger.
-			 */
-			if (memno == 0 ||
-			    (isa_hole >= 0 && pci_addr != 0 &&
-			     hose->pci_mem_offset == isa_mb))
-				hose->pci_mem_offset = cpu_addr - pci_addr;
-			else if (pci_addr != 0 &&
-				 hose->pci_mem_offset != cpu_addr - pci_addr) {
-				printk(KERN_INFO
-				       " \\--> Skipped (offset mismatch) !\n");
-				continue;
-			}
-
 			/* Build resource */
+			hose->mem_offset[memno] = cpu_addr - pci_addr;
 			res = &hose->mem_resources[memno++];
 			res->flags = IORESOURCE_MEM;
 			if (pci_space & 0x40000000)
@@ -817,20 +802,6 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose,
 			res->child = NULL;
 		}
 	}
-
-	/* If there's an ISA hole and the pci_mem_offset is -not- matching
-	 * the ISA hole offset, then we need to remove the ISA hole from
-	 * the resource list for that brige
-	 */
-	if (isa_hole >= 0 && hose->pci_mem_offset != isa_mb) {
-		unsigned int next = isa_hole + 1;
-		printk(KERN_INFO " Removing ISA hole at 0x%016llx\n", isa_mb);
-		if (next < memno)
-			memmove(&hose->mem_resources[isa_hole],
-				&hose->mem_resources[next],
-				sizeof(struct resource) * (memno - next));
-		hose->mem_resources[--memno].flags = 0;
-	}
 }
 
 /* Decide whether to display the domain number in /proc */
@@ -845,6 +816,14 @@ int pci_proc_domain(struct pci_bus *bus)
 	return 1;
 }
 
+int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
+{
+	if (ppc_md.pcibios_root_bridge_prepare)
+		return ppc_md.pcibios_root_bridge_prepare(bridge);
+
+	return 0;
+}
+
 /* This header fixup will do the resource fixup for all devices as they are
  * probed, but not for bridge ranges
  */
@@ -908,6 +887,7 @@ static int pcibios_uninitialized_bridge_resource(struct pci_bus *bus,
 	struct pci_controller *hose = pci_bus_to_host(bus);
 	struct pci_dev *dev = bus->self;
 	resource_size_t offset;
+	struct pci_bus_region region;
 	u16 command;
 	int i;
 
@@ -917,10 +897,10 @@ static int pcibios_uninitialized_bridge_resource(struct pci_bus *bus,
 
 	/* Job is a bit different between memory and IO */
 	if (res->flags & IORESOURCE_MEM) {
-		/* If the BAR is non-0 (res != pci_mem_offset) then it's probably been
-		 * initialized by somebody
-		 */
-		if (res->start != hose->pci_mem_offset)
+		pcibios_resource_to_bus(dev, &region, res);
+
+		/* If the BAR is non-0 then it's probably been initialized */
+		if (region.start != 0)
 			return 0;
 
 		/* The BAR is 0, let's check if memory decoding is enabled on
@@ -932,11 +912,11 @@ static int pcibios_uninitialized_bridge_resource(struct pci_bus *bus,
 
 		/* Memory decoding is enabled and the BAR is 0. If any of the bridge
 		 * resources covers that starting address (0 then it's good enough for
-		 * us for memory
+		 * us for memory space)
 		 */
 		for (i = 0; i < 3; i++) {
 			if ((hose->mem_resources[i].flags & IORESOURCE_MEM) &&
-			    hose->mem_resources[i].start == hose->pci_mem_offset)
+			    hose->mem_resources[i].start == hose->mem_offset[i])
 				return 0;
 		}
 
@@ -1373,10 +1353,9 @@ static void __init pcibios_reserve_legacy_regions(struct pci_bus *bus)
 
  no_io:
 	/* Check for memory */
-	offset = hose->pci_mem_offset;
-	pr_debug("hose mem offset: %016llx\n", (unsigned long long)offset);
 	for (i = 0; i < 3; i++) {
 		pres = &hose->mem_resources[i];
+		offset = hose->mem_offset[i];
 		if (!(pres->flags & IORESOURCE_MEM))
 			continue;
 		pr_debug("hose mem res: %pR\n", pres);
@@ -1516,6 +1495,7 @@ static void pcibios_setup_phb_resources(struct pci_controller *hose,
 					struct list_head *resources)
 {
 	struct resource *res;
+	resource_size_t offset;
 	int i;
 
 	/* Hookup PHB IO resource */
@@ -1525,49 +1505,37 @@ static void pcibios_setup_phb_resources(struct pci_controller *hose,
 		printk(KERN_WARNING "PCI: I/O resource not set for host"
 		       " bridge %s (domain %d)\n",
 		       hose->dn->full_name, hose->global_number);
-#ifdef CONFIG_PPC32
-		/* Workaround for lack of IO resource only on 32-bit */
-		res->start = (unsigned long)hose->io_base_virt - isa_io_base;
-		res->end = res->start + IO_SPACE_LIMIT;
-		res->flags = IORESOURCE_IO;
-#endif /* CONFIG_PPC32 */
-	}
+	} else {
+		offset = pcibios_io_space_offset(hose);
 
-	pr_debug("PCI: PHB IO resource    = %016llx-%016llx [%lx]\n",
-		 (unsigned long long)res->start,
-		 (unsigned long long)res->end,
-		 (unsigned long)res->flags);
-	pci_add_resource_offset(resources, res, pcibios_io_space_offset(hose));
+		pr_debug("PCI: PHB IO resource    = %08llx-%08llx [%lx] off 0x%08llx\n",
+			 (unsigned long long)res->start,
+			 (unsigned long long)res->end,
+			 (unsigned long)res->flags,
+			 (unsigned long long)offset);
+		pci_add_resource_offset(resources, res, offset);
+	}
 
 	/* Hookup PHB Memory resources */
 	for (i = 0; i < 3; ++i) {
 		res = &hose->mem_resources[i];
 		if (!res->flags) {
-			if (i > 0)
-				continue;
 			printk(KERN_ERR "PCI: Memory resource 0 not set for "
 			       "host bridge %s (domain %d)\n",
 			       hose->dn->full_name, hose->global_number);
-#ifdef CONFIG_PPC32
-			/* Workaround for lack of MEM resource only on 32-bit */
-			res->start = hose->pci_mem_offset;
-			res->end = (resource_size_t)-1LL;
-			res->flags = IORESOURCE_MEM;
-#endif /* CONFIG_PPC32 */
+			continue;
 		}
+		offset = hose->mem_offset[i];
 
-		pr_debug("PCI: PHB MEM resource %d = %016llx-%016llx [%lx]\n", i,
+
+		pr_debug("PCI: PHB MEM resource %d = %08llx-%08llx [%lx] off 0x%08llx\n", i,
 			 (unsigned long long)res->start,
 			 (unsigned long long)res->end,
-			 (unsigned long)res->flags);
-		pci_add_resource_offset(resources, res, hose->pci_mem_offset);
-	}
-
-	pr_debug("PCI: PHB MEM offset     = %016llx\n",
-		 (unsigned long long)hose->pci_mem_offset);
-	pr_debug("PCI: PHB IO  offset     = %08lx\n",
-		 (unsigned long)hose->io_base_virt - _IO_BASE);
+			 (unsigned long)res->flags,
+			 (unsigned long long)offset);
 
+		pci_add_resource_offset(resources, res, offset);
+	}
 }
 
 /*
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index e37c2152acf4..432459c817fa 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -295,7 +295,7 @@ long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn)
 	case IOBASE_BRIDGE_NUMBER:
 		return (long)hose->first_busno;
 	case IOBASE_MEMORY:
-		return (long)hose->pci_mem_offset;
+		return (long)hose->mem_offset[0];
 	case IOBASE_IO:
 		return (long)hose->io_base_phys;
 	case IOBASE_ISA_IO:
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 51a133a78a09..873050d26840 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -246,7 +246,7 @@ long sys_pciconfig_iobase(long which, unsigned long in_bus,
 	case IOBASE_BRIDGE_NUMBER:
 		return (long)hose->first_busno;
 	case IOBASE_MEMORY:
-		return (long)hose->pci_mem_offset;
+		return (long)hose->mem_offset[0];
 	case IOBASE_IO:
 		return (long)hose->io_base_phys;
 	case IOBASE_ISA_IO:
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 78b8766fd79e..c29666586998 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -143,7 +143,8 @@ EXPORT_SYMBOL(__lshrdi3);
 int __ucmpdi2(unsigned long long, unsigned long long);
 EXPORT_SYMBOL(__ucmpdi2);
 #endif
-
+long long __bswapdi2(long long);
+EXPORT_SYMBOL(__bswapdi2);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memmove);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ceb4e7b62cf4..a902723fdc69 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -339,6 +339,13 @@ static void set_debug_reg_defaults(struct thread_struct *thread)
 
 static void prime_debug_regs(struct thread_struct *thread)
 {
+	/*
+	 * We could have inherited MSR_DE from userspace, since
+	 * it doesn't get cleared on exception entry.  Make sure
+	 * MSR_DE is clear before we enable any debug events.
+	 */
+	mtmsr(mfmsr() & ~MSR_DE);
+
 	mtspr(SPRN_IAC1, thread->iac1);
 	mtspr(SPRN_IAC2, thread->iac2);
 #if CONFIG_PPC_ADV_DEBUG_IACS > 2
@@ -971,6 +978,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
 	 * do some house keeping and then return from the fork or clone
 	 * system call, using the stack frame created above.
 	 */
+	((unsigned long *)sp)[0] = 0;
 	sp -= sizeof(struct pt_regs);
 	kregs = (struct pt_regs *) sp;
 	sp -= STACK_FRAME_OVERHEAD;
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 3b14d320e69f..98c2fc198712 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -32,6 +32,7 @@
 #include <trace/syscall.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/perf_event.h>
+#include <linux/context_tracking.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -1788,6 +1789,8 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 {
 	long ret = 0;
 
+	user_exit();
+
 	secure_computing_strict(regs->gpr[0]);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
@@ -1832,4 +1835,6 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
 		tracehook_report_syscall_exit(regs, step);
+
+	user_enter();
 }
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 1fd6e7b2f390..52add6f3e201 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/capability.h>
 #include <linux/delay.h>
+#include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/completion.h>
 #include <linux/cpumask.h>
@@ -807,6 +808,95 @@ static void rtas_percpu_suspend_me(void *info)
 	__rtas_suspend_cpu((struct rtas_suspend_me_data *)info, 1);
 }
 
+enum rtas_cpu_state {
+	DOWN,
+	UP,
+};
+
+#ifndef CONFIG_SMP
+static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
+				cpumask_var_t cpus)
+{
+	if (!cpumask_empty(cpus)) {
+		cpumask_clear(cpus);
+		return -EINVAL;
+	} else
+		return 0;
+}
+#else
+/* On return cpumask will be altered to indicate CPUs changed.
+ * CPUs with states changed will be set in the mask,
+ * CPUs with status unchanged will be unset in the mask. */
+static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
+				cpumask_var_t cpus)
+{
+	int cpu;
+	int cpuret = 0;
+	int ret = 0;
+
+	if (cpumask_empty(cpus))
+		return 0;
+
+	for_each_cpu(cpu, cpus) {
+		switch (state) {
+		case DOWN:
+			cpuret = cpu_down(cpu);
+			break;
+		case UP:
+			cpuret = cpu_up(cpu);
+			break;
+		}
+		if (cpuret) {
+			pr_debug("%s: cpu_%s for cpu#%d returned %d.\n",
+					__func__,
+					((state == UP) ? "up" : "down"),
+					cpu, cpuret);
+			if (!ret)
+				ret = cpuret;
+			if (state == UP) {
+				/* clear bits for unchanged cpus, return */
+				cpumask_shift_right(cpus, cpus, cpu);
+				cpumask_shift_left(cpus, cpus, cpu);
+				break;
+			} else {
+				/* clear bit for unchanged cpu, continue */
+				cpumask_clear_cpu(cpu, cpus);
+			}
+		}
+	}
+
+	return ret;
+}
+#endif
+
+int rtas_online_cpus_mask(cpumask_var_t cpus)
+{
+	int ret;
+
+	ret = rtas_cpu_state_change_mask(UP, cpus);
+
+	if (ret) {
+		cpumask_var_t tmp_mask;
+
+		if (!alloc_cpumask_var(&tmp_mask, GFP_TEMPORARY))
+			return ret;
+
+		/* Use tmp_mask to preserve cpus mask from first failure */
+		cpumask_copy(tmp_mask, cpus);
+		rtas_offline_cpus_mask(tmp_mask);
+		free_cpumask_var(tmp_mask);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(rtas_online_cpus_mask);
+
+int rtas_offline_cpus_mask(cpumask_var_t cpus)
+{
+	return rtas_cpu_state_change_mask(DOWN, cpus);
+}
+EXPORT_SYMBOL(rtas_offline_cpus_mask);
+
 int rtas_ibm_suspend_me(struct rtas_args *args)
 {
 	long state;
@@ -814,6 +904,8 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
 	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
 	struct rtas_suspend_me_data data;
 	DECLARE_COMPLETION_ONSTACK(done);
+	cpumask_var_t offline_mask;
+	int cpuret;
 
 	if (!rtas_service_present("ibm,suspend-me"))
 		return -ENOSYS;
@@ -837,11 +929,24 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
 		return 0;
 	}
 
+	if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY))
+		return -ENOMEM;
+
 	atomic_set(&data.working, 0);
 	atomic_set(&data.done, 0);
 	atomic_set(&data.error, 0);
 	data.token = rtas_token("ibm,suspend-me");
 	data.complete = &done;
+
+	/* All present CPUs must be online */
+	cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask);
+	cpuret = rtas_online_cpus_mask(offline_mask);
+	if (cpuret) {
+		pr_err("%s: Could not bring present CPUs online.\n", __func__);
+		atomic_set(&data.error, cpuret);
+		goto out;
+	}
+
 	stop_topology_update();
 
 	/* Call function on all CPUs.  One of us will make the
@@ -857,6 +962,14 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
 
 	start_topology_update();
 
+	/* Take down CPUs not online prior to suspend */
+	cpuret = rtas_offline_cpus_mask(offline_mask);
+	if (cpuret)
+		pr_warn("%s: Could not restore CPUs to offline state.\n",
+				__func__);
+
+out:
+	free_cpumask_var(offline_mask);
 	return atomic_read(&data.error);
 }
 #else /* CONFIG_PPC_PSERIES */
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 5b3022470126..2f3cdb01506d 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -89,6 +89,7 @@
 
 /* Array sizes */
 #define VALIDATE_BUF_SIZE 4096    
+#define VALIDATE_MSG_LEN  256
 #define RTAS_MSG_MAXLEN   64
 
 /* Quirk - RTAS requires 4k list length and block size */
@@ -466,7 +467,7 @@ static void validate_flash(struct rtas_validate_flash_t *args_buf)
 }
 
 static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf, 
-		                   char *msg)
+		                   char *msg, int msglen)
 {
 	int n;
 
@@ -474,7 +475,8 @@ static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf,
 		n = sprintf(msg, "%d\n", args_buf->update_results);
 		if ((args_buf->update_results >= VALIDATE_CUR_UNKNOWN) ||
 		    (args_buf->update_results == VALIDATE_TMP_UPDATE))
-			n += sprintf(msg + n, "%s\n", args_buf->buf);
+			n += snprintf(msg + n, msglen - n, "%s\n",
+					args_buf->buf);
 	} else {
 		n = sprintf(msg, "%d\n", args_buf->status);
 	}
@@ -486,11 +488,11 @@ static ssize_t validate_flash_read(struct file *file, char __user *buf,
 {
 	struct rtas_validate_flash_t *const args_buf =
 		&rtas_validate_flash_data;
-	char msg[RTAS_MSG_MAXLEN];
+	char msg[VALIDATE_MSG_LEN];
 	int msglen;
 
 	mutex_lock(&rtas_validate_flash_mutex);
-	msglen = get_validate_flash_msg(args_buf, msg);
+	msglen = get_validate_flash_msg(args_buf, msg, VALIDATE_MSG_LEN);
 	mutex_unlock(&rtas_validate_flash_mutex);
 
 	return simple_read_from_buffer(buf, count, ppos, msg, msglen);
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index cf12eae02de5..577a8aa69c6e 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -13,6 +13,7 @@
 #include <linux/signal.h>
 #include <linux/uprobes.h>
 #include <linux/key.h>
+#include <linux/context_tracking.h>
 #include <asm/hw_breakpoint.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -24,7 +25,7 @@
  * through debug.exception-trace sysctl.
  */
 
-int show_unhandled_signals = 0;
+int show_unhandled_signals = 1;
 
 /*
  * Allocate space for the signal frame
@@ -159,6 +160,8 @@ static int do_signal(struct pt_regs *regs)
 
 void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
 {
+	user_exit();
+
 	if (thread_info_flags & _TIF_UPROBE)
 		uprobe_notify_resume(regs);
 
@@ -169,4 +172,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
 	}
+
+	user_enter();
 }
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index cd6e19d263b3..8a285876aef8 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -126,11 +126,3 @@ asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags,
 
 	return sys_sync_file_range(fd, offset, nbytes, flags);
 }
-
-asmlinkage long compat_sys_fanotify_mark(int fanotify_fd, unsigned int flags,
-					 unsigned mask_hi, unsigned mask_lo,
-					 int dfd, const char __user *pathname)
-{
-	u64 mask = ((u64)mask_hi << 32) | mask_lo;
-	return sys_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
-}
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 3ce1f864c2d3..e68a84568b8b 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -180,7 +180,7 @@ SYSFS_PMCSETUP(dscr, SPRN_DSCR);
 SYSFS_PMCSETUP(pir, SPRN_PIR);
 
 static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
-static DEVICE_ATTR(spurr, 0600, show_spurr, NULL);
+static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
 static DEVICE_ATTR(dscr, 0600, show_dscr, store_dscr);
 static DEVICE_ATTR(purr, 0600, show_purr, store_purr);
 static DEVICE_ATTR(pir, 0400, show_pir, NULL);
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 37cc40ef5043..a7a648f6b750 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -35,6 +35,7 @@
 #include <linux/kdebug.h>
 #include <linux/debugfs.h>
 #include <linux/ratelimit.h>
+#include <linux/context_tracking.h>
 
 #include <asm/emulated_ops.h>
 #include <asm/pgtable.h>
@@ -667,6 +668,7 @@ int machine_check_generic(struct pt_regs *regs)
 
 void machine_check_exception(struct pt_regs *regs)
 {
+	enum ctx_state prev_state = exception_enter();
 	int recover = 0;
 
 	__get_cpu_var(irq_stat).mce_exceptions++;
@@ -683,7 +685,7 @@ void machine_check_exception(struct pt_regs *regs)
 		recover = cur_cpu_spec->machine_check(regs);
 
 	if (recover > 0)
-		return;
+		goto bail;
 
 #if defined(CONFIG_8xx) && defined(CONFIG_PCI)
 	/* the qspan pci read routines can cause machine checks -- Cort
@@ -693,20 +695,23 @@ void machine_check_exception(struct pt_regs *regs)
 	 * -- BenH
 	 */
 	bad_page_fault(regs, regs->dar, SIGBUS);
-	return;
+	goto bail;
 #endif
 
 	if (debugger_fault_handler(regs))
-		return;
+		goto bail;
 
 	if (check_io_access(regs))
-		return;
+		goto bail;
 
 	die("Machine check", regs, SIGBUS);
 
 	/* Must die if the interrupt is not recoverable */
 	if (!(regs->msr & MSR_RI))
 		panic("Unrecoverable Machine check");
+
+bail:
+	exception_exit(prev_state);
 }
 
 void SMIException(struct pt_regs *regs)
@@ -716,20 +721,29 @@ void SMIException(struct pt_regs *regs)
 
 void unknown_exception(struct pt_regs *regs)
 {
+	enum ctx_state prev_state = exception_enter();
+
 	printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n",
 	       regs->nip, regs->msr, regs->trap);
 
 	_exception(SIGTRAP, regs, 0, 0);
+
+	exception_exit(prev_state);
 }
 
 void instruction_breakpoint_exception(struct pt_regs *regs)
 {
+	enum ctx_state prev_state = exception_enter();
+
 	if (notify_die(DIE_IABR_MATCH, "iabr_match", regs, 5,
 					5, SIGTRAP) == NOTIFY_STOP)
-		return;
+		goto bail;
 	if (debugger_iabr_match(regs))
-		return;
+		goto bail;
 	_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
+
+bail:
+	exception_exit(prev_state);
 }
 
 void RunModeException(struct pt_regs *regs)
@@ -739,15 +753,20 @@ void RunModeException(struct pt_regs *regs)
 
 void __kprobes single_step_exception(struct pt_regs *regs)
 {
+	enum ctx_state prev_state = exception_enter();
+
 	clear_single_step(regs);
 
 	if (notify_die(DIE_SSTEP, "single_step", regs, 5,
 					5, SIGTRAP) == NOTIFY_STOP)
-		return;
+		goto bail;
 	if (debugger_sstep(regs))
-		return;
+		goto bail;
 
 	_exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
+
+bail:
+	exception_exit(prev_state);
 }
 
 /*
@@ -970,7 +989,10 @@ static int emulate_instruction(struct pt_regs *regs)
 
 #ifdef CONFIG_PPC64
 	/* Emulate the mfspr rD, DSCR. */
-	if (((instword & PPC_INST_MFSPR_DSCR_MASK) == PPC_INST_MFSPR_DSCR) &&
+	if ((((instword & PPC_INST_MFSPR_DSCR_USER_MASK) ==
+		PPC_INST_MFSPR_DSCR_USER) ||
+	     ((instword & PPC_INST_MFSPR_DSCR_MASK) ==
+		PPC_INST_MFSPR_DSCR)) &&
 			cpu_has_feature(CPU_FTR_DSCR)) {
 		PPC_WARN_EMULATED(mfdscr, regs);
 		rd = (instword >> 21) & 0x1f;
@@ -978,7 +1000,10 @@ static int emulate_instruction(struct pt_regs *regs)
 		return 0;
 	}
 	/* Emulate the mtspr DSCR, rD. */
-	if (((instword & PPC_INST_MTSPR_DSCR_MASK) == PPC_INST_MTSPR_DSCR) &&
+	if ((((instword & PPC_INST_MTSPR_DSCR_USER_MASK) ==
+		PPC_INST_MTSPR_DSCR_USER) ||
+	     ((instword & PPC_INST_MTSPR_DSCR_MASK) ==
+		PPC_INST_MTSPR_DSCR)) &&
 			cpu_has_feature(CPU_FTR_DSCR)) {
 		PPC_WARN_EMULATED(mtdscr, regs);
 		rd = (instword >> 21) & 0x1f;
@@ -999,6 +1024,7 @@ int is_valid_bugaddr(unsigned long addr)
 
 void __kprobes program_check_exception(struct pt_regs *regs)
 {
+	enum ctx_state prev_state = exception_enter();
 	unsigned int reason = get_reason(regs);
 	extern int do_mathemu(struct pt_regs *regs);
 
@@ -1008,26 +1034,26 @@ void __kprobes program_check_exception(struct pt_regs *regs)
 	if (reason & REASON_FP) {
 		/* IEEE FP exception */
 		parse_fpe(regs);
-		return;
+		goto bail;
 	}
 	if (reason & REASON_TRAP) {
 		/* Debugger is first in line to stop recursive faults in
 		 * rcu_lock, notify_die, or atomic_notifier_call_chain */
 		if (debugger_bpt(regs))
-			return;
+			goto bail;
 
 		/* trap exception */
 		if (notify_die(DIE_BPT, "breakpoint", regs, 5, 5, SIGTRAP)
 				== NOTIFY_STOP)
-			return;
+			goto bail;
 
 		if (!(regs->msr & MSR_PR) &&  /* not user-mode */
 		    report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) {
 			regs->nip += 4;
-			return;
+			goto bail;
 		}
 		_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
-		return;
+		goto bail;
 	}
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	if (reason & REASON_TM) {
@@ -1043,7 +1069,7 @@ void __kprobes program_check_exception(struct pt_regs *regs)
 		if (!user_mode(regs) &&
 		    report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) {
 			regs->nip += 4;
-			return;
+			goto bail;
 		}
 		/* If usermode caused this, it's done something illegal and
 		 * gets a SIGILL slap on the wrist.  We call it an illegal
@@ -1053,7 +1079,7 @@ void __kprobes program_check_exception(struct pt_regs *regs)
 		 */
 		if (user_mode(regs)) {
 			_exception(SIGILL, regs, ILL_ILLOPN, regs->nip);
-			return;
+			goto bail;
 		} else {
 			printk(KERN_EMERG "Unexpected TM Bad Thing exception "
 			       "at %lx (msr 0x%x)\n", regs->nip, reason);
@@ -1077,16 +1103,16 @@ void __kprobes program_check_exception(struct pt_regs *regs)
 	switch (do_mathemu(regs)) {
 	case 0:
 		emulate_single_step(regs);
-		return;
+		goto bail;
 	case 1: {
 			int code = 0;
 			code = __parse_fpscr(current->thread.fpscr.val);
 			_exception(SIGFPE, regs, code, regs->nip);
-			return;
+			goto bail;
 		}
 	case -EFAULT:
 		_exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
-		return;
+		goto bail;
 	}
 	/* fall through on any other errors */
 #endif /* CONFIG_MATH_EMULATION */
@@ -1097,10 +1123,10 @@ void __kprobes program_check_exception(struct pt_regs *regs)
 		case 0:
 			regs->nip += 4;
 			emulate_single_step(regs);
-			return;
+			goto bail;
 		case -EFAULT:
 			_exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
-			return;
+			goto bail;
 		}
 	}
 
@@ -1108,10 +1134,14 @@ void __kprobes program_check_exception(struct pt_regs *regs)
 		_exception(SIGILL, regs, ILL_PRVOPC, regs->nip);
 	else
 		_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
+
+bail:
+	exception_exit(prev_state);
 }
 
 void alignment_exception(struct pt_regs *regs)
 {
+	enum ctx_state prev_state = exception_enter();
 	int sig, code, fixed = 0;
 
 	/* We restore the interrupt state now */
@@ -1125,7 +1155,7 @@ void alignment_exception(struct pt_regs *regs)
 	if (fixed == 1) {
 		regs->nip += 4;	/* skip over emulated instruction */
 		emulate_single_step(regs);
-		return;
+		goto bail;
 	}
 
 	/* Operand address was bad */
@@ -1140,6 +1170,9 @@ void alignment_exception(struct pt_regs *regs)
 		_exception(sig, regs, code, regs->dar);
 	else
 		bad_page_fault(regs, regs->dar, sig);
+
+bail:
+	exception_exit(prev_state);
 }
 
 void StackOverflow(struct pt_regs *regs)
@@ -1168,23 +1201,32 @@ void trace_syscall(struct pt_regs *regs)
 
 void kernel_fp_unavailable_exception(struct pt_regs *regs)
 {
+	enum ctx_state prev_state = exception_enter();
+
 	printk(KERN_EMERG "Unrecoverable FP Unavailable Exception "
 			  "%lx at %lx\n", regs->trap, regs->nip);
 	die("Unrecoverable FP Unavailable Exception", regs, SIGABRT);
+
+	exception_exit(prev_state);
 }
 
 void altivec_unavailable_exception(struct pt_regs *regs)
 {
+	enum ctx_state prev_state = exception_enter();
+
 	if (user_mode(regs)) {
 		/* A user program has executed an altivec instruction,
 		   but this kernel doesn't support altivec. */
 		_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
-		return;
+		goto bail;
 	}
 
 	printk(KERN_EMERG "Unrecoverable VMX/Altivec Unavailable Exception "
 			"%lx at %lx\n", regs->trap, regs->nip);
 	die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT);
+
+bail:
+	exception_exit(prev_state);
 }
 
 void vsx_unavailable_exception(struct pt_regs *regs)
diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c
index 13b867093499..9d3fdcd66290 100644
--- a/arch/powerpc/kernel/udbg.c
+++ b/arch/powerpc/kernel/udbg.c
@@ -64,6 +64,9 @@ void __init udbg_early_init(void)
 	udbg_init_usbgecko();
 #elif defined(CONFIG_PPC_EARLY_DEBUG_WSP)
 	udbg_init_wsp();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_MEMCONS)
+	/* In memory console */
+	udbg_init_memcons();
 #elif defined(CONFIG_PPC_EARLY_DEBUG_EHV_BC)
 	udbg_init_ehv_bc();
 #elif defined(CONFIG_PPC_EARLY_DEBUG_PS3GELIC)
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 3d7fd21c65f9..2f5c6b6d6877 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -124,6 +124,18 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 	return kvmppc_set_sregs_ivor(vcpu, sregs);
 }
 
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+			union kvmppc_one_reg *val)
+{
+	return -EINVAL;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+		       union kvmppc_one_reg *val)
+{
+	return -EINVAL;
+}
+
 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	struct kvmppc_vcpu_44x *vcpu_44x;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 63c67ec72e43..eb643f862579 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -136,21 +136,41 @@ config KVM_E500V2
 	  If unsure, say N.
 
 config KVM_E500MC
-	bool "KVM support for PowerPC E500MC/E5500 processors"
+	bool "KVM support for PowerPC E500MC/E5500/E6500 processors"
 	depends on PPC_E500MC
 	select KVM
 	select KVM_MMIO
 	select KVM_BOOKE_HV
 	select MMU_NOTIFIER
 	---help---
-	  Support running unmodified E500MC/E5500 (32-bit) guest kernels in
-	  virtual machines on E500MC/E5500 host processors.
+	  Support running unmodified E500MC/E5500/E6500 guest kernels in
+	  virtual machines on E500MC/E5500/E6500 host processors.
 
 	  This module provides access to the hardware capabilities through
 	  a character device node named /dev/kvm.
 
 	  If unsure, say N.
 
+config KVM_MPIC
+	bool "KVM in-kernel MPIC emulation"
+	depends on KVM && E500
+	select HAVE_KVM_IRQCHIP
+	select HAVE_KVM_IRQ_ROUTING
+	select HAVE_KVM_MSI
+	help
+	  Enable support for emulating MPIC devices inside the
+          host kernel, rather than relying on userspace to emulate.
+          Currently, support is limited to certain versions of
+          Freescale's MPIC implementation.
+
+config KVM_XICS
+	bool "KVM in-kernel XICS emulation"
+	depends on KVM_BOOK3S_64 && !KVM_MPIC
+	---help---
+	  Include support for the XICS (eXternal Interrupt Controller
+	  Specification) interrupt controller architecture used on
+	  IBM POWER (pSeries) servers.
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index b772eded8c26..422de3f4d46c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -72,12 +72,18 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv.o \
 	book3s_hv_interrupts.o \
 	book3s_64_mmu_hv.o
+kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
+	book3s_hv_rm_xics.o
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv_rmhandlers.o \
 	book3s_hv_rm_mmu.o \
 	book3s_64_vio_hv.o \
 	book3s_hv_ras.o \
-	book3s_hv_builtin.o
+	book3s_hv_builtin.o \
+	$(kvm-book3s_64-builtin-xics-objs-y)
+
+kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
+	book3s_xics.o
 
 kvm-book3s_64-module-objs := \
 	../../../virt/kvm/kvm_main.o \
@@ -86,6 +92,7 @@ kvm-book3s_64-module-objs := \
 	emulate.o \
 	book3s.o \
 	book3s_64_vio.o \
+	book3s_rtas.o \
 	$(kvm-book3s_64-objs-y)
 
 kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
@@ -103,6 +110,9 @@ kvm-book3s_32-objs := \
 	book3s_32_mmu.o
 kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
 
+kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
+kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(addprefix ../../../virt/kvm/, irqchip.o)
+
 kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
 
 obj-$(CONFIG_KVM_440) += kvm.o
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index a4b645285240..700df6f1d32c 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -104,7 +104,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 	return prio;
 }
 
-static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
 					  unsigned int vec)
 {
 	unsigned long old_pending = vcpu->arch.pending_exceptions;
@@ -160,8 +160,7 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
 	kvmppc_book3s_queue_irqprio(vcpu, vec);
 }
 
-void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
-                                  struct kvm_interrupt *irq)
+void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
 {
 	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
 	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
@@ -530,6 +529,21 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 			val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]);
 			break;
 #endif /* CONFIG_ALTIVEC */
+		case KVM_REG_PPC_DEBUG_INST: {
+			u32 opcode = INS_TW;
+			r = copy_to_user((u32 __user *)(long)reg->addr,
+					 &opcode, sizeof(u32));
+			break;
+		}
+#ifdef CONFIG_KVM_XICS
+		case KVM_REG_PPC_ICP_STATE:
+			if (!vcpu->arch.icp) {
+				r = -ENXIO;
+				break;
+			}
+			val = get_reg_val(reg->id, kvmppc_xics_get_icp(vcpu));
+			break;
+#endif /* CONFIG_KVM_XICS */
 		default:
 			r = -EINVAL;
 			break;
@@ -592,6 +606,16 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 			vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val);
 			break;
 #endif /* CONFIG_ALTIVEC */
+#ifdef CONFIG_KVM_XICS
+		case KVM_REG_PPC_ICP_STATE:
+			if (!vcpu->arch.icp) {
+				r = -ENXIO;
+				break;
+			}
+			r = kvmppc_xics_set_icp(vcpu,
+						set_reg_val(reg->id, val));
+			break;
+#endif /* CONFIG_KVM_XICS */
 		default:
 			r = -EINVAL;
 			break;
@@ -607,6 +631,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
+{
+	return -EINVAL;
+}
+
 void kvmppc_decrementer_func(unsigned long data)
 {
 	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index da98e26f6e45..5880dfb31074 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -893,7 +893,10 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			/* Harvest R and C */
 			rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
 			*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
-			rev[i].guest_rpte = ptel | rcbits;
+			if (rcbits & ~rev[i].guest_rpte) {
+				rev[i].guest_rpte = ptel | rcbits;
+				note_hpte_modification(kvm, &rev[i]);
+			}
 		}
 		unlock_rmap(rmapp);
 		hptep[0] &= ~HPTE_V_HVLOCK;
@@ -976,7 +979,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		/* Now check and modify the HPTE */
 		if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
 			kvmppc_clear_ref_hpte(kvm, hptep, i);
-			rev[i].guest_rpte |= HPTE_R_R;
+			if (!(rev[i].guest_rpte & HPTE_R_R)) {
+				rev[i].guest_rpte |= HPTE_R_R;
+				note_hpte_modification(kvm, &rev[i]);
+			}
 			ret = 1;
 		}
 		hptep[0] &= ~HPTE_V_HVLOCK;
@@ -1080,7 +1086,10 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
 			hptep[1] &= ~HPTE_R_C;
 			eieio();
 			hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
-			rev[i].guest_rpte |= HPTE_R_C;
+			if (!(rev[i].guest_rpte & HPTE_R_C)) {
+				rev[i].guest_rpte |= HPTE_R_C;
+				note_hpte_modification(kvm, &rev[i]);
+			}
 			ret = 1;
 		}
 		hptep[0] &= ~HPTE_V_HVLOCK;
@@ -1090,11 +1099,30 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
 	return ret;
 }
 
+static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
+			      struct kvm_memory_slot *memslot,
+			      unsigned long *map)
+{
+	unsigned long gfn;
+
+	if (!vpa->dirty || !vpa->pinned_addr)
+		return;
+	gfn = vpa->gpa >> PAGE_SHIFT;
+	if (gfn < memslot->base_gfn ||
+	    gfn >= memslot->base_gfn + memslot->npages)
+		return;
+
+	vpa->dirty = false;
+	if (map)
+		__set_bit_le(gfn - memslot->base_gfn, map);
+}
+
 long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			     unsigned long *map)
 {
 	unsigned long i;
 	unsigned long *rmapp;
+	struct kvm_vcpu *vcpu;
 
 	preempt_disable();
 	rmapp = memslot->arch.rmap;
@@ -1103,6 +1131,15 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			__set_bit_le(i, map);
 		++rmapp;
 	}
+
+	/* Harvest dirty bits from VPA and DTL updates */
+	/* Note: we never modify the SLB shadow buffer areas */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map);
+		harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map);
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+	}
 	preempt_enable();
 	return 0;
 }
@@ -1114,7 +1151,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 	unsigned long gfn = gpa >> PAGE_SHIFT;
 	struct page *page, *pages[1];
 	int npages;
-	unsigned long hva, psize, offset;
+	unsigned long hva, offset;
 	unsigned long pa;
 	unsigned long *physp;
 	int srcu_idx;
@@ -1146,14 +1183,9 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 	}
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 
-	psize = PAGE_SIZE;
-	if (PageHuge(page)) {
-		page = compound_head(page);
-		psize <<= compound_order(page);
-	}
-	offset = gpa & (psize - 1);
+	offset = gpa & (PAGE_SIZE - 1);
 	if (nb_ret)
-		*nb_ret = psize - offset;
+		*nb_ret = PAGE_SIZE - offset;
 	return page_address(page) + offset;
 
  err:
@@ -1161,11 +1193,31 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 	return NULL;
 }
 
-void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
+void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
+			     bool dirty)
 {
 	struct page *page = virt_to_page(va);
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn;
+	unsigned long *rmap;
+	int srcu_idx;
 
 	put_page(page);
+
+	if (!dirty || !kvm->arch.using_mmu_notifiers)
+		return;
+
+	/* We need to mark this page dirty in the rmap chain */
+	gfn = gpa >> PAGE_SHIFT;
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	memslot = gfn_to_memslot(kvm, gfn);
+	if (memslot) {
+		rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+		lock_rmap(rmap);
+		*rmap |= KVMPPC_RMAP_CHANGED;
+		unlock_rmap(rmap);
+	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
 
 /*
@@ -1193,16 +1245,36 @@ struct kvm_htab_ctx {
 
 #define HPTE_SIZE	(2 * sizeof(unsigned long))
 
+/*
+ * Returns 1 if this HPT entry has been modified or has pending
+ * R/C bit changes.
+ */
+static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp)
+{
+	unsigned long rcbits_unset;
+
+	if (revp->guest_rpte & HPTE_GR_MODIFIED)
+		return 1;
+
+	/* Also need to consider changes in reference and changed bits */
+	rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
+	if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset))
+		return 1;
+
+	return 0;
+}
+
 static long record_hpte(unsigned long flags, unsigned long *hptp,
 			unsigned long *hpte, struct revmap_entry *revp,
 			int want_valid, int first_pass)
 {
 	unsigned long v, r;
+	unsigned long rcbits_unset;
 	int ok = 1;
 	int valid, dirty;
 
 	/* Unmodified entries are uninteresting except on the first pass */
-	dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+	dirty = hpte_dirty(revp, hptp);
 	if (!first_pass && !dirty)
 		return 0;
 
@@ -1223,16 +1295,28 @@ static long record_hpte(unsigned long flags, unsigned long *hptp,
 		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
 			cpu_relax();
 		v = hptp[0];
+
+		/* re-evaluate valid and dirty from synchronized HPTE value */
+		valid = !!(v & HPTE_V_VALID);
+		dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+
+		/* Harvest R and C into guest view if necessary */
+		rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
+		if (valid && (rcbits_unset & hptp[1])) {
+			revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) |
+				HPTE_GR_MODIFIED;
+			dirty = 1;
+		}
+
 		if (v & HPTE_V_ABSENT) {
 			v &= ~HPTE_V_ABSENT;
 			v |= HPTE_V_VALID;
+			valid = 1;
 		}
-		/* re-evaluate valid and dirty from synchronized HPTE value */
-		valid = !!(v & HPTE_V_VALID);
 		if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
 			valid = 0;
-		r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C));
-		dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+
+		r = revp->guest_rpte;
 		/* only clear modified if this is the right sort of entry */
 		if (valid == want_valid && dirty) {
 			r &= ~HPTE_GR_MODIFIED;
@@ -1288,7 +1372,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 		/* Skip uninteresting entries, i.e. clean on not-first pass */
 		if (!first_pass) {
 			while (i < kvm->arch.hpt_npte &&
-			       !(revp->guest_rpte & HPTE_GR_MODIFIED)) {
+			       !hpte_dirty(revp, hptp)) {
 				++i;
 				hptp += 2;
 				++revp;
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 836c56975e21..1f6344c4408d 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -194,7 +194,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				run->papr_hcall.args[i] = gpr;
 			}
 
-			emulated = EMULATE_DO_PAPR;
+			run->exit_reason = KVM_EXIT_PAPR_HCALL;
+			vcpu->arch.hcall_needed = 1;
+			emulated = EMULATE_EXIT_USER;
 			break;
 		}
 #endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f5416934932b..9de24f8e03c7 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -66,6 +66,31 @@
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
+void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+	int me;
+	int cpu = vcpu->cpu;
+	wait_queue_head_t *wqp;
+
+	wqp = kvm_arch_vcpu_wq(vcpu);
+	if (waitqueue_active(wqp)) {
+		wake_up_interruptible(wqp);
+		++vcpu->stat.halt_wakeup;
+	}
+
+	me = get_cpu();
+
+	/* CPU points to the first thread of the core */
+	if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
+		int real_cpu = cpu + vcpu->arch.ptid;
+		if (paca[real_cpu].kvm_hstate.xics_phys)
+			xics_wake_cpu(real_cpu);
+		else if (cpu_online(cpu))
+			smp_send_reschedule(cpu);
+	}
+	put_cpu();
+}
+
 /*
  * We use the vcpu_load/put functions to measure stolen time.
  * Stolen time is counted as time when either the vcpu is able to
@@ -259,7 +284,7 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 			len = ((struct reg_vpa *)va)->length.hword;
 		else
 			len = ((struct reg_vpa *)va)->length.word;
-		kvmppc_unpin_guest_page(kvm, va);
+		kvmppc_unpin_guest_page(kvm, va, vpa, false);
 
 		/* Check length */
 		if (len > nb || len < sizeof(struct reg_vpa))
@@ -359,13 +384,13 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
 		va = NULL;
 		nb = 0;
 		if (gpa)
-			va = kvmppc_pin_guest_page(kvm, vpap->next_gpa, &nb);
+			va = kvmppc_pin_guest_page(kvm, gpa, &nb);
 		spin_lock(&vcpu->arch.vpa_update_lock);
 		if (gpa == vpap->next_gpa)
 			break;
 		/* sigh... unpin that one and try again */
 		if (va)
-			kvmppc_unpin_guest_page(kvm, va);
+			kvmppc_unpin_guest_page(kvm, va, gpa, false);
 	}
 
 	vpap->update_pending = 0;
@@ -375,12 +400,15 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
 		 * has changed the mappings underlying guest memory,
 		 * so unregister the region.
 		 */
-		kvmppc_unpin_guest_page(kvm, va);
+		kvmppc_unpin_guest_page(kvm, va, gpa, false);
 		va = NULL;
 	}
 	if (vpap->pinned_addr)
-		kvmppc_unpin_guest_page(kvm, vpap->pinned_addr);
+		kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
+					vpap->dirty);
+	vpap->gpa = gpa;
 	vpap->pinned_addr = va;
+	vpap->dirty = false;
 	if (va)
 		vpap->pinned_end = va + vpap->len;
 }
@@ -472,6 +500,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
 	/* order writing *dt vs. writing vpa->dtl_idx */
 	smp_wmb();
 	vpa->dtl_idx = ++vcpu->arch.dtl_index;
+	vcpu->arch.dtl.dirty = true;
 }
 
 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
@@ -479,7 +508,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 	unsigned long req = kvmppc_get_gpr(vcpu, 3);
 	unsigned long target, ret = H_SUCCESS;
 	struct kvm_vcpu *tvcpu;
-	int idx;
+	int idx, rc;
 
 	switch (req) {
 	case H_ENTER:
@@ -515,6 +544,28 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 					kvmppc_get_gpr(vcpu, 5),
 					kvmppc_get_gpr(vcpu, 6));
 		break;
+	case H_RTAS:
+		if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+			return RESUME_HOST;
+
+		rc = kvmppc_rtas_hcall(vcpu);
+
+		if (rc == -ENOENT)
+			return RESUME_HOST;
+		else if (rc == 0)
+			break;
+
+		/* Send the error out to userspace via KVM_RUN */
+		return rc;
+
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+		if (kvmppc_xics_enabled(vcpu)) {
+			ret = kvmppc_xics_hcall(vcpu, req);
+			break;
+		} /* fallthrough */
 	default:
 		return RESUME_HOST;
 	}
@@ -913,15 +964,19 @@ out:
 	return ERR_PTR(err);
 }
 
+static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
+{
+	if (vpa->pinned_addr)
+		kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
+					vpa->dirty);
+}
+
 void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	spin_lock(&vcpu->arch.vpa_update_lock);
-	if (vcpu->arch.dtl.pinned_addr)
-		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl.pinned_addr);
-	if (vcpu->arch.slb_shadow.pinned_addr)
-		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow.pinned_addr);
-	if (vcpu->arch.vpa.pinned_addr)
-		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr);
+	unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
+	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
+	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
 	spin_unlock(&vcpu->arch.vpa_update_lock);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu);
@@ -955,7 +1010,6 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
 }
 
 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
-extern void xics_wake_cpu(int cpu);
 
 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
 				   struct kvm_vcpu *vcpu)
@@ -1330,9 +1384,12 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			break;
 		vc->runner = vcpu;
 		n_ceded = 0;
-		list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
+		list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
 			if (!v->arch.pending_exceptions)
 				n_ceded += v->arch.ceded;
+			else
+				v->arch.ceded = 0;
+		}
 		if (n_ceded == vc->n_runnable)
 			kvmppc_vcore_blocked(vc);
 		else
@@ -1645,12 +1702,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				      struct kvm_userspace_memory_region *mem,
-				      struct kvm_memory_slot old)
+				      const struct kvm_memory_slot *old)
 {
 	unsigned long npages = mem->memory_size >> PAGE_SHIFT;
 	struct kvm_memory_slot *memslot;
 
-	if (npages && old.npages) {
+	if (npages && old->npages) {
 		/*
 		 * If modifying a memslot, reset all the rmap dirty bits.
 		 * If this is a new memslot, we don't need to do anything
@@ -1827,6 +1884,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 	cpumask_setall(&kvm->arch.need_tlb_flush);
 
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+	INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
 
 	kvm->arch.rma = NULL;
 
@@ -1872,6 +1930,8 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
 		kvm->arch.rma = NULL;
 	}
 
+	kvmppc_rtas_tokens_free(kvm);
+
 	kvmppc_free_hpt(kvm);
 	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 19c93bae1aea..6dcbb49105a4 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -97,17 +97,6 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
-/*
- * Note modification of an HPTE; set the HPTE modified bit
- * if anyone is interested.
- */
-static inline void note_hpte_modification(struct kvm *kvm,
-					  struct revmap_entry *rev)
-{
-	if (atomic_read(&kvm->arch.hpte_mod_interest))
-		rev->guest_rpte |= HPTE_GR_MODIFIED;
-}
-
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 				struct revmap_entry *rev,
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
new file mode 100644
index 000000000000..b4b0082f761c
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+#include "book3s_xics.h"
+
+#define DEBUG_PASSUP
+
+static inline void rm_writeb(unsigned long paddr, u8 val)
+{
+	__asm__ __volatile__("sync; stbcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
+static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
+				struct kvm_vcpu *this_vcpu)
+{
+	struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
+	unsigned long xics_phys;
+	int cpu;
+
+	/* Mark the target VCPU as having an interrupt pending */
+	vcpu->stat.queue_intr++;
+	set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
+
+	/* Kick self ? Just set MER and return */
+	if (vcpu == this_vcpu) {
+		mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER);
+		return;
+	}
+
+	/* Check if the core is loaded, if not, too hard */
+	cpu = vcpu->cpu;
+	if (cpu < 0 || cpu >= nr_cpu_ids) {
+		this_icp->rm_action |= XICS_RM_KICK_VCPU;
+		this_icp->rm_kick_target = vcpu;
+		return;
+	}
+	/* In SMT cpu will always point to thread 0, we adjust it */
+	cpu += vcpu->arch.ptid;
+
+	/* Not too hard, then poke the target */
+	xics_phys = paca[cpu].kvm_hstate.xics_phys;
+	rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+}
+
+static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
+{
+	/* Note: Only called on self ! */
+	clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
+		  &vcpu->arch.pending_exceptions);
+	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
+}
+
+static inline bool icp_rm_try_update(struct kvmppc_icp *icp,
+				     union kvmppc_icp_state old,
+				     union kvmppc_icp_state new)
+{
+	struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu;
+	bool success;
+
+	/* Calculate new output value */
+	new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+	/* Attempt atomic update */
+	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+	if (!success)
+		goto bail;
+
+	/*
+	 * Check for output state update
+	 *
+	 * Note that this is racy since another processor could be updating
+	 * the state already. This is why we never clear the interrupt output
+	 * here, we only ever set it. The clear only happens prior to doing
+	 * an update and only by the processor itself. Currently we do it
+	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+	 *
+	 * We also do not try to figure out whether the EE state has changed,
+	 * we unconditionally set it if the new state calls for it. The reason
+	 * for that is that we opportunistically remove the pending interrupt
+	 * flag when raising CPPR, so we need to set it back here if an
+	 * interrupt is still pending.
+	 */
+	if (new.out_ee)
+		icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu);
+
+	/* Expose the state change for debug purposes */
+	this_vcpu->arch.icp->rm_dbgstate = new;
+	this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu;
+
+ bail:
+	return success;
+}
+
+static inline int check_too_hard(struct kvmppc_xics *xics,
+				 struct kvmppc_icp *icp)
+{
+	return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS;
+}
+
+static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			     u8 new_cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool resend;
+
+	/*
+	 * This handles several related states in one operation:
+	 *
+	 * ICP State: Down_CPPR
+	 *
+	 * Load CPPR with new value and if the XISR is 0
+	 * then check for resends:
+	 *
+	 * ICP State: Resend
+	 *
+	 * If MFRR is more favored than CPPR, check for IPIs
+	 * and notify ICS of a potential resend. This is done
+	 * asynchronously (when used in real mode, we will have
+	 * to exit here).
+	 *
+	 * We do not handle the complete Check_IPI as documented
+	 * here. In the PAPR, this state will be used for both
+	 * Set_MFRR and Down_CPPR. However, we know that we aren't
+	 * changing the MFRR state here so we don't need to handle
+	 * the case of an MFRR causing a reject of a pending irq,
+	 * this will have been handled when the MFRR was set in the
+	 * first place.
+	 *
+	 * Thus we don't have to handle rejects, only resends.
+	 *
+	 * When implementing real mode for HV KVM, resend will lead to
+	 * a H_TOO_HARD return and the whole transaction will be handled
+	 * in virtual mode.
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Down_CPPR */
+		new_state.cppr = new_cppr;
+
+		/*
+		 * Cut down Resend / Check_IPI / IPI
+		 *
+		 * The logic is that we cannot have a pending interrupt
+		 * trumped by an IPI at this point (see above), so we
+		 * know that either the pending interrupt is already an
+		 * IPI (in which case we don't care to override it) or
+		 * it's either more favored than us or non existent
+		 */
+		if (new_state.mfrr < new_cppr &&
+		    new_state.mfrr <= new_state.pending_pri) {
+			new_state.pending_pri = new_state.mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		/* Latch/clear resend bit */
+		resend = new_state.need_resend;
+		new_state.need_resend = 0;
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/*
+	 * Now handle resend checks. Those are asynchronous to the ICP
+	 * state update in HW (ie bus transactions) so we can handle them
+	 * separately here as well.
+	 */
+	if (resend)
+		icp->rm_action |= XICS_RM_CHECK_RESEND;
+}
+
+
+unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 xirr;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	/* First clear the interrupt */
+	icp_rm_clr_vcpu_irq(icp->vcpu);
+
+	/*
+	 * ICP State: Accept_Interrupt
+	 *
+	 * Return the pending interrupt (if any) along with the
+	 * current CPPR, then clear the XISR & set CPPR to the
+	 * pending priority
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+		if (!old_state.xisr)
+			break;
+		new_state.cppr = new_state.pending_pri;
+		new_state.pending_pri = 0xff;
+		new_state.xisr = 0;
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/* Return the result in GPR4 */
+	vcpu->arch.gpr[4] = xirr;
+
+	return check_too_hard(xics, icp);
+}
+
+int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+		    unsigned long mfrr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp;
+	u32 reject;
+	bool resend;
+	bool local;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	local = this_icp->server_num == server;
+	if (local)
+		icp = this_icp;
+	else
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
+	if (!icp)
+		return H_PARAMETER;
+
+	/*
+	 * ICP state: Set_MFRR
+	 *
+	 * If the CPPR is more favored than the new MFRR, then
+	 * nothing needs to be done as there can be no XISR to
+	 * reject.
+	 *
+	 * If the CPPR is less favored, then we might be replacing
+	 * an interrupt, and thus need to possibly reject it as in
+	 *
+	 * ICP state: Check_IPI
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Set_MFRR */
+		new_state.mfrr = mfrr;
+
+		/* Check_IPI */
+		reject = 0;
+		resend = false;
+		if (mfrr < new_state.cppr) {
+			/* Reject a pending interrupt if not an IPI */
+			if (mfrr <= new_state.pending_pri)
+				reject = new_state.xisr;
+			new_state.pending_pri = mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
+			resend = new_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/* Pass rejects to virtual mode */
+	if (reject && reject != XICS_IPI) {
+		this_icp->rm_action |= XICS_RM_REJECT;
+		this_icp->rm_reject = reject;
+	}
+
+	/* Pass resends to virtual mode */
+	if (resend)
+		this_icp->rm_action |= XICS_RM_CHECK_RESEND;
+
+	return check_too_hard(xics, this_icp);
+}
+
+int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 reject;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	/*
+	 * ICP State: Set_CPPR
+	 *
+	 * We can safely compare the new value with the current
+	 * value outside of the transaction as the CPPR is only
+	 * ever changed by the processor on itself
+	 */
+	if (cppr > icp->state.cppr) {
+		icp_rm_down_cppr(xics, icp, cppr);
+		goto bail;
+	} else if (cppr == icp->state.cppr)
+		return H_SUCCESS;
+
+	/*
+	 * ICP State: Up_CPPR
+	 *
+	 * The processor is raising its priority, this can result
+	 * in a rejection of a pending interrupt:
+	 *
+	 * ICP State: Reject_Current
+	 *
+	 * We can remove EE from the current processor, the update
+	 * transaction will set it again if needed
+	 */
+	icp_rm_clr_vcpu_irq(icp->vcpu);
+
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		reject = 0;
+		new_state.cppr = cppr;
+
+		if (cppr <= new_state.pending_pri) {
+			reject = new_state.xisr;
+			new_state.xisr = 0;
+			new_state.pending_pri = 0xff;
+		}
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/* Pass rejects to virtual mode */
+	if (reject && reject != XICS_IPI) {
+		icp->rm_action |= XICS_RM_REJECT;
+		icp->rm_reject = reject;
+	}
+ bail:
+	return check_too_hard(xics, icp);
+}
+
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u32 irq = xirr & 0x00ffffff;
+	u16 src;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR sepcifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_rm_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		goto bail;
+	/*
+	 * EOI handling: If the interrupt is still asserted, we need to
+	 * resend it. We can take a lockless "peek" at the ICS state here.
+	 *
+	 * "Message" interrupts will never have "asserted" set
+	 */
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		goto bail;
+	state = &ics->irq_state[src];
+
+	/* Still asserted, resend it, we make it look like a reject */
+	if (state->asserted) {
+		icp->rm_action |= XICS_RM_REJECT;
+		icp->rm_reject = irq;
+	}
+ bail:
+	return check_too_hard(xics, icp);
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index e33d11f1b977..b02f91e4c70d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -79,10 +79,6 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
  *                                                                            *
  *****************************************************************************/
 
-#define XICS_XIRR		4
-#define XICS_QIRR		0xc
-#define XICS_IPI		2	/* interrupt source # for IPIs */
-
 /*
  * We come in here when wakened from nap mode on a secondary hw thread.
  * Relocation is off and most register values are lost.
@@ -101,50 +97,51 @@ kvm_start_guest:
 	li	r0,1
 	stb	r0,PACA_NAPSTATELOST(r13)
 
-	/* get vcpu pointer, NULL if we have no vcpu to run */
-	ld	r4,HSTATE_KVM_VCPU(r13)
-	cmpdi	cr1,r4,0
+	/* were we napping due to cede? */
+	lbz	r0,HSTATE_NAPPING(r13)
+	cmpwi	r0,0
+	bne	kvm_end_cede
+
+	/*
+	 * We weren't napping due to cede, so this must be a secondary
+	 * thread being woken up to run a guest, or being woken up due
+	 * to a stray IPI.  (Or due to some machine check or hypervisor
+	 * maintenance interrupt while the core is in KVM.)
+	 */
 
 	/* Check the wake reason in SRR1 to see why we got here */
 	mfspr	r3,SPRN_SRR1
 	rlwinm	r3,r3,44-31,0x7		/* extract wake reason field */
 	cmpwi	r3,4			/* was it an external interrupt? */
-	bne	27f
-
-	/*
-	 * External interrupt - for now assume it is an IPI, since we
-	 * should never get any other interrupts sent to offline threads.
-	 * Only do this for secondary threads.
-	 */
-	beq	cr1,25f
-	lwz	r3,VCPU_PTID(r4)
-	cmpwi	r3,0
-	beq	27f
-25:	ld	r5,HSTATE_XICS_PHYS(r13)
-	li	r0,0xff
-	li	r6,XICS_QIRR
-	li	r7,XICS_XIRR
+	bne	27f			/* if not */
+	ld	r5,HSTATE_XICS_PHYS(r13)
+	li	r7,XICS_XIRR		/* if it was an external interrupt, */
 	lwzcix	r8,r5,r7		/* get and ack the interrupt */
 	sync
 	clrldi.	r9,r8,40		/* get interrupt source ID. */
-	beq	27f			/* none there? */
-	cmpwi	r9,XICS_IPI
-	bne	26f
+	beq	28f			/* none there? */
+	cmpwi	r9,XICS_IPI		/* was it an IPI? */
+	bne	29f
+	li	r0,0xff
+	li	r6,XICS_MFRR
 	stbcix	r0,r5,r6		/* clear IPI */
-26:	stwcix	r8,r5,r7		/* EOI the interrupt */
-
-27:	/* XXX should handle hypervisor maintenance interrupts etc. here */
+	stwcix	r8,r5,r7		/* EOI the interrupt */
+	sync				/* order loading of vcpu after that */
 
-	/* reload vcpu pointer after clearing the IPI */
+	/* get vcpu pointer, NULL if we have no vcpu to run */
 	ld	r4,HSTATE_KVM_VCPU(r13)
 	cmpdi	r4,0
 	/* if we have no vcpu to run, go back to sleep */
 	beq	kvm_no_guest
+	b	kvmppc_hv_entry
 
-	/* were we napping due to cede? */
-	lbz	r0,HSTATE_NAPPING(r13)
-	cmpwi	r0,0
-	bne	kvm_end_cede
+27:	/* XXX should handle hypervisor maintenance interrupts etc. here */
+	b	kvm_no_guest
+28:	/* SRR1 said external but ICP said nope?? */
+	b	kvm_no_guest
+29:	/* External non-IPI interrupt to offline secondary thread? help?? */
+	stw	r8,HSTATE_SAVED_XIRR(r13)
+	b	kvm_no_guest
 
 .global kvmppc_hv_entry
 kvmppc_hv_entry:
@@ -260,6 +257,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	lwz	r5, LPPACA_YIELDCOUNT(r3)
 	addi	r5, r5, 1
 	stw	r5, LPPACA_YIELDCOUNT(r3)
+	li	r6, 1
+	stb	r6, VCPU_VPA_DIRTY(r4)
 25:
 	/* Load up DAR and DSISR */
 	ld	r5, VCPU_DAR(r4)
@@ -485,20 +484,20 @@ toc_tlbie_lock:
 	mtctr	r6
 	mtxer	r7
 
+	ld	r10, VCPU_PC(r4)
+	ld	r11, VCPU_MSR(r4)
 kvmppc_cede_reentry:		/* r4 = vcpu, r13 = paca */
 	ld	r6, VCPU_SRR0(r4)
 	ld	r7, VCPU_SRR1(r4)
-	ld	r10, VCPU_PC(r4)
-	ld	r11, VCPU_MSR(r4)	/* r11 = vcpu->arch.msr & ~MSR_HV */
 
+	/* r11 = vcpu->arch.msr & ~MSR_HV */
 	rldicl	r11, r11, 63 - MSR_HV_LG, 1
 	rotldi	r11, r11, 1 + MSR_HV_LG
 	ori	r11, r11, MSR_ME
 
 	/* Check if we can deliver an external or decrementer interrupt now */
 	ld	r0,VCPU_PENDING_EXC(r4)
-	li	r8,(1 << BOOK3S_IRQPRIO_EXTERNAL)
-	oris	r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+	lis	r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
 	and	r0,r0,r8
 	cmpdi	cr1,r0,0
 	andi.	r0,r11,MSR_EE
@@ -526,10 +525,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	/* Move SRR0 and SRR1 into the respective regs */
 5:	mtspr	SPRN_SRR0, r6
 	mtspr	SPRN_SRR1, r7
-	li	r0,0
-	stb	r0,VCPU_CEDED(r4)	/* cancel cede */
 
 fast_guest_return:
+	li	r0,0
+	stb	r0,VCPU_CEDED(r4)	/* cancel cede */
 	mtspr	SPRN_HSRR0,r10
 	mtspr	SPRN_HSRR1,r11
 
@@ -676,17 +675,99 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
 	beq	hcall_try_real_mode
 
-	/* Check for mediated interrupts (could be done earlier really ...) */
+	/* Only handle external interrupts here on arch 206 and later */
 BEGIN_FTR_SECTION
-	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
-	bne+	1f
-	andi.	r0,r11,MSR_EE
-	beq	1f
-	mfspr	r5,SPRN_LPCR
-	andi.	r0,r5,LPCR_MER
-	bne	bounce_ext_interrupt
-1:
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+	b	ext_interrupt_to_host
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
+
+	/* External interrupt ? */
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	bne+	ext_interrupt_to_host
+
+	/* External interrupt, first check for host_ipi. If this is
+	 * set, we know the host wants us out so let's do it now
+	 */
+do_ext_interrupt:
+	lbz	r0, HSTATE_HOST_IPI(r13)
+	cmpwi	r0, 0
+	bne	ext_interrupt_to_host
+
+	/* Now read the interrupt from the ICP */
+	ld	r5, HSTATE_XICS_PHYS(r13)
+	li	r7, XICS_XIRR
+	cmpdi	r5, 0
+	beq-	ext_interrupt_to_host
+	lwzcix	r3, r5, r7
+	rlwinm.	r0, r3, 0, 0xffffff
+	sync
+	beq	3f		/* if nothing pending in the ICP */
+
+	/* We found something in the ICP...
+	 *
+	 * If it's not an IPI, stash it in the PACA and return to
+	 * the host, we don't (yet) handle directing real external
+	 * interrupts directly to the guest
+	 */
+	cmpwi	r0, XICS_IPI
+	bne	ext_stash_for_host
+
+	/* It's an IPI, clear the MFRR and EOI it */
+	li	r0, 0xff
+	li	r6, XICS_MFRR
+	stbcix	r0, r5, r6		/* clear the IPI */
+	stwcix	r3, r5, r7		/* EOI it */
+	sync
+
+	/* We need to re-check host IPI now in case it got set in the
+	 * meantime. If it's clear, we bounce the interrupt to the
+	 * guest
+	 */
+	lbz	r0, HSTATE_HOST_IPI(r13)
+	cmpwi	r0, 0
+	bne-	1f
+
+	/* Allright, looks like an IPI for the guest, we need to set MER */
+3:
+	/* Check if any CPU is heading out to the host, if so head out too */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lwz	r0, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r0, 0x100
+	bge	ext_interrupt_to_host
+
+	/* See if there is a pending interrupt for the guest */
+	mfspr	r8, SPRN_LPCR
+	ld	r0, VCPU_PENDING_EXC(r9)
+	/* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
+	rldicl.	r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63
+	rldimi	r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
+	beq	2f
+
+	/* And if the guest EE is set, we can deliver immediately, else
+	 * we return to the guest with MER set
+	 */
+	andi.	r0, r11, MSR_EE
+	beq	2f
+	mtspr	SPRN_SRR0, r10
+	mtspr	SPRN_SRR1, r11
+	li	r10, BOOK3S_INTERRUPT_EXTERNAL
+	li	r11, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
+	rotldi	r11, r11, 63
+2:	mr	r4, r9
+	mtspr	SPRN_LPCR, r8
+	b	fast_guest_return
+
+	/* We raced with the host, we need to resend that IPI, bummer */
+1:	li	r0, IPI_PRIORITY
+	stbcix	r0, r5, r6		/* set the IPI */
+	sync
+	b	ext_interrupt_to_host
+
+ext_stash_for_host:
+	/* It's not an IPI and it's for the host, stash it in the PACA
+	 * before exit, it will be picked up by the host ICP driver
+	 */
+	stw	r3, HSTATE_SAVED_XIRR(r13)
+ext_interrupt_to_host:
 
 guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	/* Save DEC */
@@ -829,7 +910,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	beq	44f
 	ld	r8,HSTATE_XICS_PHYS(r6)	/* get thread's XICS reg addr */
 	li	r0,IPI_PRIORITY
-	li	r7,XICS_QIRR
+	li	r7,XICS_MFRR
 	stbcix	r0,r7,r8		/* trigger the IPI */
 44:	srdi.	r3,r3,1
 	addi	r6,r6,PACA_SIZE
@@ -1018,6 +1099,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	lwz	r3, LPPACA_YIELDCOUNT(r8)
 	addi	r3, r3, 1
 	stw	r3, LPPACA_YIELDCOUNT(r8)
+	li	r3, 1
+	stb	r3, VCPU_VPA_DIRTY(r9)
 25:
 	/* Save PMU registers if requested */
 	/* r8 and cr0.eq are live here */
@@ -1350,11 +1433,19 @@ hcall_real_table:
 	.long	0		/* 0x58 */
 	.long	0		/* 0x5c */
 	.long	0		/* 0x60 */
-	.long	0		/* 0x64 */
-	.long	0		/* 0x68 */
-	.long	0		/* 0x6c */
-	.long	0		/* 0x70 */
-	.long	0		/* 0x74 */
+#ifdef CONFIG_KVM_XICS
+	.long	.kvmppc_rm_h_eoi - hcall_real_table
+	.long	.kvmppc_rm_h_cppr - hcall_real_table
+	.long	.kvmppc_rm_h_ipi - hcall_real_table
+	.long	0		/* 0x70 - H_IPOLL */
+	.long	.kvmppc_rm_h_xirr - hcall_real_table
+#else
+	.long	0		/* 0x64 - H_EOI */
+	.long	0		/* 0x68 - H_CPPR */
+	.long	0		/* 0x6c - H_IPI */
+	.long	0		/* 0x70 - H_IPOLL */
+	.long	0		/* 0x74 - H_XIRR */
+#endif
 	.long	0		/* 0x78 */
 	.long	0		/* 0x7c */
 	.long	0		/* 0x80 */
@@ -1405,15 +1496,6 @@ ignore_hdec:
 	mr	r4,r9
 	b	fast_guest_return
 
-bounce_ext_interrupt:
-	mr	r4,r9
-	mtspr	SPRN_SRR0,r10
-	mtspr	SPRN_SRR1,r11
-	li	r10,BOOK3S_INTERRUPT_EXTERNAL
-	li	r11,(MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
-	rotldi	r11,r11,63
-	b	fast_guest_return
-
 _GLOBAL(kvmppc_h_set_dabr)
 	std	r4,VCPU_DABR(r3)
 	/* Work around P7 bug where DABR can get corrupted on mtspr */
@@ -1519,6 +1601,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
 	b	.
 
 kvm_end_cede:
+	/* get vcpu pointer */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+
 	/* Woken by external or decrementer interrupt */
 	ld	r1, HSTATE_HOST_R1(r13)
 
@@ -1558,6 +1643,16 @@ kvm_end_cede:
 	li	r0,0
 	stb	r0,HSTATE_NAPPING(r13)
 
+	/* Check the wake reason in SRR1 to see why we got here */
+	mfspr	r3, SPRN_SRR1
+	rlwinm	r3, r3, 44-31, 0x7	/* extract wake reason field */
+	cmpwi	r3, 4			/* was it an external interrupt? */
+	li	r12, BOOK3S_INTERRUPT_EXTERNAL
+	mr	r9, r4
+	ld	r10, VCPU_PC(r9)
+	ld	r11, VCPU_MSR(r9)
+	beq	do_ext_interrupt	/* if so */
+
 	/* see if any other thread is already exiting */
 	lwz	r0,VCORE_ENTRY_EXIT(r5)
 	cmpwi	r0,0x100
@@ -1577,8 +1672,7 @@ kvm_cede_prodded:
 
 	/* we've ceded but we want to give control to the host */
 kvm_cede_exit:
-	li	r3,H_TOO_HARD
-	blr
+	b	hcall_real_fallback
 
 	/* Try to handle a machine check in real mode */
 machine_check_realmode:
@@ -1626,7 +1720,7 @@ secondary_nap:
 	beq	37f
 	sync
 	li	r0, 0xff
-	li	r6, XICS_QIRR
+	li	r6, XICS_MFRR
 	stbcix	r0, r5, r6		/* clear the IPI */
 	stwcix	r3, r5, r7		/* EOI it */
 37:	sync
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index dbdc15aa8127..bdc40b8e77d9 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -762,9 +762,7 @@ program_interrupt:
 			run->exit_reason = KVM_EXIT_MMIO;
 			r = RESUME_HOST_NV;
 			break;
-		case EMULATE_DO_PAPR:
-			run->exit_reason = KVM_EXIT_PAPR_HCALL;
-			vcpu->arch.hcall_needed = 1;
+		case EMULATE_EXIT_USER:
 			r = RESUME_HOST_NV;
 			break;
 		default:
@@ -1283,7 +1281,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old)
+				const struct kvm_memory_slot *old)
 {
 }
 
@@ -1298,6 +1296,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 {
 #ifdef CONFIG_PPC64
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+	INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
 #endif
 
 	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index ee02b30878ed..b24309c6c2d5 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -227,6 +227,13 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+{
+	long rc = kvmppc_xics_hcall(vcpu, cmd);
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
 int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 {
 	switch (cmd) {
@@ -246,6 +253,20 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		vcpu->stat.halt_wakeup++;
 		return EMULATE_DONE;
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+		if (kvmppc_xics_enabled(vcpu))
+			return kvmppc_h_pr_xics_hcall(vcpu, cmd);
+		break;
+	case H_RTAS:
+		if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+			return RESUME_HOST;
+		if (kvmppc_rtas_hcall(vcpu))
+			break;
+		kvmppc_set_gpr(vcpu, 3, 0);
+		return EMULATE_DONE;
 	}
 
 	return EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
new file mode 100644
index 000000000000..3219ba895246
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/err.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/rtas.h>
+
+#ifdef CONFIG_KVM_XICS
+static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq, server, priority;
+	int rc;
+
+	if (args->nargs != 3 || args->nret != 1) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = args->args[0];
+	server = args->args[1];
+	priority = args->args[2];
+
+	rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+	if (rc)
+		rc = -3;
+out:
+	args->rets[0] = rc;
+}
+
+static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq, server, priority;
+	int rc;
+
+	if (args->nargs != 1 || args->nret != 3) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = args->args[0];
+
+	server = priority = 0;
+	rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+	if (rc) {
+		rc = -3;
+		goto out;
+	}
+
+	args->rets[1] = server;
+	args->rets[2] = priority;
+out:
+	args->rets[0] = rc;
+}
+
+static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq;
+	int rc;
+
+	if (args->nargs != 1 || args->nret != 1) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = args->args[0];
+
+	rc = kvmppc_xics_int_off(vcpu->kvm, irq);
+	if (rc)
+		rc = -3;
+out:
+	args->rets[0] = rc;
+}
+
+static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq;
+	int rc;
+
+	if (args->nargs != 1 || args->nret != 1) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = args->args[0];
+
+	rc = kvmppc_xics_int_on(vcpu->kvm, irq);
+	if (rc)
+		rc = -3;
+out:
+	args->rets[0] = rc;
+}
+#endif /* CONFIG_KVM_XICS */
+
+struct rtas_handler {
+	void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args);
+	char *name;
+};
+
+static struct rtas_handler rtas_handlers[] = {
+#ifdef CONFIG_KVM_XICS
+	{ .name = "ibm,set-xive", .handler = kvm_rtas_set_xive },
+	{ .name = "ibm,get-xive", .handler = kvm_rtas_get_xive },
+	{ .name = "ibm,int-off",  .handler = kvm_rtas_int_off },
+	{ .name = "ibm,int-on",   .handler = kvm_rtas_int_on },
+#endif
+};
+
+struct rtas_token_definition {
+	struct list_head list;
+	struct rtas_handler *handler;
+	u64 token;
+};
+
+static int rtas_name_matches(char *s1, char *s2)
+{
+	struct kvm_rtas_token_args args;
+	return !strncmp(s1, s2, sizeof(args.name));
+}
+
+static int rtas_token_undefine(struct kvm *kvm, char *name)
+{
+	struct rtas_token_definition *d, *tmp;
+
+	lockdep_assert_held(&kvm->lock);
+
+	list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
+		if (rtas_name_matches(d->handler->name, name)) {
+			list_del(&d->list);
+			kfree(d);
+			return 0;
+		}
+	}
+
+	/* It's not an error to undefine an undefined token */
+	return 0;
+}
+
+static int rtas_token_define(struct kvm *kvm, char *name, u64 token)
+{
+	struct rtas_token_definition *d;
+	struct rtas_handler *h = NULL;
+	bool found;
+	int i;
+
+	lockdep_assert_held(&kvm->lock);
+
+	list_for_each_entry(d, &kvm->arch.rtas_tokens, list) {
+		if (d->token == token)
+			return -EEXIST;
+	}
+
+	found = false;
+	for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) {
+		h = &rtas_handlers[i];
+		if (rtas_name_matches(h->name, name)) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return -ENOENT;
+
+	d = kzalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	d->handler = h;
+	d->token = token;
+
+	list_add_tail(&d->list, &kvm->arch.rtas_tokens);
+
+	return 0;
+}
+
+int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp)
+{
+	struct kvm_rtas_token_args args;
+	int rc;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	mutex_lock(&kvm->lock);
+
+	if (args.token)
+		rc = rtas_token_define(kvm, args.name, args.token);
+	else
+		rc = rtas_token_undefine(kvm, args.name);
+
+	mutex_unlock(&kvm->lock);
+
+	return rc;
+}
+
+int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
+{
+	struct rtas_token_definition *d;
+	struct rtas_args args;
+	rtas_arg_t *orig_rets;
+	gpa_t args_phys;
+	int rc;
+
+	/* r4 contains the guest physical address of the RTAS args */
+	args_phys = kvmppc_get_gpr(vcpu, 4);
+
+	rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args));
+	if (rc)
+		goto fail;
+
+	/*
+	 * args->rets is a pointer into args->args. Now that we've
+	 * copied args we need to fix it up to point into our copy,
+	 * not the guest args. We also need to save the original
+	 * value so we can restore it on the way out.
+	 */
+	orig_rets = args.rets;
+	args.rets = &args.args[args.nargs];
+
+	mutex_lock(&vcpu->kvm->lock);
+
+	rc = -ENOENT;
+	list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) {
+		if (d->token == args.token) {
+			d->handler->handler(vcpu, &args);
+			rc = 0;
+			break;
+		}
+	}
+
+	mutex_unlock(&vcpu->kvm->lock);
+
+	if (rc == 0) {
+		args.rets = orig_rets;
+		rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args));
+		if (rc)
+			goto fail;
+	}
+
+	return rc;
+
+fail:
+	/*
+	 * We only get here if the guest has called RTAS with a bogus
+	 * args pointer. That means we can't get to the args, and so we
+	 * can't fail the RTAS call. So fail right out to userspace,
+	 * which should kill the guest.
+	 */
+	return rc;
+}
+
+void kvmppc_rtas_tokens_free(struct kvm *kvm)
+{
+	struct rtas_token_definition *d, *tmp;
+
+	lockdep_assert_held(&kvm->lock);
+
+	list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
+		list_del(&d->list);
+		kfree(d);
+	}
+}
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
new file mode 100644
index 000000000000..f7a103756618
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -0,0 +1,1270 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/anon_inodes.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xics.h"
+
+#if 1
+#define XICS_DBG(fmt...) do { } while (0)
+#else
+#define XICS_DBG(fmt...) trace_printk(fmt)
+#endif
+
+#define ENABLE_REALMODE	true
+#define DEBUG_REALMODE	false
+
+/*
+ * LOCKING
+ * =======
+ *
+ * Each ICS has a mutex protecting the information about the IRQ
+ * sources and avoiding simultaneous deliveries if the same interrupt.
+ *
+ * ICP operations are done via a single compare & swap transaction
+ * (most ICP state fits in the union kvmppc_icp_state)
+ */
+
+/*
+ * TODO
+ * ====
+ *
+ * - To speed up resends, keep a bitmap of "resend" set bits in the
+ *   ICS
+ *
+ * - Speed up server# -> ICP lookup (array ? hash table ?)
+ *
+ * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
+ *   locks array to improve scalability
+ */
+
+/* -- ICS routines -- */
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq);
+
+static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level,
+			   bool report_status)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u16 src;
+
+	XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
+		return -EINVAL;
+	}
+	state = &ics->irq_state[src];
+	if (!state->exists)
+		return -EINVAL;
+
+	if (report_status)
+		return state->asserted;
+
+	/*
+	 * We set state->asserted locklessly. This should be fine as
+	 * we are the only setter, thus concurrent access is undefined
+	 * to begin with.
+	 */
+	if (level == KVM_INTERRUPT_SET_LEVEL)
+		state->asserted = 1;
+	else if (level == KVM_INTERRUPT_UNSET) {
+		state->asserted = 0;
+		return 0;
+	}
+
+	/* Attempt delivery */
+	icp_deliver_irq(xics, NULL, irq);
+
+	return state->asserted;
+}
+
+static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+			     struct kvmppc_icp *icp)
+{
+	int i;
+
+	mutex_lock(&ics->lock);
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct ics_irq_state *state = &ics->irq_state[i];
+
+		if (!state->resend)
+			continue;
+
+		XICS_DBG("resend %#x prio %#x\n", state->number,
+			      state->priority);
+
+		mutex_unlock(&ics->lock);
+		icp_deliver_irq(xics, icp, state->number);
+		mutex_lock(&ics->lock);
+	}
+
+	mutex_unlock(&ics->lock);
+}
+
+static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+		       struct ics_irq_state *state,
+		       u32 server, u32 priority, u32 saved_priority)
+{
+	bool deliver;
+
+	mutex_lock(&ics->lock);
+
+	state->server = server;
+	state->priority = priority;
+	state->saved_priority = saved_priority;
+	deliver = false;
+	if ((state->masked_pending || state->resend) && priority != MASKED) {
+		state->masked_pending = 0;
+		deliver = true;
+	}
+
+	mutex_unlock(&ics->lock);
+
+	return deliver;
+}
+
+int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	icp = kvmppc_xics_find_server(kvm, server);
+	if (!icp)
+		return -EINVAL;
+
+	XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n",
+		 irq, server, priority,
+		 state->masked_pending, state->resend);
+
+	if (write_xive(xics, ics, state, server, priority, priority))
+		icp_deliver_irq(xics, icp, irq);
+
+	return 0;
+}
+
+int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	mutex_lock(&ics->lock);
+	*server = state->server;
+	*priority = state->priority;
+	mutex_unlock(&ics->lock);
+
+	return 0;
+}
+
+int kvmppc_xics_int_on(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	icp = kvmppc_xics_find_server(kvm, state->server);
+	if (!icp)
+		return -EINVAL;
+
+	if (write_xive(xics, ics, state, state->server, state->saved_priority,
+		       state->saved_priority))
+		icp_deliver_irq(xics, icp, irq);
+
+	return 0;
+}
+
+int kvmppc_xics_int_off(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	write_xive(xics, ics, state, state->server, MASKED, state->priority);
+
+	return 0;
+}
+
+/* -- ICP routines, including hcalls -- */
+
+static inline bool icp_try_update(struct kvmppc_icp *icp,
+				  union kvmppc_icp_state old,
+				  union kvmppc_icp_state new,
+				  bool change_self)
+{
+	bool success;
+
+	/* Calculate new output value */
+	new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+	/* Attempt atomic update */
+	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+	if (!success)
+		goto bail;
+
+	XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 icp->server_num,
+		 old.cppr, old.mfrr, old.pending_pri, old.xisr,
+		 old.need_resend, old.out_ee);
+	XICS_DBG("UPD        - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 new.cppr, new.mfrr, new.pending_pri, new.xisr,
+		 new.need_resend, new.out_ee);
+	/*
+	 * Check for output state update
+	 *
+	 * Note that this is racy since another processor could be updating
+	 * the state already. This is why we never clear the interrupt output
+	 * here, we only ever set it. The clear only happens prior to doing
+	 * an update and only by the processor itself. Currently we do it
+	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+	 *
+	 * We also do not try to figure out whether the EE state has changed,
+	 * we unconditionally set it if the new state calls for it. The reason
+	 * for that is that we opportunistically remove the pending interrupt
+	 * flag when raising CPPR, so we need to set it back here if an
+	 * interrupt is still pending.
+	 */
+	if (new.out_ee) {
+		kvmppc_book3s_queue_irqprio(icp->vcpu,
+					    BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+		if (!change_self)
+			kvmppc_fast_vcpu_kick(icp->vcpu);
+	}
+ bail:
+	return success;
+}
+
+static void icp_check_resend(struct kvmppc_xics *xics,
+			     struct kvmppc_icp *icp)
+{
+	u32 icsid;
+
+	/* Order this load with the test for need_resend in the caller */
+	smp_rmb();
+	for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!test_and_clear_bit(icsid, icp->resend_map))
+			continue;
+		if (!ics)
+			continue;
+		ics_check_resend(xics, ics, icp);
+	}
+}
+
+static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+			       u32 *reject)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool success;
+
+	XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority,
+		 icp->server_num);
+
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		*reject = 0;
+
+		/* See if we can deliver */
+		success = new_state.cppr > priority &&
+			new_state.mfrr > priority &&
+			new_state.pending_pri > priority;
+
+		/*
+		 * If we can, check for a rejection and perform the
+		 * delivery
+		 */
+		if (success) {
+			*reject = new_state.xisr;
+			new_state.xisr = irq;
+			new_state.pending_pri = priority;
+		} else {
+			/*
+			 * If we failed to deliver we set need_resend
+			 * so a subsequent CPPR state change causes us
+			 * to try a new delivery.
+			 */
+			new_state.need_resend = true;
+		}
+
+	} while (!icp_try_update(icp, old_state, new_state, false));
+
+	return success;
+}
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u32 reject;
+	u16 src;
+
+	/*
+	 * This is used both for initial delivery of an interrupt and
+	 * for subsequent rejection.
+	 *
+	 * Rejection can be racy vs. resends. We have evaluated the
+	 * rejection in an atomic ICP transaction which is now complete,
+	 * so potentially the ICP can already accept the interrupt again.
+	 *
+	 * So we need to retry the delivery. Essentially the reject path
+	 * boils down to a failed delivery. Always.
+	 *
+	 * Now the interrupt could also have moved to a different target,
+	 * thus we may need to re-do the ICP lookup as well
+	 */
+
+ again:
+	/* Get the ICS state and lock it */
+	ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+	if (!ics) {
+		XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq);
+		return;
+	}
+	state = &ics->irq_state[src];
+
+	/* Get a lock on the ICS */
+	mutex_lock(&ics->lock);
+
+	/* Get our server */
+	if (!icp || state->server != icp->server_num) {
+		icp = kvmppc_xics_find_server(xics->kvm, state->server);
+		if (!icp) {
+			pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n",
+				new_irq, state->server);
+			goto out;
+		}
+	}
+
+	/* Clear the resend bit of that interrupt */
+	state->resend = 0;
+
+	/*
+	 * If masked, bail out
+	 *
+	 * Note: PAPR doesn't mention anything about masked pending
+	 * when doing a resend, only when doing a delivery.
+	 *
+	 * However that would have the effect of losing a masked
+	 * interrupt that was rejected and isn't consistent with
+	 * the whole masked_pending business which is about not
+	 * losing interrupts that occur while masked.
+	 *
+	 * I don't differenciate normal deliveries and resends, this
+	 * implementation will differ from PAPR and not lose such
+	 * interrupts.
+	 */
+	if (state->priority == MASKED) {
+		XICS_DBG("irq %#x masked pending\n", new_irq);
+		state->masked_pending = 1;
+		goto out;
+	}
+
+	/*
+	 * Try the delivery, this will set the need_resend flag
+	 * in the ICP as part of the atomic transaction if the
+	 * delivery is not possible.
+	 *
+	 * Note that if successful, the new delivery might have itself
+	 * rejected an interrupt that was "delivered" before we took the
+	 * icp mutex.
+	 *
+	 * In this case we do the whole sequence all over again for the
+	 * new guy. We cannot assume that the rejected interrupt is less
+	 * favored than the new one, and thus doesn't need to be delivered,
+	 * because by the time we exit icp_try_to_deliver() the target
+	 * processor may well have alrady consumed & completed it, and thus
+	 * the rejected interrupt might actually be already acceptable.
+	 */
+	if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+		/*
+		 * Delivery was successful, did we reject somebody else ?
+		 */
+		if (reject && reject != XICS_IPI) {
+			mutex_unlock(&ics->lock);
+			new_irq = reject;
+			goto again;
+		}
+	} else {
+		/*
+		 * We failed to deliver the interrupt we need to set the
+		 * resend map bit and mark the ICS state as needing a resend
+		 */
+		set_bit(ics->icsid, icp->resend_map);
+		state->resend = 1;
+
+		/*
+		 * If the need_resend flag got cleared in the ICP some time
+		 * between icp_try_to_deliver() atomic update and now, then
+		 * we know it might have missed the resend_map bit. So we
+		 * retry
+		 */
+		smp_mb();
+		if (!icp->state.need_resend) {
+			mutex_unlock(&ics->lock);
+			goto again;
+		}
+	}
+ out:
+	mutex_unlock(&ics->lock);
+}
+
+static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			  u8 new_cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool resend;
+
+	/*
+	 * This handles several related states in one operation:
+	 *
+	 * ICP State: Down_CPPR
+	 *
+	 * Load CPPR with new value and if the XISR is 0
+	 * then check for resends:
+	 *
+	 * ICP State: Resend
+	 *
+	 * If MFRR is more favored than CPPR, check for IPIs
+	 * and notify ICS of a potential resend. This is done
+	 * asynchronously (when used in real mode, we will have
+	 * to exit here).
+	 *
+	 * We do not handle the complete Check_IPI as documented
+	 * here. In the PAPR, this state will be used for both
+	 * Set_MFRR and Down_CPPR. However, we know that we aren't
+	 * changing the MFRR state here so we don't need to handle
+	 * the case of an MFRR causing a reject of a pending irq,
+	 * this will have been handled when the MFRR was set in the
+	 * first place.
+	 *
+	 * Thus we don't have to handle rejects, only resends.
+	 *
+	 * When implementing real mode for HV KVM, resend will lead to
+	 * a H_TOO_HARD return and the whole transaction will be handled
+	 * in virtual mode.
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Down_CPPR */
+		new_state.cppr = new_cppr;
+
+		/*
+		 * Cut down Resend / Check_IPI / IPI
+		 *
+		 * The logic is that we cannot have a pending interrupt
+		 * trumped by an IPI at this point (see above), so we
+		 * know that either the pending interrupt is already an
+		 * IPI (in which case we don't care to override it) or
+		 * it's either more favored than us or non existent
+		 */
+		if (new_state.mfrr < new_cppr &&
+		    new_state.mfrr <= new_state.pending_pri) {
+			WARN_ON(new_state.xisr != XICS_IPI &&
+				new_state.xisr != 0);
+			new_state.pending_pri = new_state.mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		/* Latch/clear resend bit */
+		resend = new_state.need_resend;
+		new_state.need_resend = 0;
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	/*
+	 * Now handle resend checks. Those are asynchronous to the ICP
+	 * state update in HW (ie bus transactions) so we can handle them
+	 * separately here too
+	 */
+	if (resend)
+		icp_check_resend(xics, icp);
+}
+
+static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 xirr;
+
+	/* First, remove EE from the processor */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	/*
+	 * ICP State: Accept_Interrupt
+	 *
+	 * Return the pending interrupt (if any) along with the
+	 * current CPPR, then clear the XISR & set CPPR to the
+	 * pending priority
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+		if (!old_state.xisr)
+			break;
+		new_state.cppr = new_state.pending_pri;
+		new_state.pending_pri = 0xff;
+		new_state.xisr = 0;
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr);
+
+	return xirr;
+}
+
+static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+				 unsigned long mfrr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	u32 reject;
+	bool resend;
+	bool local;
+
+	XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n",
+		 vcpu->vcpu_id, server, mfrr);
+
+	icp = vcpu->arch.icp;
+	local = icp->server_num == server;
+	if (!local) {
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
+		if (!icp)
+			return H_PARAMETER;
+	}
+
+	/*
+	 * ICP state: Set_MFRR
+	 *
+	 * If the CPPR is more favored than the new MFRR, then
+	 * nothing needs to be rejected as there can be no XISR to
+	 * reject.  If the MFRR is being made less favored then
+	 * there might be a previously-rejected interrupt needing
+	 * to be resent.
+	 *
+	 * If the CPPR is less favored, then we might be replacing
+	 * an interrupt, and thus need to possibly reject it as in
+	 *
+	 * ICP state: Check_IPI
+	 */
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		/* Set_MFRR */
+		new_state.mfrr = mfrr;
+
+		/* Check_IPI */
+		reject = 0;
+		resend = false;
+		if (mfrr < new_state.cppr) {
+			/* Reject a pending interrupt if not an IPI */
+			if (mfrr <= new_state.pending_pri)
+				reject = new_state.xisr;
+			new_state.pending_pri = mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
+			resend = new_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_try_update(icp, old_state, new_state, local));
+
+	/* Handle reject */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject);
+
+	/* Handle resend */
+	if (resend)
+		icp_check_resend(xics, icp);
+
+	return H_SUCCESS;
+}
+
+static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 reject;
+
+	XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr);
+
+	/*
+	 * ICP State: Set_CPPR
+	 *
+	 * We can safely compare the new value with the current
+	 * value outside of the transaction as the CPPR is only
+	 * ever changed by the processor on itself
+	 */
+	if (cppr > icp->state.cppr)
+		icp_down_cppr(xics, icp, cppr);
+	else if (cppr == icp->state.cppr)
+		return;
+
+	/*
+	 * ICP State: Up_CPPR
+	 *
+	 * The processor is raising its priority, this can result
+	 * in a rejection of a pending interrupt:
+	 *
+	 * ICP State: Reject_Current
+	 *
+	 * We can remove EE from the current processor, the update
+	 * transaction will set it again if needed
+	 */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	do {
+		old_state = new_state = ACCESS_ONCE(icp->state);
+
+		reject = 0;
+		new_state.cppr = cppr;
+
+		if (cppr <= new_state.pending_pri) {
+			reject = new_state.xisr;
+			new_state.xisr = 0;
+			new_state.pending_pri = 0xff;
+		}
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	/*
+	 * Check for rejects. They are handled by doing a new delivery
+	 * attempt (see comments in icp_deliver_irq).
+	 */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject);
+}
+
+static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u32 irq = xirr & 0x00ffffff;
+	u16 src;
+
+	XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
+
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR sepcifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		return H_SUCCESS;
+	/*
+	 * EOI handling: If the interrupt is still asserted, we need to
+	 * resend it. We can take a lockless "peek" at the ICS state here.
+	 *
+	 * "Message" interrupts will never have "asserted" set
+	 */
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
+		return H_PARAMETER;
+	}
+	state = &ics->irq_state[src];
+
+	/* Still asserted, resend it */
+	if (state->asserted)
+		icp_deliver_irq(xics, icp, irq);
+
+	return H_SUCCESS;
+}
+
+static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+
+	XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n",
+		 hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt);
+
+	if (icp->rm_action & XICS_RM_KICK_VCPU)
+		kvmppc_fast_vcpu_kick(icp->rm_kick_target);
+	if (icp->rm_action & XICS_RM_CHECK_RESEND)
+		icp_check_resend(xics, icp);
+	if (icp->rm_action & XICS_RM_REJECT)
+		icp_deliver_irq(xics, icp, icp->rm_reject);
+
+	icp->rm_action = 0;
+
+	return H_SUCCESS;
+}
+
+int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	unsigned long res;
+	int rc = H_SUCCESS;
+
+	/* Check if we have an ICP */
+	if (!xics || !vcpu->arch.icp)
+		return H_HARDWARE;
+
+	/* Check for real mode returning too hard */
+	if (xics->real_mode)
+		return kvmppc_xics_rm_complete(vcpu, req);
+
+	switch (req) {
+	case H_XIRR:
+		res = kvmppc_h_xirr(vcpu);
+		kvmppc_set_gpr(vcpu, 4, res);
+		break;
+	case H_CPPR:
+		kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_EOI:
+		rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_IPI:
+		rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+				  kvmppc_get_gpr(vcpu, 5));
+		break;
+	}
+
+	return rc;
+}
+
+
+/* -- Initialisation code etc. -- */
+
+static int xics_debug_show(struct seq_file *m, void *private)
+{
+	struct kvmppc_xics *xics = m->private;
+	struct kvm *kvm = xics->kvm;
+	struct kvm_vcpu *vcpu;
+	int icsid, i;
+
+	if (!kvm)
+		return 0;
+
+	seq_printf(m, "=========\nICP state\n=========\n");
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_icp *icp = vcpu->arch.icp;
+		union kvmppc_icp_state state;
+
+		if (!icp)
+			continue;
+
+		state.raw = ACCESS_ONCE(icp->state.raw);
+		seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n",
+			   icp->server_num, state.xisr,
+			   state.pending_pri, state.cppr, state.mfrr,
+			   state.out_ee, state.need_resend);
+	}
+
+	for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!ics)
+			continue;
+
+		seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
+			   icsid);
+
+		mutex_lock(&ics->lock);
+
+		for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+			struct ics_irq_state *irq = &ics->irq_state[i];
+
+			seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n",
+				   irq->number, irq->server, irq->priority,
+				   irq->saved_priority, irq->asserted,
+				   irq->resend, irq->masked_pending);
+
+		}
+		mutex_unlock(&ics->lock);
+	}
+	return 0;
+}
+
+static int xics_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xics_debug_show, inode->i_private);
+}
+
+static const struct file_operations xics_debug_fops = {
+	.open = xics_debug_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void xics_debugfs_init(struct kvmppc_xics *xics)
+{
+	char *name;
+
+	name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics);
+	if (!name) {
+		pr_err("%s: no memory for name\n", __func__);
+		return;
+	}
+
+	xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+					   xics, &xics_debug_fops);
+
+	pr_debug("%s: created %s\n", __func__, name);
+	kfree(name);
+}
+
+static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
+					struct kvmppc_xics *xics, int irq)
+{
+	struct kvmppc_ics *ics;
+	int i, icsid;
+
+	icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+	mutex_lock(&kvm->lock);
+
+	/* ICS already exists - somebody else got here first */
+	if (xics->ics[icsid])
+		goto out;
+
+	/* Create the ICS */
+	ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL);
+	if (!ics)
+		goto out;
+
+	mutex_init(&ics->lock);
+	ics->icsid = icsid;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i;
+		ics->irq_state[i].priority = MASKED;
+		ics->irq_state[i].saved_priority = MASKED;
+	}
+	smp_wmb();
+	xics->ics[icsid] = ics;
+
+	if (icsid > xics->max_icsid)
+		xics->max_icsid = icsid;
+
+ out:
+	mutex_unlock(&kvm->lock);
+	return xics->ics[icsid];
+}
+
+int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
+{
+	struct kvmppc_icp *icp;
+
+	if (!vcpu->kvm->arch.xics)
+		return -ENODEV;
+
+	if (kvmppc_xics_find_server(vcpu->kvm, server_num))
+		return -EEXIST;
+
+	icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL);
+	if (!icp)
+		return -ENOMEM;
+
+	icp->vcpu = vcpu;
+	icp->server_num = server_num;
+	icp->state.mfrr = MASKED;
+	icp->state.pending_pri = MASKED;
+	vcpu->arch.icp = icp;
+
+	XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id);
+
+	return 0;
+}
+
+u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	union kvmppc_icp_state state;
+
+	if (!icp)
+		return 0;
+	state = icp->state;
+	return ((u64)state.cppr << KVM_REG_PPC_ICP_CPPR_SHIFT) |
+		((u64)state.xisr << KVM_REG_PPC_ICP_XISR_SHIFT) |
+		((u64)state.mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT) |
+		((u64)state.pending_pri << KVM_REG_PPC_ICP_PPRI_SHIFT);
+}
+
+int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
+{
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_ics *ics;
+	u8 cppr, mfrr, pending_pri;
+	u32 xisr;
+	u16 src;
+	bool resend;
+
+	if (!icp || !xics)
+		return -ENOENT;
+
+	cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
+	xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
+		KVM_REG_PPC_ICP_XISR_MASK;
+	mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
+	pending_pri = icpval >> KVM_REG_PPC_ICP_PPRI_SHIFT;
+
+	/* Require the new state to be internally consistent */
+	if (xisr == 0) {
+		if (pending_pri != 0xff)
+			return -EINVAL;
+	} else if (xisr == XICS_IPI) {
+		if (pending_pri != mfrr || pending_pri >= cppr)
+			return -EINVAL;
+	} else {
+		if (pending_pri >= mfrr || pending_pri >= cppr)
+			return -EINVAL;
+		ics = kvmppc_xics_find_ics(xics, xisr, &src);
+		if (!ics)
+			return -EINVAL;
+	}
+
+	new_state.raw = 0;
+	new_state.cppr = cppr;
+	new_state.xisr = xisr;
+	new_state.mfrr = mfrr;
+	new_state.pending_pri = pending_pri;
+
+	/*
+	 * Deassert the CPU interrupt request.
+	 * icp_try_update will reassert it if necessary.
+	 */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	/*
+	 * Note that if we displace an interrupt from old_state.xisr,
+	 * we don't mark it as rejected.  We expect userspace to set
+	 * the state of the interrupt sources to be consistent with
+	 * the ICP states (either before or afterwards, which doesn't
+	 * matter).  We do handle resends due to CPPR becoming less
+	 * favoured because that is necessary to end up with a
+	 * consistent state in the situation where userspace restores
+	 * the ICS states before the ICP states.
+	 */
+	do {
+		old_state = ACCESS_ONCE(icp->state);
+
+		if (new_state.mfrr <= old_state.mfrr) {
+			resend = false;
+			new_state.need_resend = old_state.need_resend;
+		} else {
+			resend = old_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_try_update(icp, old_state, new_state, false));
+
+	if (resend)
+		icp_check_resend(xics, icp);
+
+	return 0;
+}
+
+static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
+{
+	int ret;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *irqp;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val, prio;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics)
+		return -ENOENT;
+
+	irqp = &ics->irq_state[idx];
+	mutex_lock(&ics->lock);
+	ret = -ENOENT;
+	if (irqp->exists) {
+		val = irqp->server;
+		prio = irqp->priority;
+		if (prio == MASKED) {
+			val |= KVM_XICS_MASKED;
+			prio = irqp->saved_priority;
+		}
+		val |= prio << KVM_XICS_PRIORITY_SHIFT;
+		if (irqp->asserted)
+			val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING;
+		else if (irqp->masked_pending || irqp->resend)
+			val |= KVM_XICS_PENDING;
+		ret = 0;
+	}
+	mutex_unlock(&ics->lock);
+
+	if (!ret && put_user(val, ubufp))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
+{
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *irqp;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val;
+	u8 prio;
+	u32 server;
+
+	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
+		return -ENOENT;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics) {
+		ics = kvmppc_xics_create_ics(xics->kvm, xics, irq);
+		if (!ics)
+			return -ENOMEM;
+	}
+	irqp = &ics->irq_state[idx];
+	if (get_user(val, ubufp))
+		return -EFAULT;
+
+	server = val & KVM_XICS_DESTINATION_MASK;
+	prio = val >> KVM_XICS_PRIORITY_SHIFT;
+	if (prio != MASKED &&
+	    kvmppc_xics_find_server(xics->kvm, server) == NULL)
+		return -EINVAL;
+
+	mutex_lock(&ics->lock);
+	irqp->server = server;
+	irqp->saved_priority = prio;
+	if (val & KVM_XICS_MASKED)
+		prio = MASKED;
+	irqp->priority = prio;
+	irqp->resend = 0;
+	irqp->masked_pending = 0;
+	irqp->asserted = 0;
+	if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE))
+		irqp->asserted = 1;
+	irqp->exists = 1;
+	mutex_unlock(&ics->lock);
+
+	if (val & KVM_XICS_PENDING)
+		icp_deliver_irq(xics, NULL, irqp->number);
+
+	return 0;
+}
+
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+		bool line_status)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+
+	return ics_deliver_irq(xics, irq, level, line_status);
+}
+
+static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xics *xics = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xics_set_source(xics, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xics *xics = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xics_get_source(xics, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
+		    attr->attr < KVMPPC_XICS_NR_IRQS)
+			return 0;
+		break;
+	}
+	return -ENXIO;
+}
+
+static void kvmppc_xics_free(struct kvm_device *dev)
+{
+	struct kvmppc_xics *xics = dev->private;
+	int i;
+	struct kvm *kvm = xics->kvm;
+
+	debugfs_remove(xics->dentry);
+
+	if (kvm)
+		kvm->arch.xics = NULL;
+
+	for (i = 0; i <= xics->max_icsid; i++)
+		kfree(xics->ics[i]);
+	kfree(xics);
+	kfree(dev);
+}
+
+static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
+{
+	struct kvmppc_xics *xics;
+	struct kvm *kvm = dev->kvm;
+	int ret = 0;
+
+	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+	if (!xics)
+		return -ENOMEM;
+
+	dev->private = xics;
+	xics->dev = dev;
+	xics->kvm = kvm;
+
+	/* Already there ? */
+	mutex_lock(&kvm->lock);
+	if (kvm->arch.xics)
+		ret = -EEXIST;
+	else
+		kvm->arch.xics = xics;
+	mutex_unlock(&kvm->lock);
+
+	if (ret)
+		return ret;
+
+	xics_debugfs_init(xics);
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+		/* Enable real mode support */
+		xics->real_mode = ENABLE_REALMODE;
+		xics->real_mode_dbg = DEBUG_REALMODE;
+	}
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
+	return 0;
+}
+
+struct kvm_device_ops kvm_xics_ops = {
+	.name = "kvm-xics",
+	.create = kvmppc_xics_create,
+	.destroy = kvmppc_xics_free,
+	.set_attr = xics_set_attr,
+	.get_attr = xics_get_attr,
+	.has_attr = xics_has_attr,
+};
+
+int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+			     u32 xcpu)
+{
+	struct kvmppc_xics *xics = dev->private;
+	int r = -EBUSY;
+
+	if (dev->ops != &kvm_xics_ops)
+		return -EPERM;
+	if (xics->kvm != vcpu->kvm)
+		return -EPERM;
+	if (vcpu->arch.irq_type)
+		return -EBUSY;
+
+	r = kvmppc_xics_create_icp(vcpu, xcpu);
+	if (!r)
+		vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
+
+	return r;
+}
+
+void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->arch.icp)
+		return;
+	kfree(vcpu->arch.icp);
+	vcpu->arch.icp = NULL;
+	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
new file mode 100644
index 000000000000..dd9326c5c19b
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XICS_H
+#define _KVM_PPC_BOOK3S_XICS_H
+
+/*
+ * We use a two-level tree to store interrupt source information.
+ * There are up to 1024 ICS nodes, each of which can represent
+ * 1024 sources.
+ */
+#define KVMPPC_XICS_MAX_ICS_ID	1023
+#define KVMPPC_XICS_ICS_SHIFT	10
+#define KVMPPC_XICS_IRQ_PER_ICS	(1 << KVMPPC_XICS_ICS_SHIFT)
+#define KVMPPC_XICS_SRC_MASK	(KVMPPC_XICS_IRQ_PER_ICS - 1)
+
+/*
+ * Interrupt source numbers below this are reserved, for example
+ * 0 is "no interrupt", and 2 is used for IPIs.
+ */
+#define KVMPPC_XICS_FIRST_IRQ	16
+#define KVMPPC_XICS_NR_IRQS	((KVMPPC_XICS_MAX_ICS_ID + 1) * \
+				 KVMPPC_XICS_IRQ_PER_ICS)
+
+/* Priority value to use for disabling an interrupt */
+#define MASKED	0xff
+
+/* State for one irq source */
+struct ics_irq_state {
+	u32 number;
+	u32 server;
+	u8  priority;
+	u8  saved_priority;
+	u8  resend;
+	u8  masked_pending;
+	u8  asserted; /* Only for LSI */
+	u8  exists;
+};
+
+/* Atomic ICP state, updated with a single compare & swap */
+union kvmppc_icp_state {
+	unsigned long raw;
+	struct {
+		u8 out_ee:1;
+		u8 need_resend:1;
+		u8 cppr;
+		u8 mfrr;
+		u8 pending_pri;
+		u32 xisr;
+	};
+};
+
+/* One bit per ICS */
+#define ICP_RESEND_MAP_SIZE	(KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1)
+
+struct kvmppc_icp {
+	struct kvm_vcpu *vcpu;
+	unsigned long server_num;
+	union kvmppc_icp_state state;
+	unsigned long resend_map[ICP_RESEND_MAP_SIZE];
+
+	/* Real mode might find something too hard, here's the action
+	 * it might request from virtual mode
+	 */
+#define XICS_RM_KICK_VCPU	0x1
+#define XICS_RM_CHECK_RESEND	0x2
+#define XICS_RM_REJECT		0x4
+	u32 rm_action;
+	struct kvm_vcpu *rm_kick_target;
+	u32  rm_reject;
+
+	/* Debug stuff for real mode */
+	union kvmppc_icp_state rm_dbgstate;
+	struct kvm_vcpu *rm_dbgtgt;
+};
+
+struct kvmppc_ics {
+	struct mutex lock;
+	u16 icsid;
+	struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+struct kvmppc_xics {
+	struct kvm *kvm;
+	struct kvm_device *dev;
+	struct dentry *dentry;
+	u32 max_icsid;
+	bool real_mode;
+	bool real_mode_dbg;
+	struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
+};
+
+static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm,
+							 u32 nr)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num)
+			return vcpu->arch.icp;
+	}
+	return NULL;
+}
+
+static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
+						      u32 irq, u16 *source)
+{
+	u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+	u16 src = irq & KVMPPC_XICS_SRC_MASK;
+	struct kvmppc_ics *ics;
+
+	if (source)
+		*source = src;
+	if (icsid > KVMPPC_XICS_MAX_ICS_ID)
+		return NULL;
+	ics = xics->ics[icsid];
+	if (!ics)
+		return NULL;
+	return ics;
+}
+
+
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 020923e43134..1020119226db 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -222,8 +222,7 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
 	kvmppc_booke_queue_irqprio(vcpu, prio);
 }
 
-void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
-                                  struct kvm_interrupt *irq)
+void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
 {
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
@@ -347,7 +346,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		keep_irq = true;
 	}
 
-	if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_enabled)
+	if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags)
 		update_epr = true;
 
 	switch (priority) {
@@ -428,8 +427,14 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 			set_guest_esr(vcpu, vcpu->arch.queued_esr);
 		if (update_dear == true)
 			set_guest_dear(vcpu, vcpu->arch.queued_dear);
-		if (update_epr == true)
-			kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
+		if (update_epr == true) {
+			if (vcpu->arch.epr_flags & KVMPPC_EPR_USER)
+				kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
+			else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) {
+				BUG_ON(vcpu->arch.irq_type != KVMPPC_IRQ_MPIC);
+				kvmppc_mpic_set_epr(vcpu);
+			}
+		}
 
 		new_msr &= msr_mask;
 #if defined(CONFIG_64BIT)
@@ -746,6 +751,9 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		kvmppc_core_queue_program(vcpu, ESR_PIL);
 		return RESUME_HOST;
 
+	case EMULATE_EXIT_USER:
+		return RESUME_HOST;
+
 	default:
 		BUG();
 	}
@@ -1148,6 +1156,18 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return r;
 }
 
+static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr)
+{
+	u32 old_tsr = vcpu->arch.tsr;
+
+	vcpu->arch.tsr = new_tsr;
+
+	if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
+		arm_next_watchdog(vcpu);
+
+	update_timer_ints(vcpu);
+}
+
 /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
@@ -1287,16 +1307,8 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
 		kvmppc_emulate_dec(vcpu);
 	}
 
-	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
-		u32 old_tsr = vcpu->arch.tsr;
-
-		vcpu->arch.tsr = sregs->u.e.tsr;
-
-		if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
-			arm_next_watchdog(vcpu);
-
-		update_timer_ints(vcpu);
-	}
+	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR)
+		kvmppc_set_tsr(vcpu, sregs->u.e.tsr);
 
 	return 0;
 }
@@ -1409,84 +1421,134 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 {
-	int r = -EINVAL;
+	int r = 0;
+	union kvmppc_one_reg val;
+	int size;
+	long int i;
+
+	size = one_reg_size(reg->id);
+	if (size > sizeof(val))
+		return -EINVAL;
 
 	switch (reg->id) {
 	case KVM_REG_PPC_IAC1:
 	case KVM_REG_PPC_IAC2:
 	case KVM_REG_PPC_IAC3:
-	case KVM_REG_PPC_IAC4: {
-		int iac = reg->id - KVM_REG_PPC_IAC1;
-		r = copy_to_user((u64 __user *)(long)reg->addr,
-				 &vcpu->arch.dbg_reg.iac[iac], sizeof(u64));
+	case KVM_REG_PPC_IAC4:
+		i = reg->id - KVM_REG_PPC_IAC1;
+		val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac[i]);
 		break;
-	}
 	case KVM_REG_PPC_DAC1:
-	case KVM_REG_PPC_DAC2: {
-		int dac = reg->id - KVM_REG_PPC_DAC1;
-		r = copy_to_user((u64 __user *)(long)reg->addr,
-				 &vcpu->arch.dbg_reg.dac[dac], sizeof(u64));
+	case KVM_REG_PPC_DAC2:
+		i = reg->id - KVM_REG_PPC_DAC1;
+		val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac[i]);
 		break;
-	}
 	case KVM_REG_PPC_EPR: {
 		u32 epr = get_guest_epr(vcpu);
-		r = put_user(epr, (u32 __user *)(long)reg->addr);
+		val = get_reg_val(reg->id, epr);
 		break;
 	}
 #if defined(CONFIG_64BIT)
 	case KVM_REG_PPC_EPCR:
-		r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr);
+		val = get_reg_val(reg->id, vcpu->arch.epcr);
 		break;
 #endif
+	case KVM_REG_PPC_TCR:
+		val = get_reg_val(reg->id, vcpu->arch.tcr);
+		break;
+	case KVM_REG_PPC_TSR:
+		val = get_reg_val(reg->id, vcpu->arch.tsr);
+		break;
+	case KVM_REG_PPC_DEBUG_INST:
+		val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV);
+		break;
 	default:
+		r = kvmppc_get_one_reg(vcpu, reg->id, &val);
 		break;
 	}
+
+	if (r)
+		return r;
+
+	if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size))
+		r = -EFAULT;
+
 	return r;
 }
 
 int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 {
-	int r = -EINVAL;
+	int r = 0;
+	union kvmppc_one_reg val;
+	int size;
+	long int i;
+
+	size = one_reg_size(reg->id);
+	if (size > sizeof(val))
+		return -EINVAL;
+
+	if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size))
+		return -EFAULT;
 
 	switch (reg->id) {
 	case KVM_REG_PPC_IAC1:
 	case KVM_REG_PPC_IAC2:
 	case KVM_REG_PPC_IAC3:
-	case KVM_REG_PPC_IAC4: {
-		int iac = reg->id - KVM_REG_PPC_IAC1;
-		r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac],
-			     (u64 __user *)(long)reg->addr, sizeof(u64));
+	case KVM_REG_PPC_IAC4:
+		i = reg->id - KVM_REG_PPC_IAC1;
+		vcpu->arch.dbg_reg.iac[i] = set_reg_val(reg->id, val);
 		break;
-	}
 	case KVM_REG_PPC_DAC1:
-	case KVM_REG_PPC_DAC2: {
-		int dac = reg->id - KVM_REG_PPC_DAC1;
-		r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac],
-			     (u64 __user *)(long)reg->addr, sizeof(u64));
+	case KVM_REG_PPC_DAC2:
+		i = reg->id - KVM_REG_PPC_DAC1;
+		vcpu->arch.dbg_reg.dac[i] = set_reg_val(reg->id, val);
 		break;
-	}
 	case KVM_REG_PPC_EPR: {
-		u32 new_epr;
-		r = get_user(new_epr, (u32 __user *)(long)reg->addr);
-		if (!r)
-			kvmppc_set_epr(vcpu, new_epr);
+		u32 new_epr = set_reg_val(reg->id, val);
+		kvmppc_set_epr(vcpu, new_epr);
 		break;
 	}
 #if defined(CONFIG_64BIT)
 	case KVM_REG_PPC_EPCR: {
-		u32 new_epcr;
-		r = get_user(new_epcr, (u32 __user *)(long)reg->addr);
-		if (r == 0)
-			kvmppc_set_epcr(vcpu, new_epcr);
+		u32 new_epcr = set_reg_val(reg->id, val);
+		kvmppc_set_epcr(vcpu, new_epcr);
 		break;
 	}
 #endif
+	case KVM_REG_PPC_OR_TSR: {
+		u32 tsr_bits = set_reg_val(reg->id, val);
+		kvmppc_set_tsr_bits(vcpu, tsr_bits);
+		break;
+	}
+	case KVM_REG_PPC_CLEAR_TSR: {
+		u32 tsr_bits = set_reg_val(reg->id, val);
+		kvmppc_clr_tsr_bits(vcpu, tsr_bits);
+		break;
+	}
+	case KVM_REG_PPC_TSR: {
+		u32 tsr = set_reg_val(reg->id, val);
+		kvmppc_set_tsr(vcpu, tsr);
+		break;
+	}
+	case KVM_REG_PPC_TCR: {
+		u32 tcr = set_reg_val(reg->id, val);
+		kvmppc_set_tcr(vcpu, tcr);
+		break;
+	}
 	default:
+		r = kvmppc_set_one_reg(vcpu, reg->id, &val);
 		break;
 	}
+
 	return r;
 }
 
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					 struct kvm_guest_debug *dbg)
+{
+	return -EINVAL;
+}
+
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	return -ENOTSUPP;
@@ -1531,7 +1593,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old)
+				const struct kvm_memory_slot *old)
 {
 }
 
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index f4bb55c96517..2c6deb5ef2fe 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -54,8 +54,7 @@
                        (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
                        (1<<BOOKE_INTERRUPT_ALIGNMENT))
 
-.macro KVM_HANDLER ivor_nr scratch srr0
-_GLOBAL(kvmppc_handler_\ivor_nr)
+.macro __KVM_HANDLER ivor_nr scratch srr0
 	/* Get pointer to vcpu and record exit number. */
 	mtspr	\scratch , r4
 	mfspr   r4, SPRN_SPRG_THREAD
@@ -76,6 +75,43 @@ _GLOBAL(kvmppc_handler_\ivor_nr)
 	bctr
 .endm
 
+.macro KVM_HANDLER ivor_nr scratch srr0
+_GLOBAL(kvmppc_handler_\ivor_nr)
+	__KVM_HANDLER \ivor_nr \scratch \srr0
+.endm
+
+.macro KVM_DBG_HANDLER ivor_nr scratch srr0
+_GLOBAL(kvmppc_handler_\ivor_nr)
+	mtspr   \scratch, r4
+	mfspr	r4, SPRN_SPRG_THREAD
+	lwz	r4, THREAD_KVM_VCPU(r4)
+	stw	r3, VCPU_CRIT_SAVE(r4)
+	mfcr	r3
+	mfspr	r4, SPRN_CSRR1
+	andi.	r4, r4, MSR_PR
+	bne	1f
+	/* debug interrupt happened in enter/exit path */
+	mfspr   r4, SPRN_CSRR1
+	rlwinm  r4, r4, 0, ~MSR_DE
+	mtspr   SPRN_CSRR1, r4
+	lis	r4, 0xffff
+	ori	r4, r4, 0xffff
+	mtspr	SPRN_DBSR, r4
+	mfspr	r4, SPRN_SPRG_THREAD
+	lwz	r4, THREAD_KVM_VCPU(r4)
+	mtcr	r3
+	lwz     r3, VCPU_CRIT_SAVE(r4)
+	mfspr   r4, \scratch
+	rfci
+1:	/* debug interrupt happened in guest */
+	mtcr	r3
+	mfspr	r4, SPRN_SPRG_THREAD
+	lwz	r4, THREAD_KVM_VCPU(r4)
+	lwz     r3, VCPU_CRIT_SAVE(r4)
+	mfspr   r4, \scratch
+	__KVM_HANDLER \ivor_nr \scratch \srr0
+.endm
+
 .macro KVM_HANDLER_ADDR ivor_nr
 	.long	kvmppc_handler_\ivor_nr
 .endm
@@ -100,7 +136,7 @@ KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0
 KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
 KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
 KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
-KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
+KVM_DBG_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
 KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
 KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0
 KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 6dd4de7802bf..ce6b73c29612 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -425,6 +425,20 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 	return kvmppc_set_sregs_ivor(vcpu, sregs);
 }
 
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+			union kvmppc_one_reg *val)
+{
+	int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+	return r;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+		       union kvmppc_one_reg *val)
+{
+	int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+	return r;
+}
+
 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500;
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 33db48a8ce24..c2e5e98453a6 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -23,6 +23,10 @@
 #include <asm/mmu-book3e.h>
 #include <asm/tlb.h>
 
+enum vcpu_ftr {
+	VCPU_FTR_MMU_V2
+};
+
 #define E500_PID_NUM   3
 #define E500_TLB_NUM   2
 
@@ -131,6 +135,10 @@ void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
 void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
+int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+				union kvmppc_one_reg *val);
+int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+			       union kvmppc_one_reg *val);
 
 #ifdef CONFIG_KVM_E500V2
 unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
@@ -295,4 +303,18 @@ static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu)
 #define get_tlb_sts(gtlbe)              (MAS1_TS)
 #endif /* !BOOKE_HV */
 
+static inline bool has_feature(const struct kvm_vcpu *vcpu,
+			       enum vcpu_ftr ftr)
+{
+	bool has_ftr;
+	switch (ftr) {
+	case VCPU_FTR_MMU_V2:
+		has_ftr = ((vcpu->arch.mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2);
+		break;
+	default:
+		return false;
+	}
+	return has_ftr;
+}
+
 #endif /* KVM_E500_H */
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index e78f353a836a..b10a01243abd 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -284,6 +284,16 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 	case SPRN_TLB1CFG:
 		*spr_val = vcpu->arch.tlbcfg[1];
 		break;
+	case SPRN_TLB0PS:
+		if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+			return EMULATE_FAIL;
+		*spr_val = vcpu->arch.tlbps[0];
+		break;
+	case SPRN_TLB1PS:
+		if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+			return EMULATE_FAIL;
+		*spr_val = vcpu->arch.tlbps[1];
+		break;
 	case SPRN_L1CSR0:
 		*spr_val = vcpu_e500->l1csr0;
 		break;
@@ -307,6 +317,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 	case SPRN_MMUCFG:
 		*spr_val = vcpu->arch.mmucfg;
 		break;
+	case SPRN_EPTCFG:
+		if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+			return EMULATE_FAIL;
+		/*
+		 * Legacy Linux guests access EPTCFG register even if the E.PT
+		 * category is disabled in the VM. Give them a chance to live.
+		 */
+		*spr_val = vcpu->arch.eptcfg;
+		break;
 
 	/* extra exceptions */
 	case SPRN_IVOR32:
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 5c4475983f78..c41a5a96b558 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -596,6 +596,140 @@ int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 	return 0;
 }
 
+int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+				union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+
+	switch (id) {
+	case KVM_REG_PPC_MAS0:
+		*val = get_reg_val(id, vcpu->arch.shared->mas0);
+		break;
+	case KVM_REG_PPC_MAS1:
+		*val = get_reg_val(id, vcpu->arch.shared->mas1);
+		break;
+	case KVM_REG_PPC_MAS2:
+		*val = get_reg_val(id, vcpu->arch.shared->mas2);
+		break;
+	case KVM_REG_PPC_MAS7_3:
+		*val = get_reg_val(id, vcpu->arch.shared->mas7_3);
+		break;
+	case KVM_REG_PPC_MAS4:
+		*val = get_reg_val(id, vcpu->arch.shared->mas4);
+		break;
+	case KVM_REG_PPC_MAS6:
+		*val = get_reg_val(id, vcpu->arch.shared->mas6);
+		break;
+	case KVM_REG_PPC_MMUCFG:
+		*val = get_reg_val(id, vcpu->arch.mmucfg);
+		break;
+	case KVM_REG_PPC_EPTCFG:
+		*val = get_reg_val(id, vcpu->arch.eptcfg);
+		break;
+	case KVM_REG_PPC_TLB0CFG:
+	case KVM_REG_PPC_TLB1CFG:
+	case KVM_REG_PPC_TLB2CFG:
+	case KVM_REG_PPC_TLB3CFG:
+		i = id - KVM_REG_PPC_TLB0CFG;
+		*val = get_reg_val(id, vcpu->arch.tlbcfg[i]);
+		break;
+	case KVM_REG_PPC_TLB0PS:
+	case KVM_REG_PPC_TLB1PS:
+	case KVM_REG_PPC_TLB2PS:
+	case KVM_REG_PPC_TLB3PS:
+		i = id - KVM_REG_PPC_TLB0PS;
+		*val = get_reg_val(id, vcpu->arch.tlbps[i]);
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+			       union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+
+	switch (id) {
+	case KVM_REG_PPC_MAS0:
+		vcpu->arch.shared->mas0 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS1:
+		vcpu->arch.shared->mas1 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS2:
+		vcpu->arch.shared->mas2 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS7_3:
+		vcpu->arch.shared->mas7_3 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS4:
+		vcpu->arch.shared->mas4 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS6:
+		vcpu->arch.shared->mas6 = set_reg_val(id, *val);
+		break;
+	/* Only allow MMU registers to be set to the config supported by KVM */
+	case KVM_REG_PPC_MMUCFG: {
+		u32 reg = set_reg_val(id, *val);
+		if (reg != vcpu->arch.mmucfg)
+			r = -EINVAL;
+		break;
+	}
+	case KVM_REG_PPC_EPTCFG: {
+		u32 reg = set_reg_val(id, *val);
+		if (reg != vcpu->arch.eptcfg)
+			r = -EINVAL;
+		break;
+	}
+	case KVM_REG_PPC_TLB0CFG:
+	case KVM_REG_PPC_TLB1CFG:
+	case KVM_REG_PPC_TLB2CFG:
+	case KVM_REG_PPC_TLB3CFG: {
+		/* MMU geometry (N_ENTRY/ASSOC) can be set only using SW_TLB */
+		u32 reg = set_reg_val(id, *val);
+		i = id - KVM_REG_PPC_TLB0CFG;
+		if (reg != vcpu->arch.tlbcfg[i])
+			r = -EINVAL;
+		break;
+	}
+	case KVM_REG_PPC_TLB0PS:
+	case KVM_REG_PPC_TLB1PS:
+	case KVM_REG_PPC_TLB2PS:
+	case KVM_REG_PPC_TLB3PS: {
+		u32 reg = set_reg_val(id, *val);
+		i = id - KVM_REG_PPC_TLB0PS;
+		if (reg != vcpu->arch.tlbps[i])
+			r = -EINVAL;
+		break;
+	}
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu,
+		struct kvm_book3e_206_tlb_params *params)
+{
+	vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	if (params->tlb_sizes[0] <= 2048)
+		vcpu->arch.tlbcfg[0] |= params->tlb_sizes[0];
+	vcpu->arch.tlbcfg[0] |= params->tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
+
+	vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.tlbcfg[1] |= params->tlb_sizes[1];
+	vcpu->arch.tlbcfg[1] |= params->tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
+	return 0;
+}
+
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 			      struct kvm_config_tlb *cfg)
 {
@@ -692,16 +826,8 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	vcpu_e500->gtlb_offset[0] = 0;
 	vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0];
 
-	vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
-
-	vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-	if (params.tlb_sizes[0] <= 2048)
-		vcpu->arch.tlbcfg[0] |= params.tlb_sizes[0];
-	vcpu->arch.tlbcfg[0] |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
-
-	vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-	vcpu->arch.tlbcfg[1] |= params.tlb_sizes[1];
-	vcpu->arch.tlbcfg[1] |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
+	/* Update vcpu's MMU geometry based on SW_TLB input */
+	vcpu_mmu_geometry_update(vcpu, &params);
 
 	vcpu_e500->shared_tlb_pages = pages;
 	vcpu_e500->num_shared_tlb_pages = num_pages;
@@ -737,6 +863,39 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+/* Vcpu's MMU default configuration */
+static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
+		       struct kvmppc_e500_tlb_params *params)
+{
+	/* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host values*/
+	vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
+
+	/* Initialize TLBnCFG fields with host values and SW_TLB geometry*/
+	vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
+			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.tlbcfg[0] |= params[0].entries;
+	vcpu->arch.tlbcfg[0] |= params[0].ways << TLBnCFG_ASSOC_SHIFT;
+
+	vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
+			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.tlbcfg[1] |= params[1].entries;
+	vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT;
+
+	if (has_feature(vcpu, VCPU_FTR_MMU_V2)) {
+		vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS);
+		vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS);
+
+		vcpu->arch.mmucfg &= ~MMUCFG_LRAT;
+
+		/* Guest mmu emulation currently doesn't handle E.PT */
+		vcpu->arch.eptcfg = 0;
+		vcpu->arch.tlbcfg[0] &= ~TLBnCFG_PT;
+		vcpu->arch.tlbcfg[1] &= ~TLBnCFG_IND;
+	}
+
+	return 0;
+}
+
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
@@ -781,18 +940,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	if (!vcpu_e500->g2h_tlb1_map)
 		goto err;
 
-	/* Init TLB configuration register */
-	vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
-			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-	vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[0].entries;
-	vcpu->arch.tlbcfg[0] |=
-		vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT;
-
-	vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
-			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-	vcpu->arch.tlbcfg[1] |= vcpu_e500->gtlb_params[1].entries;
-	vcpu->arch.tlbcfg[1] |=
-		vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT;
+	vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
 
 	kvmppc_recalc_tlb1map_range(vcpu_e500);
 	return 0;
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index 2f4baa074b2e..753cc99eff2b 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -177,6 +177,8 @@ int kvmppc_core_check_processor_compat(void)
 		r = 0;
 	else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0)
 		r = 0;
+	else if (strcmp(cur_cpu_spec->cpu_name, "e6500") == 0)
+		r = 0;
 	else
 		r = -ENOTSUPP;
 
@@ -260,6 +262,20 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 	return kvmppc_set_sregs_ivor(vcpu, sregs);
 }
 
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+			union kvmppc_one_reg *val)
+{
+	int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+	return r;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+		       union kvmppc_one_reg *val)
+{
+	int r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val);
+	return r;
+}
+
 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500;
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 7a73b6f72a8b..631a2650e4e4 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -38,6 +38,7 @@
 
 #define OP_31_XOP_TRAP      4
 #define OP_31_XOP_LWZX      23
+#define OP_31_XOP_DCBST     54
 #define OP_31_XOP_TRAP_64   68
 #define OP_31_XOP_DCBF      86
 #define OP_31_XOP_LBZX      87
@@ -370,6 +371,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
 			break;
 
+		case OP_31_XOP_DCBST:
 		case OP_31_XOP_DCBF:
 		case OP_31_XOP_DCBI:
 			/* Do nothing. The guest is performing dcbi because
diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
new file mode 100644
index 000000000000..5a9a10b90762
--- /dev/null
+++ b/arch/powerpc/kvm/irq.h
@@ -0,0 +1,20 @@
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/kvm_host.h>
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+	int ret = 0;
+
+#ifdef CONFIG_KVM_MPIC
+	ret = ret || (kvm->arch.mpic != NULL);
+#endif
+#ifdef CONFIG_KVM_XICS
+	ret = ret || (kvm->arch.xics != NULL);
+#endif
+	smp_rmb();
+	return ret;
+}
+
+#endif
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
new file mode 100644
index 000000000000..2861ae9eaae6
--- /dev/null
+++ b/arch/powerpc/kvm/mpic.c
@@ -0,0 +1,1853 @@
+/*
+ * OpenPIC emulation
+ *
+ * Copyright (c) 2004 Jocelyn Mayer
+ *               2011 Alexander Graf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <asm/uaccess.h>
+#include <asm/mpic.h>
+#include <asm/kvm_para.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_ppc.h>
+#include "iodev.h"
+
+#define MAX_CPU     32
+#define MAX_SRC     256
+#define MAX_TMR     4
+#define MAX_IPI     4
+#define MAX_MSI     8
+#define MAX_IRQ     (MAX_SRC + MAX_IPI + MAX_TMR)
+#define VID         0x03	/* MPIC version ID */
+
+/* OpenPIC capability flags */
+#define OPENPIC_FLAG_IDR_CRIT     (1 << 0)
+#define OPENPIC_FLAG_ILR          (2 << 0)
+
+/* OpenPIC address map */
+#define OPENPIC_REG_SIZE             0x40000
+#define OPENPIC_GLB_REG_START        0x0
+#define OPENPIC_GLB_REG_SIZE         0x10F0
+#define OPENPIC_TMR_REG_START        0x10F0
+#define OPENPIC_TMR_REG_SIZE         0x220
+#define OPENPIC_MSI_REG_START        0x1600
+#define OPENPIC_MSI_REG_SIZE         0x200
+#define OPENPIC_SUMMARY_REG_START    0x3800
+#define OPENPIC_SUMMARY_REG_SIZE     0x800
+#define OPENPIC_SRC_REG_START        0x10000
+#define OPENPIC_SRC_REG_SIZE         (MAX_SRC * 0x20)
+#define OPENPIC_CPU_REG_START        0x20000
+#define OPENPIC_CPU_REG_SIZE         (0x100 + ((MAX_CPU - 1) * 0x1000))
+
+struct fsl_mpic_info {
+	int max_ext;
+};
+
+static struct fsl_mpic_info fsl_mpic_20 = {
+	.max_ext = 12,
+};
+
+static struct fsl_mpic_info fsl_mpic_42 = {
+	.max_ext = 12,
+};
+
+#define FRR_NIRQ_SHIFT    16
+#define FRR_NCPU_SHIFT     8
+#define FRR_VID_SHIFT      0
+
+#define VID_REVISION_1_2   2
+#define VID_REVISION_1_3   3
+
+#define VIR_GENERIC      0x00000000	/* Generic Vendor ID */
+
+#define GCR_RESET        0x80000000
+#define GCR_MODE_PASS    0x00000000
+#define GCR_MODE_MIXED   0x20000000
+#define GCR_MODE_PROXY   0x60000000
+
+#define TBCR_CI           0x80000000	/* count inhibit */
+#define TCCR_TOG          0x80000000	/* toggles when decrement to zero */
+
+#define IDR_EP_SHIFT      31
+#define IDR_EP_MASK       (1 << IDR_EP_SHIFT)
+#define IDR_CI0_SHIFT     30
+#define IDR_CI1_SHIFT     29
+#define IDR_P1_SHIFT      1
+#define IDR_P0_SHIFT      0
+
+#define ILR_INTTGT_MASK   0x000000ff
+#define ILR_INTTGT_INT    0x00
+#define ILR_INTTGT_CINT   0x01	/* critical */
+#define ILR_INTTGT_MCP    0x02	/* machine check */
+#define NUM_OUTPUTS       3
+
+#define MSIIR_OFFSET       0x140
+#define MSIIR_SRS_SHIFT    29
+#define MSIIR_SRS_MASK     (0x7 << MSIIR_SRS_SHIFT)
+#define MSIIR_IBS_SHIFT    24
+#define MSIIR_IBS_MASK     (0x1f << MSIIR_IBS_SHIFT)
+
+static int get_current_cpu(void)
+{
+#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
+	struct kvm_vcpu *vcpu = current->thread.kvm_vcpu;
+	return vcpu ? vcpu->arch.irq_cpu_id : -1;
+#else
+	/* XXX */
+	return -1;
+#endif
+}
+
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+				      u32 val, int idx);
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+				     u32 *ptr, int idx);
+
+enum irq_type {
+	IRQ_TYPE_NORMAL = 0,
+	IRQ_TYPE_FSLINT,	/* FSL internal interrupt -- level only */
+	IRQ_TYPE_FSLSPECIAL,	/* FSL timer/IPI interrupt, edge, no polarity */
+};
+
+struct irq_queue {
+	/* Round up to the nearest 64 IRQs so that the queue length
+	 * won't change when moving between 32 and 64 bit hosts.
+	 */
+	unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)];
+	int next;
+	int priority;
+};
+
+struct irq_source {
+	uint32_t ivpr;		/* IRQ vector/priority register */
+	uint32_t idr;		/* IRQ destination register */
+	uint32_t destmask;	/* bitmap of CPU destinations */
+	int last_cpu;
+	int output;		/* IRQ level, e.g. ILR_INTTGT_INT */
+	int pending;		/* TRUE if IRQ is pending */
+	enum irq_type type;
+	bool level:1;		/* level-triggered */
+	bool nomask:1;	/* critical interrupts ignore mask on some FSL MPICs */
+};
+
+#define IVPR_MASK_SHIFT       31
+#define IVPR_MASK_MASK        (1 << IVPR_MASK_SHIFT)
+#define IVPR_ACTIVITY_SHIFT   30
+#define IVPR_ACTIVITY_MASK    (1 << IVPR_ACTIVITY_SHIFT)
+#define IVPR_MODE_SHIFT       29
+#define IVPR_MODE_MASK        (1 << IVPR_MODE_SHIFT)
+#define IVPR_POLARITY_SHIFT   23
+#define IVPR_POLARITY_MASK    (1 << IVPR_POLARITY_SHIFT)
+#define IVPR_SENSE_SHIFT      22
+#define IVPR_SENSE_MASK       (1 << IVPR_SENSE_SHIFT)
+
+#define IVPR_PRIORITY_MASK     (0xF << 16)
+#define IVPR_PRIORITY(_ivprr_) ((int)(((_ivprr_) & IVPR_PRIORITY_MASK) >> 16))
+#define IVPR_VECTOR(opp, _ivprr_) ((_ivprr_) & (opp)->vector_mask)
+
+/* IDR[EP/CI] are only for FSL MPIC prior to v4.0 */
+#define IDR_EP      0x80000000	/* external pin */
+#define IDR_CI      0x40000000	/* critical interrupt */
+
+struct irq_dest {
+	struct kvm_vcpu *vcpu;
+
+	int32_t ctpr;		/* CPU current task priority */
+	struct irq_queue raised;
+	struct irq_queue servicing;
+
+	/* Count of IRQ sources asserting on non-INT outputs */
+	uint32_t outputs_active[NUM_OUTPUTS];
+};
+
+#define MAX_MMIO_REGIONS 10
+
+struct openpic {
+	struct kvm *kvm;
+	struct kvm_device *dev;
+	struct kvm_io_device mmio;
+	const struct mem_reg *mmio_regions[MAX_MMIO_REGIONS];
+	int num_mmio_regions;
+
+	gpa_t reg_base;
+	spinlock_t lock;
+
+	/* Behavior control */
+	struct fsl_mpic_info *fsl;
+	uint32_t model;
+	uint32_t flags;
+	uint32_t nb_irqs;
+	uint32_t vid;
+	uint32_t vir;		/* Vendor identification register */
+	uint32_t vector_mask;
+	uint32_t tfrr_reset;
+	uint32_t ivpr_reset;
+	uint32_t idr_reset;
+	uint32_t brr1;
+	uint32_t mpic_mode_mask;
+
+	/* Global registers */
+	uint32_t frr;		/* Feature reporting register */
+	uint32_t gcr;		/* Global configuration register  */
+	uint32_t pir;		/* Processor initialization register */
+	uint32_t spve;		/* Spurious vector register */
+	uint32_t tfrr;		/* Timer frequency reporting register */
+	/* Source registers */
+	struct irq_source src[MAX_IRQ];
+	/* Local registers per output pin */
+	struct irq_dest dst[MAX_CPU];
+	uint32_t nb_cpus;
+	/* Timer registers */
+	struct {
+		uint32_t tccr;	/* Global timer current count register */
+		uint32_t tbcr;	/* Global timer base count register */
+	} timers[MAX_TMR];
+	/* Shared MSI registers */
+	struct {
+		uint32_t msir;	/* Shared Message Signaled Interrupt Register */
+	} msi[MAX_MSI];
+	uint32_t max_irq;
+	uint32_t irq_ipi0;
+	uint32_t irq_tim0;
+	uint32_t irq_msi;
+};
+
+
+static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst,
+			   int output)
+{
+	struct kvm_interrupt irq = {
+		.irq = KVM_INTERRUPT_SET_LEVEL,
+	};
+
+	if (!dst->vcpu) {
+		pr_debug("%s: destination cpu %d does not exist\n",
+			 __func__, (int)(dst - &opp->dst[0]));
+		return;
+	}
+
+	pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id,
+		output);
+
+	if (output != ILR_INTTGT_INT)	/* TODO */
+		return;
+
+	kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq);
+}
+
+static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst,
+			   int output)
+{
+	if (!dst->vcpu) {
+		pr_debug("%s: destination cpu %d does not exist\n",
+			 __func__, (int)(dst - &opp->dst[0]));
+		return;
+	}
+
+	pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id,
+		output);
+
+	if (output != ILR_INTTGT_INT)	/* TODO */
+		return;
+
+	kvmppc_core_dequeue_external(dst->vcpu);
+}
+
+static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ)
+{
+	set_bit(n_IRQ, q->queue);
+}
+
+static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ)
+{
+	clear_bit(n_IRQ, q->queue);
+}
+
+static inline int IRQ_testbit(struct irq_queue *q, int n_IRQ)
+{
+	return test_bit(n_IRQ, q->queue);
+}
+
+static void IRQ_check(struct openpic *opp, struct irq_queue *q)
+{
+	int irq = -1;
+	int next = -1;
+	int priority = -1;
+
+	for (;;) {
+		irq = find_next_bit(q->queue, opp->max_irq, irq + 1);
+		if (irq == opp->max_irq)
+			break;
+
+		pr_debug("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n",
+			irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority);
+
+		if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) {
+			next = irq;
+			priority = IVPR_PRIORITY(opp->src[irq].ivpr);
+		}
+	}
+
+	q->next = next;
+	q->priority = priority;
+}
+
+static int IRQ_get_next(struct openpic *opp, struct irq_queue *q)
+{
+	/* XXX: optimize */
+	IRQ_check(opp, q);
+
+	return q->next;
+}
+
+static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
+			   bool active, bool was_active)
+{
+	struct irq_dest *dst;
+	struct irq_source *src;
+	int priority;
+
+	dst = &opp->dst[n_CPU];
+	src = &opp->src[n_IRQ];
+
+	pr_debug("%s: IRQ %d active %d was %d\n",
+		__func__, n_IRQ, active, was_active);
+
+	if (src->output != ILR_INTTGT_INT) {
+		pr_debug("%s: output %d irq %d active %d was %d count %d\n",
+			__func__, src->output, n_IRQ, active, was_active,
+			dst->outputs_active[src->output]);
+
+		/* On Freescale MPIC, critical interrupts ignore priority,
+		 * IACK, EOI, etc.  Before MPIC v4.1 they also ignore
+		 * masking.
+		 */
+		if (active) {
+			if (!was_active &&
+			    dst->outputs_active[src->output]++ == 0) {
+				pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n",
+					__func__, src->output, n_CPU, n_IRQ);
+				mpic_irq_raise(opp, dst, src->output);
+			}
+		} else {
+			if (was_active &&
+			    --dst->outputs_active[src->output] == 0) {
+				pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n",
+					__func__, src->output, n_CPU, n_IRQ);
+				mpic_irq_lower(opp, dst, src->output);
+			}
+		}
+
+		return;
+	}
+
+	priority = IVPR_PRIORITY(src->ivpr);
+
+	/* Even if the interrupt doesn't have enough priority,
+	 * it is still raised, in case ctpr is lowered later.
+	 */
+	if (active)
+		IRQ_setbit(&dst->raised, n_IRQ);
+	else
+		IRQ_resetbit(&dst->raised, n_IRQ);
+
+	IRQ_check(opp, &dst->raised);
+
+	if (active && priority <= dst->ctpr) {
+		pr_debug("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n",
+			__func__, n_IRQ, priority, dst->ctpr, n_CPU);
+		active = 0;
+	}
+
+	if (active) {
+		if (IRQ_get_next(opp, &dst->servicing) >= 0 &&
+		    priority <= dst->servicing.priority) {
+			pr_debug("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n",
+				__func__, n_IRQ, dst->servicing.next, n_CPU);
+		} else {
+			pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n",
+				__func__, n_CPU, n_IRQ, dst->raised.next);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+		}
+	} else {
+		IRQ_get_next(opp, &dst->servicing);
+		if (dst->raised.priority > dst->ctpr &&
+		    dst->raised.priority > dst->servicing.priority) {
+			pr_debug("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n",
+				__func__, n_IRQ, dst->raised.next,
+				dst->raised.priority, dst->ctpr,
+				dst->servicing.priority, n_CPU);
+			/* IRQ line stays asserted */
+		} else {
+			pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n",
+				__func__, n_IRQ, dst->ctpr,
+				dst->servicing.priority, n_CPU);
+			mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+		}
+	}
+}
+
+/* update pic state because registers for n_IRQ have changed value */
+static void openpic_update_irq(struct openpic *opp, int n_IRQ)
+{
+	struct irq_source *src;
+	bool active, was_active;
+	int i;
+
+	src = &opp->src[n_IRQ];
+	active = src->pending;
+
+	if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) {
+		/* Interrupt source is disabled */
+		pr_debug("%s: IRQ %d is disabled\n", __func__, n_IRQ);
+		active = false;
+	}
+
+	was_active = !!(src->ivpr & IVPR_ACTIVITY_MASK);
+
+	/*
+	 * We don't have a similar check for already-active because
+	 * ctpr may have changed and we need to withdraw the interrupt.
+	 */
+	if (!active && !was_active) {
+		pr_debug("%s: IRQ %d is already inactive\n", __func__, n_IRQ);
+		return;
+	}
+
+	if (active)
+		src->ivpr |= IVPR_ACTIVITY_MASK;
+	else
+		src->ivpr &= ~IVPR_ACTIVITY_MASK;
+
+	if (src->destmask == 0) {
+		/* No target */
+		pr_debug("%s: IRQ %d has no target\n", __func__, n_IRQ);
+		return;
+	}
+
+	if (src->destmask == (1 << src->last_cpu)) {
+		/* Only one CPU is allowed to receive this IRQ */
+		IRQ_local_pipe(opp, src->last_cpu, n_IRQ, active, was_active);
+	} else if (!(src->ivpr & IVPR_MODE_MASK)) {
+		/* Directed delivery mode */
+		for (i = 0; i < opp->nb_cpus; i++) {
+			if (src->destmask & (1 << i)) {
+				IRQ_local_pipe(opp, i, n_IRQ, active,
+					       was_active);
+			}
+		}
+	} else {
+		/* Distributed delivery mode */
+		for (i = src->last_cpu + 1; i != src->last_cpu; i++) {
+			if (i == opp->nb_cpus)
+				i = 0;
+
+			if (src->destmask & (1 << i)) {
+				IRQ_local_pipe(opp, i, n_IRQ, active,
+					       was_active);
+				src->last_cpu = i;
+				break;
+			}
+		}
+	}
+}
+
+static void openpic_set_irq(void *opaque, int n_IRQ, int level)
+{
+	struct openpic *opp = opaque;
+	struct irq_source *src;
+
+	if (n_IRQ >= MAX_IRQ) {
+		WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ);
+		return;
+	}
+
+	src = &opp->src[n_IRQ];
+	pr_debug("openpic: set irq %d = %d ivpr=0x%08x\n",
+		n_IRQ, level, src->ivpr);
+	if (src->level) {
+		/* level-sensitive irq */
+		src->pending = level;
+		openpic_update_irq(opp, n_IRQ);
+	} else {
+		/* edge-sensitive irq */
+		if (level) {
+			src->pending = 1;
+			openpic_update_irq(opp, n_IRQ);
+		}
+
+		if (src->output != ILR_INTTGT_INT) {
+			/* Edge-triggered interrupts shouldn't be used
+			 * with non-INT delivery, but just in case,
+			 * try to make it do something sane rather than
+			 * cause an interrupt storm.  This is close to
+			 * what you'd probably see happen in real hardware.
+			 */
+			src->pending = 0;
+			openpic_update_irq(opp, n_IRQ);
+		}
+	}
+}
+
+static void openpic_reset(struct openpic *opp)
+{
+	int i;
+
+	opp->gcr = GCR_RESET;
+	/* Initialise controller registers */
+	opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) |
+	    (opp->vid << FRR_VID_SHIFT);
+
+	opp->pir = 0;
+	opp->spve = -1 & opp->vector_mask;
+	opp->tfrr = opp->tfrr_reset;
+	/* Initialise IRQ sources */
+	for (i = 0; i < opp->max_irq; i++) {
+		opp->src[i].ivpr = opp->ivpr_reset;
+		opp->src[i].idr = opp->idr_reset;
+
+		switch (opp->src[i].type) {
+		case IRQ_TYPE_NORMAL:
+			opp->src[i].level =
+			    !!(opp->ivpr_reset & IVPR_SENSE_MASK);
+			break;
+
+		case IRQ_TYPE_FSLINT:
+			opp->src[i].ivpr |= IVPR_POLARITY_MASK;
+			break;
+
+		case IRQ_TYPE_FSLSPECIAL:
+			break;
+		}
+	}
+	/* Initialise IRQ destinations */
+	for (i = 0; i < MAX_CPU; i++) {
+		opp->dst[i].ctpr = 15;
+		memset(&opp->dst[i].raised, 0, sizeof(struct irq_queue));
+		opp->dst[i].raised.next = -1;
+		memset(&opp->dst[i].servicing, 0, sizeof(struct irq_queue));
+		opp->dst[i].servicing.next = -1;
+	}
+	/* Initialise timers */
+	for (i = 0; i < MAX_TMR; i++) {
+		opp->timers[i].tccr = 0;
+		opp->timers[i].tbcr = TBCR_CI;
+	}
+	/* Go out of RESET state */
+	opp->gcr = 0;
+}
+
+static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ)
+{
+	return opp->src[n_IRQ].idr;
+}
+
+static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ)
+{
+	if (opp->flags & OPENPIC_FLAG_ILR)
+		return opp->src[n_IRQ].output;
+
+	return 0xffffffff;
+}
+
+static inline uint32_t read_IRQreg_ivpr(struct openpic *opp, int n_IRQ)
+{
+	return opp->src[n_IRQ].ivpr;
+}
+
+static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
+				    uint32_t val)
+{
+	struct irq_source *src = &opp->src[n_IRQ];
+	uint32_t normal_mask = (1UL << opp->nb_cpus) - 1;
+	uint32_t crit_mask = 0;
+	uint32_t mask = normal_mask;
+	int crit_shift = IDR_EP_SHIFT - opp->nb_cpus;
+	int i;
+
+	if (opp->flags & OPENPIC_FLAG_IDR_CRIT) {
+		crit_mask = mask << crit_shift;
+		mask |= crit_mask | IDR_EP;
+	}
+
+	src->idr = val & mask;
+	pr_debug("Set IDR %d to 0x%08x\n", n_IRQ, src->idr);
+
+	if (opp->flags & OPENPIC_FLAG_IDR_CRIT) {
+		if (src->idr & crit_mask) {
+			if (src->idr & normal_mask) {
+				pr_debug("%s: IRQ configured for multiple output types, using critical\n",
+					__func__);
+			}
+
+			src->output = ILR_INTTGT_CINT;
+			src->nomask = true;
+			src->destmask = 0;
+
+			for (i = 0; i < opp->nb_cpus; i++) {
+				int n_ci = IDR_CI0_SHIFT - i;
+
+				if (src->idr & (1UL << n_ci))
+					src->destmask |= 1UL << i;
+			}
+		} else {
+			src->output = ILR_INTTGT_INT;
+			src->nomask = false;
+			src->destmask = src->idr & normal_mask;
+		}
+	} else {
+		src->destmask = src->idr;
+	}
+}
+
+static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ,
+				    uint32_t val)
+{
+	if (opp->flags & OPENPIC_FLAG_ILR) {
+		struct irq_source *src = &opp->src[n_IRQ];
+
+		src->output = val & ILR_INTTGT_MASK;
+		pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr,
+			src->output);
+
+		/* TODO: on MPIC v4.0 only, set nomask for non-INT */
+	}
+}
+
+static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ,
+				     uint32_t val)
+{
+	uint32_t mask;
+
+	/* NOTE when implementing newer FSL MPIC models: starting with v4.0,
+	 * the polarity bit is read-only on internal interrupts.
+	 */
+	mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK |
+	    IVPR_POLARITY_MASK | opp->vector_mask;
+
+	/* ACTIVITY bit is read-only */
+	opp->src[n_IRQ].ivpr =
+	    (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask);
+
+	/* For FSL internal interrupts, The sense bit is reserved and zero,
+	 * and the interrupt is always level-triggered.  Timers and IPIs
+	 * have no sense or polarity bits, and are edge-triggered.
+	 */
+	switch (opp->src[n_IRQ].type) {
+	case IRQ_TYPE_NORMAL:
+		opp->src[n_IRQ].level =
+		    !!(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK);
+		break;
+
+	case IRQ_TYPE_FSLINT:
+		opp->src[n_IRQ].ivpr &= ~IVPR_SENSE_MASK;
+		break;
+
+	case IRQ_TYPE_FSLSPECIAL:
+		opp->src[n_IRQ].ivpr &= ~(IVPR_POLARITY_MASK | IVPR_SENSE_MASK);
+		break;
+	}
+
+	openpic_update_irq(opp, n_IRQ);
+	pr_debug("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val,
+		opp->src[n_IRQ].ivpr);
+}
+
+static void openpic_gcr_write(struct openpic *opp, uint64_t val)
+{
+	if (val & GCR_RESET) {
+		openpic_reset(opp);
+		return;
+	}
+
+	opp->gcr &= ~opp->mpic_mode_mask;
+	opp->gcr |= val & opp->mpic_mode_mask;
+}
+
+static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+	int err = 0;
+
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+	if (addr & 0xF)
+		return 0;
+
+	switch (addr) {
+	case 0x00:	/* Block Revision Register1 (BRR1) is Readonly */
+		break;
+	case 0x40:
+	case 0x50:
+	case 0x60:
+	case 0x70:
+	case 0x80:
+	case 0x90:
+	case 0xA0:
+	case 0xB0:
+		err = openpic_cpu_write_internal(opp, addr, val,
+						 get_current_cpu());
+		break;
+	case 0x1000:		/* FRR */
+		break;
+	case 0x1020:		/* GCR */
+		openpic_gcr_write(opp, val);
+		break;
+	case 0x1080:		/* VIR */
+		break;
+	case 0x1090:		/* PIR */
+		/*
+		 * This register is used to reset a CPU core --
+		 * let userspace handle it.
+		 */
+		err = -ENXIO;
+		break;
+	case 0x10A0:		/* IPI_IVPR */
+	case 0x10B0:
+	case 0x10C0:
+	case 0x10D0: {
+		int idx;
+		idx = (addr - 0x10A0) >> 4;
+		write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val);
+		break;
+	}
+	case 0x10E0:		/* SPVE */
+		opp->spve = val & opp->vector_mask;
+		break;
+	default:
+		break;
+	}
+
+	return err;
+}
+
+static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+	u32 retval;
+	int err = 0;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+	retval = 0xFFFFFFFF;
+	if (addr & 0xF)
+		goto out;
+
+	switch (addr) {
+	case 0x1000:		/* FRR */
+		retval = opp->frr;
+		retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT;
+		break;
+	case 0x1020:		/* GCR */
+		retval = opp->gcr;
+		break;
+	case 0x1080:		/* VIR */
+		retval = opp->vir;
+		break;
+	case 0x1090:		/* PIR */
+		retval = 0x00000000;
+		break;
+	case 0x00:		/* Block Revision Register1 (BRR1) */
+		retval = opp->brr1;
+		break;
+	case 0x40:
+	case 0x50:
+	case 0x60:
+	case 0x70:
+	case 0x80:
+	case 0x90:
+	case 0xA0:
+	case 0xB0:
+		err = openpic_cpu_read_internal(opp, addr,
+			&retval, get_current_cpu());
+		break;
+	case 0x10A0:		/* IPI_IVPR */
+	case 0x10B0:
+	case 0x10C0:
+	case 0x10D0:
+		{
+			int idx;
+			idx = (addr - 0x10A0) >> 4;
+			retval = read_IRQreg_ivpr(opp, opp->irq_ipi0 + idx);
+		}
+		break;
+	case 0x10E0:		/* SPVE */
+		retval = opp->spve;
+		break;
+	default:
+		break;
+	}
+
+out:
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+	*ptr = retval;
+	return err;
+}
+
+static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+	int idx;
+
+	addr += 0x10f0;
+
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+	if (addr & 0xF)
+		return 0;
+
+	if (addr == 0x10f0) {
+		/* TFRR */
+		opp->tfrr = val;
+		return 0;
+	}
+
+	idx = (addr >> 6) & 0x3;
+	addr = addr & 0x30;
+
+	switch (addr & 0x30) {
+	case 0x00:		/* TCCR */
+		break;
+	case 0x10:		/* TBCR */
+		if ((opp->timers[idx].tccr & TCCR_TOG) != 0 &&
+		    (val & TBCR_CI) == 0 &&
+		    (opp->timers[idx].tbcr & TBCR_CI) != 0)
+			opp->timers[idx].tccr &= ~TCCR_TOG;
+
+		opp->timers[idx].tbcr = val;
+		break;
+	case 0x20:		/* TVPR */
+		write_IRQreg_ivpr(opp, opp->irq_tim0 + idx, val);
+		break;
+	case 0x30:		/* TDR */
+		write_IRQreg_idr(opp, opp->irq_tim0 + idx, val);
+		break;
+	}
+
+	return 0;
+}
+
+static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+	uint32_t retval = -1;
+	int idx;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+	if (addr & 0xF)
+		goto out;
+
+	idx = (addr >> 6) & 0x3;
+	if (addr == 0x0) {
+		/* TFRR */
+		retval = opp->tfrr;
+		goto out;
+	}
+
+	switch (addr & 0x30) {
+	case 0x00:		/* TCCR */
+		retval = opp->timers[idx].tccr;
+		break;
+	case 0x10:		/* TBCR */
+		retval = opp->timers[idx].tbcr;
+		break;
+	case 0x20:		/* TIPV */
+		retval = read_IRQreg_ivpr(opp, opp->irq_tim0 + idx);
+		break;
+	case 0x30:		/* TIDE (TIDR) */
+		retval = read_IRQreg_idr(opp, opp->irq_tim0 + idx);
+		break;
+	}
+
+out:
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+	*ptr = retval;
+	return 0;
+}
+
+static int openpic_src_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+	int idx;
+
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+
+	addr = addr & 0xffff;
+	idx = addr >> 5;
+
+	switch (addr & 0x1f) {
+	case 0x00:
+		write_IRQreg_ivpr(opp, idx, val);
+		break;
+	case 0x10:
+		write_IRQreg_idr(opp, idx, val);
+		break;
+	case 0x18:
+		write_IRQreg_ilr(opp, idx, val);
+		break;
+	}
+
+	return 0;
+}
+
+static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+	uint32_t retval;
+	int idx;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+	retval = 0xFFFFFFFF;
+
+	addr = addr & 0xffff;
+	idx = addr >> 5;
+
+	switch (addr & 0x1f) {
+	case 0x00:
+		retval = read_IRQreg_ivpr(opp, idx);
+		break;
+	case 0x10:
+		retval = read_IRQreg_idr(opp, idx);
+		break;
+	case 0x18:
+		retval = read_IRQreg_ilr(opp, idx);
+		break;
+	}
+
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+	*ptr = retval;
+	return 0;
+}
+
+static int openpic_msi_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+	int idx = opp->irq_msi;
+	int srs, ibs;
+
+	pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
+	if (addr & 0xF)
+		return 0;
+
+	switch (addr) {
+	case MSIIR_OFFSET:
+		srs = val >> MSIIR_SRS_SHIFT;
+		idx += srs;
+		ibs = (val & MSIIR_IBS_MASK) >> MSIIR_IBS_SHIFT;
+		opp->msi[srs].msir |= 1 << ibs;
+		openpic_set_irq(opp, idx, 1);
+		break;
+	default:
+		/* most registers are read-only, thus ignored */
+		break;
+	}
+
+	return 0;
+}
+
+static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+	uint32_t r = 0;
+	int i, srs;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+	if (addr & 0xF)
+		return -ENXIO;
+
+	srs = addr >> 4;
+
+	switch (addr) {
+	case 0x00:
+	case 0x10:
+	case 0x20:
+	case 0x30:
+	case 0x40:
+	case 0x50:
+	case 0x60:
+	case 0x70:		/* MSIRs */
+		r = opp->msi[srs].msir;
+		/* Clear on read */
+		opp->msi[srs].msir = 0;
+		openpic_set_irq(opp, opp->irq_msi + srs, 0);
+		break;
+	case 0x120:		/* MSISR */
+		for (i = 0; i < MAX_MSI; i++)
+			r |= (opp->msi[i].msir ? 1 : 0) << i;
+		break;
+	}
+
+	pr_debug("%s: => 0x%08x\n", __func__, r);
+	*ptr = r;
+	return 0;
+}
+
+static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	uint32_t r = 0;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+
+	/* TODO: EISR/EIMR */
+
+	*ptr = r;
+	return 0;
+}
+
+static int openpic_summary_write(void *opaque, gpa_t addr, u32 val)
+{
+	pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
+
+	/* TODO: EISR/EIMR */
+	return 0;
+}
+
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+				      u32 val, int idx)
+{
+	struct openpic *opp = opaque;
+	struct irq_source *src;
+	struct irq_dest *dst;
+	int s_IRQ, n_IRQ;
+
+	pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx,
+		addr, val);
+
+	if (idx < 0)
+		return 0;
+
+	if (addr & 0xF)
+		return 0;
+
+	dst = &opp->dst[idx];
+	addr &= 0xFF0;
+	switch (addr) {
+	case 0x40:		/* IPIDR */
+	case 0x50:
+	case 0x60:
+	case 0x70:
+		idx = (addr - 0x40) >> 4;
+		/* we use IDE as mask which CPUs to deliver the IPI to still. */
+		opp->src[opp->irq_ipi0 + idx].destmask |= val;
+		openpic_set_irq(opp, opp->irq_ipi0 + idx, 1);
+		openpic_set_irq(opp, opp->irq_ipi0 + idx, 0);
+		break;
+	case 0x80:		/* CTPR */
+		dst->ctpr = val & 0x0000000F;
+
+		pr_debug("%s: set CPU %d ctpr to %d, raised %d servicing %d\n",
+			__func__, idx, dst->ctpr, dst->raised.priority,
+			dst->servicing.priority);
+
+		if (dst->raised.priority <= dst->ctpr) {
+			pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n",
+				__func__, idx);
+			mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+		} else if (dst->raised.priority > dst->servicing.priority) {
+			pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n",
+				__func__, idx, dst->raised.next);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+		}
+
+		break;
+	case 0x90:		/* WHOAMI */
+		/* Read-only register */
+		break;
+	case 0xA0:		/* IACK */
+		/* Read-only register */
+		break;
+	case 0xB0: {		/* EOI */
+		int notify_eoi;
+
+		pr_debug("EOI\n");
+		s_IRQ = IRQ_get_next(opp, &dst->servicing);
+
+		if (s_IRQ < 0) {
+			pr_debug("%s: EOI with no interrupt in service\n",
+				__func__);
+			break;
+		}
+
+		IRQ_resetbit(&dst->servicing, s_IRQ);
+		/* Notify listeners that the IRQ is over */
+		notify_eoi = s_IRQ;
+		/* Set up next servicing IRQ */
+		s_IRQ = IRQ_get_next(opp, &dst->servicing);
+		/* Check queued interrupts. */
+		n_IRQ = IRQ_get_next(opp, &dst->raised);
+		src = &opp->src[n_IRQ];
+		if (n_IRQ != -1 &&
+		    (s_IRQ == -1 ||
+		     IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) {
+			pr_debug("Raise OpenPIC INT output cpu %d irq %d\n",
+				idx, n_IRQ);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+		}
+
+		spin_unlock(&opp->lock);
+		kvm_notify_acked_irq(opp->kvm, 0, notify_eoi);
+		spin_lock(&opp->lock);
+
+		break;
+	}
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+
+	return openpic_cpu_write_internal(opp, addr, val,
+					 (addr & 0x1f000) >> 12);
+}
+
+static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst,
+			     int cpu)
+{
+	struct irq_source *src;
+	int retval, irq;
+
+	pr_debug("Lower OpenPIC INT output\n");
+	mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+
+	irq = IRQ_get_next(opp, &dst->raised);
+	pr_debug("IACK: irq=%d\n", irq);
+
+	if (irq == -1)
+		/* No more interrupt pending */
+		return opp->spve;
+
+	src = &opp->src[irq];
+	if (!(src->ivpr & IVPR_ACTIVITY_MASK) ||
+	    !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) {
+		pr_err("%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n",
+			__func__, irq, dst->ctpr, src->ivpr);
+		openpic_update_irq(opp, irq);
+		retval = opp->spve;
+	} else {
+		/* IRQ enter servicing state */
+		IRQ_setbit(&dst->servicing, irq);
+		retval = IVPR_VECTOR(opp, src->ivpr);
+	}
+
+	if (!src->level) {
+		/* edge-sensitive IRQ */
+		src->ivpr &= ~IVPR_ACTIVITY_MASK;
+		src->pending = 0;
+		IRQ_resetbit(&dst->raised, irq);
+	}
+
+	if ((irq >= opp->irq_ipi0) && (irq < (opp->irq_ipi0 + MAX_IPI))) {
+		src->destmask &= ~(1 << cpu);
+		if (src->destmask && !src->level) {
+			/* trigger on CPUs that didn't know about it yet */
+			openpic_set_irq(opp, irq, 1);
+			openpic_set_irq(opp, irq, 0);
+			/* if all CPUs knew about it, set active bit again */
+			src->ivpr |= IVPR_ACTIVITY_MASK;
+		}
+	}
+
+	return retval;
+}
+
+void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
+{
+	struct openpic *opp = vcpu->arch.mpic;
+	int cpu = vcpu->arch.irq_cpu_id;
+	unsigned long flags;
+
+	spin_lock_irqsave(&opp->lock, flags);
+
+	if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY)
+		kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu));
+
+	spin_unlock_irqrestore(&opp->lock, flags);
+}
+
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+				     u32 *ptr, int idx)
+{
+	struct openpic *opp = opaque;
+	struct irq_dest *dst;
+	uint32_t retval;
+
+	pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr);
+	retval = 0xFFFFFFFF;
+
+	if (idx < 0)
+		goto out;
+
+	if (addr & 0xF)
+		goto out;
+
+	dst = &opp->dst[idx];
+	addr &= 0xFF0;
+	switch (addr) {
+	case 0x80:		/* CTPR */
+		retval = dst->ctpr;
+		break;
+	case 0x90:		/* WHOAMI */
+		retval = idx;
+		break;
+	case 0xA0:		/* IACK */
+		retval = openpic_iack(opp, dst, idx);
+		break;
+	case 0xB0:		/* EOI */
+		retval = 0;
+		break;
+	default:
+		break;
+	}
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+
+out:
+	*ptr = retval;
+	return 0;
+}
+
+static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+
+	return openpic_cpu_read_internal(opp, addr, ptr,
+					 (addr & 0x1f000) >> 12);
+}
+
+struct mem_reg {
+	int (*read)(void *opaque, gpa_t addr, u32 *ptr);
+	int (*write)(void *opaque, gpa_t addr, u32 val);
+	gpa_t start_addr;
+	int size;
+};
+
+static const struct mem_reg openpic_gbl_mmio = {
+	.write = openpic_gbl_write,
+	.read = openpic_gbl_read,
+	.start_addr = OPENPIC_GLB_REG_START,
+	.size = OPENPIC_GLB_REG_SIZE,
+};
+
+static const struct mem_reg openpic_tmr_mmio = {
+	.write = openpic_tmr_write,
+	.read = openpic_tmr_read,
+	.start_addr = OPENPIC_TMR_REG_START,
+	.size = OPENPIC_TMR_REG_SIZE,
+};
+
+static const struct mem_reg openpic_cpu_mmio = {
+	.write = openpic_cpu_write,
+	.read = openpic_cpu_read,
+	.start_addr = OPENPIC_CPU_REG_START,
+	.size = OPENPIC_CPU_REG_SIZE,
+};
+
+static const struct mem_reg openpic_src_mmio = {
+	.write = openpic_src_write,
+	.read = openpic_src_read,
+	.start_addr = OPENPIC_SRC_REG_START,
+	.size = OPENPIC_SRC_REG_SIZE,
+};
+
+static const struct mem_reg openpic_msi_mmio = {
+	.read = openpic_msi_read,
+	.write = openpic_msi_write,
+	.start_addr = OPENPIC_MSI_REG_START,
+	.size = OPENPIC_MSI_REG_SIZE,
+};
+
+static const struct mem_reg openpic_summary_mmio = {
+	.read = openpic_summary_read,
+	.write = openpic_summary_write,
+	.start_addr = OPENPIC_SUMMARY_REG_START,
+	.size = OPENPIC_SUMMARY_REG_SIZE,
+};
+
+static void add_mmio_region(struct openpic *opp, const struct mem_reg *mr)
+{
+	if (opp->num_mmio_regions >= MAX_MMIO_REGIONS) {
+		WARN(1, "kvm mpic: too many mmio regions\n");
+		return;
+	}
+
+	opp->mmio_regions[opp->num_mmio_regions++] = mr;
+}
+
+static void fsl_common_init(struct openpic *opp)
+{
+	int i;
+	int virq = MAX_SRC;
+
+	add_mmio_region(opp, &openpic_msi_mmio);
+	add_mmio_region(opp, &openpic_summary_mmio);
+
+	opp->vid = VID_REVISION_1_2;
+	opp->vir = VIR_GENERIC;
+	opp->vector_mask = 0xFFFF;
+	opp->tfrr_reset = 0;
+	opp->ivpr_reset = IVPR_MASK_MASK;
+	opp->idr_reset = 1 << 0;
+	opp->max_irq = MAX_IRQ;
+
+	opp->irq_ipi0 = virq;
+	virq += MAX_IPI;
+	opp->irq_tim0 = virq;
+	virq += MAX_TMR;
+
+	BUG_ON(virq > MAX_IRQ);
+
+	opp->irq_msi = 224;
+
+	for (i = 0; i < opp->fsl->max_ext; i++)
+		opp->src[i].level = false;
+
+	/* Internal interrupts, including message and MSI */
+	for (i = 16; i < MAX_SRC; i++) {
+		opp->src[i].type = IRQ_TYPE_FSLINT;
+		opp->src[i].level = true;
+	}
+
+	/* timers and IPIs */
+	for (i = MAX_SRC; i < virq; i++) {
+		opp->src[i].type = IRQ_TYPE_FSLSPECIAL;
+		opp->src[i].level = false;
+	}
+}
+
+static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr)
+{
+	int i;
+
+	for (i = 0; i < opp->num_mmio_regions; i++) {
+		const struct mem_reg *mr = opp->mmio_regions[i];
+
+		if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+			continue;
+
+		return mr->read(opp, addr - mr->start_addr, ptr);
+	}
+
+	return -ENXIO;
+}
+
+static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val)
+{
+	int i;
+
+	for (i = 0; i < opp->num_mmio_regions; i++) {
+		const struct mem_reg *mr = opp->mmio_regions[i];
+
+		if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+			continue;
+
+		return mr->write(opp, addr - mr->start_addr, val);
+	}
+
+	return -ENXIO;
+}
+
+static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr,
+			 int len, void *ptr)
+{
+	struct openpic *opp = container_of(this, struct openpic, mmio);
+	int ret;
+	union {
+		u32 val;
+		u8 bytes[4];
+	} u;
+
+	if (addr & (len - 1)) {
+		pr_debug("%s: bad alignment %llx/%d\n",
+			 __func__, addr, len);
+		return -EINVAL;
+	}
+
+	spin_lock_irq(&opp->lock);
+	ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val);
+	spin_unlock_irq(&opp->lock);
+
+	/*
+	 * Technically only 32-bit accesses are allowed, but be nice to
+	 * people dumping registers a byte at a time -- it works in real
+	 * hardware (reads only, not writes).
+	 */
+	if (len == 4) {
+		*(u32 *)ptr = u.val;
+		pr_debug("%s: addr %llx ret %d len 4 val %x\n",
+			 __func__, addr, ret, u.val);
+	} else if (len == 1) {
+		*(u8 *)ptr = u.bytes[addr & 3];
+		pr_debug("%s: addr %llx ret %d len 1 val %x\n",
+			 __func__, addr, ret, u.bytes[addr & 3]);
+	} else {
+		pr_debug("%s: bad length %d\n", __func__, len);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr,
+			  int len, const void *ptr)
+{
+	struct openpic *opp = container_of(this, struct openpic, mmio);
+	int ret;
+
+	if (len != 4) {
+		pr_debug("%s: bad length %d\n", __func__, len);
+		return -EOPNOTSUPP;
+	}
+	if (addr & 3) {
+		pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len);
+		return -EOPNOTSUPP;
+	}
+
+	spin_lock_irq(&opp->lock);
+	ret = kvm_mpic_write_internal(opp, addr - opp->reg_base,
+				      *(const u32 *)ptr);
+	spin_unlock_irq(&opp->lock);
+
+	pr_debug("%s: addr %llx ret %d val %x\n",
+		 __func__, addr, ret, *(const u32 *)ptr);
+
+	return ret;
+}
+
+static const struct kvm_io_device_ops mpic_mmio_ops = {
+	.read = kvm_mpic_read,
+	.write = kvm_mpic_write,
+};
+
+static void map_mmio(struct openpic *opp)
+{
+	kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops);
+
+	kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS,
+				opp->reg_base, OPENPIC_REG_SIZE,
+				&opp->mmio);
+}
+
+static void unmap_mmio(struct openpic *opp)
+{
+	kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio);
+}
+
+static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr)
+{
+	u64 base;
+
+	if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64)))
+		return -EFAULT;
+
+	if (base & 0x3ffff) {
+		pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n",
+			 __func__, base);
+		return -EINVAL;
+	}
+
+	if (base == opp->reg_base)
+		return 0;
+
+	mutex_lock(&opp->kvm->slots_lock);
+
+	unmap_mmio(opp);
+	opp->reg_base = base;
+
+	pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n",
+		 __func__, base);
+
+	if (base == 0)
+		goto out;
+
+	map_mmio(opp);
+
+out:
+	mutex_unlock(&opp->kvm->slots_lock);
+	return 0;
+}
+
+#define ATTR_SET		0
+#define ATTR_GET		1
+
+static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type)
+{
+	int ret;
+
+	if (addr & 3)
+		return -ENXIO;
+
+	spin_lock_irq(&opp->lock);
+
+	if (type == ATTR_SET)
+		ret = kvm_mpic_write_internal(opp, addr, *val);
+	else
+		ret = kvm_mpic_read_internal(opp, addr, val);
+
+	spin_unlock_irq(&opp->lock);
+
+	pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val);
+
+	return ret;
+}
+
+static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct openpic *opp = dev->private;
+	u32 attr32;
+
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			return set_base_addr(opp, attr);
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		if (get_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		return access_reg(opp, attr->attr, &attr32, ATTR_SET);
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			return -EINVAL;
+
+		if (get_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		if (attr32 != 0 && attr32 != 1)
+			return -EINVAL;
+
+		spin_lock_irq(&opp->lock);
+		openpic_set_irq(opp, attr->attr, attr32);
+		spin_unlock_irq(&opp->lock);
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct openpic *opp = dev->private;
+	u64 attr64;
+	u32 attr32;
+	int ret;
+
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			mutex_lock(&opp->kvm->slots_lock);
+			attr64 = opp->reg_base;
+			mutex_unlock(&opp->kvm->slots_lock);
+
+			if (copy_to_user((u64 __user *)(long)attr->addr,
+					 &attr64, sizeof(u64)))
+				return -EFAULT;
+
+			return 0;
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		ret = access_reg(opp, attr->attr, &attr32, ATTR_GET);
+		if (ret)
+			return ret;
+
+		if (put_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		return 0;
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			return -EINVAL;
+
+		spin_lock_irq(&opp->lock);
+		attr32 = opp->src[attr->attr].pending;
+		spin_unlock_irq(&opp->lock);
+
+		if (put_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			return 0;
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		return 0;
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			break;
+
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static void mpic_destroy(struct kvm_device *dev)
+{
+	struct openpic *opp = dev->private;
+
+	dev->kvm->arch.mpic = NULL;
+	kfree(opp);
+}
+
+static int mpic_set_default_irq_routing(struct openpic *opp)
+{
+	struct kvm_irq_routing_entry *routing;
+
+	/* Create a nop default map, so that dereferencing it still works */
+	routing = kzalloc((sizeof(*routing)), GFP_KERNEL);
+	if (!routing)
+		return -ENOMEM;
+
+	kvm_set_irq_routing(opp->kvm, routing, 0, 0);
+
+	kfree(routing);
+	return 0;
+}
+
+static int mpic_create(struct kvm_device *dev, u32 type)
+{
+	struct openpic *opp;
+	int ret;
+
+	/* We only support one MPIC at a time for now */
+	if (dev->kvm->arch.mpic)
+		return -EINVAL;
+
+	opp = kzalloc(sizeof(struct openpic), GFP_KERNEL);
+	if (!opp)
+		return -ENOMEM;
+
+	dev->private = opp;
+	opp->kvm = dev->kvm;
+	opp->dev = dev;
+	opp->model = type;
+	spin_lock_init(&opp->lock);
+
+	add_mmio_region(opp, &openpic_gbl_mmio);
+	add_mmio_region(opp, &openpic_tmr_mmio);
+	add_mmio_region(opp, &openpic_src_mmio);
+	add_mmio_region(opp, &openpic_cpu_mmio);
+
+	switch (opp->model) {
+	case KVM_DEV_TYPE_FSL_MPIC_20:
+		opp->fsl = &fsl_mpic_20;
+		opp->brr1 = 0x00400200;
+		opp->flags |= OPENPIC_FLAG_IDR_CRIT;
+		opp->nb_irqs = 80;
+		opp->mpic_mode_mask = GCR_MODE_MIXED;
+
+		fsl_common_init(opp);
+
+		break;
+
+	case KVM_DEV_TYPE_FSL_MPIC_42:
+		opp->fsl = &fsl_mpic_42;
+		opp->brr1 = 0x00400402;
+		opp->flags |= OPENPIC_FLAG_ILR;
+		opp->nb_irqs = 196;
+		opp->mpic_mode_mask = GCR_MODE_PROXY;
+
+		fsl_common_init(opp);
+
+		break;
+
+	default:
+		ret = -ENODEV;
+		goto err;
+	}
+
+	ret = mpic_set_default_irq_routing(opp);
+	if (ret)
+		goto err;
+
+	openpic_reset(opp);
+
+	smp_wmb();
+	dev->kvm->arch.mpic = opp;
+
+	return 0;
+
+err:
+	kfree(opp);
+	return ret;
+}
+
+struct kvm_device_ops kvm_mpic_ops = {
+	.name = "kvm-mpic",
+	.create = mpic_create,
+	.destroy = mpic_destroy,
+	.set_attr = mpic_set_attr,
+	.get_attr = mpic_get_attr,
+	.has_attr = mpic_has_attr,
+};
+
+int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+			     u32 cpu)
+{
+	struct openpic *opp = dev->private;
+	int ret = 0;
+
+	if (dev->ops != &kvm_mpic_ops)
+		return -EPERM;
+	if (opp->kvm != vcpu->kvm)
+		return -EPERM;
+	if (cpu < 0 || cpu >= MAX_CPU)
+		return -EPERM;
+
+	spin_lock_irq(&opp->lock);
+
+	if (opp->dst[cpu].vcpu) {
+		ret = -EEXIST;
+		goto out;
+	}
+	if (vcpu->arch.irq_type) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	opp->dst[cpu].vcpu = vcpu;
+	opp->nb_cpus = max(opp->nb_cpus, cpu + 1);
+
+	vcpu->arch.mpic = opp;
+	vcpu->arch.irq_cpu_id = cpu;
+	vcpu->arch.irq_type = KVMPPC_IRQ_MPIC;
+
+	/* This might need to be changed if GCR gets extended */
+	if (opp->mpic_mode_mask == GCR_MODE_PROXY)
+		vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL;
+
+out:
+	spin_unlock_irq(&opp->lock);
+	return ret;
+}
+
+/*
+ * This should only happen immediately before the mpic is destroyed,
+ * so we shouldn't need to worry about anything still trying to
+ * access the vcpu pointer.
+ */
+void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu)
+{
+	BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu);
+
+	opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL;
+}
+
+/*
+ * Return value:
+ *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
+ *  = 0   Interrupt was coalesced (previous irq is still pending)
+ *  > 0   Number of CPUs interrupt was delivered to
+ */
+static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e,
+			struct kvm *kvm, int irq_source_id, int level,
+			bool line_status)
+{
+	u32 irq = e->irqchip.pin;
+	struct openpic *opp = kvm->arch.mpic;
+	unsigned long flags;
+
+	spin_lock_irqsave(&opp->lock, flags);
+	openpic_set_irq(opp, irq, level);
+	spin_unlock_irqrestore(&opp->lock, flags);
+
+	/* All code paths we care about don't check for the return value */
+	return 0;
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+		struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+	struct openpic *opp = kvm->arch.mpic;
+	unsigned long flags;
+
+	spin_lock_irqsave(&opp->lock, flags);
+
+	/*
+	 * XXX We ignore the target address for now, as we only support
+	 *     a single MSI bank.
+	 */
+	openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data);
+	spin_unlock_irqrestore(&opp->lock, flags);
+
+	/* All code paths we care about don't check for the return value */
+	return 0;
+}
+
+int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
+			  struct kvm_kernel_irq_routing_entry *e,
+			  const struct kvm_irq_routing_entry *ue)
+{
+	int r = -EINVAL;
+
+	switch (ue->type) {
+	case KVM_IRQ_ROUTING_IRQCHIP:
+		e->set = mpic_set_irq;
+		e->irqchip.irqchip = ue->u.irqchip.irqchip;
+		e->irqchip.pin = ue->u.irqchip.pin;
+		if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS)
+			goto out;
+		rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi;
+		break;
+	case KVM_IRQ_ROUTING_MSI:
+		e->set = kvm_set_msi;
+		e->msi.address_lo = ue->u.msi.address_lo;
+		e->msi.address_hi = ue->u.msi.address_hi;
+		e->msi.data = ue->u.msi.data;
+		break;
+	default:
+		goto out;
+	}
+
+	r = 0;
+out:
+	return r;
+}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 934413cd3a1b..6316ee336e88 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -25,6 +25,7 @@
 #include <linux/hrtimer.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/file.h>
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
@@ -32,6 +33,7 @@
 #include <asm/cputhreads.h>
 #include <asm/irqflags.h>
 #include "timing.h"
+#include "irq.h"
 #include "../mm/mmu_decl.h"
 
 #define CREATE_TRACE_POINTS
@@ -317,6 +319,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_ONE_REG:
 	case KVM_CAP_IOEVENTFD:
+	case KVM_CAP_DEVICE_CTRL:
 		r = 1;
 		break;
 #ifndef CONFIG_KVM_BOOK3S_64_HV
@@ -326,6 +329,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB:
 #endif
+#ifdef CONFIG_KVM_MPIC
+	case KVM_CAP_IRQ_MPIC:
+#endif
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -335,6 +341,10 @@ int kvm_dev_ioctl_check_extension(long ext)
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_SPAPR_TCE:
 	case KVM_CAP_PPC_ALLOC_HTAB:
+	case KVM_CAP_PPC_RTAS:
+#ifdef CONFIG_KVM_XICS
+	case KVM_CAP_IRQ_XICS:
+#endif
 		r = 1;
 		break;
 #endif /* CONFIG_PPC_BOOK3S_64 */
@@ -411,18 +421,17 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
-                                   struct kvm_memory_slot *memslot,
-                                   struct kvm_memory_slot old,
-                                   struct kvm_userspace_memory_region *mem,
-                                   bool user_alloc)
+				   struct kvm_memory_slot *memslot,
+				   struct kvm_userspace_memory_region *mem,
+				   enum kvm_mr_change change)
 {
 	return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-               struct kvm_userspace_memory_region *mem,
-               struct kvm_memory_slot old,
-               bool user_alloc)
+				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_memory_slot *old,
+				   enum kvm_mr_change change)
 {
 	kvmppc_core_commit_memory_region(kvm, mem, old);
 }
@@ -460,6 +469,16 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 	tasklet_kill(&vcpu->arch.tasklet);
 
 	kvmppc_remove_vcpu_debugfs(vcpu);
+
+	switch (vcpu->arch.irq_type) {
+	case KVMPPC_IRQ_MPIC:
+		kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
+		break;
+	case KVMPPC_IRQ_XICS:
+		kvmppc_xics_free_icp(vcpu);
+		break;
+	}
+
 	kvmppc_core_vcpu_free(vcpu);
 }
 
@@ -532,12 +551,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 #endif
 }
 
-int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
-                                        struct kvm_guest_debug *dbg)
-{
-	return -EINVAL;
-}
-
 static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
                                      struct kvm_run *run)
 {
@@ -612,6 +625,8 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        unsigned int rt, unsigned int bytes, int is_bigendian)
 {
+	int idx, ret;
+
 	if (bytes > sizeof(run->mmio.data)) {
 		printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,
 		       run->mmio.len);
@@ -627,8 +642,14 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	vcpu->mmio_is_write = 0;
 	vcpu->arch.mmio_sign_extend = 0;
 
-	if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
-			     bytes, &run->mmio.data)) {
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
+			      bytes, &run->mmio.data);
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	if (!ret) {
 		kvmppc_complete_mmio_load(vcpu, run);
 		vcpu->mmio_needed = 0;
 		return EMULATE_DONE;
@@ -653,6 +674,7 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
                         u64 val, unsigned int bytes, int is_bigendian)
 {
 	void *data = run->mmio.data;
+	int idx, ret;
 
 	if (bytes > sizeof(run->mmio.data)) {
 		printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,
@@ -682,9 +704,14 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		}
 	}
 
-	if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
-			      bytes, &run->mmio.data)) {
-		kvmppc_complete_mmio_load(vcpu, run);
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
+			       bytes, &run->mmio.data);
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	if (!ret) {
 		vcpu->mmio_needed = 0;
 		return EMULATE_DONE;
 	}
@@ -740,7 +767,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
 	if (irq->irq == KVM_INTERRUPT_UNSET) {
-		kvmppc_core_dequeue_external(vcpu, irq);
+		kvmppc_core_dequeue_external(vcpu);
 		return 0;
 	}
 
@@ -770,7 +797,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		break;
 	case KVM_CAP_PPC_EPR:
 		r = 0;
-		vcpu->arch.epr_enabled = cap->args[0];
+		if (cap->args[0])
+			vcpu->arch.epr_flags |= KVMPPC_EPR_USER;
+		else
+			vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER;
 		break;
 #ifdef CONFIG_BOOKE
 	case KVM_CAP_PPC_BOOKE_WATCHDOG:
@@ -791,6 +821,44 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		break;
 	}
 #endif
+#ifdef CONFIG_KVM_MPIC
+	case KVM_CAP_IRQ_MPIC: {
+		struct file *filp;
+		struct kvm_device *dev;
+
+		r = -EBADF;
+		filp = fget(cap->args[0]);
+		if (!filp)
+			break;
+
+		r = -EPERM;
+		dev = kvm_device_from_filp(filp);
+		if (dev)
+			r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]);
+
+		fput(filp);
+		break;
+	}
+#endif
+#ifdef CONFIG_KVM_XICS
+	case KVM_CAP_IRQ_XICS: {
+		struct file *filp;
+		struct kvm_device *dev;
+
+		r = -EBADF;
+		filp = fget(cap->args[0]);
+		if (!filp)
+			break;
+
+		r = -EPERM;
+		dev = kvm_device_from_filp(filp);
+		if (dev)
+			r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
+
+		fput(filp);
+		break;
+	}
+#endif /* CONFIG_KVM_XICS */
 	default:
 		r = -EINVAL;
 		break;
@@ -913,9 +981,22 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
 	return 0;
 }
 
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+			  bool line_status)
+{
+	if (!irqchip_in_kernel(kvm))
+		return -ENXIO;
+
+	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+					irq_event->irq, irq_event->level,
+					line_status);
+	return 0;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
 {
+	struct kvm *kvm __maybe_unused = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	long r;
 
@@ -934,7 +1015,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CREATE_SPAPR_TCE: {
 		struct kvm_create_spapr_tce create_tce;
-		struct kvm *kvm = filp->private_data;
 
 		r = -EFAULT;
 		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
@@ -946,8 +1026,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	case KVM_ALLOCATE_RMA: {
-		struct kvm *kvm = filp->private_data;
 		struct kvm_allocate_rma rma;
+		struct kvm *kvm = filp->private_data;
 
 		r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
 		if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
@@ -956,7 +1036,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 
 	case KVM_PPC_ALLOCATE_HTAB: {
-		struct kvm *kvm = filp->private_data;
 		u32 htab_order;
 
 		r = -EFAULT;
@@ -973,7 +1052,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 
 	case KVM_PPC_GET_HTAB_FD: {
-		struct kvm *kvm = filp->private_data;
 		struct kvm_get_htab_fd ghf;
 
 		r = -EFAULT;
@@ -986,7 +1064,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_PPC_GET_SMMU_INFO: {
-		struct kvm *kvm = filp->private_data;
 		struct kvm_ppc_smmu_info info;
 
 		memset(&info, 0, sizeof(info));
@@ -995,6 +1072,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			r = -EFAULT;
 		break;
 	}
+	case KVM_PPC_RTAS_DEFINE_TOKEN: {
+		struct kvm *kvm = filp->private_data;
+
+		r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
+		break;
+	}
 #endif /* CONFIG_PPC_BOOK3S_64 */
 	default:
 		r = -ENOTTY;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 229951ffc351..8726779e1409 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -32,6 +32,7 @@
 #include <linux/perf_event.h>
 #include <linux/magic.h>
 #include <linux/ratelimit.h>
+#include <linux/context_tracking.h>
 
 #include <asm/firmware.h>
 #include <asm/page.h>
@@ -196,6 +197,7 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 			    unsigned long error_code)
 {
+	enum ctx_state prev_state = exception_enter();
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -204,6 +206,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	int trap = TRAP(regs);
  	int is_exec = trap == 0x400;
 	int fault;
+	int rc = 0;
 
 #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
 	/*
@@ -230,28 +233,30 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * look at it
 	 */
 	if (error_code & ICSWX_DSI_UCT) {
-		int rc = acop_handle_fault(regs, address, error_code);
+		rc = acop_handle_fault(regs, address, error_code);
 		if (rc)
-			return rc;
+			goto bail;
 	}
 #endif /* CONFIG_PPC_ICSWX */
 
 	if (notify_page_fault(regs))
-		return 0;
+		goto bail;
 
 	if (unlikely(debugger_fault_handler(regs)))
-		return 0;
+		goto bail;
 
 	/* On a kernel SLB miss we can only check for a valid exception entry */
-	if (!user_mode(regs) && (address >= TASK_SIZE))
-		return SIGSEGV;
+	if (!user_mode(regs) && (address >= TASK_SIZE)) {
+		rc = SIGSEGV;
+		goto bail;
+	}
 
 #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
 			     defined(CONFIG_PPC_BOOK3S_64))
   	if (error_code & DSISR_DABRMATCH) {
 		/* breakpoint match */
 		do_break(regs, address, error_code);
-		return 0;
+		goto bail;
 	}
 #endif
 
@@ -260,8 +265,10 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		local_irq_enable();
 
 	if (in_atomic() || mm == NULL) {
-		if (!user_mode(regs))
-			return SIGSEGV;
+		if (!user_mode(regs)) {
+			rc = SIGSEGV;
+			goto bail;
+		}
 		/* in_atomic() in user mode is really bad,
 		   as is current->mm == NULL. */
 		printk(KERN_EMERG "Page fault in user mode with "
@@ -417,9 +424,11 @@ good_area:
 	 */
 	fault = handle_mm_fault(mm, vma, address, flags);
 	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
-		int rc = mm_fault_error(regs, address, fault);
+		rc = mm_fault_error(regs, address, fault);
 		if (rc >= MM_FAULT_RETURN)
-			return rc;
+			goto bail;
+		else
+			rc = 0;
 	}
 
 	/*
@@ -454,7 +463,7 @@ good_area:
 	}
 
 	up_read(&mm->mmap_sem);
-	return 0;
+	goto bail;
 
 bad_area:
 	up_read(&mm->mmap_sem);
@@ -463,7 +472,7 @@ bad_area_nosemaphore:
 	/* User mode accesses cause a SIGSEGV */
 	if (user_mode(regs)) {
 		_exception(SIGSEGV, regs, code, address);
-		return 0;
+		goto bail;
 	}
 
 	if (is_exec && (error_code & DSISR_PROTFAULT))
@@ -471,7 +480,11 @@ bad_area_nosemaphore:
 				   " page (%lx) - exploit attempt? (uid: %d)\n",
 				   address, from_kuid(&init_user_ns, current_uid()));
 
-	return SIGSEGV;
+	rc = SIGSEGV;
+
+bail:
+	exception_exit(prev_state);
+	return rc;
 
 }
 
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 3e4c4ed19335..e303a6d74e3a 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/signal.h>
 #include <linux/memblock.h>
+#include <linux/context_tracking.h>
 
 #include <asm/processor.h>
 #include <asm/pgtable.h>
@@ -954,6 +955,7 @@ void hash_failure_debug(unsigned long ea, unsigned long access,
  */
 int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 {
+	enum ctx_state prev_state = exception_enter();
 	pgd_t *pgdir;
 	unsigned long vsid;
 	struct mm_struct *mm;
@@ -973,7 +975,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 		mm = current->mm;
 		if (! mm) {
 			DBG_LOW(" user region with no mm !\n");
-			return 1;
+			rc = 1;
+			goto bail;
 		}
 		psize = get_slice_psize(mm, ea);
 		ssize = user_segment_size(ea);
@@ -992,19 +995,23 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 		/* Not a valid range
 		 * Send the problem up to do_page_fault 
 		 */
-		return 1;
+		rc = 1;
+		goto bail;
 	}
 	DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
 
 	/* Bad address. */
 	if (!vsid) {
 		DBG_LOW("Bad address!\n");
-		return 1;
+		rc = 1;
+		goto bail;
 	}
 	/* Get pgdir */
 	pgdir = mm->pgd;
-	if (pgdir == NULL)
-		return 1;
+	if (pgdir == NULL) {
+		rc = 1;
+		goto bail;
+	}
 
 	/* Check CPU locality */
 	tmp = cpumask_of(smp_processor_id());
@@ -1027,7 +1034,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 	ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
 	if (ptep == NULL || !pte_present(*ptep)) {
 		DBG_LOW(" no PTE !\n");
-		return 1;
+		rc = 1;
+		goto bail;
 	}
 
 	/* Add _PAGE_PRESENT to the required access perm */
@@ -1038,13 +1046,16 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 	 */
 	if (access & ~pte_val(*ptep)) {
 		DBG_LOW(" no access !\n");
-		return 1;
+		rc = 1;
+		goto bail;
 	}
 
 #ifdef CONFIG_HUGETLB_PAGE
-	if (hugeshift)
-		return __hash_page_huge(ea, access, vsid, ptep, trap, local,
+	if (hugeshift) {
+		rc = __hash_page_huge(ea, access, vsid, ptep, trap, local,
 					ssize, hugeshift, psize);
+		goto bail;
+	}
 #endif /* CONFIG_HUGETLB_PAGE */
 
 #ifndef CONFIG_PPC_64K_PAGES
@@ -1124,6 +1135,9 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 		pte_val(*(ptep + PTRS_PER_PTE)));
 #endif
 	DBG_LOW(" -> rc=%d\n", rc);
+
+bail:
+	exception_exit(prev_state);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(hash_page);
@@ -1230,6 +1244,7 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
 	 * unmapping it first, it may see the speculated version.
 	 */
 	if (local && cpu_has_feature(CPU_FTR_TM) &&
+	    current->thread.regs &&
 	    MSR_TM_ACTIVE(current->thread.regs->msr)) {
 		tm_enable();
 		tm_abort(TM_CAUSE_TLBI);
@@ -1258,6 +1273,8 @@ void flush_hash_range(unsigned long number, int local)
  */
 void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
 {
+	enum ctx_state prev_state = exception_enter();
+
 	if (user_mode(regs)) {
 #ifdef CONFIG_PPC_SUBPAGE_PROT
 		if (rc == -2)
@@ -1267,6 +1284,8 @@ void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
 			_exception(SIGBUS, regs, BUS_ADRERR, address);
 	} else
 		bad_page_fault(regs, address, SIGBUS);
+
+	exception_exit(prev_state);
 }
 
 long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index c2787bf779ca..a90b9c458990 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -215,7 +215,8 @@ static void __meminit vmemmap_create_mapping(unsigned long start,
 					     unsigned long phys)
 {
 	int  mapped = htab_bolt_mapping(start, start + page_size, phys,
-					PAGE_KERNEL, mmu_vmemmap_psize,
+					pgprot_val(PAGE_KERNEL),
+					mmu_vmemmap_psize,
 					mmu_kernel_ssize);
 	BUG_ON(mapped < 0);
 }
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index c627843c5b2e..426180b84978 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -13,11 +13,13 @@
 #include <linux/perf_event.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>
 #include <asm/reg.h>
 #include <asm/pmc.h>
 #include <asm/machdep.h>
 #include <asm/firmware.h>
 #include <asm/ptrace.h>
+#include <asm/code-patching.h>
 
 #define BHRB_MAX_ENTRIES	32
 #define BHRB_TARGET		0x0000000000000002
@@ -100,6 +102,10 @@ static inline int siar_valid(struct pt_regs *regs)
 	return 1;
 }
 
+static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
+static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
+void power_pmu_flush_branch_stack(void) {}
+static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {}
 #endif /* CONFIG_PPC32 */
 
 static bool regs_use_siar(struct pt_regs *regs)
@@ -308,6 +314,159 @@ static inline int siar_valid(struct pt_regs *regs)
 	return 1;
 }
 
+
+/* Reset all possible BHRB entries */
+static void power_pmu_bhrb_reset(void)
+{
+	asm volatile(PPC_CLRBHRB);
+}
+
+static void power_pmu_bhrb_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
+
+	if (!ppmu->bhrb_nr)
+		return;
+
+	/* Clear BHRB if we changed task context to avoid data leaks */
+	if (event->ctx->task && cpuhw->bhrb_context != event->ctx) {
+		power_pmu_bhrb_reset();
+		cpuhw->bhrb_context = event->ctx;
+	}
+	cpuhw->bhrb_users++;
+}
+
+static void power_pmu_bhrb_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
+
+	if (!ppmu->bhrb_nr)
+		return;
+
+	cpuhw->bhrb_users--;
+	WARN_ON_ONCE(cpuhw->bhrb_users < 0);
+
+	if (!cpuhw->disabled && !cpuhw->bhrb_users) {
+		/* BHRB cannot be turned off when other
+		 * events are active on the PMU.
+		 */
+
+		/* avoid stale pointer */
+		cpuhw->bhrb_context = NULL;
+	}
+}
+
+/* Called from ctxsw to prevent one process's branch entries to
+ * mingle with the other process's entries during context switch.
+ */
+void power_pmu_flush_branch_stack(void)
+{
+	if (ppmu->bhrb_nr)
+		power_pmu_bhrb_reset();
+}
+/* Calculate the to address for a branch */
+static __u64 power_pmu_bhrb_to(u64 addr)
+{
+	unsigned int instr;
+	int ret;
+	__u64 target;
+
+	if (is_kernel_addr(addr))
+		return branch_target((unsigned int *)addr);
+
+	/* Userspace: need copy instruction here then translate it */
+	pagefault_disable();
+	ret = __get_user_inatomic(instr, (unsigned int __user *)addr);
+	if (ret) {
+		pagefault_enable();
+		return 0;
+	}
+	pagefault_enable();
+
+	target = branch_target(&instr);
+	if ((!target) || (instr & BRANCH_ABSOLUTE))
+		return target;
+
+	/* Translate relative branch target from kernel to user address */
+	return target - (unsigned long)&instr + addr;
+}
+
+/* Processing BHRB entries */
+void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw)
+{
+	u64 val;
+	u64 addr;
+	int r_index, u_index, pred;
+
+	r_index = 0;
+	u_index = 0;
+	while (r_index < ppmu->bhrb_nr) {
+		/* Assembly read function */
+		val = read_bhrb(r_index++);
+		if (!val)
+			/* Terminal marker: End of valid BHRB entries */
+			break;
+		else {
+			addr = val & BHRB_EA;
+			pred = val & BHRB_PREDICTION;
+
+			if (!addr)
+				/* invalid entry */
+				continue;
+
+			/* Branches are read most recent first (ie. mfbhrb 0 is
+			 * the most recent branch).
+			 * There are two types of valid entries:
+			 * 1) a target entry which is the to address of a
+			 *    computed goto like a blr,bctr,btar.  The next
+			 *    entry read from the bhrb will be branch
+			 *    corresponding to this target (ie. the actual
+			 *    blr/bctr/btar instruction).
+			 * 2) a from address which is an actual branch.  If a
+			 *    target entry proceeds this, then this is the
+			 *    matching branch for that target.  If this is not
+			 *    following a target entry, then this is a branch
+			 *    where the target is given as an immediate field
+			 *    in the instruction (ie. an i or b form branch).
+			 *    In this case we need to read the instruction from
+			 *    memory to determine the target/to address.
+			 */
+
+			if (val & BHRB_TARGET) {
+				/* Target branches use two entries
+				 * (ie. computed gotos/XL form)
+				 */
+				cpuhw->bhrb_entries[u_index].to = addr;
+				cpuhw->bhrb_entries[u_index].mispred = pred;
+				cpuhw->bhrb_entries[u_index].predicted = ~pred;
+
+				/* Get from address in next entry */
+				val = read_bhrb(r_index++);
+				addr = val & BHRB_EA;
+				if (val & BHRB_TARGET) {
+					/* Shouldn't have two targets in a
+					   row.. Reset index and try again */
+					r_index--;
+					addr = 0;
+				}
+				cpuhw->bhrb_entries[u_index].from = addr;
+			} else {
+				/* Branches to immediate field 
+				   (ie I or B form) */
+				cpuhw->bhrb_entries[u_index].from = addr;
+				cpuhw->bhrb_entries[u_index].to =
+					power_pmu_bhrb_to(addr);
+				cpuhw->bhrb_entries[u_index].mispred = pred;
+				cpuhw->bhrb_entries[u_index].predicted = ~pred;
+			}
+			u_index++;
+
+		}
+	}
+	cpuhw->bhrb_stack.nr = u_index;
+	return;
+}
+
 #endif /* CONFIG_PPC64 */
 
 static void perf_event_interrupt(struct pt_regs *regs);
@@ -904,47 +1063,6 @@ static int collect_events(struct perf_event *group, int max_count,
 	return n;
 }
 
-/* Reset all possible BHRB entries */
-static void power_pmu_bhrb_reset(void)
-{
-	asm volatile(PPC_CLRBHRB);
-}
-
-void power_pmu_bhrb_enable(struct perf_event *event)
-{
-	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
-
-	if (!ppmu->bhrb_nr)
-		return;
-
-	/* Clear BHRB if we changed task context to avoid data leaks */
-	if (event->ctx->task && cpuhw->bhrb_context != event->ctx) {
-		power_pmu_bhrb_reset();
-		cpuhw->bhrb_context = event->ctx;
-	}
-	cpuhw->bhrb_users++;
-}
-
-void power_pmu_bhrb_disable(struct perf_event *event)
-{
-	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
-
-	if (!ppmu->bhrb_nr)
-		return;
-
-	cpuhw->bhrb_users--;
-	WARN_ON_ONCE(cpuhw->bhrb_users < 0);
-
-	if (!cpuhw->disabled && !cpuhw->bhrb_users) {
-		/* BHRB cannot be turned off when other
-		 * events are active on the PMU.
-		 */
-
-		/* avoid stale pointer */
-		cpuhw->bhrb_context = NULL;
-	}
-}
-
 /*
  * Add a event to the PMU.
  * If all events are not already frozen, then we disable and
@@ -1180,15 +1298,6 @@ int power_pmu_commit_txn(struct pmu *pmu)
 	return 0;
 }
 
-/* Called from ctxsw to prevent one process's branch entries to
- * mingle with the other process's entries during context switch.
- */
-void power_pmu_flush_branch_stack(void)
-{
-	if (ppmu->bhrb_nr)
-		power_pmu_bhrb_reset();
-}
-
 /*
  * Return 1 if we might be able to put event on a limited PMC,
  * or 0 if not.
@@ -1458,77 +1567,6 @@ struct pmu power_pmu = {
 	.flush_branch_stack = power_pmu_flush_branch_stack,
 };
 
-/* Processing BHRB entries */
-void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw)
-{
-	u64 val;
-	u64 addr;
-	int r_index, u_index, target, pred;
-
-	r_index = 0;
-	u_index = 0;
-	while (r_index < ppmu->bhrb_nr) {
-		/* Assembly read function */
-		val = read_bhrb(r_index);
-
-		/* Terminal marker: End of valid BHRB entries */
-		if (val == 0) {
-			break;
-		} else {
-			/* BHRB field break up */
-			addr = val & BHRB_EA;
-			pred = val & BHRB_PREDICTION;
-			target = val & BHRB_TARGET;
-
-			/* Probable Missed entry: Not applicable for POWER8 */
-			if ((addr == 0) && (target == 0) && (pred == 1)) {
-				r_index++;
-				continue;
-			}
-
-			/* Real Missed entry: Power8 based missed entry */
-			if ((addr == 0) && (target == 1) && (pred == 1)) {
-				r_index++;
-				continue;
-			}
-
-			/* Reserved condition: Not a valid entry  */
-			if ((addr == 0) && (target == 1) && (pred == 0)) {
-				r_index++;
-				continue;
-			}
-
-			/* Is a target address */
-			if (val & BHRB_TARGET) {
-				/* First address cannot be a target address */
-				if (r_index == 0) {
-					r_index++;
-					continue;
-				}
-
-				/* Update target address for the previous entry */
-				cpuhw->bhrb_entries[u_index - 1].to = addr;
-				cpuhw->bhrb_entries[u_index - 1].mispred = pred;
-				cpuhw->bhrb_entries[u_index - 1].predicted = ~pred;
-
-				/* Dont increment u_index */
-				r_index++;
-			} else {
-				/* Update address, flags for current entry */
-				cpuhw->bhrb_entries[u_index].from = addr;
-				cpuhw->bhrb_entries[u_index].mispred = pred;
-				cpuhw->bhrb_entries[u_index].predicted = ~pred;
-
-				/* Successfully popullated one entry */
-				u_index++;
-				r_index++;
-			}
-		}
-	}
-	cpuhw->bhrb_stack.nr = u_index;
-	return;
-}
-
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
diff --git a/arch/powerpc/platforms/40x/Kconfig b/arch/powerpc/platforms/40x/Kconfig
index bd40bbb15e14..6e287f1294fa 100644
--- a/arch/powerpc/platforms/40x/Kconfig
+++ b/arch/powerpc/platforms/40x/Kconfig
@@ -138,7 +138,6 @@ config PPC4xx_GPIO
 	bool "PPC4xx GPIO support"
 	depends on 40x
 	select ARCH_REQUIRE_GPIOLIB
-	select GENERIC_GPIO
 	help
 	  Enable gpiolib support for ppc40x based boards
 
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig
index 7be93367d92f..d6c7506ec7d9 100644
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -248,7 +248,6 @@ config PPC4xx_GPIO
 	bool "PPC4xx GPIO support"
 	depends on 44x
 	select ARCH_REQUIRE_GPIOLIB
-	select GENERIC_GPIO
 	help
 	  Enable gpiolib support for ppc440 based boards
 
diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig
index 8f02b05f4c96..efdd37c775ad 100644
--- a/arch/powerpc/platforms/85xx/Kconfig
+++ b/arch/powerpc/platforms/85xx/Kconfig
@@ -203,7 +203,6 @@ config GE_IMP3A
 	select DEFAULT_UIMAGE
 	select SWIOTLB
 	select MMIO_NVRAM
-	select GENERIC_GPIO
 	select ARCH_REQUIRE_GPIOLIB
 	select GE_FPGA
 	help
@@ -328,7 +327,7 @@ config B4_QDS
 	select PPC_E500MC
 	select PHYS_64BIT
 	select SWIOTLB
-	select GENERIC_GPIO
+	select GPIOLIB
 	select ARCH_REQUIRE_GPIOLIB
 	select HAS_RAPIDIO
 	select PPC_EPAPR_HV_PIC
diff --git a/arch/powerpc/platforms/86xx/Kconfig b/arch/powerpc/platforms/86xx/Kconfig
index 7a6279e38213..1afd1e4a2dd2 100644
--- a/arch/powerpc/platforms/86xx/Kconfig
+++ b/arch/powerpc/platforms/86xx/Kconfig
@@ -37,7 +37,6 @@ config GEF_PPC9A
 	bool "GE PPC9A"
 	select DEFAULT_UIMAGE
 	select MMIO_NVRAM
-	select GENERIC_GPIO
 	select ARCH_REQUIRE_GPIOLIB
 	select GE_FPGA
 	help
@@ -47,7 +46,6 @@ config GEF_SBC310
 	bool "GE SBC310"
 	select DEFAULT_UIMAGE
 	select MMIO_NVRAM
-	select GENERIC_GPIO
 	select ARCH_REQUIRE_GPIOLIB
 	select GE_FPGA
 	help
@@ -57,7 +55,6 @@ config GEF_SBC610
 	bool "GE SBC610"
 	select DEFAULT_UIMAGE
 	select MMIO_NVRAM
-	select GENERIC_GPIO
 	select ARCH_REQUIRE_GPIOLIB
 	select GE_FPGA
 	select HAS_RAPIDIO
diff --git a/arch/powerpc/platforms/8xx/Kconfig b/arch/powerpc/platforms/8xx/Kconfig
index 1fb0b3cddeb3..8dec3c0911ad 100644
--- a/arch/powerpc/platforms/8xx/Kconfig
+++ b/arch/powerpc/platforms/8xx/Kconfig
@@ -114,7 +114,6 @@ config 8xx_COPYBACK
 
 config 8xx_GPIO
 	bool "GPIO API Support"
-	select GENERIC_GPIO
 	select ARCH_REQUIRE_GPIOLIB
 	help
 	  Saying Y here will cause the ports on an MPC8xx processor to be used
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 34d224be93ba..b62aab3e22ec 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -128,7 +128,7 @@ config PPC_RTAS_DAEMON
 
 config RTAS_PROC
 	bool "Proc interface to RTAS"
-	depends on PPC_RTAS
+	depends on PPC_RTAS && PROC_FS
 	default y
 
 config RTAS_FLASH
@@ -302,7 +302,6 @@ config QUICC_ENGINE
 config QE_GPIO
 	bool "QE GPIO support"
 	depends on QUICC_ENGINE
-	select GENERIC_GPIO
 	select ARCH_REQUIRE_GPIOLIB
 	help
 	  Say Y here if you're going to use hardware that connects to the
@@ -315,7 +314,6 @@ config CPM2
 	select PPC_LIB_RHEAP
 	select PPC_PCI_CHOICE
 	select ARCH_REQUIRE_GPIOLIB
-	select GENERIC_GPIO
 	help
 	  The CPM2 (Communications Processor Module) is a coprocessor on
 	  embedded CPUs made by Freescale.  Selecting this option means that
@@ -353,7 +351,6 @@ config OF_RTC
 config SIMPLE_GPIO
 	bool "Support for simple, memory-mapped GPIO controllers"
 	depends on PPC
-	select GENERIC_GPIO
 	select ARCH_REQUIRE_GPIOLIB
 	help
 	  Say Y here to support simple, memory-mapped GPIO controllers.
@@ -364,7 +361,6 @@ config SIMPLE_GPIO
 config MCU_MPC8349EMITX
 	bool "MPC8349E-mITX MCU driver"
 	depends on I2C=y && PPC_83xx
-	select GENERIC_GPIO
 	select ARCH_REQUIRE_GPIOLIB
 	help
 	  Say Y here to enable soft power-off functionality on the Freescale
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index e56bb651da1a..946306b1bb4e 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -550,7 +550,7 @@ static struct iommu_table *cell_get_iommu_table(struct device *dev)
 	 */
 	iommu = cell_iommu_for_node(dev_to_node(dev));
 	if (iommu == NULL || list_empty(&iommu->windows)) {
-		printk(KERN_ERR "iommu: missing iommu for %s (node %d)\n",
+		dev_err(dev, "iommu: missing iommu for %s (node %d)\n",
 		       of_node_full_name(dev->of_node), dev_to_node(dev));
 		return NULL;
 	}
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 8b1213993b10..f85db3a69b4a 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -715,7 +715,7 @@ static ssize_t spu_stat_show(struct device *dev,
 		spu->stats.libassist);
 }
 
-static DEVICE_ATTR(stat, 0644, spu_stat_show, NULL);
+static DEVICE_ATTR(stat, 0444, spu_stat_show, NULL);
 
 #ifdef CONFIG_KEXEC
 
diff --git a/arch/powerpc/platforms/embedded6xx/mpc10x.h b/arch/powerpc/platforms/embedded6xx/mpc10x.h
index b30a6a3b5bd2..b290b63661f1 100644
--- a/arch/powerpc/platforms/embedded6xx/mpc10x.h
+++ b/arch/powerpc/platforms/embedded6xx/mpc10x.h
@@ -81,17 +81,6 @@
 #define	MPC10X_MAPB_PCI_MEM_OFFSET	(MPC10X_MAPB_ISA_MEM_BASE -	\
 					 MPC10X_MAPB_PCI_MEM_START)
 
-/* Set hose members to values appropriate for the mem map used */
-#define	MPC10X_SETUP_HOSE(hose, map) {					\
-	(hose)->pci_mem_offset = MPC10X_MAP##map##_PCI_MEM_OFFSET;	\
-	(hose)->io_space.start = MPC10X_MAP##map##_PCI_IO_START;	\
-	(hose)->io_space.end = MPC10X_MAP##map##_PCI_IO_END;		\
-	(hose)->mem_space.start = MPC10X_MAP##map##_PCI_MEM_START;	\
-	(hose)->mem_space.end = MPC10X_MAP##map##_PCI_MEM_END;		\
-	(hose)->io_base_virt = (void *)MPC10X_MAP##map##_ISA_IO_BASE;	\
-}
-
-
 /* Miscellaneous Configuration register offsets */
 #define	MPC10X_CFG_PIR_REG		0x09
 #define	MPC10X_CFG_PIR_HOST_BRIDGE	0x00
diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c
index 2b8af75abc23..cf7009b8c7b6 100644
--- a/arch/powerpc/platforms/powermac/pci.c
+++ b/arch/powerpc/platforms/powermac/pci.c
@@ -824,6 +824,7 @@ static void __init parse_region_decode(struct pci_controller *hose,
 			hose->mem_resources[cur].name = hose->dn->full_name;
 			hose->mem_resources[cur].start = base;
 			hose->mem_resources[cur].end = end;
+			hose->mem_offset[cur] = 0;
 			DBG("  %d: 0x%08lx-0x%08lx\n", cur, base, end);
 		} else {
 			DBG("   :           -0x%08lx\n", end);
@@ -866,7 +867,6 @@ static void __init setup_u3_ht(struct pci_controller* hose)
 	hose->io_resource.start = 0;
 	hose->io_resource.end = 0x003fffff;
 	hose->io_resource.flags = IORESOURCE_IO;
-	hose->pci_mem_offset = 0;
 	hose->first_busno = 0;
 	hose->last_busno = 0xef;
 
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index aaa0dba49471..628c564ceadb 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -15,6 +15,7 @@
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/interrupt.h>
+#include <linux/slab.h>
 #include <asm/opal.h>
 #include <asm/firmware.h>
 
@@ -28,13 +29,14 @@ struct opal {
 static struct device_node *opal_node;
 static DEFINE_SPINLOCK(opal_write_lock);
 extern u64 opal_mc_secondary_handler[];
+static unsigned int *opal_irqs;
+static unsigned int opal_irq_count;
 
 int __init early_init_dt_scan_opal(unsigned long node,
 				   const char *uname, int depth, void *data)
 {
 	const void *basep, *entryp;
 	unsigned long basesz, entrysz;
-	u64 glue;
 
 	if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
 		return 0;
@@ -54,13 +56,27 @@ int __init early_init_dt_scan_opal(unsigned long node,
 		 opal.entry, entryp, entrysz);
 
 	powerpc_firmware_features |= FW_FEATURE_OPAL;
-	if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) {
+	if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) {
+		powerpc_firmware_features |= FW_FEATURE_OPALv2;
+		powerpc_firmware_features |= FW_FEATURE_OPALv3;
+		printk("OPAL V3 detected !\n");
+	} else if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) {
 		powerpc_firmware_features |= FW_FEATURE_OPALv2;
 		printk("OPAL V2 detected !\n");
 	} else {
 		printk("OPAL V1 detected !\n");
 	}
 
+	return 1;
+}
+
+static int __init opal_register_exception_handlers(void)
+{
+	u64 glue;
+
+	if (!(powerpc_firmware_features & FW_FEATURE_OPAL))
+		return -ENODEV;
+
 	/* Hookup some exception handlers. We use the fwnmi area at 0x7000
 	 * to provide the glue space to OPAL
 	 */
@@ -74,9 +90,11 @@ int __init early_init_dt_scan_opal(unsigned long node,
 	glue += 128;
 	opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue);
 
-	return 1;
+	return 0;
 }
 
+early_initcall(opal_register_exception_handlers);
+
 int opal_get_chars(uint32_t vtermno, char *buf, int count)
 {
 	s64 len, rc;
@@ -133,6 +151,13 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
 				rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) {
 		len = total_len;
 		rc = opal_console_write(vtermno, &len, data);
+
+		/* Closed or other error drop */
+		if (rc != OPAL_SUCCESS && rc != OPAL_BUSY &&
+		    rc != OPAL_BUSY_EVENT) {
+			written = total_len;
+			break;
+		}
 		if (rc == OPAL_SUCCESS) {
 			total_len -= len;
 			data += len;
@@ -305,6 +330,8 @@ static int __init opal_init(void)
 	irqs = of_get_property(opal_node, "opal-interrupts", &irqlen);
 	pr_debug("opal: Found %d interrupts reserved for OPAL\n",
 		 irqs ? (irqlen / 4) : 0);
+	opal_irq_count = irqlen / 4;
+	opal_irqs = kzalloc(opal_irq_count * sizeof(unsigned int), GFP_KERNEL);
 	for (i = 0; irqs && i < (irqlen / 4); i++, irqs++) {
 		unsigned int hwirq = be32_to_cpup(irqs);
 		unsigned int irq = irq_create_mapping(NULL, hwirq);
@@ -316,7 +343,19 @@ static int __init opal_init(void)
 		if (rc)
 			pr_warning("opal: Error %d requesting irq %d"
 				   " (0x%x)\n", rc, irq, hwirq);
+		opal_irqs[i] = irq;
 	}
 	return 0;
 }
 subsys_initcall(opal_init);
+
+void opal_shutdown(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < opal_irq_count; i++) {
+		if (opal_irqs[i])
+			free_irq(opal_irqs[i], 0);
+		opal_irqs[i] = 0;
+	}
+}
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8c6c9cf91c13..3937aaae5bc4 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -915,11 +915,14 @@ static void pnv_ioda_setup_pe_seg(struct pci_controller *hose,
 				index++;
 			}
 		} else if (res->flags & IORESOURCE_MEM) {
+			/* WARNING: Assumes M32 is mem region 0 in PHB. We need to
+			 * harden that algorithm when we start supporting M64
+			 */
 			region.start = res->start -
-				       hose->pci_mem_offset -
+				       hose->mem_offset[0] -
 				       phb->ioda.m32_pci_base;
 			region.end   = res->end -
-				       hose->pci_mem_offset -
+				       hose->mem_offset[0] -
 				       phb->ioda.m32_pci_base;
 			index = region.start / phb->ioda.m32_segsize;
 
@@ -1045,6 +1048,12 @@ static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
 	return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
 }
 
+static void pnv_pci_ioda_shutdown(struct pnv_phb *phb)
+{
+	opal_pci_reset(phb->opal_id, OPAL_PCI_IODA_TABLE_RESET,
+		       OPAL_ASSERT_RESET);
+}
+
 void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
 {
 	struct pci_controller *hose;
@@ -1089,7 +1098,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
 	/* Detect specific models for error handling */
 	if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
 		phb->model = PNV_PHB_MODEL_P7IOC;
-	else if (of_device_is_compatible(np, "ibm,p8-pciex"))
+	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
 		phb->model = PNV_PHB_MODEL_PHB3;
 	else
 		phb->model = PNV_PHB_MODEL_UNKNOWN;
@@ -1115,8 +1124,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
 	phb->ioda.m32_size += 0x10000;
 
 	phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe;
-	phb->ioda.m32_pci_base = hose->mem_resources[0].start -
-		hose->pci_mem_offset;
+	phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
 	phb->ioda.io_size = hose->pci_io_size;
 	phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe;
 	phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
@@ -1176,6 +1184,9 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
 	/* Setup TCEs */
 	phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
 
+	/* Setup shutdown function for kexec */
+	phb->shutdown = pnv_pci_ioda_shutdown;
+
 	/* Setup MSI support */
 	pnv_pci_init_ioda_msis(phb);
 
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 55dfca844ddf..163bd7422f1c 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -450,6 +450,18 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
 		pnv_pci_dma_fallback_setup(hose, pdev);
 }
 
+void pnv_pci_shutdown(void)
+{
+	struct pci_controller *hose;
+
+	list_for_each_entry(hose, &hose_list, list_node) {
+		struct pnv_phb *phb = hose->private_data;
+
+		if (phb && phb->shutdown)
+			phb->shutdown(phb);
+	}
+}
+
 /* Fixup wrong class code in p7ioc and p8 root complex */
 static void pnv_p7ioc_rc_quirk(struct pci_dev *dev)
 {
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 48dc4bb856a1..25d76c4df50b 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -86,6 +86,7 @@ struct pnv_phb {
 	void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
 	void (*fixup_phb)(struct pci_controller *hose);
 	u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn);
+	void (*shutdown)(struct pnv_phb *phb);
 
 	union {
 		struct {
@@ -158,4 +159,5 @@ extern void pnv_pci_init_ioda_hub(struct device_node *np);
 extern void pnv_pci_init_ioda2_phb(struct device_node *np);
 extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
 					u64 *startp, u64 *endp);
+
 #endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index 8a9df7f9667e..a1c6f83fc391 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -9,8 +9,10 @@ static inline void pnv_smp_init(void) { }
 
 #ifdef CONFIG_PCI
 extern void pnv_pci_init(void);
+extern void pnv_pci_shutdown(void);
 #else
 static inline void pnv_pci_init(void) { }
+static inline void pnv_pci_shutdown(void) { }
 #endif
 
 #endif /* _POWERNV_H */
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index db1ad1c8f68f..d4459bfc92f7 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -78,7 +78,9 @@ static void pnv_show_cpuinfo(struct seq_file *m)
 	if (root)
 		model = of_get_property(root, "model", NULL);
 	seq_printf(m, "machine\t\t: PowerNV %s\n", model);
-	if (firmware_has_feature(FW_FEATURE_OPALv2))
+	if (firmware_has_feature(FW_FEATURE_OPALv3))
+		seq_printf(m, "firmware\t: OPAL v3\n");
+	else if (firmware_has_feature(FW_FEATURE_OPALv2))
 		seq_printf(m, "firmware\t: OPAL v2\n");
 	else if (firmware_has_feature(FW_FEATURE_OPAL))
 		seq_printf(m, "firmware\t: OPAL v1\n");
@@ -126,6 +128,17 @@ static void pnv_progress(char *s, unsigned short hex)
 {
 }
 
+static void pnv_shutdown(void)
+{
+	/* Let the PCI code clear up IODA tables */
+	pnv_pci_shutdown();
+
+	/* And unregister all OPAL interrupts so they don't fire
+	 * up while we kexec
+	 */
+	opal_shutdown();
+}
+
 #ifdef CONFIG_KEXEC
 static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
 {
@@ -187,6 +200,7 @@ define_machine(powernv) {
 	.init_IRQ		= pnv_init_IRQ,
 	.show_cpuinfo		= pnv_show_cpuinfo,
 	.progress		= pnv_progress,
+	.machine_shutdown	= pnv_shutdown,
 	.power_save             = power7_idle,
 	.calibrate_decr		= generic_calibrate_decr,
 #ifdef CONFIG_KEXEC
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 0bdc735db16f..88c9459c3e07 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -71,16 +71,68 @@ int pnv_smp_kick_cpu(int nr)
 
 	BUG_ON(nr < 0 || nr >= NR_CPUS);
 
-	/* On OPAL v2 the CPU are still spinning inside OPAL itself,
-	 * get them back now
+	/*
+	 * If we already started or OPALv2 is not supported, we just
+	 * kick the CPU via the PACA
 	 */
-	if (!paca[nr].cpu_start && firmware_has_feature(FW_FEATURE_OPALv2)) {
-		pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", nr, pcpu);
-		rc = opal_start_cpu(pcpu, start_here);
-		if (rc != OPAL_SUCCESS)
-			pr_warn("OPAL Error %ld starting CPU %d\n",
+	if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPALv2))
+		goto kick;
+
+	/*
+	 * At this point, the CPU can either be spinning on the way in
+	 * from kexec or be inside OPAL waiting to be started for the
+	 * first time. OPAL v3 allows us to query OPAL to know if it
+	 * has the CPUs, so we do that
+	 */
+	if (firmware_has_feature(FW_FEATURE_OPALv3)) {
+		uint8_t status;
+
+		rc = opal_query_cpu_status(pcpu, &status);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("OPAL Error %ld querying CPU %d state\n",
 				rc, nr);
+			return -ENODEV;
+		}
+
+		/*
+		 * Already started, just kick it, probably coming from
+		 * kexec and spinning
+		 */
+		if (status == OPAL_THREAD_STARTED)
+			goto kick;
+
+		/*
+		 * Available/inactive, let's kick it
+		 */
+		if (status == OPAL_THREAD_INACTIVE) {
+			pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n",
+				 nr, pcpu);
+			rc = opal_start_cpu(pcpu, start_here);
+			if (rc != OPAL_SUCCESS) {
+				pr_warn("OPAL Error %ld starting CPU %d\n",
+					rc, nr);
+				return -ENODEV;
+			}
+		} else {
+			/*
+			 * An unavailable CPU (or any other unknown status)
+			 * shouldn't be started. It should also
+			 * not be in the possible map but currently it can
+			 * happen
+			 */
+			pr_devel("OPAL: CPU %d (HW 0x%x) is unavailable"
+				 " (status %d)...\n", nr, pcpu, status);
+			return -ENODEV;
+		}
+	} else {
+		/*
+		 * On OPAL v2, we just kick it and hope for the best,
+		 * we must not test the error from opal_start_cpu() or
+		 * we would fail to get CPUs from kexec.
+		 */
+		opal_start_cpu(pcpu, start_here);
 	}
+ kick:
 	return smp_generic_kick_cpu(nr);
 }
 
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 9a0941bc4d31..023b288f895b 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -18,6 +18,7 @@ config PPC_PSERIES
 	select PPC_PCI_CHOICE if EXPERT
 	select ZLIB_DEFLATE
 	select PPC_DOORBELL
+	select HAVE_CONTEXT_TRACKING
 	default y
 
 config PPC_SPLPAR
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index e5b084723131..420524e6f8c9 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -24,6 +24,7 @@ static int query_token, change_token;
 #define RTAS_RESET_FN		2
 #define RTAS_CHANGE_MSI_FN	3
 #define RTAS_CHANGE_MSIX_FN	4
+#define RTAS_CHANGE_32MSI_FN	5
 
 static struct pci_dn *get_pdn(struct pci_dev *pdev)
 {
@@ -58,7 +59,8 @@ static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs)
 
 	seq_num = 1;
 	do {
-		if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN)
+		if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN ||
+		    func == RTAS_CHANGE_32MSI_FN)
 			rc = rtas_call(change_token, 6, 4, rtas_ret, addr,
 					BUID_HI(buid), BUID_LO(buid),
 					func, num_irqs, seq_num);
@@ -426,9 +428,12 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
 	 */
 again:
 	if (type == PCI_CAP_ID_MSI) {
-		rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec);
+		if (pdn->force_32bit_msi)
+			rc = rtas_change_msi(pdn, RTAS_CHANGE_32MSI_FN, nvec);
+		else
+			rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec);
 
-		if (rc < 0) {
+		if (rc < 0 && !pdn->force_32bit_msi) {
 			pr_debug("rtas_msi: trying the old firmware call.\n");
 			rc = rtas_change_msi(pdn, RTAS_CHANGE_FN, nvec);
 		}
@@ -512,3 +517,13 @@ static int rtas_msi_init(void)
 	return 0;
 }
 arch_initcall(rtas_msi_init);
+
+static void quirk_radeon(struct pci_dev *dev)
+{
+	struct pci_dn *pdn = get_pdn(dev);
+
+	if (pdn)
+		pdn->force_32bit_msi = 1;
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x68f2, quirk_radeon);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0xaa68, quirk_radeon);
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
index 0b580f413a9a..5f93856cdf47 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -108,3 +108,56 @@ static void fixup_winbond_82c105(struct pci_dev* dev)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105,
 			 fixup_winbond_82c105);
+
+int pseries_root_bridge_prepare(struct pci_host_bridge *bridge)
+{
+	struct device_node *dn, *pdn;
+	struct pci_bus *bus;
+	const uint32_t *pcie_link_speed_stats;
+
+	bus = bridge->bus;
+
+	dn = pcibios_get_phb_of_node(bus);
+	if (!dn)
+		return 0;
+
+	for (pdn = dn; pdn != NULL; pdn = of_get_next_parent(pdn)) {
+		pcie_link_speed_stats = (const uint32_t *) of_get_property(pdn,
+			"ibm,pcie-link-speed-stats", NULL);
+		if (pcie_link_speed_stats)
+			break;
+	}
+
+	of_node_put(pdn);
+
+	if (!pcie_link_speed_stats) {
+		pr_err("no ibm,pcie-link-speed-stats property\n");
+		return 0;
+	}
+
+	switch (pcie_link_speed_stats[0]) {
+	case 0x01:
+		bus->max_bus_speed = PCIE_SPEED_2_5GT;
+		break;
+	case 0x02:
+		bus->max_bus_speed = PCIE_SPEED_5_0GT;
+		break;
+	default:
+		bus->max_bus_speed = PCI_SPEED_UNKNOWN;
+		break;
+	}
+
+	switch (pcie_link_speed_stats[1]) {
+	case 0x01:
+		bus->cur_bus_speed = PCIE_SPEED_2_5GT;
+		break;
+	case 0x02:
+		bus->cur_bus_speed = PCIE_SPEED_5_0GT;
+		break;
+	default:
+		bus->cur_bus_speed = PCI_SPEED_UNKNOWN;
+		break;
+	}
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 8af71e4cc17f..c2a3a258001c 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -63,4 +63,8 @@ extern int dlpar_detach_node(struct device_node *);
 /* Snooze Delay, pseries_idle */
 DECLARE_PER_CPU(long, smt_snooze_delay);
 
+/* PCI root bridge prepare function override for pseries */
+struct pci_host_bridge;
+int pseries_root_bridge_prepare(struct pci_host_bridge *bridge);
+
 #endif /* _PSERIES_PSERIES_H */
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index ac932a9eb440..c11c8238797c 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -466,6 +466,8 @@ static void __init pSeries_setup_arch(void)
 	else
 		ppc_md.enable_pmcs = power4_enable_pmcs;
 
+	ppc_md.pcibios_root_bridge_prepare = pseries_root_bridge_prepare;
+
 	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
 		long rc;
 		if ((rc = pSeries_enable_reloc_on_exc()) != H_SUCCESS) {
diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c
index 47226e04126d..5f997e79d570 100644
--- a/arch/powerpc/platforms/pseries/suspend.c
+++ b/arch/powerpc/platforms/pseries/suspend.c
@@ -16,6 +16,7 @@
   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
   */
 
+#include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/suspend.h>
 #include <linux/stat.h>
@@ -126,11 +127,15 @@ static ssize_t store_hibernate(struct device *dev,
 			       struct device_attribute *attr,
 			       const char *buf, size_t count)
 {
+	cpumask_var_t offline_mask;
 	int rc;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY))
+		return -ENOMEM;
+
 	stream_id = simple_strtoul(buf, NULL, 16);
 
 	do {
@@ -140,15 +145,32 @@ static ssize_t store_hibernate(struct device *dev,
 	} while (rc == -EAGAIN);
 
 	if (!rc) {
+		/* All present CPUs must be online */
+		cpumask_andnot(offline_mask, cpu_present_mask,
+				cpu_online_mask);
+		rc = rtas_online_cpus_mask(offline_mask);
+		if (rc) {
+			pr_err("%s: Could not bring present CPUs online.\n",
+					__func__);
+			goto out;
+		}
+
 		stop_topology_update();
 		rc = pm_suspend(PM_SUSPEND_MEM);
 		start_topology_update();
+
+		/* Take down CPUs not online prior to suspend */
+		if (!rtas_offline_cpus_mask(offline_mask))
+			pr_warn("%s: Could not restore CPUs to offline "
+					"state.\n", __func__);
 	}
 
 	stream_id = 0;
 
 	if (!rc)
 		rc = count;
+out:
+	free_cpumask_var(offline_mask);
 	return rc;
 }
 
diff --git a/arch/powerpc/platforms/wsp/ics.c b/arch/powerpc/platforms/wsp/ics.c
index 97fe82ee8633..2d3b1dd9571d 100644
--- a/arch/powerpc/platforms/wsp/ics.c
+++ b/arch/powerpc/platforms/wsp/ics.c
@@ -361,7 +361,7 @@ static int wsp_chip_set_affinity(struct irq_data *d,
 	xive = xive_set_server(xive, get_irq_server(ics, hw_irq));
 	wsp_ics_set_xive(ics, hw_irq, xive);
 
-	return 0;
+	return IRQ_SET_MASK_OK;
 }
 
 static struct irq_chip wsp_irq_chip = {
diff --git a/arch/powerpc/platforms/wsp/wsp_pci.c b/arch/powerpc/platforms/wsp/wsp_pci.c
index 8e22f561d171..62cb527493e7 100644
--- a/arch/powerpc/platforms/wsp/wsp_pci.c
+++ b/arch/powerpc/platforms/wsp/wsp_pci.c
@@ -502,7 +502,7 @@ static void __init wsp_pcie_configure_hw(struct pci_controller *hose)
 		 (~(hose->mem_resources[0].end -
 		    hose->mem_resources[0].start)) & 0x3ffffff0000ul);
 	out_be64(hose->cfg_data + PCIE_REG_M32A_START_ADDR,
-		 (hose->mem_resources[0].start - hose->pci_mem_offset) | 1);
+		 (hose->mem_resources[0].start - hose->mem_offset[0]) | 1);
 
 	/* Clear all TVT entries
 	 *
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index b0a518e97599..99464a7bdb3b 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -64,6 +64,8 @@ endif
 
 obj-$(CONFIG_PPC_SCOM)		+= scom.o
 
+obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS)	+= udbg_memcons.o
+
 subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 
 obj-$(CONFIG_PPC_XICS)		+= xics/
diff --git a/arch/powerpc/sysdev/ehv_pic.c b/arch/powerpc/sysdev/ehv_pic.c
index 6e0e1005227f..9cd0e60716fe 100644
--- a/arch/powerpc/sysdev/ehv_pic.c
+++ b/arch/powerpc/sysdev/ehv_pic.c
@@ -81,7 +81,7 @@ int ehv_pic_set_affinity(struct irq_data *d, const struct cpumask *dest,
 	ev_int_set_config(src, config, prio, cpuid);
 	spin_unlock_irqrestore(&ehv_pic_lock, flags);
 
-	return 0;
+	return IRQ_SET_MASK_OK;
 }
 
 static unsigned int ehv_pic_type_to_vecpri(unsigned int type)
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index cffe7edac858..028ac1f71b51 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -178,7 +178,7 @@ static void setup_pci_atmu(struct pci_controller *hose)
 	struct ccsr_pci __iomem *pci = hose->private_data;
 	int i, j, n, mem_log, win_idx = 3, start_idx = 1, end_idx = 4;
 	u64 mem, sz, paddr_hi = 0;
-	u64 paddr_lo = ULLONG_MAX;
+	u64 offset = 0, paddr_lo = ULLONG_MAX;
 	u32 pcicsrbar = 0, pcicsrbar_sz;
 	u32 piwar = PIWAR_EN | PIWAR_PF | PIWAR_TGI_LOCAL |
 			PIWAR_READ_SNOOP | PIWAR_WRITE_SNOOP;
@@ -208,8 +208,9 @@ static void setup_pci_atmu(struct pci_controller *hose)
 		paddr_lo = min(paddr_lo, (u64)hose->mem_resources[i].start);
 		paddr_hi = max(paddr_hi, (u64)hose->mem_resources[i].end);
 
-		n = setup_one_atmu(pci, j, &hose->mem_resources[i],
-				   hose->pci_mem_offset);
+		/* We assume all memory resources have the same offset */
+		offset = hose->mem_offset[i];
+		n = setup_one_atmu(pci, j, &hose->mem_resources[i], offset);
 
 		if (n < 0 || j >= 5) {
 			pr_err("Ran out of outbound PCI ATMUs for resource %d!\n", i);
@@ -239,8 +240,8 @@ static void setup_pci_atmu(struct pci_controller *hose)
 	}
 
 	/* convert to pci address space */
-	paddr_hi -= hose->pci_mem_offset;
-	paddr_lo -= hose->pci_mem_offset;
+	paddr_hi -= offset;
+	paddr_lo -= offset;
 
 	if (paddr_hi == paddr_lo) {
 		pr_err("%s: No outbound window space\n", name);
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index d30e6a676c89..0a13ecb270c7 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -836,7 +836,7 @@ int mpic_set_affinity(struct irq_data *d, const struct cpumask *cpumask,
 			       mpic_physmask(mask));
 	}
 
-	return 0;
+	return IRQ_SET_MASK_OK;
 }
 
 static unsigned int mpic_type_to_vecpri(struct mpic *mpic, unsigned int type)
@@ -1001,8 +1001,12 @@ static int mpic_host_map(struct irq_domain *h, unsigned int virq,
 
 	if (hw == mpic->spurious_vec)
 		return -EINVAL;
-	if (mpic->protected && test_bit(hw, mpic->protected))
-		return -EINVAL;
+	if (mpic->protected && test_bit(hw, mpic->protected)) {
+		pr_warning("mpic: Mapping of source 0x%x failed, "
+			   "source protected by firmware !\n",\
+			   (unsigned int)hw);
+		return -EPERM;
+	}
 
 #ifdef CONFIG_SMP
 	else if (hw >= mpic->ipi_vecs[0]) {
@@ -1029,8 +1033,12 @@ static int mpic_host_map(struct irq_domain *h, unsigned int virq,
 	if (mpic_map_error_int(mpic, virq, hw))
 		return 0;
 
-	if (hw >= mpic->num_sources)
+	if (hw >= mpic->num_sources) {
+		pr_warning("mpic: Mapping of source 0x%x failed, "
+			   "source out of range !\n",\
+			   (unsigned int)hw);
 		return -EINVAL;
+	}
 
 	mpic_msi_reserve_hwirq(mpic, hw);
 
diff --git a/arch/powerpc/sysdev/ppc4xx_pci.c b/arch/powerpc/sysdev/ppc4xx_pci.c
index 56e8b3c3c890..64603a10b863 100644
--- a/arch/powerpc/sysdev/ppc4xx_pci.c
+++ b/arch/powerpc/sysdev/ppc4xx_pci.c
@@ -257,6 +257,7 @@ static void __init ppc4xx_configure_pci_PMMs(struct pci_controller *hose,
 	/* Setup outbound memory windows */
 	for (i = j = 0; i < 3; i++) {
 		struct resource *res = &hose->mem_resources[i];
+		resource_size_t offset = hose->mem_offset[i];
 
 		/* we only care about memory windows */
 		if (!(res->flags & IORESOURCE_MEM))
@@ -270,7 +271,7 @@ static void __init ppc4xx_configure_pci_PMMs(struct pci_controller *hose,
 		/* Configure the resource */
 		if (ppc4xx_setup_one_pci_PMM(hose, reg,
 					     res->start,
-					     res->start - hose->pci_mem_offset,
+					     res->start - offset,
 					     resource_size(res),
 					     res->flags,
 					     j) == 0) {
@@ -279,7 +280,7 @@ static void __init ppc4xx_configure_pci_PMMs(struct pci_controller *hose,
 			/* If the resource PCI address is 0 then we have our
 			 * ISA memory hole
 			 */
-			if (res->start == hose->pci_mem_offset)
+			if (res->start == offset)
 				found_isa_hole = 1;
 		}
 	}
@@ -457,6 +458,7 @@ static void __init ppc4xx_configure_pcix_POMs(struct pci_controller *hose,
 	/* Setup outbound memory windows */
 	for (i = j = 0; i < 3; i++) {
 		struct resource *res = &hose->mem_resources[i];
+		resource_size_t offset = hose->mem_offset[i];
 
 		/* we only care about memory windows */
 		if (!(res->flags & IORESOURCE_MEM))
@@ -470,7 +472,7 @@ static void __init ppc4xx_configure_pcix_POMs(struct pci_controller *hose,
 		/* Configure the resource */
 		if (ppc4xx_setup_one_pcix_POM(hose, reg,
 					      res->start,
-					      res->start - hose->pci_mem_offset,
+					      res->start - offset,
 					      resource_size(res),
 					      res->flags,
 					      j) == 0) {
@@ -479,7 +481,7 @@ static void __init ppc4xx_configure_pcix_POMs(struct pci_controller *hose,
 			/* If the resource PCI address is 0 then we have our
 			 * ISA memory hole
 			 */
-			if (res->start == hose->pci_mem_offset)
+			if (res->start == offset)
 				found_isa_hole = 1;
 		}
 	}
@@ -1792,6 +1794,7 @@ static void __init ppc4xx_configure_pciex_POMs(struct ppc4xx_pciex_port *port,
 	/* Setup outbound memory windows */
 	for (i = j = 0; i < 3; i++) {
 		struct resource *res = &hose->mem_resources[i];
+		resource_size_t offset = hose->mem_offset[i];
 
 		/* we only care about memory windows */
 		if (!(res->flags & IORESOURCE_MEM))
@@ -1805,7 +1808,7 @@ static void __init ppc4xx_configure_pciex_POMs(struct ppc4xx_pciex_port *port,
 		/* Configure the resource */
 		if (ppc4xx_setup_one_pciex_POM(port, hose, mbase,
 					       res->start,
-					       res->start - hose->pci_mem_offset,
+					       res->start - offset,
 					       resource_size(res),
 					       res->flags,
 					       j) == 0) {
@@ -1814,7 +1817,7 @@ static void __init ppc4xx_configure_pciex_POMs(struct ppc4xx_pciex_port *port,
 			/* If the resource PCI address is 0 then we have our
 			 * ISA memory hole
 			 */
-			if (res->start == hose->pci_mem_offset)
+			if (res->start == offset)
 				found_isa_hole = 1;
 		}
 	}
diff --git a/arch/powerpc/sysdev/udbg_memcons.c b/arch/powerpc/sysdev/udbg_memcons.c
new file mode 100644
index 000000000000..ce5a7b489e4b
--- /dev/null
+++ b/arch/powerpc/sysdev/udbg_memcons.c
@@ -0,0 +1,105 @@
+/*
+ * A udbg backend which logs messages and reads input from in memory
+ * buffers.
+ *
+ * The console output can be read from memcons_output which is a
+ * circular buffer whose next write position is stored in memcons.output_pos.
+ *
+ * Input may be passed by writing into the memcons_input buffer when it is
+ * empty. The input buffer is empty when both input_pos == input_start and
+ * *input_start == '\0'.
+ *
+ * Copyright (C) 2003-2005 Anton Blanchard and Milton Miller, IBM Corp
+ * Copyright (C) 2013 Alistair Popple, IBM Corp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <asm/barrier.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <asm/udbg.h>
+
+struct memcons {
+	char *output_start;
+	char *output_pos;
+	char *output_end;
+	char *input_start;
+	char *input_pos;
+	char *input_end;
+};
+
+static char memcons_output[CONFIG_PPC_MEMCONS_OUTPUT_SIZE];
+static char memcons_input[CONFIG_PPC_MEMCONS_INPUT_SIZE];
+
+struct memcons memcons = {
+	.output_start = memcons_output,
+	.output_pos = memcons_output,
+	.output_end = &memcons_output[CONFIG_PPC_MEMCONS_OUTPUT_SIZE],
+	.input_start = memcons_input,
+	.input_pos = memcons_input,
+	.input_end = &memcons_input[CONFIG_PPC_MEMCONS_INPUT_SIZE],
+};
+
+void memcons_putc(char c)
+{
+	char *new_output_pos;
+
+	*memcons.output_pos = c;
+	wmb();
+	new_output_pos = memcons.output_pos + 1;
+	if (new_output_pos >= memcons.output_end)
+		new_output_pos = memcons.output_start;
+
+	memcons.output_pos = new_output_pos;
+}
+
+int memcons_getc_poll(void)
+{
+	char c;
+	char *new_input_pos;
+
+	if (*memcons.input_pos) {
+		c = *memcons.input_pos;
+
+		new_input_pos = memcons.input_pos + 1;
+		if (new_input_pos >= memcons.input_end)
+			new_input_pos = memcons.input_start;
+		else if (*new_input_pos == '\0')
+			new_input_pos = memcons.input_start;
+
+		*memcons.input_pos = '\0';
+		wmb();
+		memcons.input_pos = new_input_pos;
+		return c;
+	}
+
+	return -1;
+}
+
+int memcons_getc(void)
+{
+	int c;
+
+	while (1) {
+		c = memcons_getc_poll();
+		if (c == -1)
+			cpu_relax();
+		else
+			break;
+	}
+
+	return c;
+}
+
+void udbg_init_memcons(void)
+{
+	udbg_putc = memcons_putc;
+	udbg_getc = memcons_getc;
+	udbg_getc_poll = memcons_getc_poll;
+}
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 89db29d17c25..7cd728b3b5e4 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -51,6 +51,12 @@ static struct icp_ipl __iomem *icp_native_regs[NR_CPUS];
 static inline unsigned int icp_native_get_xirr(void)
 {
 	int cpu = smp_processor_id();
+	unsigned int xirr;
+
+	/* Handled an interrupt latched by KVM */
+	xirr = kvmppc_get_xics_latch();
+	if (xirr)
+		return xirr;
 
 	return in_be32(&icp_native_regs[cpu]->xirr.word);
 }
@@ -138,6 +144,7 @@ static unsigned int icp_native_get_irq(void)
 
 static void icp_native_cause_ipi(int cpu, unsigned long data)
 {
+	kvmppc_set_host_ipi(cpu, 1);
 	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
@@ -151,6 +158,7 @@ static irqreturn_t icp_native_ipi_action(int irq, void *dev_id)
 {
 	int cpu = smp_processor_id();
 
+	kvmppc_set_host_ipi(cpu, 0);
 	icp_native_set_qirr(cpu, 0xff);
 
 	return smp_ipi_demux();
diff --git a/arch/powerpc/sysdev/xics/ics-opal.c b/arch/powerpc/sysdev/xics/ics-opal.c
index f7e8609df0d5..39d72212655e 100644
--- a/arch/powerpc/sysdev/xics/ics-opal.c
+++ b/arch/powerpc/sysdev/xics/ics-opal.c
@@ -148,7 +148,7 @@ static int ics_opal_set_affinity(struct irq_data *d,
 		       __func__, d->irq, hw_irq, server, rc);
 		return -1;
 	}
-	return 0;
+	return IRQ_SET_MASK_OK;
 }
 
 static struct irq_chip ics_opal_irq_chip = {