61 files changed, 1673 insertions, 1320 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2a83bc8b24c6..ac5944fa6da2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -706,7 +706,6 @@ config PARAVIRT_DEBUG
 config PARAVIRT_SPINLOCKS
 	bool "Paravirtualization layer for spinlocks"
 	depends on PARAVIRT && SMP
-	select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS
 	---help---
 	  Paravirtualized spinlocks allow a pvops backend to replace the
 	  spinlock implementation with something virtualization-friendly
@@ -719,7 +718,7 @@ config PARAVIRT_SPINLOCKS
 
 config QUEUED_LOCK_STAT
 	bool "Paravirt queued spinlock statistics"
-	depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS
+	depends on PARAVIRT_SPINLOCKS && DEBUG_FS
 	---help---
 	  Enable the collection of statistical data on the slowpath
 	  behavior of paravirtualized queued spinlocks and report
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 94dd4a31f5b3..cc69e37548db 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -29,22 +29,11 @@ __pure const struct efi_config *__efi_early(void)
 static void setup_boot_services##bits(struct efi_config *c)		\
 {									\
 	efi_system_table_##bits##_t *table;				\
-	efi_boot_services_##bits##_t *bt;				\
 									\
 	table = (typeof(table))sys_table;				\
 									\
+	c->boot_services = table->boottime;				\
 	c->text_output = table->con_out;				\
-									\
-	bt = (typeof(bt))(unsigned long)(table->boottime);		\
-									\
-	c->allocate_pool = bt->allocate_pool;				\
-	c->allocate_pages = bt->allocate_pages;				\
-	c->get_memory_map = bt->get_memory_map;				\
-	c->free_pool = bt->free_pool;					\
-	c->free_pages = bt->free_pages;					\
-	c->locate_handle = bt->locate_handle;				\
-	c->handle_protocol = bt->handle_protocol;			\
-	c->exit_boot_services = bt->exit_boot_services;			\
 }
 BOOT_SERVICES(32);
 BOOT_SERVICES(64);
@@ -286,29 +275,6 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
 	}
 }
 
-static void find_bits(unsigned long mask, u8 *pos, u8 *size)
-{
-	u8 first, len;
-
-	first = 0;
-	len = 0;
-
-	if (mask) {
-		while (!(mask & 0x1)) {
-			mask = mask >> 1;
-			first++;
-		}
-
-		while (mask & 0x1) {
-			mask = mask >> 1;
-			len++;
-		}
-	}
-
-	*pos = first;
-	*size = len;
-}
-
 static efi_status_t
 __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
 {
@@ -578,7 +544,7 @@ setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height)
 	efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
 	unsigned long nr_ugas;
 	u32 *handles = (u32 *)uga_handle;;
-	efi_status_t status;
+	efi_status_t status = EFI_INVALID_PARAMETER;
 	int i;
 
 	first_uga = NULL;
@@ -623,7 +589,7 @@ setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height)
 	efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
 	unsigned long nr_ugas;
 	u64 *handles = (u64 *)uga_handle;;
-	efi_status_t status;
+	efi_status_t status = EFI_INVALID_PARAMETER;
 	int i;
 
 	first_uga = NULL;
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1038524270e7..fd0b6a272dd5 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -82,7 +82,7 @@ ENTRY(efi_pe_entry)
 
 	/* Relocate efi_config->call() */
 	leal	efi32_config(%esi), %eax
-	add	%esi, 88(%eax)
+	add	%esi, 32(%eax)
 	pushl	%eax
 
 	call	make_boot_params
@@ -108,7 +108,7 @@ ENTRY(efi32_stub_entry)
 
 	/* Relocate efi_config->call() */
 	leal	efi32_config(%esi), %eax
-	add	%esi, 88(%eax)
+	add	%esi, 32(%eax)
 	pushl	%eax
 2:
 	call	efi_main
@@ -264,7 +264,7 @@ relocated:
 #ifdef CONFIG_EFI_STUB
 	.data
 efi32_config:
-	.fill 11,8,0
+	.fill 4,8,0
 	.long efi_call_phys
 	.long 0
 	.byte 0
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 0d80a7ad65cd..efdfba21a5b2 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -265,7 +265,7 @@ ENTRY(efi_pe_entry)
 	/*
 	 * Relocate efi_config->call().
 	 */
-	addq	%rbp, efi64_config+88(%rip)
+	addq	%rbp, efi64_config+32(%rip)
 
 	movq	%rax, %rdi
 	call	make_boot_params
@@ -285,7 +285,7 @@ handover_entry:
 	 * Relocate efi_config->call().
 	 */
 	movq	efi_config(%rip), %rax
-	addq	%rbp, 88(%rax)
+	addq	%rbp, 32(%rax)
 2:
 	movq	efi_config(%rip), %rdi
 	call	efi_main
@@ -457,14 +457,14 @@ efi_config:
 #ifdef CONFIG_EFI_MIXED
 	.global efi32_config
 efi32_config:
-	.fill	11,8,0
+	.fill	4,8,0
 	.quad	efi64_thunk
 	.byte	0
 #endif
 
 	.global efi64_config
 efi64_config:
-	.fill	11,8,0
+	.fill	4,8,0
 	.quad	efi_call
 	.byte	1
 #endif /* CONFIG_EFI_STUB */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 80ab68a42621..fee1d95902b5 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1077,7 +1077,6 @@ ENTRY(error_entry)
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
 
-.Lerror_entry_from_usermode_swapgs:
 	/*
 	 * We entered from user mode or we're pretending to have entered
 	 * from user mode due to an IRET fault.
@@ -1120,7 +1119,8 @@ ENTRY(error_entry)
 	 * gsbase and proceed.  We'll fix up the exception and land in
 	 * .Lgs_change's error handler with kernel gsbase.
 	 */
-	jmp	.Lerror_entry_from_usermode_swapgs
+	SWAPGS
+	jmp .Lerror_entry_done
 
 .Lbstep_iret:
 	/* Fix truncated RIP */
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 0a8bd7fcdbed..d31735f37ed7 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1202,6 +1202,9 @@ static int x86_pmu_add(struct perf_event *event, int flags)
 	 * If group events scheduling transaction was started,
 	 * skip the schedulability test here, it will be performed
 	 * at commit time (->commit_txn) as a whole.
+	 *
+	 * If commit fails, we'll call ->del() on all events
+	 * for which ->add() was called.
 	 */
 	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
 		goto done_collect;
@@ -1224,6 +1227,14 @@ done_collect:
 	cpuc->n_added += n - n0;
 	cpuc->n_txn += n - n0;
 
+	if (x86_pmu.add) {
+		/*
+		 * This is before x86_pmu_enable() will call x86_pmu_start(),
+		 * so we enable LBRs before an event needs them etc..
+		 */
+		x86_pmu.add(event);
+	}
+
 	ret = 0;
 out:
 	return ret;
@@ -1347,7 +1358,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 	event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
 
 	/*
-	 * If we're called during a txn, we don't need to do anything.
+	 * If we're called during a txn, we only need to undo x86_pmu.add.
 	 * The events never got scheduled and ->cancel_txn will truncate
 	 * the event_list.
 	 *
@@ -1355,7 +1366,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 	 * an event added during that same TXN.
 	 */
 	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
-		return;
+		goto do_del;
 
 	/*
 	 * Not a TXN, therefore cleanup properly.
@@ -1385,6 +1396,15 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 	--cpuc->n_events;
 
 	perf_event_update_userpage(event);
+
+do_del:
+	if (x86_pmu.del) {
+		/*
+		 * This is after x86_pmu_stop(); so we disable LBRs after any
+		 * event can need them etc..
+		 */
+		x86_pmu.del(event);
+	}
 }
 
 int x86_pmu_handle_irq(struct pt_regs *regs)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 4c9a79b9cd69..a3a9eb84b5cf 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1906,13 +1906,6 @@ static void intel_pmu_disable_event(struct perf_event *event)
 	cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
 	cpuc->intel_cp_status &= ~(1ull << hwc->idx);
 
-	/*
-	 * must disable before any actual event
-	 * because any event may be combined with LBR
-	 */
-	if (needs_branch_stack(event))
-		intel_pmu_lbr_disable(event);
-
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
 		intel_pmu_disable_fixed(hwc);
 		return;
@@ -1924,6 +1917,14 @@ static void intel_pmu_disable_event(struct perf_event *event)
 		intel_pmu_pebs_disable(event);
 }
 
+static void intel_pmu_del_event(struct perf_event *event)
+{
+	if (needs_branch_stack(event))
+		intel_pmu_lbr_del(event);
+	if (event->attr.precise_ip)
+		intel_pmu_pebs_del(event);
+}
+
 static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 {
 	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
@@ -1967,12 +1968,6 @@ static void intel_pmu_enable_event(struct perf_event *event)
 		intel_pmu_enable_bts(hwc->config);
 		return;
 	}
-	/*
-	 * must enabled before any actual event
-	 * because any event may be combined with LBR
-	 */
-	if (needs_branch_stack(event))
-		intel_pmu_lbr_enable(event);
 
 	if (event->attr.exclude_host)
 		cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
@@ -1993,6 +1988,14 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
+static void intel_pmu_add_event(struct perf_event *event)
+{
+	if (event->attr.precise_ip)
+		intel_pmu_pebs_add(event);
+	if (needs_branch_stack(event))
+		intel_pmu_lbr_add(event);
+}
+
 /*
  * Save and restart an expired event. Called by NMI contexts,
  * so it has to be careful about preempting normal event ops:
@@ -3291,6 +3294,8 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.enable_all		= intel_pmu_enable_all,
 	.enable			= intel_pmu_enable_event,
 	.disable		= intel_pmu_disable_event,
+	.add			= intel_pmu_add_event,
+	.del			= intel_pmu_del_event,
 	.hw_config		= intel_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 9b983a474253..0319311dbdbb 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -806,9 +806,65 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 	return &emptyconstraint;
 }
 
-static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+/*
+ * We need the sched_task callback even for per-cpu events when we use
+ * the large interrupt threshold, such that we can provide PID and TID
+ * to PEBS samples.
+ */
+static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
+{
+	return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
+}
+
+static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
+{
+	struct debug_store *ds = cpuc->ds;
+	u64 threshold;
+
+	if (cpuc->n_pebs == cpuc->n_large_pebs) {
+		threshold = ds->pebs_absolute_maximum -
+			x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+	} else {
+		threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+	}
+
+	ds->pebs_interrupt_threshold = threshold;
+}
+
+static void
+pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu)
+{
+	/*
+	 * Make sure we get updated with the first PEBS
+	 * event. It will trigger also during removal, but
+	 * that does not hurt:
+	 */
+	bool update = cpuc->n_pebs == 1;
+
+	if (needed_cb != pebs_needs_sched_cb(cpuc)) {
+		if (!needed_cb)
+			perf_sched_cb_inc(pmu);
+		else
+			perf_sched_cb_dec(pmu);
+
+		update = true;
+	}
+
+	if (update)
+		pebs_update_threshold(cpuc);
+}
+
+void intel_pmu_pebs_add(struct perf_event *event)
 {
-	return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	bool needed_cb = pebs_needs_sched_cb(cpuc);
+
+	cpuc->n_pebs++;
+	if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+		cpuc->n_large_pebs++;
+
+	pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
 }
 
 void intel_pmu_pebs_enable(struct perf_event *event)
@@ -816,12 +872,9 @@ void intel_pmu_pebs_enable(struct perf_event *event)
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
 	struct debug_store *ds = cpuc->ds;
-	bool first_pebs;
-	u64 threshold;
 
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
-	first_pebs = !pebs_is_enabled(cpuc);
 	cpuc->pebs_enabled |= 1ULL << hwc->idx;
 
 	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
@@ -830,46 +883,34 @@ void intel_pmu_pebs_enable(struct perf_event *event)
 		cpuc->pebs_enabled |= 1ULL << 63;
 
 	/*
-	 * When the event is constrained enough we can use a larger
-	 * threshold and run the event with less frequent PMI.
+	 * Use auto-reload if possible to save a MSR write in the PMI.
+	 * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
 	 */
-	if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
-		threshold = ds->pebs_absolute_maximum -
-			x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
-
-		if (first_pebs)
-			perf_sched_cb_inc(event->ctx->pmu);
-	} else {
-		threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-		/*
-		 * If not all events can use larger buffer,
-		 * roll back to threshold = 1
-		 */
-		if (!first_pebs &&
-		    (ds->pebs_interrupt_threshold > threshold))
-			perf_sched_cb_dec(event->ctx->pmu);
-	}
-
-	/* Use auto-reload if possible to save a MSR write in the PMI */
 	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
 		ds->pebs_event_reset[hwc->idx] =
 			(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
 	}
+}
+
+void intel_pmu_pebs_del(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	bool needed_cb = pebs_needs_sched_cb(cpuc);
 
-	if (first_pebs || ds->pebs_interrupt_threshold > threshold)
-		ds->pebs_interrupt_threshold = threshold;
+	cpuc->n_pebs--;
+	if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+		cpuc->n_large_pebs--;
+
+	pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
 }
 
 void intel_pmu_pebs_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	struct debug_store *ds = cpuc->ds;
-	bool large_pebs = ds->pebs_interrupt_threshold >
-		ds->pebs_buffer_base + x86_pmu.pebs_record_size;
 
-	if (large_pebs)
+	if (cpuc->n_pebs == cpuc->n_large_pebs)
 		intel_pmu_drain_pebs_buffer();
 
 	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
@@ -879,9 +920,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)
 	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
 		cpuc->pebs_enabled &= ~(1ULL << 63);
 
-	if (large_pebs && !pebs_is_enabled(cpuc))
-		perf_sched_cb_dec(event->ctx->pmu);
-
 	if (cpuc->enabled)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 707d358e0dff..fc6cf21c535e 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -380,7 +380,6 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 {
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct x86_perf_task_context *task_ctx;
 
 	/*
@@ -390,31 +389,21 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 	 */
 	task_ctx = ctx ? ctx->task_ctx_data : NULL;
 	if (task_ctx) {
-		if (sched_in) {
+		if (sched_in)
 			__intel_pmu_lbr_restore(task_ctx);
-			cpuc->lbr_context = ctx;
-		} else {
+		else
 			__intel_pmu_lbr_save(task_ctx);
-		}
 		return;
 	}
 
 	/*
-	 * When sampling the branck stack in system-wide, it may be
-	 * necessary to flush the stack on context switch. This happens
-	 * when the branch stack does not tag its entries with the pid
-	 * of the current task. Otherwise it becomes impossible to
-	 * associate a branch entry with a task. This ambiguity is more
-	 * likely to appear when the branch stack supports priv level
-	 * filtering and the user sets it to monitor only at the user
-	 * level (which could be a useful measurement in system-wide
-	 * mode). In that case, the risk is high of having a branch
-	 * stack with branch from multiple tasks.
- 	 */
-	if (sched_in) {
+	 * Since a context switch can flip the address space and LBR entries
+	 * are not tagged with an identifier, we need to wipe the LBR, even for
+	 * per-cpu events. You simply cannot resolve the branches from the old
+	 * address space.
+	 */
+	if (sched_in)
 		intel_pmu_lbr_reset();
-		cpuc->lbr_context = ctx;
-	}
 }
 
 static inline bool branch_user_callstack(unsigned br_sel)
@@ -422,7 +411,7 @@ static inline bool branch_user_callstack(unsigned br_sel)
 	return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
 }
 
-void intel_pmu_lbr_enable(struct perf_event *event)
+void intel_pmu_lbr_add(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct x86_perf_task_context *task_ctx;
@@ -430,27 +419,38 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 	if (!x86_pmu.lbr_nr)
 		return;
 
-	/*
-	 * Reset the LBR stack if we changed task context to
-	 * avoid data leaks.
-	 */
-	if (event->ctx->task && cpuc->lbr_context != event->ctx) {
-		intel_pmu_lbr_reset();
-		cpuc->lbr_context = event->ctx;
-	}
 	cpuc->br_sel = event->hw.branch_reg.reg;
 
-	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
-					event->ctx->task_ctx_data) {
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
 		task_ctx = event->ctx->task_ctx_data;
 		task_ctx->lbr_callstack_users++;
 	}
 
-	cpuc->lbr_users++;
+	/*
+	 * Request pmu::sched_task() callback, which will fire inside the
+	 * regular perf event scheduling, so that call will:
+	 *
+	 *  - restore or wipe; when LBR-callstack,
+	 *  - wipe; otherwise,
+	 *
+	 * when this is from __perf_event_task_sched_in().
+	 *
+	 * However, if this is from perf_install_in_context(), no such callback
+	 * will follow and we'll need to reset the LBR here if this is the
+	 * first LBR event.
+	 *
+	 * The problem is, we cannot tell these cases apart... but we can
+	 * exclude the biggest chunk of cases by looking at
+	 * event->total_time_running. An event that has accrued runtime cannot
+	 * be 'new'. Conversely, a new event can get installed through the
+	 * context switch path for the first time.
+	 */
 	perf_sched_cb_inc(event->ctx->pmu);
+	if (!cpuc->lbr_users++ && !event->total_time_running)
+		intel_pmu_lbr_reset();
 }
 
-void intel_pmu_lbr_disable(struct perf_event *event)
+void intel_pmu_lbr_del(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct x86_perf_task_context *task_ctx;
@@ -467,12 +467,6 @@ void intel_pmu_lbr_disable(struct perf_event *event)
 	cpuc->lbr_users--;
 	WARN_ON_ONCE(cpuc->lbr_users < 0);
 	perf_sched_cb_dec(event->ctx->pmu);
-
-	if (cpuc->enabled && !cpuc->lbr_users) {
-		__intel_pmu_lbr_disable();
-		/* avoid stale pointer */
-		cpuc->lbr_context = NULL;
-	}
 }
 
 void intel_pmu_lbr_enable_all(bool pmi)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 861a7d9cb60f..c5047b8f777b 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -69,6 +69,8 @@ static struct pt_cap_desc {
 	PT_CAP(psb_cyc,			0, CR_EBX, BIT(1)),
 	PT_CAP(ip_filtering,		0, CR_EBX, BIT(2)),
 	PT_CAP(mtc,			0, CR_EBX, BIT(3)),
+	PT_CAP(ptwrite,			0, CR_EBX, BIT(4)),
+	PT_CAP(power_event_trace,	0, CR_EBX, BIT(5)),
 	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
 	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
 	PT_CAP(single_range_output,	0, CR_ECX, BIT(2)),
@@ -259,10 +261,16 @@ fail:
 #define RTIT_CTL_MTC	(RTIT_CTL_MTC_EN	| \
 			 RTIT_CTL_MTC_RANGE)
 
+#define RTIT_CTL_PTW	(RTIT_CTL_PTW_EN	| \
+			 RTIT_CTL_FUP_ON_PTW)
+
 #define PT_CONFIG_MASK (RTIT_CTL_TSC_EN		| \
 			RTIT_CTL_DISRETC	| \
 			RTIT_CTL_CYC_PSB	| \
-			RTIT_CTL_MTC)
+			RTIT_CTL_MTC		| \
+			RTIT_CTL_PWR_EVT_EN	| \
+			RTIT_CTL_FUP_ON_PTW	| \
+			RTIT_CTL_PTW_EN)
 
 static bool pt_event_valid(struct perf_event *event)
 {
@@ -311,6 +319,20 @@ static bool pt_event_valid(struct perf_event *event)
 			return false;
 	}
 
+	if (config & RTIT_CTL_PWR_EVT_EN &&
+	    !pt_cap_get(PT_CAP_power_event_trace))
+		return false;
+
+	if (config & RTIT_CTL_PTW) {
+		if (!pt_cap_get(PT_CAP_ptwrite))
+			return false;
+
+		/* FUPonPTW without PTW doesn't make sense */
+		if ((config & RTIT_CTL_FUP_ON_PTW) &&
+		    !(config & RTIT_CTL_PTW_EN))
+			return false;
+	}
+
 	return true;
 }
 
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index efffa4a09f68..53473c21b554 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -26,11 +26,14 @@
 #define RTIT_CTL_CYCLEACC		BIT(1)
 #define RTIT_CTL_OS			BIT(2)
 #define RTIT_CTL_USR			BIT(3)
+#define RTIT_CTL_PWR_EVT_EN		BIT(4)
+#define RTIT_CTL_FUP_ON_PTW		BIT(5)
 #define RTIT_CTL_CR3EN			BIT(7)
 #define RTIT_CTL_TOPA			BIT(8)
 #define RTIT_CTL_MTC_EN			BIT(9)
 #define RTIT_CTL_TSC_EN			BIT(10)
 #define RTIT_CTL_DISRETC		BIT(11)
+#define RTIT_CTL_PTW_EN			BIT(12)
 #define RTIT_CTL_BRANCH_EN		BIT(13)
 #define RTIT_CTL_MTC_RANGE_OFFSET	14
 #define RTIT_CTL_MTC_RANGE		(0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
@@ -91,6 +94,8 @@ enum pt_capabilities {
 	PT_CAP_psb_cyc,
 	PT_CAP_ip_filtering,
 	PT_CAP_mtc,
+	PT_CAP_ptwrite,
+	PT_CAP_power_event_trace,
 	PT_CAP_topa_output,
 	PT_CAP_topa_multiple_entries,
 	PT_CAP_single_range_output,
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 28865938aadf..b0f0e835a770 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -357,6 +357,8 @@ static int rapl_pmu_event_init(struct perf_event *event)
 	if (event->cpu < 0)
 		return -EINVAL;
 
+	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
 	/*
 	 * check event is known (determines counter)
 	 */
@@ -765,6 +767,8 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE,  skl_rapl_init),
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init),
 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,	 hsx_rapl_init),
+
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init),
 	{},
 };
 
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 463dc7a5a6c3..d9844cc74486 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -664,6 +664,8 @@ static int uncore_pmu_event_init(struct perf_event *event)
 	event->cpu = box->cpu;
 	event->pmu_private = box;
 
+	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
 	event->hw.idx = -1;
 	event->hw.last_tag = ~0ULL;
 	event->hw.extra_reg.idx = EXTRA_REG_NONE;
@@ -683,7 +685,8 @@ static int uncore_pmu_event_init(struct perf_event *event)
 		/* fixed counters have event field hardcoded to zero */
 		hwc->config = 0ULL;
 	} else {
-		hwc->config = event->attr.config & pmu->type->event_mask;
+		hwc->config = event->attr.config &
+			      (pmu->type->event_mask | ((u64)pmu->type->event_mask_ext << 32));
 		if (pmu->type->ops->hw_config) {
 			ret = pmu->type->ops->hw_config(box, event);
 			if (ret)
@@ -1321,6 +1324,11 @@ static const struct intel_uncore_init_fun skl_uncore_init __initconst = {
 	.pci_init = skl_uncore_pci_init,
 };
 
+static const struct intel_uncore_init_fun skx_uncore_init __initconst = {
+	.cpu_init = skx_uncore_cpu_init,
+	.pci_init = skx_uncore_pci_init,
+};
+
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP,	  nhm_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM,	  nhm_uncore_init),
@@ -1343,6 +1351,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL,	  knl_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init),
 	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,      skx_uncore_init),
 	{},
 };
 
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 78b9c23e2d8d..ad986c1e29bc 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -44,6 +44,7 @@ struct intel_uncore_type {
 	unsigned perf_ctr;
 	unsigned event_ctl;
 	unsigned event_mask;
+	unsigned event_mask_ext;
 	unsigned fixed_ctr;
 	unsigned fixed_ctl;
 	unsigned box_ctl;
@@ -120,6 +121,7 @@ struct intel_uncore_box {
 };
 
 #define UNCORE_BOX_FLAG_INITIATED	0
+#define UNCORE_BOX_FLAG_CTL_OFFS8	1 /* event config registers are 8-byte apart */
 
 struct uncore_event_desc {
 	struct kobj_attribute attr;
@@ -172,6 +174,9 @@ static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
 static inline
 unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
 {
+	if (test_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags))
+		return idx * 8 + box->pmu->type->event_ctl;
+
 	return idx * 4 + box->pmu->type->event_ctl;
 }
 
@@ -377,6 +382,8 @@ int bdx_uncore_pci_init(void);
 void bdx_uncore_cpu_init(void);
 int knl_uncore_pci_init(void);
 void knl_uncore_cpu_init(void);
+int skx_uncore_pci_init(void);
+void skx_uncore_cpu_init(void);
 
 /* perf_event_intel_uncore_nhmex.c */
 void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 9d35ec0cb8fc..5f845eef9a4d 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -388,6 +388,8 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
 	event->cpu = box->cpu;
 	event->pmu_private = box;
 
+	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
 	event->hw.idx = -1;
 	event->hw.last_tag = ~0ULL;
 	event->hw.extra_reg.idx = EXTRA_REG_NONE;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 8aee83bcf71f..272427700d48 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1,6 +1,10 @@
 /* SandyBridge-EP/IvyTown uncore support */
 #include "uncore.h"
 
+/* SNB-EP pci bus to socket mapping */
+#define SNBEP_CPUNODEID			0x40
+#define SNBEP_GIDNIDMAP			0x54
+
 /* SNB-EP Box level control */
 #define SNBEP_PMON_BOX_CTL_RST_CTRL	(1 << 0)
 #define SNBEP_PMON_BOX_CTL_RST_CTRS	(1 << 1)
@@ -264,15 +268,72 @@
 				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
 				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
 
+/* SKX pci bus to socket mapping */
+#define SKX_CPUNODEID			0xc0
+#define SKX_GIDNIDMAP			0xd4
+
+/* SKX CHA */
+#define SKX_CHA_MSR_PMON_BOX_FILTER_TID		(0x1ffULL << 0)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 9)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_STATE	(0x3ffULL << 17)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_REM		(0x1ULL << 32)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_LOC		(0x1ULL << 33)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC	(0x1ULL << 35)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NM		(0x1ULL << 36)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM	(0x1ULL << 37)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC0	(0x3ffULL << 41)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC1	(0x3ffULL << 51)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_C6		(0x1ULL << 61)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NC		(0x1ULL << 62)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_ISOC	(0x1ULL << 63)
+
+/* SKX IIO */
+#define SKX_IIO0_MSR_PMON_CTL0		0xa48
+#define SKX_IIO0_MSR_PMON_CTR0		0xa41
+#define SKX_IIO0_MSR_PMON_BOX_CTL	0xa40
+#define SKX_IIO_MSR_OFFSET		0x20
+
+#define SKX_PMON_CTL_TRESH_MASK		(0xff << 24)
+#define SKX_PMON_CTL_TRESH_MASK_EXT	(0xf)
+#define SKX_PMON_CTL_CH_MASK		(0xff << 4)
+#define SKX_PMON_CTL_FC_MASK		(0x7 << 12)
+#define SKX_IIO_PMON_RAW_EVENT_MASK	(SNBEP_PMON_CTL_EV_SEL_MASK | \
+					 SNBEP_PMON_CTL_UMASK_MASK | \
+					 SNBEP_PMON_CTL_EDGE_DET | \
+					 SNBEP_PMON_CTL_INVERT | \
+					 SKX_PMON_CTL_TRESH_MASK)
+#define SKX_IIO_PMON_RAW_EVENT_MASK_EXT	(SKX_PMON_CTL_TRESH_MASK_EXT | \
+					 SKX_PMON_CTL_CH_MASK | \
+					 SKX_PMON_CTL_FC_MASK)
+
+/* SKX IRP */
+#define SKX_IRP0_MSR_PMON_CTL0		0xa5b
+#define SKX_IRP0_MSR_PMON_CTR0		0xa59
+#define SKX_IRP0_MSR_PMON_BOX_CTL	0xa58
+#define SKX_IRP_MSR_OFFSET		0x20
+
+/* SKX UPI */
+#define SKX_UPI_PCI_PMON_CTL0		0x350
+#define SKX_UPI_PCI_PMON_CTR0		0x318
+#define SKX_UPI_PCI_PMON_BOX_CTL	0x378
+#define SKX_PMON_CTL_UMASK_EXT		0xff
+
+/* SKX M2M */
+#define SKX_M2M_PCI_PMON_CTL0		0x228
+#define SKX_M2M_PCI_PMON_CTR0		0x200
+#define SKX_M2M_PCI_PMON_BOX_CTL	0x258
+
 DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
 DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
 DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
 DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
 DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-39");
 DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
 DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
 DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
 DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35");
 DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
 DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
 DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
@@ -280,6 +341,8 @@ DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
 DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
 DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
 DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
+DEFINE_UNCORE_FORMAT_ATTR(ch_mask, ch_mask, "config:36-43");
+DEFINE_UNCORE_FORMAT_ATTR(fc_mask, fc_mask, "config:44-46");
 DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
 DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
 DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
@@ -288,18 +351,26 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link4, filter_link, "config1:9-12");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state5, filter_state, "config1:17-26");
+DEFINE_UNCORE_FORMAT_ATTR(filter_rem, filter_rem, "config1:32");
+DEFINE_UNCORE_FORMAT_ATTR(filter_loc, filter_loc, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nm, filter_nm, "config1:36");
+DEFINE_UNCORE_FORMAT_ATTR(filter_not_nm, filter_not_nm, "config1:37");
 DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
 DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
 DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
 DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
 DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc_0, filter_opc0, "config1:41-50");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc_1, filter_opc1, "config1:51-60");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
 DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
 DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
@@ -1153,7 +1224,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
 /*
  * build pci bus to socket mapping
  */
-static int snbep_pci2phy_map_init(int devid)
+static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
 {
 	struct pci_dev *ubox_dev = NULL;
 	int i, bus, nodeid, segment;
@@ -1168,12 +1239,12 @@ static int snbep_pci2phy_map_init(int devid)
 			break;
 		bus = ubox_dev->bus->number;
 		/* get the Node ID of the local register */
-		err = pci_read_config_dword(ubox_dev, 0x40, &config);
+		err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
 		if (err)
 			break;
 		nodeid = config;
 		/* get the Node ID mapping */
-		err = pci_read_config_dword(ubox_dev, 0x54, &config);
+		err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
 		if (err)
 			break;
 
@@ -1207,11 +1278,20 @@ static int snbep_pci2phy_map_init(int devid)
 		raw_spin_lock(&pci2phy_map_lock);
 		list_for_each_entry(map, &pci2phy_map_head, list) {
 			i = -1;
-			for (bus = 255; bus >= 0; bus--) {
-				if (map->pbus_to_physid[bus] >= 0)
-					i = map->pbus_to_physid[bus];
-				else
-					map->pbus_to_physid[bus] = i;
+			if (reverse) {
+				for (bus = 255; bus >= 0; bus--) {
+					if (map->pbus_to_physid[bus] >= 0)
+						i = map->pbus_to_physid[bus];
+					else
+						map->pbus_to_physid[bus] = i;
+				}
+			} else {
+				for (bus = 0; bus <= 255; bus++) {
+					if (map->pbus_to_physid[bus] >= 0)
+						i = map->pbus_to_physid[bus];
+					else
+						map->pbus_to_physid[bus] = i;
+				}
 			}
 		}
 		raw_spin_unlock(&pci2phy_map_lock);
@@ -1224,7 +1304,7 @@ static int snbep_pci2phy_map_init(int devid)
 
 int snbep_uncore_pci_init(void)
 {
-	int ret = snbep_pci2phy_map_init(0x3ce0);
+	int ret = snbep_pci2phy_map_init(0x3ce0, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
 	if (ret)
 		return ret;
 	uncore_pci_uncores = snbep_pci_uncores;
@@ -1788,7 +1868,7 @@ static struct pci_driver ivbep_uncore_pci_driver = {
 
 int ivbep_uncore_pci_init(void)
 {
-	int ret = snbep_pci2phy_map_init(0x0e1e);
+	int ret = snbep_pci2phy_map_init(0x0e1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
 	if (ret)
 		return ret;
 	uncore_pci_uncores = ivbep_pci_uncores;
@@ -2897,7 +2977,7 @@ static struct pci_driver hswep_uncore_pci_driver = {
 
 int hswep_uncore_pci_init(void)
 {
-	int ret = snbep_pci2phy_map_init(0x2f1e);
+	int ret = snbep_pci2phy_map_init(0x2f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
 	if (ret)
 		return ret;
 	uncore_pci_uncores = hswep_pci_uncores;
@@ -3186,7 +3266,7 @@ static struct pci_driver bdx_uncore_pci_driver = {
 
 int bdx_uncore_pci_init(void)
 {
-	int ret = snbep_pci2phy_map_init(0x6f1e);
+	int ret = snbep_pci2phy_map_init(0x6f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
 
 	if (ret)
 		return ret;
@@ -3196,3 +3276,525 @@ int bdx_uncore_pci_init(void)
 }
 
 /* end of BDX uncore support */
+
+/* SKX uncore support */
+
+static struct intel_uncore_type skx_uncore_ubox = {
+	.name			= "ubox",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.ops			= &ivbep_uncore_msr_ops,
+	.format_group		= &ivbep_uncore_ubox_format_group,
+};
+
+static struct attribute *skx_uncore_cha_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid4.attr,
+	&format_attr_filter_link4.attr,
+	&format_attr_filter_state5.attr,
+	&format_attr_filter_rem.attr,
+	&format_attr_filter_loc.attr,
+	&format_attr_filter_nm.attr,
+	&format_attr_filter_all_op.attr,
+	&format_attr_filter_not_nm.attr,
+	&format_attr_filter_opc_0.attr,
+	&format_attr_filter_opc_1.attr,
+	&format_attr_filter_nc.attr,
+	&format_attr_filter_c6.attr,
+	&format_attr_filter_isoc.attr,
+	NULL,
+};
+
+static struct attribute_group skx_uncore_chabox_format_group = {
+	.name = "format",
+	.attrs = skx_uncore_cha_formats_attr,
+};
+
+static struct event_constraint skx_uncore_chabox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg skx_uncore_cha_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8134, 0xffff, 0x4),
+};
+
+static u64 skx_cha_filter_mask(int fields)
+{
+	u64 mask = 0;
+
+	if (fields & 0x1)
+		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK;
+	if (fields & 0x4)
+		mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE;
+	return mask;
+}
+
+static struct event_constraint *
+skx_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, skx_cha_filter_mask);
+}
+
+static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = skx_uncore_cha_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+			    HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & skx_cha_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static struct intel_uncore_ops skx_uncore_chabox_ops = {
+	/* There is no frz_en for chabox ctl */
+	.init_box		= ivbep_uncore_msr_init_box,
+	.disable_box		= snbep_uncore_msr_disable_box,
+	.enable_box		= snbep_uncore_msr_enable_box,
+	.disable_event		= snbep_uncore_msr_disable_event,
+	.enable_event		= hswep_cbox_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+	.hw_config		= skx_cha_hw_config,
+	.get_constraint		= skx_cha_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type skx_uncore_chabox = {
+	.name			= "cha",
+	.num_counters		= 4,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
+	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= skx_uncore_chabox_constraints,
+	.ops			= &skx_uncore_chabox_ops,
+	.format_group		= &skx_uncore_chabox_format_group,
+};
+
+static struct attribute *skx_uncore_iio_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh9.attr,
+	&format_attr_ch_mask.attr,
+	&format_attr_fc_mask.attr,
+	NULL,
+};
+
+static struct attribute_group skx_uncore_iio_format_group = {
+	.name = "format",
+	.attrs = skx_uncore_iio_formats_attr,
+};
+
+static struct event_constraint skx_uncore_iio_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x88, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x95, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0xd4, 0xc),
+	EVENT_CONSTRAINT_END
+};
+
+static void skx_iio_enable_event(struct intel_uncore_box *box,
+				 struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops skx_uncore_iio_ops = {
+	.init_box		= ivbep_uncore_msr_init_box,
+	.disable_box		= snbep_uncore_msr_disable_box,
+	.enable_box		= snbep_uncore_msr_enable_box,
+	.disable_event		= snbep_uncore_msr_disable_event,
+	.enable_event		= skx_iio_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_iio = {
+	.name			= "iio",
+	.num_counters		= 4,
+	.num_boxes		= 5,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= SKX_IIO0_MSR_PMON_CTL0,
+	.perf_ctr		= SKX_IIO0_MSR_PMON_CTR0,
+	.event_mask		= SKX_IIO_PMON_RAW_EVENT_MASK,
+	.event_mask_ext		= SKX_IIO_PMON_RAW_EVENT_MASK_EXT,
+	.box_ctl		= SKX_IIO0_MSR_PMON_BOX_CTL,
+	.msr_offset		= SKX_IIO_MSR_OFFSET,
+	.constraints		= skx_uncore_iio_constraints,
+	.ops			= &skx_uncore_iio_ops,
+	.format_group		= &skx_uncore_iio_format_group,
+};
+
+static struct attribute *skx_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute_group skx_uncore_format_group = {
+	.name = "format",
+	.attrs = skx_uncore_formats_attr,
+};
+
+static struct intel_uncore_type skx_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 2,
+	.num_boxes		= 5,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= SKX_IRP0_MSR_PMON_CTL0,
+	.perf_ctr		= SKX_IRP0_MSR_PMON_CTR0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SKX_IRP0_MSR_PMON_BOX_CTL,
+	.msr_offset		= SKX_IRP_MSR_OFFSET,
+	.ops			= &skx_uncore_iio_ops,
+	.format_group		= &skx_uncore_format_group,
+};
+
+static struct intel_uncore_ops skx_uncore_pcu_ops = {
+	IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= hswep_pcu_hw_config,
+	.get_constraint		= snbep_pcu_get_constraint,
+	.put_constraint		= snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type skx_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_PCU_MSR_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &skx_uncore_pcu_ops,
+	.format_group		= &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *skx_msr_uncores[] = {
+	&skx_uncore_ubox,
+	&skx_uncore_chabox,
+	&skx_uncore_iio,
+	&skx_uncore_irp,
+	&skx_uncore_pcu,
+	NULL,
+};
+
+static int skx_count_chabox(void)
+{
+	struct pci_dev *chabox_dev = NULL;
+	int bus, count = 0;
+
+	while (1) {
+		chabox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x208d, chabox_dev);
+		if (!chabox_dev)
+			break;
+		if (count == 0)
+			bus = chabox_dev->bus->number;
+		if (bus != chabox_dev->bus->number)
+			break;
+		count++;
+	}
+
+	pci_dev_put(chabox_dev);
+	return count;
+}
+
+void skx_uncore_cpu_init(void)
+{
+	skx_uncore_chabox.num_boxes = skx_count_chabox();
+	uncore_msr_uncores = skx_msr_uncores;
+}
+
+static struct intel_uncore_type skx_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 4,
+	.num_boxes	= 6,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= hswep_uncore_imc_events,
+	.perf_ctr	= SNBEP_PCI_PMON_CTR0,
+	.event_ctl	= SNBEP_PCI_PMON_CTL0,
+	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,
+	.ops		= &ivbep_uncore_pci_ops,
+	.format_group	= &skx_uncore_format_group,
+};
+
+static struct attribute *skx_upi_uncore_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_umask_ext.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute_group skx_upi_uncore_format_group = {
+	.name = "format",
+	.attrs = skx_upi_uncore_formats_attr,
+};
+
+static void skx_upi_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+
+	__set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+	pci_write_config_dword(pdev, SKX_UPI_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops skx_upi_uncore_pci_ops = {
+	.init_box	= skx_upi_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= snbep_uncore_pci_disable_event,
+	.enable_event	= snbep_uncore_pci_enable_event,
+	.read_counter	= snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_upi = {
+	.name		= "upi",
+	.num_counters   = 4,
+	.num_boxes	= 3,
+	.perf_ctr_bits	= 48,
+	.perf_ctr	= SKX_UPI_PCI_PMON_CTR0,
+	.event_ctl	= SKX_UPI_PCI_PMON_CTL0,
+	.event_mask	= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.event_mask_ext = SKX_PMON_CTL_UMASK_EXT,
+	.box_ctl	= SKX_UPI_PCI_PMON_BOX_CTL,
+	.ops		= &skx_upi_uncore_pci_ops,
+	.format_group	= &skx_upi_uncore_format_group,
+};
+
+static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+
+	__set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+	pci_write_config_dword(pdev, SKX_M2M_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops skx_m2m_uncore_pci_ops = {
+	.init_box	= skx_m2m_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= snbep_uncore_pci_disable_event,
+	.enable_event	= snbep_uncore_pci_enable_event,
+	.read_counter	= snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_m2m = {
+	.name		= "m2m",
+	.num_counters   = 4,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 48,
+	.perf_ctr	= SKX_M2M_PCI_PMON_CTR0,
+	.event_ctl	= SKX_M2M_PCI_PMON_CTL0,
+	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl	= SKX_M2M_PCI_PMON_BOX_CTL,
+	.ops		= &skx_m2m_uncore_pci_ops,
+	.format_group	= &skx_uncore_format_group,
+};
+
+static struct event_constraint skx_uncore_m2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type skx_uncore_m2pcie = {
+	.name		= "m2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 4,
+	.perf_ctr_bits	= 48,
+	.constraints	= skx_uncore_m2pcie_constraints,
+	.perf_ctr	= SNBEP_PCI_PMON_CTR0,
+	.event_ctl	= SNBEP_PCI_PMON_CTL0,
+	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,
+	.ops		= &ivbep_uncore_pci_ops,
+	.format_group	= &skx_uncore_format_group,
+};
+
+static struct event_constraint skx_uncore_m3upi_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x1d, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x1e, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x40, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x4e, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x4f, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x50, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x51, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x52, 0x7),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type skx_uncore_m3upi = {
+	.name		= "m3upi",
+	.num_counters   = 3,
+	.num_boxes	= 3,
+	.perf_ctr_bits	= 48,
+	.constraints	= skx_uncore_m3upi_constraints,
+	.perf_ctr	= SNBEP_PCI_PMON_CTR0,
+	.event_ctl	= SNBEP_PCI_PMON_CTL0,
+	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,
+	.ops		= &ivbep_uncore_pci_ops,
+	.format_group	= &skx_uncore_format_group,
+};
+
+enum {
+	SKX_PCI_UNCORE_IMC,
+	SKX_PCI_UNCORE_M2M,
+	SKX_PCI_UNCORE_UPI,
+	SKX_PCI_UNCORE_M2PCIE,
+	SKX_PCI_UNCORE_M3UPI,
+};
+
+static struct intel_uncore_type *skx_pci_uncores[] = {
+	[SKX_PCI_UNCORE_IMC]	= &skx_uncore_imc,
+	[SKX_PCI_UNCORE_M2M]	= &skx_uncore_m2m,
+	[SKX_PCI_UNCORE_UPI]	= &skx_uncore_upi,
+	[SKX_PCI_UNCORE_M2PCIE]	= &skx_uncore_m2pcie,
+	[SKX_PCI_UNCORE_M3UPI]	= &skx_uncore_m3upi,
+	NULL,
+};
+
+static const struct pci_device_id skx_uncore_pci_ids[] = {
+	{ /* MC0 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 2, SKX_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC0 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 6, SKX_PCI_UNCORE_IMC, 1),
+	},
+	{ /* MC0 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 2, SKX_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC1 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 2, SKX_PCI_UNCORE_IMC, 3),
+	},
+	{ /* MC1 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 6, SKX_PCI_UNCORE_IMC, 4),
+	},
+	{ /* MC1 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(13, 2, SKX_PCI_UNCORE_IMC, 5),
+	},
+	{ /* M2M0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 0, SKX_PCI_UNCORE_M2M, 0),
+	},
+	{ /* M2M1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 0, SKX_PCI_UNCORE_M2M, 1),
+	},
+	{ /* UPI0 Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(14, 0, SKX_PCI_UNCORE_UPI, 0),
+	},
+	{ /* UPI0 Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, SKX_PCI_UNCORE_UPI, 1),
+	},
+	{ /* UPI1 Link 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, SKX_PCI_UNCORE_UPI, 2),
+	},
+	{ /* M2PCIe 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 1, SKX_PCI_UNCORE_M2PCIE, 0),
+	},
+	{ /* M2PCIe 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 1, SKX_PCI_UNCORE_M2PCIE, 1),
+	},
+	{ /* M2PCIe 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(23, 1, SKX_PCI_UNCORE_M2PCIE, 2),
+	},
+	{ /* M2PCIe 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3),
+	},
+	{ /* M3UPI0 Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, SKX_PCI_UNCORE_M3UPI, 0),
+	},
+	{ /* M3UPI0 Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 1),
+	},
+	{ /* M3UPI1 Link 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 4, SKX_PCI_UNCORE_M3UPI, 2),
+	},
+	{ /* end: all zeroes */ }
+};
+
+
+static struct pci_driver skx_uncore_pci_driver = {
+	.name		= "skx_uncore",
+	.id_table	= skx_uncore_pci_ids,
+};
+
+int skx_uncore_pci_init(void)
+{
+	/* need to double check pci address */
+	int ret = snbep_pci2phy_map_init(0x2014, SKX_CPUNODEID, SKX_GIDNIDMAP, false);
+
+	if (ret)
+		return ret;
+
+	uncore_pci_uncores = skx_pci_uncores;
+	uncore_pci_driver = &skx_uncore_pci_driver;
+	return 0;
+}
+
+/* end of SKX uncore support */
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 8c4a47706296..5874d8de1f8d 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -194,12 +194,13 @@ struct cpu_hw_events {
 	 */
 	struct debug_store	*ds;
 	u64			pebs_enabled;
+	int			n_pebs;
+	int			n_large_pebs;
 
 	/*
 	 * Intel LBR bits
 	 */
 	int				lbr_users;
-	void				*lbr_context;
 	struct perf_branch_stack	lbr_stack;
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
 	struct er_account		*lbr_sel;
@@ -508,6 +509,8 @@ struct x86_pmu {
 	void		(*enable_all)(int added);
 	void		(*enable)(struct perf_event *);
 	void		(*disable)(struct perf_event *);
+	void		(*add)(struct perf_event *);
+	void		(*del)(struct perf_event *);
 	int		(*hw_config)(struct perf_event *event);
 	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
 	unsigned	eventsel;
@@ -888,6 +891,10 @@ extern struct event_constraint intel_skl_pebs_event_constraints[];
 
 struct event_constraint *intel_pebs_constraints(struct perf_event *event);
 
+void intel_pmu_pebs_add(struct perf_event *event);
+
+void intel_pmu_pebs_del(struct perf_event *event);
+
 void intel_pmu_pebs_enable(struct perf_event *event);
 
 void intel_pmu_pebs_disable(struct perf_event *event);
@@ -906,9 +913,9 @@ u64 lbr_from_signext_quirk_wr(u64 val);
 
 void intel_pmu_lbr_reset(void);
 
-void intel_pmu_lbr_enable(struct perf_event *event);
+void intel_pmu_lbr_add(struct perf_event *event);
 
-void intel_pmu_lbr_disable(struct perf_event *event);
+void intel_pmu_lbr_del(struct perf_event *event);
 
 void intel_pmu_lbr_enable_all(bool pmi);
 
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 124357773ffa..f5aaf6c83222 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -650,8 +650,8 @@ static inline void entering_ack_irq(void)
 
 static inline void ipi_entering_ack_irq(void)
 {
-	ack_APIC_irq();
 	irq_enter();
+	ack_APIC_irq();
 }
 
 static inline void exiting_irq(void)
@@ -661,9 +661,8 @@ static inline void exiting_irq(void)
 
 static inline void exiting_ack_irq(void)
 {
-	irq_exit();
-	/* Ack only at the end to avoid potential reentry */
 	ack_APIC_irq();
+	irq_exit();
 }
 
 extern void ioapic_zap_locks(void);
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 9733361fed6f..97848cdfcb1a 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -158,53 +158,9 @@ extern void __add_wrong_size(void)
  * value of "*ptr".
  *
  * xadd() is locked when multiple CPUs are online
- * xadd_sync() is always locked
- * xadd_local() is never locked
  */
 #define __xadd(ptr, inc, lock)	__xchg_op((ptr), (inc), xadd, lock)
 #define xadd(ptr, inc)		__xadd((ptr), (inc), LOCK_PREFIX)
-#define xadd_sync(ptr, inc)	__xadd((ptr), (inc), "lock; ")
-#define xadd_local(ptr, inc)	__xadd((ptr), (inc), "")
-
-#define __add(ptr, inc, lock)						\
-	({								\
-	        __typeof__ (*(ptr)) __ret = (inc);			\
-		switch (sizeof(*(ptr))) {				\
-		case __X86_CASE_B:					\
-			asm volatile (lock "addb %b1, %0\n"		\
-				      : "+m" (*(ptr)) : "qi" (inc)	\
-				      : "memory", "cc");		\
-			break;						\
-		case __X86_CASE_W:					\
-			asm volatile (lock "addw %w1, %0\n"		\
-				      : "+m" (*(ptr)) : "ri" (inc)	\
-				      : "memory", "cc");		\
-			break;						\
-		case __X86_CASE_L:					\
-			asm volatile (lock "addl %1, %0\n"		\
-				      : "+m" (*(ptr)) : "ri" (inc)	\
-				      : "memory", "cc");		\
-			break;						\
-		case __X86_CASE_Q:					\
-			asm volatile (lock "addq %1, %0\n"		\
-				      : "+m" (*(ptr)) : "ri" (inc)	\
-				      : "memory", "cc");		\
-			break;						\
-		default:						\
-			__add_wrong_size();				\
-		}							\
-		__ret;							\
-	})
-
-/*
- * add_*() adds "inc" to "*ptr"
- *
- * __add() takes a lock prefix
- * add_smp() is locked when multiple CPUs are online
- * add_sync() is always locked
- */
-#define add_smp(ptr, inc)	__add((ptr), (inc), LOCK_PREFIX)
-#define add_sync(ptr, inc)	__add((ptr), (inc), "lock; ")
 
 #define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2)			\
 ({									\
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 92a8308b96f6..1188bc849ee3 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -106,7 +106,6 @@
 #define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
 #define X86_FEATURE_EAGER_FPU	( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
 #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index d0bb76d81402..389d700b961e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -117,7 +117,6 @@ extern int __init efi_memblock_x86_reserve_range(void);
 extern pgd_t * __init efi_call_phys_prolog(void);
 extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
 extern void __init efi_print_memmap(void);
-extern void __init efi_unmap_memmap(void);
 extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
 extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
@@ -192,14 +191,7 @@ static inline efi_status_t efi_thunk_set_virtual_address_map(
 struct efi_config {
 	u64 image_handle;
 	u64 table;
-	u64 allocate_pool;
-	u64 allocate_pages;
-	u64 get_memory_map;
-	u64 free_pool;
-	u64 free_pages;
-	u64 locate_handle;
-	u64 handle_protocol;
-	u64 exit_boot_services;
+	u64 boot_services;
 	u64 text_output;
 	efi_status_t (*call)(unsigned long, ...);
 	bool is64;
@@ -207,14 +199,27 @@ struct efi_config {
 
 __pure const struct efi_config *__efi_early(void);
 
+static inline bool efi_is_64bit(void)
+{
+	if (!IS_ENABLED(CONFIG_X86_64))
+		return false;
+
+	if (!IS_ENABLED(CONFIG_EFI_MIXED))
+		return true;
+
+	return __efi_early()->is64;
+}
+
 #define efi_call_early(f, ...)						\
-	__efi_early()->call(__efi_early()->f, __VA_ARGS__);
+	__efi_early()->call(efi_is_64bit() ?				\
+		((efi_boot_services_64_t *)(unsigned long)		\
+			__efi_early()->boot_services)->f :		\
+		((efi_boot_services_32_t *)(unsigned long)		\
+			__efi_early()->boot_services)->f, __VA_ARGS__)
 
 #define __efi_call_early(f, ...)					\
 	__efi_early()->call((unsigned long)f, __VA_ARGS__);
 
-#define efi_is_64bit()		__efi_early()->is64
-
 extern bool efi_reboot_required(void);
 
 #else
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 055ea9941dd5..67942b6ad4b7 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -43,6 +43,9 @@ struct hypervisor_x86 {
 
 	/* X2APIC detection (run once per boot) */
 	bool		(*x2apic_available)(void);
+
+	/* pin current vcpu to specified physical cpu (run rarely) */
+	void		(*pin_vcpu)(int);
 };
 
 extern const struct hypervisor_x86 *x86_hyper;
@@ -56,6 +59,7 @@ extern const struct hypervisor_x86 x86_hyper_kvm;
 extern void init_hypervisor(struct cpuinfo_x86 *c);
 extern void init_hypervisor_platform(void);
 extern bool hypervisor_x2apic_available(void);
+extern void hypervisor_pin_vcpu(int cpu);
 #else
 static inline void init_hypervisor(struct cpuinfo_x86 *c) { }
 static inline void init_hypervisor_platform(void) { }
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 8bf766ef0e18..9bd7ff5ffbcc 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,9 +40,10 @@
 #define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
 
 /* AMD-specific bits */
+#define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */
+#define MCI_STATUS_SYNDV	(1ULL<<53)  /* synd reg. valid */
 #define MCI_STATUS_DEFERRED	(1ULL<<44)  /* uncorrected error, deferred exception */
 #define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */
-#define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */
 
 /*
  * McaX field if set indicates a given bank supports MCA extensions:
@@ -110,6 +111,7 @@
 #define MSR_AMD64_SMCA_MC0_MISC0	0xc0002003
 #define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
 #define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
+#define MSR_AMD64_SMCA_MC0_SYND		0xc0002006
 #define MSR_AMD64_SMCA_MC0_DESTAT	0xc0002008
 #define MSR_AMD64_SMCA_MC0_DEADDR	0xc0002009
 #define MSR_AMD64_SMCA_MC0_MISC1	0xc000200a
@@ -119,6 +121,7 @@
 #define MSR_AMD64_SMCA_MCx_MISC(x)	(MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_SYND(x)	(MSR_AMD64_SMCA_MC0_SYND + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_DESTAT(x)	(MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_DEADDR(x)	(MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_MISCy(x, y)	((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
@@ -334,44 +337,47 @@ extern void apei_mce_report_mem_error(int corrected,
  * Scalable MCA.
  */
 #ifdef CONFIG_X86_MCE_AMD
-enum amd_ip_types {
-	SMCA_F17H_CORE = 0,	/* Core errors */
-	SMCA_DF,		/* Data Fabric */
-	SMCA_UMC,		/* Unified Memory Controller */
-	SMCA_PB,		/* Parameter Block */
-	SMCA_PSP,		/* Platform Security Processor */
-	SMCA_SMU,		/* System Management Unit */
-	N_AMD_IP_TYPES
-};
-
-struct amd_hwid {
-	const char *name;
-	unsigned int hwid;
-};
-
-extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
 
-enum amd_core_mca_blocks {
+/* These may be used by multiple smca_hwid_mcatypes */
+enum smca_bank_types {
 	SMCA_LS = 0,	/* Load Store */
 	SMCA_IF,	/* Instruction Fetch */
-	SMCA_L2_CACHE,	/* L2 cache */
-	SMCA_DE,	/* Decoder unit */
-	RES,		/* Reserved */
-	SMCA_EX,	/* Execution unit */
+	SMCA_L2_CACHE,	/* L2 Cache */
+	SMCA_DE,	/* Decoder Unit */
+	SMCA_EX,	/* Execution Unit */
 	SMCA_FP,	/* Floating Point */
-	SMCA_L3_CACHE,	/* L3 cache */
-	N_CORE_MCA_BLOCKS
+	SMCA_L3_CACHE,	/* L3 Cache */
+	SMCA_CS,	/* Coherent Slave */
+	SMCA_PIE,	/* Power, Interrupts, etc. */
+	SMCA_UMC,	/* Unified Memory Controller */
+	SMCA_PB,	/* Parameter Block */
+	SMCA_PSP,	/* Platform Security Processor */
+	SMCA_SMU,	/* System Management Unit */
+	N_SMCA_BANK_TYPES
 };
 
-extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+struct smca_bank_name {
+	const char *name;	/* Short name for sysfs */
+	const char *long_name;	/* Long name for pretty-printing */
+};
+
+extern struct smca_bank_name smca_bank_names[N_SMCA_BANK_TYPES];
+
+#define HWID_MCATYPE(hwid, mcatype) ((hwid << 16) | mcatype)
 
-enum amd_df_mca_blocks {
-	SMCA_CS = 0,	/* Coherent Slave */
-	SMCA_PIE,	/* Power management, Interrupts, etc */
-	N_DF_BLOCKS
+struct smca_hwid_mcatype {
+	unsigned int bank_type;	/* Use with smca_bank_types for easy indexing. */
+	u32 hwid_mcatype;	/* (hwid,mcatype) tuple */
+	u32 xec_bitmap;		/* Bitmap of valid ExtErrorCodes; current max is 21. */
 };
 
-extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+struct smca_bank_info {
+	struct smca_hwid_mcatype *type;
+	u32 type_instance;
+};
+
+extern struct smca_bank_info smca_banks[MAX_NR_BANKS];
+
 #endif
 
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index b07233b64578..32007041ef8c 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -6,7 +6,6 @@
 #include <asm/x86_init.h>
 #include <asm/apicdef.h>
 
-extern int apic_version[];
 extern int pic_mode;
 
 #ifdef CONFIG_X86_32
@@ -40,6 +39,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
 extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
 extern unsigned int boot_cpu_physical_apicid;
+extern u8 boot_cpu_apic_version;
 extern unsigned long mp_lapic_addr;
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -86,6 +86,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
 #endif
 
 int generic_processor_info(int apicid, int version);
+int __generic_processor_info(int apicid, int version, bool enabled);
 
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_LOCAL_APIC)
 
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 91b6f4eed3fd..ce932812f142 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -657,8 +657,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 
 #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
 
-#ifdef CONFIG_QUEUED_SPINLOCKS
-
 static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
 							u32 val)
 {
@@ -680,22 +678,6 @@ static __always_inline void pv_kick(int cpu)
 	PVOP_VCALL1(pv_lock_ops.kick, cpu);
 }
 
-#else /* !CONFIG_QUEUED_SPINLOCKS */
-
-static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
-							__ticket_t ticket)
-{
-	PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket);
-}
-
-static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
-							__ticket_t ticket)
-{
-	PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
-}
-
-#endif /* CONFIG_QUEUED_SPINLOCKS */
-
 #endif /* SMP && PARAVIRT_SPINLOCKS */
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index fcf243f077ac..0f400c0e4979 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -300,23 +300,16 @@ struct pv_mmu_ops {
 struct arch_spinlock;
 #ifdef CONFIG_SMP
 #include <asm/spinlock_types.h>
-#else
-typedef u16 __ticket_t;
 #endif
 
 struct qspinlock;
 
 struct pv_lock_ops {
-#ifdef CONFIG_QUEUED_SPINLOCKS
 	void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
 	struct paravirt_callee_save queued_spin_unlock;
 
 	void (*wait)(u8 *ptr, u8 val);
 	void (*kick)(int cpu);
-#else /* !CONFIG_QUEUED_SPINLOCKS */
-	struct paravirt_callee_save lock_spinning;
-	void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
-#endif /* !CONFIG_QUEUED_SPINLOCKS */
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index 643eba42d620..2c1ebeb4d737 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -46,10 +46,7 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
 
 static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
 {
-	if (static_cpu_has(X86_FEATURE_MCE_RECOVERY))
-		return memcpy_mcsafe(dst, src, n);
-	memcpy(dst, src, n);
-	return 0;
+	return memcpy_mcsafe(dst, src, n);
 }
 
 /**
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 8dbc762ad132..3d33a719f5c1 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -154,7 +154,7 @@ static inline bool __down_write_trylock(struct rw_semaphore *sem)
 		     : "+m" (sem->count), "=&a" (tmp0), "=&r" (tmp1),
 		       CC_OUT(e) (result)
 		     : "er" (RWSEM_ACTIVE_WRITE_BIAS)
-		     : "memory", "cc");
+		     : "memory");
 	return result;
 }
 
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index be0a05913b91..921bea7a2708 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -20,187 +20,13 @@
  * (the type definitions are in asm/spinlock_types.h)
  */
 
-#ifdef CONFIG_X86_32
-# define LOCK_PTR_REG "a"
-#else
-# define LOCK_PTR_REG "D"
-#endif
-
-#if defined(CONFIG_X86_32) && (defined(CONFIG_X86_PPRO_FENCE))
-/*
- * On PPro SMP, we use a locked operation to unlock
- * (PPro errata 66, 92)
- */
-# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
-#else
-# define UNLOCK_LOCK_PREFIX
-#endif
-
 /* How long a lock should spin before we consider blocking */
 #define SPIN_THRESHOLD	(1 << 15)
 
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
-#ifdef CONFIG_QUEUED_SPINLOCKS
 #include <asm/qspinlock.h>
-#else
-
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-
-static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
-{
-	set_bit(0, (volatile unsigned long *)&lock->tickets.head);
-}
-
-#else  /* !CONFIG_PARAVIRT_SPINLOCKS */
-static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock,
-							__ticket_t ticket)
-{
-}
-static inline void __ticket_unlock_kick(arch_spinlock_t *lock,
-							__ticket_t ticket)
-{
-}
-
-#endif /* CONFIG_PARAVIRT_SPINLOCKS */
-static inline int  __tickets_equal(__ticket_t one, __ticket_t two)
-{
-	return !((one ^ two) & ~TICKET_SLOWPATH_FLAG);
-}
-
-static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock,
-							__ticket_t head)
-{
-	if (head & TICKET_SLOWPATH_FLAG) {
-		arch_spinlock_t old, new;
-
-		old.tickets.head = head;
-		new.tickets.head = head & ~TICKET_SLOWPATH_FLAG;
-		old.tickets.tail = new.tickets.head + TICKET_LOCK_INC;
-		new.tickets.tail = old.tickets.tail;
-
-		/* try to clear slowpath flag when there are no contenders */
-		cmpxchg(&lock->head_tail, old.head_tail, new.head_tail);
-	}
-}
-
-static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
-{
-	return __tickets_equal(lock.tickets.head, lock.tickets.tail);
-}
-
-/*
- * Ticket locks are conceptually two parts, one indicating the current head of
- * the queue, and the other indicating the current tail. The lock is acquired
- * by atomically noting the tail and incrementing it by one (thus adding
- * ourself to the queue and noting our position), then waiting until the head
- * becomes equal to the the initial value of the tail.
- *
- * We use an xadd covering *both* parts of the lock, to increment the tail and
- * also load the position of the head, which takes care of memory ordering
- * issues and should be optimal for the uncontended case. Note the tail must be
- * in the high part, because a wide xadd increment of the low part would carry
- * up and contaminate the high part.
- */
-static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC };
-
-	inc = xadd(&lock->tickets, inc);
-	if (likely(inc.head == inc.tail))
-		goto out;
-
-	for (;;) {
-		unsigned count = SPIN_THRESHOLD;
-
-		do {
-			inc.head = READ_ONCE(lock->tickets.head);
-			if (__tickets_equal(inc.head, inc.tail))
-				goto clear_slowpath;
-			cpu_relax();
-		} while (--count);
-		__ticket_lock_spinning(lock, inc.tail);
-	}
-clear_slowpath:
-	__ticket_check_and_clear_slowpath(lock, inc.head);
-out:
-	barrier();	/* make sure nothing creeps before the lock is taken */
-}
-
-static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-	arch_spinlock_t old, new;
-
-	old.tickets = READ_ONCE(lock->tickets);
-	if (!__tickets_equal(old.tickets.head, old.tickets.tail))
-		return 0;
-
-	new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
-	new.head_tail &= ~TICKET_SLOWPATH_FLAG;
-
-	/* cmpxchg is a full barrier, so nothing can move before it */
-	return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
-}
-
-static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	if (TICKET_SLOWPATH_FLAG &&
-		static_key_false(&paravirt_ticketlocks_enabled)) {
-		__ticket_t head;
-
-		BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
-
-		head = xadd(&lock->tickets.head, TICKET_LOCK_INC);
-
-		if (unlikely(head & TICKET_SLOWPATH_FLAG)) {
-			head &= ~TICKET_SLOWPATH_FLAG;
-			__ticket_unlock_kick(lock, (head + TICKET_LOCK_INC));
-		}
-	} else
-		__add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
-}
-
-static inline int arch_spin_is_locked(arch_spinlock_t *lock)
-{
-	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
-
-	return !__tickets_equal(tmp.tail, tmp.head);
-}
-
-static inline int arch_spin_is_contended(arch_spinlock_t *lock)
-{
-	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
-
-	tmp.head &= ~TICKET_SLOWPATH_FLAG;
-	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
-}
-#define arch_spin_is_contended	arch_spin_is_contended
-
-static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
-						  unsigned long flags)
-{
-	arch_spin_lock(lock);
-}
-
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	__ticket_t head = READ_ONCE(lock->tickets.head);
-
-	for (;;) {
-		struct __raw_tickets tmp = READ_ONCE(lock->tickets);
-		/*
-		 * We need to check "unlocked" in a loop, tmp.head == head
-		 * can be false positive because of overflow.
-		 */
-		if (__tickets_equal(tmp.head, tmp.tail) ||
-				!__tickets_equal(tmp.head, head))
-			break;
-
-		cpu_relax();
-	}
-}
-#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 /*
  * Read-write spinlocks, allowing multiple readers
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 65c3e37f879a..25311ebb446c 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -23,20 +23,7 @@ typedef u32 __ticketpair_t;
 
 #define TICKET_SHIFT	(sizeof(__ticket_t) * 8)
 
-#ifdef CONFIG_QUEUED_SPINLOCKS
 #include <asm-generic/qspinlock_types.h>
-#else
-typedef struct arch_spinlock {
-	union {
-		__ticketpair_t head_tail;
-		struct __raw_tickets {
-			__ticket_t head, tail;
-		} tickets;
-	};
-} arch_spinlock_t;
-
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
-#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 #include <asm-generic/qrwlock_types.h>
 
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 90dbbd9666d4..a164862d77e3 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_STRING_64_H
 
 #ifdef __KERNEL__
+#include <linux/jump_label.h>
 
 /* Written 2002 by Andi Kleen */
 
@@ -78,6 +79,9 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt);
+DECLARE_STATIC_KEY_FALSE(mcsafe_key);
+
 /**
  * memcpy_mcsafe - copy memory with indication if a machine check happened
  *
@@ -86,10 +90,23 @@ int strcmp(const char *cs, const char *ct);
  * @cnt:	number of bytes to copy
  *
  * Low level memory copy function that catches machine checks
+ * We only call into the "safe" function on systems that can
+ * actually do machine check recovery. Everyone else can just
+ * use memcpy().
  *
  * Return 0 for success, -EFAULT for fail
  */
-int memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+static __always_inline __must_check int
+memcpy_mcsafe(void *dst, const void *src, size_t cnt)
+{
+#ifdef CONFIG_X86_MCE
+	if (static_branch_unlikely(&mcsafe_key))
+		return memcpy_mcsafe_unrolled(dst, src, cnt);
+	else
+#endif
+		memcpy(dst, src, cnt);
+	return 0;
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index 2184943341bf..69a6e07e3149 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -26,6 +26,8 @@ struct mce {
 	__u32 socketid;	/* CPU socket ID */
 	__u32 apicid;	/* CPU initial apic ID */
 	__u64 mcgcap;	/* MCGCAP MSR: machine check capabilities of CPU */
+	__u64 synd;	/* MCA_SYND MSR: only valid on SMCA systems */
+	__u64 ipid;	/* MCA_IPID MSR: only valid on SMCA systems */
 };
 
 #define MCE_GET_RECORD_LEN   _IOR('M', 1, int)
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 3242e591fa82..26b78d86f25a 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,6 +1,7 @@
 obj-$(CONFIG_ACPI)		+= boot.o
 obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup_$(BITS).o
 obj-$(CONFIG_ACPI_APEI)		+= apei.o
+obj-$(CONFIG_ACPI_CPPC_LIB)	+= cppc_msr.o
 
 ifneq ($(CONFIG_ACPI_PROCESSOR),)
 obj-y				+= cstate.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 90d84c3eee53..32a7d70913ac 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -176,15 +176,10 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
 		return -EINVAL;
 	}
 
-	if (!enabled) {
-		++disabled_cpus;
-		return -EINVAL;
-	}
-
 	if (boot_cpu_physical_apicid != -1U)
-		ver = apic_version[boot_cpu_physical_apicid];
+		ver = boot_cpu_apic_version;
 
-	cpu = generic_processor_info(id, ver);
+	cpu = __generic_processor_info(id, ver, enabled);
 	if (cpu >= 0)
 		early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid;
 
@@ -282,6 +277,8 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 	if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
 		return -EINVAL;
 
+	acpi_table_print_madt_entry(header);
+
 	acpi_lapic_addr = lapic_addr_ovr->address;
 
 	return 0;
@@ -705,7 +702,7 @@ static void __init acpi_set_irq_model_ioapic(void)
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 #include <acpi/processor.h>
 
-static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 {
 #ifdef CONFIG_ACPI_NUMA
 	int nid;
@@ -716,6 +713,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 		numa_set_node(cpu, nid);
 	}
 #endif
+	return 0;
 }
 
 int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
@@ -998,21 +996,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
 	if (!boot_cpu_has(X86_FEATURE_APIC))
 		return -ENODEV;
 
-	/*
-	 * Note that the LAPIC address is obtained from the MADT (32-bit value)
-	 * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
-	 */
-
-	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
-				      acpi_parse_lapic_addr_ovr, 0);
-	if (count < 0) {
-		printk(KERN_ERR PREFIX
-		       "Error parsing LAPIC address override entry\n");
-		return count;
-	}
-
-	register_lapic_address(acpi_lapic_addr);
-
 	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
 				      acpi_parse_sapic, MAX_LOCAL_APIC);
 
@@ -1031,8 +1014,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
 			return ret;
 		}
 
-		x2count = madt_proc[0].count;
-		count = madt_proc[1].count;
+		count = madt_proc[0].count;
+		x2count = madt_proc[1].count;
 	}
 	if (!count && !x2count) {
 		printk(KERN_ERR PREFIX "No LAPIC entries present\n");
@@ -1513,7 +1496,7 @@ void __init acpi_boot_table_init(void)
 	 * If acpi_disabled, bail out
 	 */
 	if (acpi_disabled)
-		return; 
+		return;
 
 	/*
 	 * Initialize the ACPI boot-time table parser.
diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c
new file mode 100644
index 000000000000..6fb478bf82fd
--- /dev/null
+++ b/arch/x86/kernel/acpi/cppc_msr.c
@@ -0,0 +1,58 @@
+/*
+ * cppc_msr.c:  MSR Interface for CPPC
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <acpi/cppc_acpi.h>
+#include <asm/msr.h>
+
+/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
+
+bool cpc_ffh_supported(void)
+{
+	return true;
+}
+
+int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
+{
+	int err;
+
+	err = rdmsrl_safe_on_cpu(cpunum, reg->address, val);
+	if (!err) {
+		u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+				       reg->bit_offset);
+
+		*val &= mask;
+		*val >>= reg->bit_offset;
+	}
+	return err;
+}
+
+int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
+{
+	u64 rd_val;
+	int err;
+
+	err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val);
+	if (!err) {
+		u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+				       reg->bit_offset);
+
+		val <<= reg->bit_offset;
+		val &= mask;
+		rd_val &= ~mask;
+		rd_val |= val;
+		err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val);
+	}
+	return err;
+}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f3e9b2df4b16..f266b8a92a9e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -64,6 +64,8 @@ unsigned disabled_cpus;
 unsigned int boot_cpu_physical_apicid = -1U;
 EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
 
+u8 boot_cpu_apic_version;
+
 /*
  * The highest APIC ID seen during enumeration.
  */
@@ -1374,7 +1376,6 @@ void setup_local_APIC(void)
 	 * Actually disabling the focus CPU check just makes the hang less
 	 * frequent as it makes the interrupt distributon model be more
 	 * like LRU than MRU (the short-term load is more even across CPUs).
-	 * See also the comment in end_level_ioapic_irq().  --macro
 	 */
 
 	/*
@@ -1816,8 +1817,7 @@ void __init init_apic_mappings(void)
 		 * since smp_sanity_check is prepared for such a case
 		 * and disable smp mode
 		 */
-		apic_version[new_apicid] =
-			 GET_APIC_VERSION(apic_read(APIC_LVR));
+		boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
 	}
 }
 
@@ -1828,17 +1828,14 @@ void __init register_lapic_address(unsigned long address)
 	if (!x2apic_mode) {
 		set_fixmap_nocache(FIX_APIC_BASE, address);
 		apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
-			    APIC_BASE, mp_lapic_addr);
+			    APIC_BASE, address);
 	}
 	if (boot_cpu_physical_apicid == -1U) {
 		boot_cpu_physical_apicid  = read_apic_id();
-		apic_version[boot_cpu_physical_apicid] =
-			 GET_APIC_VERSION(apic_read(APIC_LVR));
+		boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
 	}
 }
 
-int apic_version[MAX_LOCAL_APIC];
-
 /*
  * Local APIC interrupts
  */
@@ -2027,7 +2024,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 	apic_write(APIC_LVT1, value);
 }
 
-int generic_processor_info(int apicid, int version)
+/*
+ * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated
+ * contiguously, it equals to current allocated max logical CPU ID plus 1.
+ * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of
+ * nr_logical_cpuids is nr_cpu_ids.
+ *
+ * NOTE: Reserve 0 for BSP.
+ */
+static int nr_logical_cpuids = 1;
+
+/*
+ * Used to store mapping between logical CPU IDs and APIC IDs.
+ */
+static int cpuid_to_apicid[] = {
+	[0 ... NR_CPUS - 1] = -1,
+};
+
+/*
+ * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids
+ * and cpuid_to_apicid[] synchronized.
+ */
+static int allocate_logical_cpuid(int apicid)
+{
+	int i;
+
+	/*
+	 * cpuid <-> apicid mapping is persistent, so when a cpu is up,
+	 * check if the kernel has allocated a cpuid for it.
+	 */
+	for (i = 0; i < nr_logical_cpuids; i++) {
+		if (cpuid_to_apicid[i] == apicid)
+			return i;
+	}
+
+	/* Allocate a new cpuid. */
+	if (nr_logical_cpuids >= nr_cpu_ids) {
+		WARN_ONCE(1, "Only %d processors supported."
+			     "Processor %d/0x%x and the rest are ignored.\n",
+			     nr_cpu_ids - 1, nr_logical_cpuids, apicid);
+		return -1;
+	}
+
+	cpuid_to_apicid[nr_logical_cpuids] = apicid;
+	return nr_logical_cpuids++;
+}
+
+int __generic_processor_info(int apicid, int version, bool enabled)
 {
 	int cpu, max = nr_cpu_ids;
 	bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
@@ -2102,8 +2145,16 @@ int generic_processor_info(int apicid, int version)
 		 * for BSP.
 		 */
 		cpu = 0;
-	} else
-		cpu = cpumask_next_zero(-1, cpu_present_mask);
+
+		/* Logical cpuid 0 is reserved for BSP. */
+		cpuid_to_apicid[0] = apicid;
+	} else {
+		cpu = allocate_logical_cpuid(apicid);
+		if (cpu < 0) {
+			disabled_cpus++;
+			return -EINVAL;
+		}
+	}
 
 	/*
 	 * This can happen on physical hotplug. The sanity check at boot time
@@ -2120,8 +2171,6 @@ int generic_processor_info(int apicid, int version)
 		return -ENOSPC;
 	}
 
-	num_processors++;
-
 	/*
 	 * Validate version
 	 */
@@ -2130,14 +2179,12 @@ int generic_processor_info(int apicid, int version)
 			   cpu, apicid);
 		version = 0x10;
 	}
-	apic_version[apicid] = version;
 
-	if (version != apic_version[boot_cpu_physical_apicid]) {
+	if (version != boot_cpu_apic_version) {
 		pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
-			apic_version[boot_cpu_physical_apicid], cpu, version);
+			boot_cpu_apic_version, cpu, version);
 	}
 
-	physid_set(apicid, phys_cpu_present_map);
 	if (apicid > max_physical_apicid)
 		max_physical_apicid = apicid;
 
@@ -2150,11 +2197,23 @@ int generic_processor_info(int apicid, int version)
 		apic->x86_32_early_logical_apicid(cpu);
 #endif
 	set_cpu_possible(cpu, true);
-	set_cpu_present(cpu, true);
+
+	if (enabled) {
+		num_processors++;
+		physid_set(apicid, phys_cpu_present_map);
+		set_cpu_present(cpu, true);
+	} else {
+		disabled_cpus++;
+	}
 
 	return cpu;
 }
 
+int generic_processor_info(int apicid, int version)
+{
+	return __generic_processor_info(apicid, version, true);
+}
+
 int hard_smp_processor_id(void)
 {
 	return read_apic_id();
@@ -2277,7 +2336,7 @@ int __init APIC_init_uniprocessor(void)
 	 * Complain if the BIOS pretends there is one.
 	 */
 	if (!boot_cpu_has(X86_FEATURE_APIC) &&
-	    APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+	    APIC_INTEGRATED(boot_cpu_apic_version)) {
 		pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
 			boot_cpu_physical_apicid);
 		return -1;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7491f417a8e4..48e6d84f173e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1593,7 +1593,7 @@ void __init setup_ioapic_ids_from_mpc(void)
 	 * no meaning without the serial APIC bus.
 	 */
 	if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		|| APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+		|| APIC_XAPIC(boot_cpu_apic_version))
 		return;
 	setup_ioapic_ids_from_mpc_nocheck();
 }
@@ -2423,7 +2423,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id)
 static u8 io_apic_unique_id(int idx, u8 id)
 {
 	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
-	    !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+	    !APIC_XAPIC(boot_cpu_apic_version))
 		return io_apic_get_unique_id(idx, id);
 	else
 		return id;
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index e5fb2f086460..c48264e202fd 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -152,7 +152,7 @@ early_param("apic", parse_apic);
 
 void __init default_setup_apic_routing(void)
 {
-	int version = apic_version[boot_cpu_physical_apicid];
+	int version = boot_cpu_apic_version;
 
 	if (num_possible_cpus() > 8) {
 		switch (boot_cpu_data.x86_vendor) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 06919427d451..9bd910a7dd0a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -804,21 +804,20 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 		identify_cpu_without_cpuid(c);
 
 	/* cyrix could have cpuid enabled via c_identify()*/
-	if (!have_cpuid_p())
-		return;
+	if (have_cpuid_p()) {
+		cpu_detect(c);
+		get_cpu_vendor(c);
+		get_cpu_cap(c);
 
-	cpu_detect(c);
-	get_cpu_vendor(c);
-	get_cpu_cap(c);
-
-	if (this_cpu->c_early_init)
-		this_cpu->c_early_init(c);
+		if (this_cpu->c_early_init)
+			this_cpu->c_early_init(c);
 
-	c->cpu_index = 0;
-	filter_cpuid_features(c, false);
+		c->cpu_index = 0;
+		filter_cpuid_features(c, false);
 
-	if (this_cpu->c_bsp_init)
-		this_cpu->c_bsp_init(c);
+		if (this_cpu->c_bsp_init)
+			this_cpu->c_bsp_init(c);
+	}
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 	fpu__init_system(c);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 27e46658ebe3..35691a6b0d32 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -86,3 +86,14 @@ bool __init hypervisor_x2apic_available(void)
 	       x86_hyper->x2apic_available &&
 	       x86_hyper->x2apic_available();
 }
+
+void hypervisor_pin_vcpu(int cpu)
+{
+	if (!x86_hyper)
+		return;
+
+	if (x86_hyper->pin_vcpu)
+		x86_hyper->pin_vcpu(cpu);
+	else
+		WARN_ONCE(1, "vcpu pinning requested but not supported!\n");
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 79d8ec849468..a7fdf453d895 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -41,6 +41,7 @@
 #include <linux/debugfs.h>
 #include <linux/irq_work.h>
 #include <linux/export.h>
+#include <linux/jump_label.h>
 
 #include <asm/processor.h>
 #include <asm/traps.h>
@@ -292,6 +293,13 @@ static void print_mce(struct mce *m)
 	if (m->misc)
 		pr_cont("MISC %llx ", m->misc);
 
+	if (mce_flags.smca) {
+		if (m->synd)
+			pr_cont("SYND %llx ", m->synd);
+		if (m->ipid)
+			pr_cont("IPID %llx ", m->ipid);
+	}
+
 	pr_cont("\n");
 	/*
 	 * Note this output is parsed by external tools and old fields
@@ -568,6 +576,7 @@ static void mce_read_aux(struct mce *m, int i)
 {
 	if (m->status & MCI_STATUS_MISCV)
 		m->misc = mce_rdmsrl(msr_ops.misc(i));
+
 	if (m->status & MCI_STATUS_ADDRV) {
 		m->addr = mce_rdmsrl(msr_ops.addr(i));
 
@@ -579,6 +588,23 @@ static void mce_read_aux(struct mce *m, int i)
 			m->addr >>= shift;
 			m->addr <<= shift;
 		}
+
+		/*
+		 * Extract [55:<lsb>] where lsb is the least significant
+		 * *valid* bit of the address bits.
+		 */
+		if (mce_flags.smca) {
+			u8 lsb = (m->addr >> 56) & 0x3f;
+
+			m->addr &= GENMASK_ULL(55, lsb);
+		}
+	}
+
+	if (mce_flags.smca) {
+		m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
+
+		if (m->status & MCI_STATUS_SYNDV)
+			m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
 	}
 }
 
@@ -1633,17 +1659,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 
 		if (c->x86 == 6 && c->x86_model == 45)
 			quirk_no_way_out = quirk_sandybridge_ifu;
-		/*
-		 * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
-		 * whether this processor will actually generate recoverable
-		 * machine checks. Check to see if this is an E7 model Xeon.
-		 * We can't do a model number check because E5 and E7 use the
-		 * same model number. E5 doesn't support recovery, E7 does.
-		 */
-		if (mca_cfg.recovery || (mca_cfg.ser &&
-			!strncmp(c->x86_model_id,
-				 "Intel(R) Xeon(R) CPU E7-", 24)))
-			set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
 	}
 	if (cfg->monarch_timeout < 0)
 		cfg->monarch_timeout = 0;
@@ -2080,6 +2095,7 @@ void mce_disable_bank(int bank)
  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
  * mce=nobootlog Don't log MCEs from before booting.
  * mce=bios_cmci_threshold Don't program the CMCI threshold
+ * mce=recovery force enable memcpy_mcsafe()
  */
 static int __init mcheck_enable(char *str)
 {
@@ -2676,8 +2692,14 @@ static int __init mcheck_debugfs_init(void)
 static int __init mcheck_debugfs_init(void) { return -EINVAL; }
 #endif
 
+DEFINE_STATIC_KEY_FALSE(mcsafe_key);
+EXPORT_SYMBOL_GPL(mcsafe_key);
+
 static int __init mcheck_late_init(void)
 {
+	if (mca_cfg.recovery)
+		static_branch_inc(&mcsafe_key);
+
 	mcheck_debugfs_init();
 
 	/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 7b7f3be783d4..9b5403462936 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/string.h>
 
 #include <asm/amd_nb.h>
 #include <asm/apic.h>
@@ -63,34 +64,71 @@ static const char * const th_names[] = {
 	"execution_unit",
 };
 
-/* Define HWID to IP type mappings for Scalable MCA */
-struct amd_hwid amd_hwids[] = {
-	[SMCA_F17H_CORE]	= { "f17h_core",	0xB0 },
-	[SMCA_DF]		= { "data_fabric",	0x2E },
-	[SMCA_UMC]		= { "umc",		0x96 },
-	[SMCA_PB]		= { "param_block",	0x5 },
-	[SMCA_PSP]		= { "psp",		0xFF },
-	[SMCA_SMU]		= { "smu",		0x1 },
+static const char * const smca_umc_block_names[] = {
+	"dram_ecc",
+	"misc_umc"
 };
-EXPORT_SYMBOL_GPL(amd_hwids);
-
-const char * const amd_core_mcablock_names[] = {
-	[SMCA_LS]		= "load_store",
-	[SMCA_IF]		= "insn_fetch",
-	[SMCA_L2_CACHE]		= "l2_cache",
-	[SMCA_DE]		= "decode_unit",
-	[RES]			= "",
-	[SMCA_EX]		= "execution_unit",
-	[SMCA_FP]		= "floating_point",
-	[SMCA_L3_CACHE]		= "l3_cache",
+
+struct smca_bank_name smca_bank_names[] = {
+	[SMCA_LS]	= { "load_store",	"Load Store Unit" },
+	[SMCA_IF]	= { "insn_fetch",	"Instruction Fetch Unit" },
+	[SMCA_L2_CACHE]	= { "l2_cache",		"L2 Cache" },
+	[SMCA_DE]	= { "decode_unit",	"Decode Unit" },
+	[SMCA_EX]	= { "execution_unit",	"Execution Unit" },
+	[SMCA_FP]	= { "floating_point",	"Floating Point Unit" },
+	[SMCA_L3_CACHE]	= { "l3_cache",		"L3 Cache" },
+	[SMCA_CS]	= { "coherent_slave",	"Coherent Slave" },
+	[SMCA_PIE]	= { "pie",		"Power, Interrupts, etc." },
+	[SMCA_UMC]	= { "umc",		"Unified Memory Controller" },
+	[SMCA_PB]	= { "param_block",	"Parameter Block" },
+	[SMCA_PSP]	= { "psp",		"Platform Security Processor" },
+	[SMCA_SMU]	= { "smu",		"System Management Unit" },
 };
-EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+EXPORT_SYMBOL_GPL(smca_bank_names);
+
+static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
+	/* { bank_type, hwid_mcatype, xec_bitmap } */
+
+	/* ZN Core (HWID=0xB0) MCA types */
+	{ SMCA_LS,	 HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF },
+	{ SMCA_IF,	 HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
+	{ SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
+	{ SMCA_DE,	 HWID_MCATYPE(0xB0, 0x3), 0x1FF },
+	/* HWID 0xB0 MCATYPE 0x4 is Reserved */
+	{ SMCA_EX,	 HWID_MCATYPE(0xB0, 0x5), 0x7FF },
+	{ SMCA_FP,	 HWID_MCATYPE(0xB0, 0x6), 0x7F },
+	{ SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF },
+
+	/* Data Fabric MCA types */
+	{ SMCA_CS,	 HWID_MCATYPE(0x2E, 0x0), 0x1FF },
+	{ SMCA_PIE,	 HWID_MCATYPE(0x2E, 0x1), 0xF },
+
+	/* Unified Memory Controller MCA type */
+	{ SMCA_UMC,	 HWID_MCATYPE(0x96, 0x0), 0x3F },
+
+	/* Parameter Block MCA type */
+	{ SMCA_PB,	 HWID_MCATYPE(0x05, 0x0), 0x1 },
+
+	/* Platform Security Processor MCA type */
+	{ SMCA_PSP,	 HWID_MCATYPE(0xFF, 0x0), 0x1 },
 
-const char * const amd_df_mcablock_names[] = {
-	[SMCA_CS]		= "coherent_slave",
-	[SMCA_PIE]		= "pie",
+	/* System Management Unit MCA type */
+	{ SMCA_SMU,	 HWID_MCATYPE(0x01, 0x0), 0x1 },
 };
-EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
+struct smca_bank_info smca_banks[MAX_NR_BANKS];
+EXPORT_SYMBOL_GPL(smca_banks);
+
+/*
+ * In SMCA enabled processors, we can have multiple banks for a given IP type.
+ * So to define a unique name for each bank, we use a temp c-string to append
+ * the MCA_IPID[InstanceId] to type's name in get_name().
+ *
+ * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN
+ * is greater than 8 plus 1 (for underscore) plus length of longest type name.
+ */
+#define MAX_MCATYPE_NAME_LEN	30
+static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
 
 static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
 static DEFINE_PER_CPU(unsigned int, bank_map);	/* see which banks are on */
@@ -108,6 +146,36 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
  * CPU Initialization
  */
 
+static void get_smca_bank_info(unsigned int bank)
+{
+	unsigned int i, hwid_mcatype, cpu = smp_processor_id();
+	struct smca_hwid_mcatype *type;
+	u32 high, instanceId;
+	u16 hwid, mcatype;
+
+	/* Collect bank_info using CPU 0 for now. */
+	if (cpu)
+		return;
+
+	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) {
+		pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
+		return;
+	}
+
+	hwid = high & MCI_IPID_HWID;
+	mcatype = (high & MCI_IPID_MCATYPE) >> 16;
+	hwid_mcatype = HWID_MCATYPE(hwid, mcatype);
+
+	for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+		type = &smca_hwid_mcatypes[i];
+		if (hwid_mcatype == type->hwid_mcatype) {
+			smca_banks[bank].type = type;
+			smca_banks[bank].type_instance = instanceId;
+			break;
+		}
+	}
+}
+
 struct thresh_restart {
 	struct threshold_block	*b;
 	int			reset;
@@ -293,7 +361,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
 	wrmsr(MSR_CU_DEF_ERR, low, high);
 }
 
-static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high,
 			     unsigned int bank, unsigned int block)
 {
 	u32 addr = 0, offset = 0;
@@ -309,13 +377,13 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
 			 */
 			u32 low, high;
 
-			if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+			if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
 				return addr;
 
 			if (!(low & MCI_CONFIG_MCAX))
 				return addr;
 
-			if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+			if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
 			    (low & MASK_BLKPTR_LO))
 				addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
 		}
@@ -395,6 +463,20 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 		 */
 		smca_high &= ~BIT(2);
 
+		/*
+		 * SMCA sets the Deferred Error Interrupt type per bank.
+		 *
+		 * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+		 * if the DeferredIntType bit field is available.
+		 *
+		 * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+		 * high portion of the MSR). OS should set this to 0x1 to enable
+		 * APIC based interrupt. First, check that no interrupt has been
+		 * set.
+		 */
+		if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3))
+			smca_high |= BIT(5);
+
 		wrmsr(smca_addr, smca_low, smca_high);
 	}
 
@@ -421,12 +503,15 @@ out:
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
 	u32 low = 0, high = 0, address = 0;
-	unsigned int bank, block;
+	unsigned int bank, block, cpu = smp_processor_id();
 	int offset = -1;
 
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+		if (mce_flags.smca)
+			get_smca_bank_info(bank);
+
 		for (block = 0; block < NR_BLOCKS; ++block) {
-			address = get_block_address(address, low, high, bank, block);
+			address = get_block_address(cpu, address, low, high, bank, block);
 			if (!address)
 				break;
 
@@ -476,9 +561,27 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
 	if (threshold_err)
 		m.misc = misc;
 
-	if (m.status & MCI_STATUS_ADDRV)
+	if (m.status & MCI_STATUS_ADDRV) {
 		rdmsrl(msr_addr, m.addr);
 
+		/*
+		 * Extract [55:<lsb>] where lsb is the least significant
+		 * *valid* bit of the address bits.
+		 */
+		if (mce_flags.smca) {
+			u8 lsb = (m.addr >> 56) & 0x3f;
+
+			m.addr &= GENMASK_ULL(55, lsb);
+		}
+	}
+
+	if (mce_flags.smca) {
+		rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
+
+		if (m.status & MCI_STATUS_SYNDV)
+			rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
+	}
+
 	mce_log(&m);
 
 	wrmsrl(msr_status, 0);
@@ -541,15 +644,14 @@ static void amd_deferred_error_interrupt(void)
 static void amd_threshold_interrupt(void)
 {
 	u32 low = 0, high = 0, address = 0;
-	int cpu = smp_processor_id();
-	unsigned int bank, block;
+	unsigned int bank, block, cpu = smp_processor_id();
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
 		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
 			continue;
 		for (block = 0; block < NR_BLOCKS; ++block) {
-			address = get_block_address(address, low, high, bank, block);
+			address = get_block_address(cpu, address, low, high, bank, block);
 			if (!address)
 				break;
 
@@ -713,6 +815,34 @@ static struct kobj_type threshold_ktype = {
 	.default_attrs		= default_attrs,
 };
 
+static const char *get_name(unsigned int bank, struct threshold_block *b)
+{
+	unsigned int bank_type;
+
+	if (!mce_flags.smca) {
+		if (b && bank == 4)
+			return bank4_names(b);
+
+		return th_names[bank];
+	}
+
+	if (!smca_banks[bank].type)
+		return NULL;
+
+	bank_type = smca_banks[bank].type->bank_type;
+
+	if (b && bank_type == SMCA_UMC) {
+		if (b->block < ARRAY_SIZE(smca_umc_block_names))
+			return smca_umc_block_names[b->block];
+		return NULL;
+	}
+
+	snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
+		 "%s_%x", smca_bank_names[bank_type].name,
+			  smca_banks[bank].type_instance);
+	return buf_mcatype;
+}
+
 static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
 				     unsigned int block, u32 address)
 {
@@ -767,11 +897,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
 
 	err = kobject_init_and_add(&b->kobj, &threshold_ktype,
 				   per_cpu(threshold_banks, cpu)[bank]->kobj,
-				   (bank == 4 ? bank4_names(b) : th_names[bank]));
+				   get_name(bank, b));
 	if (err)
 		goto out_free;
 recurse:
-	address = get_block_address(address, low, high, bank, ++block);
+	address = get_block_address(cpu, address, low, high, bank, ++block);
 	if (!address)
 		return 0;
 
@@ -822,7 +952,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
 	struct device *dev = per_cpu(mce_device, cpu);
 	struct amd_northbridge *nb = NULL;
 	struct threshold_bank *b = NULL;
-	const char *name = th_names[bank];
+	const char *name = get_name(bank, NULL);
 	int err = 0;
 
 	if (is_shared_bank(bank)) {
@@ -869,7 +999,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		}
 	}
 
-	err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank));
+	err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
 	if (!err)
 		goto out;
 
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 1726c4c12336..865058d087ac 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -575,9 +575,6 @@ static void kvm_kick_cpu(int cpu)
 	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
 }
 
-
-#ifdef CONFIG_QUEUED_SPINLOCKS
-
 #include <asm/qspinlock.h>
 
 static void kvm_wait(u8 *ptr, u8 val)
@@ -606,243 +603,6 @@ out:
 	local_irq_restore(flags);
 }
 
-#else /* !CONFIG_QUEUED_SPINLOCKS */
-
-enum kvm_contention_stat {
-	TAKEN_SLOW,
-	TAKEN_SLOW_PICKUP,
-	RELEASED_SLOW,
-	RELEASED_SLOW_KICKED,
-	NR_CONTENTION_STATS
-};
-
-#ifdef CONFIG_KVM_DEBUG_FS
-#define HISTO_BUCKETS	30
-
-static struct kvm_spinlock_stats
-{
-	u32 contention_stats[NR_CONTENTION_STATS];
-	u32 histo_spin_blocked[HISTO_BUCKETS+1];
-	u64 time_blocked;
-} spinlock_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
-	u8 ret;
-	u8 old;
-
-	old = READ_ONCE(zero_stats);
-	if (unlikely(old)) {
-		ret = cmpxchg(&zero_stats, old, 0);
-		/* This ensures only one fellow resets the stat */
-		if (ret == old)
-			memset(&spinlock_stats, 0, sizeof(spinlock_stats));
-	}
-}
-
-static inline void add_stats(enum kvm_contention_stat var, u32 val)
-{
-	check_zero();
-	spinlock_stats.contention_stats[var] += val;
-}
-
-
-static inline u64 spin_time_start(void)
-{
-	return sched_clock();
-}
-
-static void __spin_time_accum(u64 delta, u32 *array)
-{
-	unsigned index;
-
-	index = ilog2(delta);
-	check_zero();
-
-	if (index < HISTO_BUCKETS)
-		array[index]++;
-	else
-		array[HISTO_BUCKETS]++;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
-	u32 delta;
-
-	delta = sched_clock() - start;
-	__spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
-	spinlock_stats.time_blocked += delta;
-}
-
-static struct dentry *d_spin_debug;
-static struct dentry *d_kvm_debug;
-
-static struct dentry *kvm_init_debugfs(void)
-{
-	d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
-	if (!d_kvm_debug)
-		printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
-
-	return d_kvm_debug;
-}
-
-static int __init kvm_spinlock_debugfs(void)
-{
-	struct dentry *d_kvm;
-
-	d_kvm = kvm_init_debugfs();
-	if (d_kvm == NULL)
-		return -ENOMEM;
-
-	d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);
-
-	debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
-
-	debugfs_create_u32("taken_slow", 0444, d_spin_debug,
-		   &spinlock_stats.contention_stats[TAKEN_SLOW]);
-	debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
-		   &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
-
-	debugfs_create_u32("released_slow", 0444, d_spin_debug,
-		   &spinlock_stats.contention_stats[RELEASED_SLOW]);
-	debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
-		   &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
-
-	debugfs_create_u64("time_blocked", 0444, d_spin_debug,
-			   &spinlock_stats.time_blocked);
-
-	debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
-		     spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
-
-	return 0;
-}
-fs_initcall(kvm_spinlock_debugfs);
-#else  /* !CONFIG_KVM_DEBUG_FS */
-static inline void add_stats(enum kvm_contention_stat var, u32 val)
-{
-}
-
-static inline u64 spin_time_start(void)
-{
-	return 0;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
-}
-#endif  /* CONFIG_KVM_DEBUG_FS */
-
-struct kvm_lock_waiting {
-	struct arch_spinlock *lock;
-	__ticket_t want;
-};
-
-/* cpus 'waiting' on a spinlock to become available */
-static cpumask_t waiting_cpus;
-
-/* Track spinlock on which a cpu is waiting */
-static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);
-
-__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
-{
-	struct kvm_lock_waiting *w;
-	int cpu;
-	u64 start;
-	unsigned long flags;
-	__ticket_t head;
-
-	if (in_nmi())
-		return;
-
-	w = this_cpu_ptr(&klock_waiting);
-	cpu = smp_processor_id();
-	start = spin_time_start();
-
-	/*
-	 * Make sure an interrupt handler can't upset things in a
-	 * partially setup state.
-	 */
-	local_irq_save(flags);
-
-	/*
-	 * The ordering protocol on this is that the "lock" pointer
-	 * may only be set non-NULL if the "want" ticket is correct.
-	 * If we're updating "want", we must first clear "lock".
-	 */
-	w->lock = NULL;
-	smp_wmb();
-	w->want = want;
-	smp_wmb();
-	w->lock = lock;
-
-	add_stats(TAKEN_SLOW, 1);
-
-	/*
-	 * This uses set_bit, which is atomic but we should not rely on its
-	 * reordering gurantees. So barrier is needed after this call.
-	 */
-	cpumask_set_cpu(cpu, &waiting_cpus);
-
-	barrier();
-
-	/*
-	 * Mark entry to slowpath before doing the pickup test to make
-	 * sure we don't deadlock with an unlocker.
-	 */
-	__ticket_enter_slowpath(lock);
-
-	/* make sure enter_slowpath, which is atomic does not cross the read */
-	smp_mb__after_atomic();
-
-	/*
-	 * check again make sure it didn't become free while
-	 * we weren't looking.
-	 */
-	head = READ_ONCE(lock->tickets.head);
-	if (__tickets_equal(head, want)) {
-		add_stats(TAKEN_SLOW_PICKUP, 1);
-		goto out;
-	}
-
-	/*
-	 * halt until it's our turn and kicked. Note that we do safe halt
-	 * for irq enabled case to avoid hang when lock info is overwritten
-	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
-	 */
-	if (arch_irqs_disabled_flags(flags))
-		halt();
-	else
-		safe_halt();
-
-out:
-	cpumask_clear_cpu(cpu, &waiting_cpus);
-	w->lock = NULL;
-	local_irq_restore(flags);
-	spin_time_accum_blocked(start);
-}
-PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
-
-/* Kick vcpu waiting on @lock->head to reach value @ticket */
-static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
-{
-	int cpu;
-
-	add_stats(RELEASED_SLOW, 1);
-	for_each_cpu(cpu, &waiting_cpus) {
-		const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
-		if (READ_ONCE(w->lock) == lock &&
-		    READ_ONCE(w->want) == ticket) {
-			add_stats(RELEASED_SLOW_KICKED, 1);
-			kvm_kick_cpu(cpu);
-			break;
-		}
-	}
-}
-
-#endif /* !CONFIG_QUEUED_SPINLOCKS */
-
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */
@@ -854,16 +614,11 @@ void __init kvm_spinlock_init(void)
 	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
 		return;
 
-#ifdef CONFIG_QUEUED_SPINLOCKS
 	__pv_init_lock_hash();
 	pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
 	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
 	pv_lock_ops.wait = kvm_wait;
 	pv_lock_ops.kick = kvm_kick_cpu;
-#else /* !CONFIG_QUEUED_SPINLOCKS */
-	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
-	pv_lock_ops.unlock_kick = kvm_unlock_kick;
-#endif
 }
 
 static __init int kvm_spinlock_init_jump(void)
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 068c4a929de6..0f8d20497383 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -499,6 +499,9 @@ void __init default_get_smp_config(unsigned int early)
 {
 	struct mpf_intel *mpf = mpf_found;
 
+	if (!smp_found_config)
+		return;
+
 	if (!mpf)
 		return;
 
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 1939a0269377..2c55a003b793 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,7 +8,6 @@
 
 #include <asm/paravirt.h>
 
-#ifdef CONFIG_QUEUED_SPINLOCKS
 __visible void __native_queued_spin_unlock(struct qspinlock *lock)
 {
 	native_queued_spin_unlock(lock);
@@ -21,19 +20,13 @@ bool pv_is_native_spin_unlock(void)
 	return pv_lock_ops.queued_spin_unlock.func ==
 		__raw_callee_save___native_queued_spin_unlock;
 }
-#endif
 
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
-#ifdef CONFIG_QUEUED_SPINLOCKS
 	.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
 	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
 	.wait = paravirt_nop,
 	.kick = paravirt_nop,
-#else /* !CONFIG_QUEUED_SPINLOCKS */
-	.lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
-	.unlock_kick = paravirt_nop,
-#endif /* !CONFIG_QUEUED_SPINLOCKS */
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 158dc0650d5d..920c6ae08592 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -10,7 +10,7 @@ DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
 #endif
 
@@ -49,7 +49,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_cpu_ops, clts);
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
 			if (pv_is_native_spin_unlock()) {
 				start = start_pv_lock_ops_queued_spin_unlock;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index e70087a04cc8..bb3840cedb4f 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,7 +19,7 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
 DEF_NATIVE(, mov32, "mov %edi, %eax");
 DEF_NATIVE(, mov64, "mov %rdi, %rax");
 
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
 #endif
 
@@ -61,7 +61,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_cpu_ops, clts);
 		PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 		PATCH_SITE(pv_cpu_ops, wbinvd);
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
 			if (pv_is_native_spin_unlock()) {
 				start = start_pv_lock_ops_queued_spin_unlock;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index cc457ff818ad..51402a7e4ca6 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -626,3 +626,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
 			amd_disable_seq_and_redirect_scrub);
 
 #endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#include <linux/jump_label.h>
+#include <asm/string_64.h>
+
+/* Ivy Bridge, Haswell, Broadwell */
+static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
+{
+	u32 capid0;
+
+	pci_read_config_dword(pdev, 0x84, &capid0);
+
+	if (capid0 & 0x10)
+		static_branch_inc(&mcsafe_key);
+}
+
+/* Skylake */
+static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
+{
+	u32 capid0;
+
+	pci_read_config_dword(pdev, 0x84, &capid0);
+
+	if ((capid0 & 0xc0) == 0xc0)
+		static_branch_inc(&mcsafe_key);
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
+#endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3aabfdcbcb52..eeb094ea794a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1096,19 +1096,19 @@ void __init setup_arch(char **cmdline_p)
 	memblock_set_current_limit(ISA_END_ADDRESS);
 	memblock_x86_fill();
 
-	if (efi_enabled(EFI_BOOT)) {
+	reserve_bios_regions();
+
+	if (efi_enabled(EFI_MEMMAP)) {
 		efi_fake_memmap();
 		efi_find_mirror();
-	}
-
-	reserve_bios_regions();
+		efi_esrt_init();
 
-	/*
-	 * The EFI specification says that boot service code won't be called
-	 * after ExitBootServices(). This is, in fact, a lie.
-	 */
-	if (efi_enabled(EFI_MEMMAP))
+		/*
+		 * The EFI specification says that boot service code won't be
+		 * called after ExitBootServices(). This is, in fact, a lie.
+		 */
 		efi_reserve_boot_services();
+	}
 
 	/* preallocate 4k for mptable mpc */
 	early_reserve_e820_mpc_new();
@@ -1219,8 +1219,7 @@ void __init setup_arch(char **cmdline_p)
 	/*
 	 * get boot-time SMP configuration:
 	 */
-	if (smp_found_config)
-		get_smp_config();
+	get_smp_config();
 
 	prefill_possible_map();
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7e52f83d3a4b..7249dcf2cbcb 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	return false;
 }
 
-static struct sched_domain_topology_level numa_inside_package_topology[] = {
+static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
 #ifdef CONFIG_SCHED_SMT
 	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
 #endif
@@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = {
 #endif
 	{ NULL, },
 };
+
+static struct sched_domain_topology_level x86_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
 /*
- * set_sched_topology() sets the topology internal to a CPU.  The
- * NUMA topologies are layered on top of it to build the full
- * system topology.
- *
- * If NUMA nodes are observed to occur within a CPU package, this
- * function should be called.  It forces the sched domain code to
- * only use the SMT level for the CPU portion of the topology.
- * This essentially falls back to relying on NUMA information
- * from the SRAT table to describe the entire system topology
- * (except for hyperthreads).
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours and Intel Cluster-on-Die have this.
  */
-static void primarily_use_numa_for_topology(void)
-{
-	set_sched_topology(numa_inside_package_topology);
-}
+static bool x86_has_numa_in_package;
 
 void set_cpu_sibling_map(int cpu)
 {
@@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu)
 				c->booted_cores = cpu_data(i).booted_cores;
 		}
 		if (match_die(c, o) && !topology_same_node(c, o))
-			primarily_use_numa_for_topology();
+			x86_has_numa_in_package = true;
 	}
 
 	threads = cpumask_weight(topology_sibling_cpumask(cpu));
@@ -690,7 +691,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
 	 * Give the other CPU some time to accept the IPI.
 	 */
 	udelay(200);
-	if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+	if (APIC_INTEGRATED(boot_cpu_apic_version)) {
 		maxlvt = lapic_get_maxlvt();
 		if (maxlvt > 3)			/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
@@ -717,7 +718,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	/*
 	 * Be paranoid about clearing APIC errors.
 	 */
-	if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+	if (APIC_INTEGRATED(boot_cpu_apic_version)) {
 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
 		apic_read(APIC_ESR);
@@ -756,7 +757,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	 * Determine this based on the APIC version.
 	 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
 	 */
-	if (APIC_INTEGRATED(apic_version[phys_apicid]))
+	if (APIC_INTEGRATED(boot_cpu_apic_version))
 		num_starts = 2;
 	else
 		num_starts = 0;
@@ -993,7 +994,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 		/*
 		 * Be paranoid about clearing APIC errors.
 		*/
-		if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+		if (APIC_INTEGRATED(boot_cpu_apic_version)) {
 			apic_write(APIC_ESR, 0);
 			apic_read(APIC_ESR);
 		}
@@ -1248,7 +1249,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	/*
 	 * If we couldn't find a local APIC, then get out of here now!
 	 */
-	if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
+	if (APIC_INTEGRATED(boot_cpu_apic_version) &&
 	    !boot_cpu_has(X86_FEATURE_APIC)) {
 		if (!disable_apic) {
 			pr_err("BIOS bug, local APIC #%d not detected!...\n",
@@ -1303,6 +1304,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
 		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
 	}
+
+	/*
+	 * Set 'default' x86 topology, this matches default_topology() in that
+	 * it has NUMA nodes as a topology level. See also
+	 * native_smp_cpus_done().
+	 *
+	 * Must be done before set_cpus_sibling_map() is ran.
+	 */
+	set_sched_topology(x86_topology);
+
 	set_cpu_sibling_map(0);
 
 	switch (smp_sanity_check(max_cpus)) {
@@ -1322,14 +1333,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 		break;
 	}
 
-	default_setup_apic_routing();
-
 	if (read_apic_id() != boot_cpu_physical_apicid) {
 		panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
 		     read_apic_id(), boot_cpu_physical_apicid);
 		/* Or can we switch back to PIC here? */
 	}
 
+	default_setup_apic_routing();
 	cpu0_logical_apicid = apic_bsp_setup(false);
 
 	pr_info("CPU%d: ", 0);
@@ -1369,6 +1379,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 {
 	pr_debug("Boot done\n");
 
+	if (x86_has_numa_in_package)
+		set_sched_topology(x86_numa_in_package_topology);
+
 	nmi_selftest();
 	impress_friends();
 	setup_ioapic_dest();
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 95e49f6e4fc3..b2cee3d19477 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -38,7 +38,7 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
-EXPORT_SYMBOL_GPL(memcpy_mcsafe);
+EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled);
 
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 2ec0b0abbfaa..49e6ebac7e73 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -181,11 +181,11 @@ ENDPROC(memcpy_orig)
 
 #ifndef CONFIG_UML
 /*
- * memcpy_mcsafe - memory copy with machine check exception handling
+ * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
  * Note that we only catch machine checks when reading the source addresses.
  * Writes to target are posted and don't generate machine checks.
  */
-ENTRY(memcpy_mcsafe)
+ENTRY(memcpy_mcsafe_unrolled)
 	cmpl $8, %edx
 	/* Less than 8 bytes? Go to byte copy loop */
 	jb .L_no_whole_words
@@ -273,7 +273,7 @@ ENTRY(memcpy_mcsafe)
 .L_done_memcpy_trap:
 	xorq %rax, %rax
 	ret
-ENDPROC(memcpy_mcsafe)
+ENDPROC(memcpy_mcsafe_unrolled)
 
 	.section .fixup, "ax"
 	/* Return -EFAULT for any failure */
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index ba47524f56e8..d1c7de095808 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -52,21 +52,6 @@ static __init int find_northbridge(void)
 	return -ENOENT;
 }
 
-static __init void early_get_boot_cpu_id(void)
-{
-	/*
-	 * need to get the APIC ID of the BSP so can use that to
-	 * create apicid_to_node in amd_scan_nodes()
-	 */
-#ifdef CONFIG_X86_MPPARSE
-	/*
-	 * get boot-time SMP configuration:
-	 */
-	if (smp_found_config)
-		early_get_smp_config();
-#endif
-}
-
 int __init amd_numa_init(void)
 {
 	u64 start = PFN_PHYS(0);
@@ -180,8 +165,11 @@ int __init amd_numa_init(void)
 	cores = 1 << bits;
 	apicid_base = 0;
 
-	/* get the APIC ID of the BSP early for systems with apicid lifting */
-	early_get_boot_cpu_id();
+	/*
+	 * get boot-time SMP configuration:
+	 */
+	early_get_smp_config();
+
 	if (boot_cpu_physical_apicid > 0) {
 		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
 		apicid_base = boot_cpu_physical_apicid;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index fb682108f4dc..3f35b48d1d9d 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -722,22 +722,19 @@ void __init x86_numa_init(void)
 	numa_init(dummy_numa_init);
 }
 
-static __init int find_near_online_node(int node)
+static void __init init_memory_less_node(int nid)
 {
-	int n, val;
-	int min_val = INT_MAX;
-	int best_node = -1;
+	unsigned long zones_size[MAX_NR_ZONES] = {0};
+	unsigned long zholes_size[MAX_NR_ZONES] = {0};
 
-	for_each_online_node(n) {
-		val = node_distance(node, n);
+	/* Allocate and initialize node data. Memory-less node is now online.*/
+	alloc_node_data(nid);
+	free_area_init_node(nid, zones_size, 0, zholes_size);
 
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	return best_node;
+	/*
+	 * All zonelists will be built later in start_kernel() after per cpu
+	 * areas are initialized.
+	 */
 }
 
 /*
@@ -766,8 +763,10 @@ void __init init_cpu_to_node(void)
 
 		if (node == NUMA_NO_NODE)
 			continue;
+
 		if (!node_online(node))
-			node = find_near_online_node(node);
+			init_memory_less_node(node);
+
 		numa_set_node(cpu, node);
 	}
 }
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index 6a2f5691b1ab..6aad870e8962 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -82,21 +82,12 @@ void __init efi_bgrt_init(void)
 	}
 	bgrt_image_size = bmp_header.size;
 
-	bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN);
+	bgrt_image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB);
 	if (!bgrt_image) {
-		pr_notice("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n",
-		       bgrt_image_size);
-		return;
-	}
-
-	image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB);
-	if (!image) {
 		pr_notice("Ignoring BGRT: failed to map image memory\n");
-		kfree(bgrt_image);
 		bgrt_image = NULL;
 		return;
 	}
 
-	memcpy(bgrt_image, image, bgrt_image_size);
-	memunmap(image);
+	efi_mem_reserve(bgrt_tab->image_address, bgrt_image_size);
 }
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 1fbb408e2e72..0955c70897ae 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -172,7 +172,9 @@ static void __init do_add_efi_memmap(void)
 int __init efi_memblock_x86_reserve_range(void)
 {
 	struct efi_info *e = &boot_params.efi_info;
+	struct efi_memory_map_data data;
 	phys_addr_t pmap;
+	int rv;
 
 	if (efi_enabled(EFI_PARAVIRT))
 		return 0;
@@ -187,11 +189,17 @@ int __init efi_memblock_x86_reserve_range(void)
 #else
 	pmap = (e->efi_memmap |	((__u64)e->efi_memmap_hi << 32));
 #endif
-	efi.memmap.phys_map	= pmap;
-	efi.memmap.nr_map	= e->efi_memmap_size /
-				  e->efi_memdesc_size;
-	efi.memmap.desc_size	= e->efi_memdesc_size;
-	efi.memmap.desc_version	= e->efi_memdesc_version;
+	data.phys_map		= pmap;
+	data.size 		= e->efi_memmap_size;
+	data.desc_size		= e->efi_memdesc_size;
+	data.desc_version	= e->efi_memdesc_version;
+
+	rv = efi_memmap_init_early(&data);
+	if (rv)
+		return rv;
+
+	if (add_efi_memmap)
+		do_add_efi_memmap();
 
 	WARN(efi.memmap.desc_version != 1,
 	     "Unexpected EFI_MEMORY_DESCRIPTOR version %ld",
@@ -218,19 +226,6 @@ void __init efi_print_memmap(void)
 	}
 }
 
-void __init efi_unmap_memmap(void)
-{
-	unsigned long size;
-
-	clear_bit(EFI_MEMMAP, &efi.flags);
-
-	size = efi.memmap.nr_map * efi.memmap.desc_size;
-	if (efi.memmap.map) {
-		early_memunmap(efi.memmap.map, size);
-		efi.memmap.map = NULL;
-	}
-}
-
 static int __init efi_systab_init(void *phys)
 {
 	if (efi_enabled(EFI_64BIT)) {
@@ -414,33 +409,6 @@ static int __init efi_runtime_init(void)
 	return 0;
 }
 
-static int __init efi_memmap_init(void)
-{
-	unsigned long addr, size;
-
-	if (efi_enabled(EFI_PARAVIRT))
-		return 0;
-
-	/* Map the EFI memory map */
-	size = efi.memmap.nr_map * efi.memmap.desc_size;
-	addr = (unsigned long)efi.memmap.phys_map;
-
-	efi.memmap.map = early_memremap(addr, size);
-	if (efi.memmap.map == NULL) {
-		pr_err("Could not map the memory map!\n");
-		return -ENOMEM;
-	}
-
-	efi.memmap.map_end = efi.memmap.map + size;
-
-	if (add_efi_memmap)
-		do_add_efi_memmap();
-
-	set_bit(EFI_MEMMAP, &efi.flags);
-
-	return 0;
-}
-
 void __init efi_init(void)
 {
 	efi_char16_t *c16;
@@ -498,16 +466,14 @@ void __init efi_init(void)
 	if (!efi_runtime_supported())
 		pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n");
 	else {
-		if (efi_runtime_disabled() || efi_runtime_init())
+		if (efi_runtime_disabled() || efi_runtime_init()) {
+			efi_memmap_unmap();
 			return;
+		}
 	}
-	if (efi_memmap_init())
-		return;
 
 	if (efi_enabled(EFI_DBG))
 		efi_print_memmap();
-
-	efi_esrt_init();
 }
 
 void __init efi_late_init(void)
@@ -624,42 +590,6 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
 	}
 }
 
-static void __init save_runtime_map(void)
-{
-#ifdef CONFIG_KEXEC_CORE
-	unsigned long desc_size;
-	efi_memory_desc_t *md;
-	void *tmp, *q = NULL;
-	int count = 0;
-
-	if (efi_enabled(EFI_OLD_MEMMAP))
-		return;
-
-	desc_size = efi.memmap.desc_size;
-
-	for_each_efi_memory_desc(md) {
-		if (!(md->attribute & EFI_MEMORY_RUNTIME) ||
-		    (md->type == EFI_BOOT_SERVICES_CODE) ||
-		    (md->type == EFI_BOOT_SERVICES_DATA))
-			continue;
-		tmp = krealloc(q, (count + 1) * desc_size, GFP_KERNEL);
-		if (!tmp)
-			goto out;
-		q = tmp;
-
-		memcpy(q + count * desc_size, md, desc_size);
-		count++;
-	}
-
-	efi_runtime_map_setup(q, count, desc_size);
-	return;
-
-out:
-	kfree(q);
-	pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n");
-#endif
-}
-
 static void *realloc_pages(void *old_memmap, int old_shift)
 {
 	void *ret;
@@ -745,6 +675,46 @@ static void *efi_map_next_entry(void *entry)
 	return entry;
 }
 
+static bool should_map_region(efi_memory_desc_t *md)
+{
+	/*
+	 * Runtime regions always require runtime mappings (obviously).
+	 */
+	if (md->attribute & EFI_MEMORY_RUNTIME)
+		return true;
+
+	/*
+	 * 32-bit EFI doesn't suffer from the bug that requires us to
+	 * reserve boot services regions, and mixed mode support
+	 * doesn't exist for 32-bit kernels.
+	 */
+	if (IS_ENABLED(CONFIG_X86_32))
+		return false;
+
+	/*
+	 * Map all of RAM so that we can access arguments in the 1:1
+	 * mapping when making EFI runtime calls.
+	 */
+	if (IS_ENABLED(CONFIG_EFI_MIXED) && !efi_is_native()) {
+		if (md->type == EFI_CONVENTIONAL_MEMORY ||
+		    md->type == EFI_LOADER_DATA ||
+		    md->type == EFI_LOADER_CODE)
+			return true;
+	}
+
+	/*
+	 * Map boot services regions as a workaround for buggy
+	 * firmware that accesses them even when they shouldn't.
+	 *
+	 * See efi_{reserve,free}_boot_services().
+	 */
+	if (md->type == EFI_BOOT_SERVICES_CODE ||
+	    md->type == EFI_BOOT_SERVICES_DATA)
+		return true;
+
+	return false;
+}
+
 /*
  * Map the efi memory ranges of the runtime services and update new_mmap with
  * virtual addresses.
@@ -761,13 +731,9 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 	p = NULL;
 	while ((p = efi_map_next_entry(p))) {
 		md = p;
-		if (!(md->attribute & EFI_MEMORY_RUNTIME)) {
-#ifdef CONFIG_X86_64
-			if (md->type != EFI_BOOT_SERVICES_CODE &&
-			    md->type != EFI_BOOT_SERVICES_DATA)
-#endif
-				continue;
-		}
+
+		if (!should_map_region(md))
+			continue;
 
 		efi_map_region(md);
 		get_systab_virt_addr(md);
@@ -803,7 +769,7 @@ static void __init kexec_enter_virtual_mode(void)
 	 * non-native EFI
 	 */
 	if (!efi_is_native()) {
-		efi_unmap_memmap();
+		efi_memmap_unmap();
 		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
 		return;
 	}
@@ -823,7 +789,18 @@ static void __init kexec_enter_virtual_mode(void)
 		get_systab_virt_addr(md);
 	}
 
-	save_runtime_map();
+	/*
+	 * Unregister the early EFI memmap from efi_init() and install
+	 * the new EFI memory map.
+	 */
+	efi_memmap_unmap();
+
+	if (efi_memmap_init_late(efi.memmap.phys_map,
+				 efi.memmap.desc_size * efi.memmap.nr_map)) {
+		pr_err("Failed to remap late EFI memory map\n");
+		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+		return;
+	}
 
 	BUG_ON(!efi.systab);
 
@@ -884,6 +861,7 @@ static void __init __efi_enter_virtual_mode(void)
 	int count = 0, pg_shift = 0;
 	void *new_memmap = NULL;
 	efi_status_t status;
+	phys_addr_t pa;
 
 	efi.systab = NULL;
 
@@ -901,11 +879,24 @@ static void __init __efi_enter_virtual_mode(void)
 		return;
 	}
 
-	save_runtime_map();
+	pa = __pa(new_memmap);
+
+	/*
+	 * Unregister the early EFI memmap from efi_init() and install
+	 * the new EFI memory map that we are about to pass to the
+	 * firmware via SetVirtualAddressMap().
+	 */
+	efi_memmap_unmap();
+
+	if (efi_memmap_init_late(pa, efi.memmap.desc_size * count)) {
+		pr_err("Failed to remap late EFI memory map\n");
+		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+		return;
+	}
 
 	BUG_ON(!efi.systab);
 
-	if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) {
+	if (efi_setup_page_tables(pa, 1 << pg_shift)) {
 		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
 		return;
 	}
@@ -917,14 +908,14 @@ static void __init __efi_enter_virtual_mode(void)
 				efi.memmap.desc_size * count,
 				efi.memmap.desc_size,
 				efi.memmap.desc_version,
-				(efi_memory_desc_t *)__pa(new_memmap));
+				(efi_memory_desc_t *)pa);
 	} else {
 		status = efi_thunk_set_virtual_address_map(
 				efi_phys.set_virtual_address_map,
 				efi.memmap.desc_size * count,
 				efi.memmap.desc_size,
 				efi.memmap.desc_version,
-				(efi_memory_desc_t *)__pa(new_memmap));
+				(efi_memory_desc_t *)pa);
 	}
 
 	if (status != EFI_SUCCESS) {
@@ -956,15 +947,6 @@ static void __init __efi_enter_virtual_mode(void)
 	efi_runtime_update_mappings();
 	efi_dump_pagetable();
 
-	/*
-	 * We mapped the descriptor array into the EFI pagetable above
-	 * but we're not unmapping it here because if we're running in
-	 * EFI mixed mode we need all of memory to be accessible when
-	 * we pass parameters to the EFI runtime services in the
-	 * thunking code.
-	 */
-	free_pages((unsigned long)new_memmap, pg_shift);
-
 	/* clean DUMMY object */
 	efi_delete_dummy_variable();
 }
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 8dd3784eb075..58b0f801f66f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -85,7 +85,7 @@ pgd_t * __init efi_call_phys_prolog(void)
 	early_code_mapping_set_exec(1);
 
 	n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
-	save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
+	save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL);
 
 	for (pgd = 0; pgd < n_pgds; pgd++) {
 		save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE);
@@ -214,7 +214,6 @@ void efi_sync_low_kernel_mappings(void)
 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 {
 	unsigned long pfn, text;
-	efi_memory_desc_t *md;
 	struct page *page;
 	unsigned npages;
 	pgd_t *pgd;
@@ -248,25 +247,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	if (!IS_ENABLED(CONFIG_EFI_MIXED) || efi_is_native())
 		return 0;
 
-	/*
-	 * Map all of RAM so that we can access arguments in the 1:1
-	 * mapping when making EFI runtime calls.
-	 */
-	for_each_efi_memory_desc(md) {
-		if (md->type != EFI_CONVENTIONAL_MEMORY &&
-		    md->type != EFI_LOADER_DATA &&
-		    md->type != EFI_LOADER_CODE)
-			continue;
-
-		pfn = md->phys_addr >> PAGE_SHIFT;
-		npages = md->num_pages;
-
-		if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, npages, _PAGE_RW)) {
-			pr_err("Failed to map 1:1 memory\n");
-			return 1;
-		}
-	}
-
 	page = alloc_page(GFP_KERNEL|__GFP_DMA32);
 	if (!page)
 		panic("Unable to allocate EFI runtime stack < 4GB\n");
@@ -359,6 +339,7 @@ void __init efi_map_region(efi_memory_desc_t *md)
  */
 void __init efi_map_region_fixed(efi_memory_desc_t *md)
 {
+	__map_region(md, md->phys_addr);
 	__map_region(md, md->virt_addr);
 }
 
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 89d1146f5a6f..10aca63a50d7 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -164,6 +164,75 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size,
 EXPORT_SYMBOL_GPL(efi_query_variable_store);
 
 /*
+ * The UEFI specification makes it clear that the operating system is
+ * free to do whatever it wants with boot services code after
+ * ExitBootServices() has been called. Ignoring this recommendation a
+ * significant bunch of EFI implementations continue calling into boot
+ * services code (SetVirtualAddressMap). In order to work around such
+ * buggy implementations we reserve boot services region during EFI
+ * init and make sure it stays executable. Then, after
+ * SetVirtualAddressMap(), it is discarded.
+ *
+ * However, some boot services regions contain data that is required
+ * by drivers, so we need to track which memory ranges can never be
+ * freed. This is done by tagging those regions with the
+ * EFI_MEMORY_RUNTIME attribute.
+ *
+ * Any driver that wants to mark a region as reserved must use
+ * efi_mem_reserve() which will insert a new EFI memory descriptor
+ * into efi.memmap (splitting existing regions if necessary) and tag
+ * it with EFI_MEMORY_RUNTIME.
+ */
+void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
+{
+	phys_addr_t new_phys, new_size;
+	struct efi_mem_range mr;
+	efi_memory_desc_t md;
+	int num_entries;
+	void *new;
+
+	if (efi_mem_desc_lookup(addr, &md)) {
+		pr_err("Failed to lookup EFI memory descriptor for %pa\n", &addr);
+		return;
+	}
+
+	if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) {
+		pr_err("Region spans EFI memory descriptors, %pa\n", &addr);
+		return;
+	}
+
+	size += addr % EFI_PAGE_SIZE;
+	size = round_up(size, EFI_PAGE_SIZE);
+	addr = round_down(addr, EFI_PAGE_SIZE);
+
+	mr.range.start = addr;
+	mr.range.end = addr + size - 1;
+	mr.attribute = md.attribute | EFI_MEMORY_RUNTIME;
+
+	num_entries = efi_memmap_split_count(&md, &mr.range);
+	num_entries += efi.memmap.nr_map;
+
+	new_size = efi.memmap.desc_size * num_entries;
+
+	new_phys = memblock_alloc(new_size, 0);
+	if (!new_phys) {
+		pr_err("Could not allocate boot services memmap\n");
+		return;
+	}
+
+	new = early_memremap(new_phys, new_size);
+	if (!new) {
+		pr_err("Failed to map new boot services memmap\n");
+		return;
+	}
+
+	efi_memmap_insert(&efi.memmap, new, &mr);
+	early_memunmap(new, new_size);
+
+	efi_memmap_install(new_phys, num_entries);
+}
+
+/*
  * Helper function for efi_reserve_boot_services() to figure out if we
  * can free regions in efi_free_boot_services().
  *
@@ -184,15 +253,6 @@ static bool can_free_region(u64 start, u64 size)
 	return true;
 }
 
-/*
- * The UEFI specification makes it clear that the operating system is free to do
- * whatever it wants with boot services code after ExitBootServices() has been
- * called. Ignoring this recommendation a significant bunch of EFI implementations 
- * continue calling into boot services code (SetVirtualAddressMap). In order to 
- * work around such buggy implementations we reserve boot services region during 
- * EFI init and make sure it stays executable. Then, after SetVirtualAddressMap(), it
-* is discarded.
-*/
 void __init efi_reserve_boot_services(void)
 {
 	efi_memory_desc_t *md;
@@ -249,7 +309,10 @@ void __init efi_reserve_boot_services(void)
 
 void __init efi_free_boot_services(void)
 {
+	phys_addr_t new_phys, new_size;
 	efi_memory_desc_t *md;
+	int num_entries = 0;
+	void *new, *new_md;
 
 	for_each_efi_memory_desc(md) {
 		unsigned long long start = md->phys_addr;
@@ -257,12 +320,16 @@ void __init efi_free_boot_services(void)
 		size_t rm_size;
 
 		if (md->type != EFI_BOOT_SERVICES_CODE &&
-		    md->type != EFI_BOOT_SERVICES_DATA)
+		    md->type != EFI_BOOT_SERVICES_DATA) {
+			num_entries++;
 			continue;
+		}
 
 		/* Do not free, someone else owns it: */
-		if (md->attribute & EFI_MEMORY_RUNTIME)
+		if (md->attribute & EFI_MEMORY_RUNTIME) {
+			num_entries++;
 			continue;
+		}
 
 		/*
 		 * Nasty quirk: if all sub-1MB memory is used for boot
@@ -287,7 +354,41 @@ void __init efi_free_boot_services(void)
 		free_bootmem_late(start, size);
 	}
 
-	efi_unmap_memmap();
+	new_size = efi.memmap.desc_size * num_entries;
+	new_phys = memblock_alloc(new_size, 0);
+	if (!new_phys) {
+		pr_err("Failed to allocate new EFI memmap\n");
+		return;
+	}
+
+	new = memremap(new_phys, new_size, MEMREMAP_WB);
+	if (!new) {
+		pr_err("Failed to map new EFI memmap\n");
+		return;
+	}
+
+	/*
+	 * Build a new EFI memmap that excludes any boot services
+	 * regions that are not tagged EFI_MEMORY_RUNTIME, since those
+	 * regions have now been freed.
+	 */
+	new_md = new;
+	for_each_efi_memory_desc(md) {
+		if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
+		    (md->type == EFI_BOOT_SERVICES_CODE ||
+		     md->type == EFI_BOOT_SERVICES_DATA))
+			continue;
+
+		memcpy(new_md, md, efi.memmap.desc_size);
+		new_md += efi.memmap.desc_size;
+	}
+
+	memunmap(new);
+
+	if (efi_memmap_install(new_phys, num_entries)) {
+		pr_err("Could not install new EFI memmap\n");
+		return;
+	}
 }
 
 /*
@@ -365,7 +466,7 @@ void __init efi_apply_memmap_quirks(void)
 	 */
 	if (!efi_runtime_supported()) {
 		pr_info("Setup done, disabling due to 32/64-bit mismatch\n");
-		efi_unmap_memmap();
+		efi_memmap_unmap();
 	}
 
 	/* UV2+ BIOS has a fix for this issue.  UV1 still needs the quirk. */
diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
index 1104515d5ad2..1ac76479c266 100644
--- a/arch/x86/ras/mce_amd_inj.c
+++ b/arch/x86/ras/mce_amd_inj.c
@@ -68,6 +68,7 @@ static int inj_##reg##_set(void *data, u64 val)				\
 MCE_INJECT_SET(status);
 MCE_INJECT_SET(misc);
 MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
 
 #define MCE_INJECT_GET(reg)						\
 static int inj_##reg##_get(void *data, u64 *val)			\
@@ -81,10 +82,12 @@ static int inj_##reg##_get(void *data, u64 *val)			\
 MCE_INJECT_GET(status);
 MCE_INJECT_GET(misc);
 MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
 
 DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
 
 /*
  * Caller needs to be make sure this cpu doesn't disappear
@@ -243,27 +246,27 @@ static void toggle_nb_mca_mst_cpu(u16 nid)
 
 static void prepare_msrs(void *info)
 {
-	struct mce i_mce = *(struct mce *)info;
-	u8 b = i_mce.bank;
+	struct mce m = *(struct mce *)info;
+	u8 b = m.bank;
 
-	wrmsrl(MSR_IA32_MCG_STATUS, i_mce.mcgstatus);
+	wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 
 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
-		if (i_mce.inject_flags == DFR_INT_INJ) {
-			wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), i_mce.status);
-			wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), i_mce.addr);
+		if (m.inject_flags == DFR_INT_INJ) {
+			wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+			wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
 		} else {
-			wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), i_mce.status);
-			wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), i_mce.addr);
+			wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+			wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
 		}
 
-		wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc);
+		wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+		wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
 	} else {
-		wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status);
-		wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr);
-		wrmsrl(MSR_IA32_MCx_MISC(b), i_mce.misc);
+		wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
+		wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
+		wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
 	}
-
 }
 
 static void do_inject(void)
@@ -275,6 +278,9 @@ static void do_inject(void)
 	if (i_mce.misc)
 		i_mce.status |= MCI_STATUS_MISCV;
 
+	if (i_mce.synd)
+		i_mce.status |= MCI_STATUS_SYNDV;
+
 	if (inj_type == SW_INJ) {
 		mce_inject_log(&i_mce);
 		return;
@@ -301,7 +307,9 @@ static void do_inject(void)
 	 * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
 	 * Fam10h and later BKDGs.
 	 */
-	if (static_cpu_has(X86_FEATURE_AMD_DCM) && b == 4) {
+	if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
+	    b == 4 &&
+	    boot_cpu_data.x86 < 0x17) {
 		toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
 		cpu = get_nbc_for_node(amd_get_nb_id(cpu));
 	}
@@ -371,6 +379,9 @@ static const char readme_msg[] =
 "\t used for error thresholding purposes and its validity is indicated by\n"
 "\t MCi_STATUS[MiscV].\n"
 "\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
 "addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
 "\t associated with the error.\n"
 "\n"
@@ -420,6 +431,7 @@ static struct dfs_node {
 	{ .name = "status",	.fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
 	{ .name = "misc",	.fops = &misc_fops,   .perm = S_IRUSR | S_IWUSR },
 	{ .name = "addr",	.fops = &addr_fops,   .perm = S_IRUSR | S_IWUSR },
+	{ .name = "synd",	.fops = &synd_fops,   .perm = S_IRUSR | S_IWUSR },
 	{ .name = "bank",	.fops = &bank_fops,   .perm = S_IRUSR | S_IWUSR },
 	{ .name = "flags",	.fops = &flags_fops,  .perm = S_IRUSR | S_IWUSR },
 	{ .name = "cpu",	.fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
@@ -428,7 +440,7 @@ static struct dfs_node {
 
 static int __init init_mce_inject(void)
 {
-	int i;
+	unsigned int i;
 	u64 cap;
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
@@ -452,26 +464,22 @@ static int __init init_mce_inject(void)
 	return 0;
 
 err_dfs_add:
-	while (--i >= 0)
+	while (i-- > 0)
 		debugfs_remove(dfs_fls[i].d);
 
 	debugfs_remove(dfs_inj);
 	dfs_inj = NULL;
 
-	return -ENOMEM;
+	return -ENODEV;
 }
 
 static void __exit exit_mce_inject(void)
 {
-	int i;
 
-	for (i = 0; i < ARRAY_SIZE(dfs_fls); i++)
-		debugfs_remove(dfs_fls[i].d);
+	debugfs_remove_recursive(dfs_inj);
+	dfs_inj = NULL;
 
 	memset(&dfs_fls, 0, sizeof(dfs_fls));
-
-	debugfs_remove(dfs_inj);
-	dfs_inj = NULL;
 }
 module_init(init_mce_inject);
 module_exit(exit_mce_inject);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e2cf8fcea6bb..f1d2182e071f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1924,6 +1924,45 @@ static void xen_set_cpu_features(struct cpuinfo_x86 *c)
 	}
 }
 
+static void xen_pin_vcpu(int cpu)
+{
+	static bool disable_pinning;
+	struct sched_pin_override pin_override;
+	int ret;
+
+	if (disable_pinning)
+		return;
+
+	pin_override.pcpu = cpu;
+	ret = HYPERVISOR_sched_op(SCHEDOP_pin_override, &pin_override);
+
+	/* Ignore errors when removing override. */
+	if (cpu < 0)
+		return;
+
+	switch (ret) {
+	case -ENOSYS:
+		pr_warn("Unable to pin on physical cpu %d. In case of problems consider vcpu pinning.\n",
+			cpu);
+		disable_pinning = true;
+		break;
+	case -EPERM:
+		WARN(1, "Trying to pin vcpu without having privilege to do so\n");
+		disable_pinning = true;
+		break;
+	case -EINVAL:
+	case -EBUSY:
+		pr_warn("Physical cpu %d not available for pinning. Check Xen cpu configuration.\n",
+			cpu);
+		break;
+	case 0:
+		break;
+	default:
+		WARN(1, "rc %d while trying to pin vcpu\n", ret);
+		disable_pinning = true;
+	}
+}
+
 const struct hypervisor_x86 x86_hyper_xen = {
 	.name			= "Xen",
 	.detect			= xen_platform,
@@ -1932,6 +1971,7 @@ const struct hypervisor_x86 x86_hyper_xen = {
 #endif
 	.x2apic_available	= xen_x2apic_para_available,
 	.set_cpu_features       = xen_set_cpu_features,
+	.pin_vcpu               = xen_pin_vcpu,
 };
 EXPORT_SYMBOL(x86_hyper_xen);
 
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index f42e78de1e10..3d6e0064cbfc 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -21,8 +21,6 @@ static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
 static DEFINE_PER_CPU(char *, irq_name);
 static bool xen_pvspin = true;
 
-#ifdef CONFIG_QUEUED_SPINLOCKS
-
 #include <asm/qspinlock.h>
 
 static void xen_qlock_kick(int cpu)
@@ -71,207 +69,6 @@ static void xen_qlock_wait(u8 *byte, u8 val)
 	xen_poll_irq(irq);
 }
 
-#else /* CONFIG_QUEUED_SPINLOCKS */
-
-enum xen_contention_stat {
-	TAKEN_SLOW,
-	TAKEN_SLOW_PICKUP,
-	TAKEN_SLOW_SPURIOUS,
-	RELEASED_SLOW,
-	RELEASED_SLOW_KICKED,
-	NR_CONTENTION_STATS
-};
-
-
-#ifdef CONFIG_XEN_DEBUG_FS
-#define HISTO_BUCKETS	30
-static struct xen_spinlock_stats
-{
-	u32 contention_stats[NR_CONTENTION_STATS];
-	u32 histo_spin_blocked[HISTO_BUCKETS+1];
-	u64 time_blocked;
-} spinlock_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
-	u8 ret;
-	u8 old = READ_ONCE(zero_stats);
-	if (unlikely(old)) {
-		ret = cmpxchg(&zero_stats, old, 0);
-		/* This ensures only one fellow resets the stat */
-		if (ret == old)
-			memset(&spinlock_stats, 0, sizeof(spinlock_stats));
-	}
-}
-
-static inline void add_stats(enum xen_contention_stat var, u32 val)
-{
-	check_zero();
-	spinlock_stats.contention_stats[var] += val;
-}
-
-static inline u64 spin_time_start(void)
-{
-	return xen_clocksource_read();
-}
-
-static void __spin_time_accum(u64 delta, u32 *array)
-{
-	unsigned index = ilog2(delta);
-
-	check_zero();
-
-	if (index < HISTO_BUCKETS)
-		array[index]++;
-	else
-		array[HISTO_BUCKETS]++;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
-	u32 delta = xen_clocksource_read() - start;
-
-	__spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
-	spinlock_stats.time_blocked += delta;
-}
-#else  /* !CONFIG_XEN_DEBUG_FS */
-static inline void add_stats(enum xen_contention_stat var, u32 val)
-{
-}
-
-static inline u64 spin_time_start(void)
-{
-	return 0;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
-}
-#endif  /* CONFIG_XEN_DEBUG_FS */
-
-struct xen_lock_waiting {
-	struct arch_spinlock *lock;
-	__ticket_t want;
-};
-
-static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
-static cpumask_t waiting_cpus;
-
-__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
-{
-	int irq = __this_cpu_read(lock_kicker_irq);
-	struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting);
-	int cpu = smp_processor_id();
-	u64 start;
-	__ticket_t head;
-	unsigned long flags;
-
-	/* If kicker interrupts not initialized yet, just spin */
-	if (irq == -1)
-		return;
-
-	start = spin_time_start();
-
-	/*
-	 * Make sure an interrupt handler can't upset things in a
-	 * partially setup state.
-	 */
-	local_irq_save(flags);
-	/*
-	 * We don't really care if we're overwriting some other
-	 * (lock,want) pair, as that would mean that we're currently
-	 * in an interrupt context, and the outer context had
-	 * interrupts enabled.  That has already kicked the VCPU out
-	 * of xen_poll_irq(), so it will just return spuriously and
-	 * retry with newly setup (lock,want).
-	 *
-	 * The ordering protocol on this is that the "lock" pointer
-	 * may only be set non-NULL if the "want" ticket is correct.
-	 * If we're updating "want", we must first clear "lock".
-	 */
-	w->lock = NULL;
-	smp_wmb();
-	w->want = want;
-	smp_wmb();
-	w->lock = lock;
-
-	/* This uses set_bit, which atomic and therefore a barrier */
-	cpumask_set_cpu(cpu, &waiting_cpus);
-	add_stats(TAKEN_SLOW, 1);
-
-	/* clear pending */
-	xen_clear_irq_pending(irq);
-
-	/* Only check lock once pending cleared */
-	barrier();
-
-	/*
-	 * Mark entry to slowpath before doing the pickup test to make
-	 * sure we don't deadlock with an unlocker.
-	 */
-	__ticket_enter_slowpath(lock);
-
-	/* make sure enter_slowpath, which is atomic does not cross the read */
-	smp_mb__after_atomic();
-
-	/*
-	 * check again make sure it didn't become free while
-	 * we weren't looking
-	 */
-	head = READ_ONCE(lock->tickets.head);
-	if (__tickets_equal(head, want)) {
-		add_stats(TAKEN_SLOW_PICKUP, 1);
-		goto out;
-	}
-
-	/* Allow interrupts while blocked */
-	local_irq_restore(flags);
-
-	/*
-	 * If an interrupt happens here, it will leave the wakeup irq
-	 * pending, which will cause xen_poll_irq() to return
-	 * immediately.
-	 */
-
-	/* Block until irq becomes pending (or perhaps a spurious wakeup) */
-	xen_poll_irq(irq);
-	add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq));
-
-	local_irq_save(flags);
-
-	kstat_incr_irq_this_cpu(irq);
-out:
-	cpumask_clear_cpu(cpu, &waiting_cpus);
-	w->lock = NULL;
-
-	local_irq_restore(flags);
-
-	spin_time_accum_blocked(start);
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning);
-
-static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
-{
-	int cpu;
-
-	add_stats(RELEASED_SLOW, 1);
-
-	for_each_cpu(cpu, &waiting_cpus) {
-		const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
-
-		/* Make sure we read lock before want */
-		if (READ_ONCE(w->lock) == lock &&
-		    READ_ONCE(w->want) == next) {
-			add_stats(RELEASED_SLOW_KICKED, 1);
-			xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
-			break;
-		}
-	}
-}
-#endif /* CONFIG_QUEUED_SPINLOCKS */
-
 static irqreturn_t dummy_handler(int irq, void *dev_id)
 {
 	BUG();
@@ -334,16 +131,12 @@ void __init xen_init_spinlocks(void)
 		return;
 	}
 	printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
-#ifdef CONFIG_QUEUED_SPINLOCKS
+
 	__pv_init_lock_hash();
 	pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
 	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
 	pv_lock_ops.wait = xen_qlock_wait;
 	pv_lock_ops.kick = xen_qlock_kick;
-#else
-	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
-	pv_lock_ops.unlock_kick = xen_unlock_kick;
-#endif
 }
 
 /*
@@ -372,44 +165,3 @@ static __init int xen_parse_nopvspin(char *arg)
 }
 early_param("xen_nopvspin", xen_parse_nopvspin);
 
-#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS)
-
-static struct dentry *d_spin_debug;
-
-static int __init xen_spinlock_debugfs(void)
-{
-	struct dentry *d_xen = xen_init_debugfs();
-
-	if (d_xen == NULL)
-		return -ENOMEM;
-
-	if (!xen_pvspin)
-		return 0;
-
-	d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
-
-	debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
-
-	debugfs_create_u32("taken_slow", 0444, d_spin_debug,
-			   &spinlock_stats.contention_stats[TAKEN_SLOW]);
-	debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
-			   &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
-	debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
-			   &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]);
-
-	debugfs_create_u32("released_slow", 0444, d_spin_debug,
-			   &spinlock_stats.contention_stats[RELEASED_SLOW]);
-	debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
-			   &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
-
-	debugfs_create_u64("time_blocked", 0444, d_spin_debug,
-			   &spinlock_stats.time_blocked);
-
-	debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
-				spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
-
-	return 0;
-}
-fs_initcall(xen_spinlock_debugfs);
-
-#endif	/* CONFIG_XEN_DEBUG_FS */