summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/alternative.c29
-rw-r--r--arch/x86/kernel/apic/apic.c6
-rw-r--r--arch/x86/kernel/cpu/common.c3
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c36
-rw-r--r--arch/x86/kernel/cpu/topology.c13
-rw-r--r--arch/x86/kernel/head_64.S28
-rw-r--r--arch/x86/kernel/smpboot.c201
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S1
9 files changed, 194 insertions, 125 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a888ae0f01fb..e87da25d1236 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1182,7 +1182,7 @@ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
poison_endbr(addr);
if (IS_ENABLED(CONFIG_FINEIBT))
- poison_cfi(addr - 16);
+ poison_cfi(addr - CFI_OFFSET);
}
}
@@ -1389,6 +1389,8 @@ extern u8 fineibt_preamble_end[];
#define fineibt_preamble_ud 0x13
#define fineibt_preamble_hash 5
+#define fineibt_prefix_size (fineibt_preamble_size - ENDBR_INSN_SIZE)
+
/*
* <fineibt_caller_start>:
* 0: b8 78 56 34 12 mov $0x12345678, %eax
@@ -1634,7 +1636,7 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end)
* have determined there are no indirect calls to it and we
* don't need no CFI either.
*/
- if (!is_endbr(addr + 16))
+ if (!is_endbr(addr + CFI_OFFSET))
continue;
hash = decode_preamble_hash(addr, &arity);
@@ -1642,6 +1644,15 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end)
addr, addr, 5, addr))
return -EINVAL;
+ /*
+ * FineIBT relies on being at func-16, so if the preamble is
+ * actually larger than that, place it the tail end.
+ *
+ * NOTE: this is possible with things like DEBUG_CALL_THUNKS
+ * and DEBUG_FORCE_FUNCTION_ALIGN_64B.
+ */
+ addr += CFI_OFFSET - fineibt_prefix_size;
+
text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
@@ -1664,10 +1675,10 @@ static void cfi_rewrite_endbr(s32 *start, s32 *end)
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- if (!exact_endbr(addr + 16))
+ if (!exact_endbr(addr + CFI_OFFSET))
continue;
- poison_endbr(addr + 16);
+ poison_endbr(addr + CFI_OFFSET);
}
}
@@ -1772,7 +1783,8 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
if (FINEIBT_WARN(fineibt_preamble_size, 20) ||
FINEIBT_WARN(fineibt_preamble_bhi + fineibt_bhi1_size, 20) ||
FINEIBT_WARN(fineibt_caller_size, 14) ||
- FINEIBT_WARN(fineibt_paranoid_size, 20))
+ FINEIBT_WARN(fineibt_paranoid_size, 20) ||
+ WARN_ON_ONCE(CFI_OFFSET < fineibt_prefix_size))
return;
if (cfi_mode == CFI_AUTO) {
@@ -1886,6 +1898,11 @@ static void poison_cfi(void *addr)
switch (cfi_mode) {
case CFI_FINEIBT:
/*
+ * FineIBT preamble is at func-16.
+ */
+ addr += CFI_OFFSET - fineibt_prefix_size;
+
+ /*
* FineIBT prefix should start with an ENDBR.
*/
if (!is_endbr(addr))
@@ -1923,8 +1940,6 @@ static void poison_cfi(void *addr)
}
}
-#define fineibt_prefix_size (fineibt_preamble_size - ENDBR_INSN_SIZE)
-
/*
* When regs->ip points to a 0xD6 byte in the FineIBT preamble,
* return true and fill out target and type.
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d93f87f29d03..961714e6adae 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1894,6 +1894,7 @@ void __init check_x2apic(void)
static inline void try_to_enable_x2apic(int remap_mode) { }
static inline void __x2apic_enable(void) { }
+static inline void __x2apic_disable(void) { }
#endif /* !CONFIG_X86_X2APIC */
void __init enable_IR_x2apic(void)
@@ -2456,6 +2457,11 @@ static void lapic_resume(void *data)
if (x2apic_mode) {
__x2apic_enable();
} else {
+ if (x2apic_enabled()) {
+ pr_warn_once("x2apic: re-enabled by firmware during resume. Disabling\n");
+ __x2apic_disable();
+ }
+
/*
* Make sure the APICBASE points to the right address
*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1c3261cae40c..a8ff4376c286 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -95,6 +95,9 @@ EXPORT_SYMBOL(__max_dies_per_package);
unsigned int __max_logical_packages __ro_after_init = 1;
EXPORT_SYMBOL(__max_logical_packages);
+unsigned int __num_nodes_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_nodes_per_package);
+
unsigned int __num_cores_per_package __ro_after_init = 1;
EXPORT_SYMBOL(__num_cores_per_package);
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index e6a154240b8d..9bd87bae4983 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -364,7 +364,7 @@ void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d)
msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
}
-/* CPU models that support MSR_RMID_SNC_CONFIG */
+/* CPU models that support SNC and MSR_RMID_SNC_CONFIG */
static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
@@ -375,40 +375,14 @@ static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
{}
};
-/*
- * There isn't a simple hardware bit that indicates whether a CPU is running
- * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
- * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
- * the same NUMA node as CPU0.
- * It is not possible to accurately determine SNC state if the system is
- * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
- * to L3 caches. It will be OK if system is booted with hyperthreading
- * disabled (since this doesn't affect the ratio).
- */
static __init int snc_get_config(void)
{
- struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
- const cpumask_t *node0_cpumask;
- int cpus_per_node, cpus_per_l3;
- int ret;
-
- if (!x86_match_cpu(snc_cpu_ids) || !ci)
- return 1;
+ int ret = topology_num_nodes_per_package();
- cpus_read_lock();
- if (num_online_cpus() != num_present_cpus())
- pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
- cpus_read_unlock();
-
- node0_cpumask = cpumask_of_node(cpu_to_node(0));
-
- cpus_per_node = cpumask_weight(node0_cpumask);
- cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
-
- if (!cpus_per_node || !cpus_per_l3)
+ if (ret > 1 && !x86_match_cpu(snc_cpu_ids)) {
+ pr_warn("CoD enabled system? Resctrl not supported\n");
return 1;
-
- ret = cpus_per_l3 / cpus_per_node;
+ }
/* sanity check: Only valid results are 1, 2, 3, 4, 6 */
switch (ret) {
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 23190a786d31..eafcb1fc185a 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -31,6 +31,7 @@
#include <asm/mpspec.h>
#include <asm/msr.h>
#include <asm/smp.h>
+#include <asm/numa.h>
#include "cpu.h"
@@ -492,11 +493,19 @@ void __init topology_init_possible_cpus(void)
set_nr_cpu_ids(allowed);
cnta = domain_weight(TOPO_PKG_DOMAIN);
- cntb = domain_weight(TOPO_DIE_DOMAIN);
__max_logical_packages = cnta;
+
+ pr_info("Max. logical packages: %3u\n", __max_logical_packages);
+
+ cntb = num_phys_nodes();
+ __num_nodes_per_package = DIV_ROUND_UP(cntb, cnta);
+
+ pr_info("Max. logical nodes: %3u\n", cntb);
+ pr_info("Num. nodes per package:%3u\n", __num_nodes_per_package);
+
+ cntb = domain_weight(TOPO_DIE_DOMAIN);
__max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
- pr_info("Max. logical packages: %3u\n", cnta);
pr_info("Max. logical dies: %3u\n", cntb);
pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 21816b48537c..85d4a5094f6b 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -616,38 +616,10 @@ SYM_DATA(early_recursion_flag, .long 0)
.data
-#if defined(CONFIG_XEN_PV) || defined(CONFIG_PVH)
-SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
- .org init_top_pgt + L4_PAGE_OFFSET*8, 0
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
- .org init_top_pgt + L4_START_KERNEL*8, 0
- /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
- .fill PTI_USER_PGD_FILL,8,0
-SYM_DATA_END(init_top_pgt)
-
-SYM_DATA_START_PAGE_ALIGNED(level3_ident_pgt)
- .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
- .fill 511, 8, 0
-SYM_DATA_END(level3_ident_pgt)
-SYM_DATA_START_PAGE_ALIGNED(level2_ident_pgt)
- /*
- * Since I easily can, map the first 1G.
- * Don't set NX because code runs from these pages.
- *
- * Note: This sets _PAGE_GLOBAL despite whether
- * the CPU supports it or it is enabled. But,
- * the CPU should ignore the bit.
- */
- PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
-SYM_DATA_END(level2_ident_pgt)
-#else
SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
.fill 512,8,0
.fill PTI_USER_PGD_FILL,8,0
SYM_DATA_END(init_top_pgt)
-#endif
SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt)
.fill 511,8,0
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 5cd6950ab672..294a8ea60298 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -468,13 +468,6 @@ static int x86_cluster_flags(void)
}
#endif
-/*
- * Set if a package/die has multiple NUMA nodes inside.
- * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
- * Sub-NUMA Clustering have this.
- */
-static bool x86_has_numa_in_package;
-
static struct sched_domain_topology_level x86_topology[] = {
SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
#ifdef CONFIG_SCHED_CLUSTER
@@ -496,7 +489,7 @@ static void __init build_sched_topology(void)
* PKG domain since the NUMA domains will auto-magically create the
* right spanning domains based on the SLIT.
*/
- if (x86_has_numa_in_package) {
+ if (topology_num_nodes_per_package() > 1) {
unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
@@ -513,33 +506,149 @@ static void __init build_sched_topology(void)
}
#ifdef CONFIG_NUMA
-static int sched_avg_remote_distance;
-static int avg_remote_numa_distance(void)
+/*
+ * Test if the on-trace cluster at (N,N) is symmetric.
+ * Uses upper triangle iteration to avoid obvious duplicates.
+ */
+static bool slit_cluster_symmetric(int N)
{
- int i, j;
- int distance, nr_remote, total_distance;
-
- if (sched_avg_remote_distance > 0)
- return sched_avg_remote_distance;
-
- nr_remote = 0;
- total_distance = 0;
- for_each_node_state(i, N_CPU) {
- for_each_node_state(j, N_CPU) {
- distance = node_distance(i, j);
-
- if (distance >= REMOTE_DISTANCE) {
- nr_remote++;
- total_distance += distance;
- }
+ int u = topology_num_nodes_per_package();
+
+ for (int k = 0; k < u; k++) {
+ for (int l = k; l < u; l++) {
+ if (node_distance(N + k, N + l) !=
+ node_distance(N + l, N + k))
+ return false;
}
}
- if (nr_remote)
- sched_avg_remote_distance = total_distance / nr_remote;
- else
- sched_avg_remote_distance = REMOTE_DISTANCE;
- return sched_avg_remote_distance;
+ return true;
+}
+
+/*
+ * Return the package-id of the cluster, or ~0 if indeterminate.
+ * Each node in the on-trace cluster should have the same package-id.
+ */
+static u32 slit_cluster_package(int N)
+{
+ int u = topology_num_nodes_per_package();
+ u32 pkg_id = ~0;
+
+ for (int n = 0; n < u; n++) {
+ const struct cpumask *cpus = cpumask_of_node(N + n);
+ int cpu;
+
+ for_each_cpu(cpu, cpus) {
+ u32 id = topology_logical_package_id(cpu);
+
+ if (pkg_id == ~0)
+ pkg_id = id;
+ if (pkg_id != id)
+ return ~0;
+ }
+ }
+
+ return pkg_id;
+}
+
+/*
+ * Validate the SLIT table is of the form expected for SNC, specifically:
+ *
+ * - each on-trace cluster should be symmetric,
+ * - each on-trace cluster should have a unique package-id.
+ *
+ * If you NUMA_EMU on top of SNC, you get to keep the pieces.
+ */
+static bool slit_validate(void)
+{
+ int u = topology_num_nodes_per_package();
+ u32 pkg_id, prev_pkg_id = ~0;
+
+ for (int pkg = 0; pkg < topology_max_packages(); pkg++) {
+ int n = pkg * u;
+
+ /*
+ * Ensure the on-trace cluster is symmetric and each cluster
+ * has a different package id.
+ */
+ if (!slit_cluster_symmetric(n))
+ return false;
+ pkg_id = slit_cluster_package(n);
+ if (pkg_id == ~0)
+ return false;
+ if (pkg && pkg_id == prev_pkg_id)
+ return false;
+
+ prev_pkg_id = pkg_id;
+ }
+
+ return true;
+}
+
+/*
+ * Compute a sanitized SLIT table for SNC; notably SNC-3 can end up with
+ * asymmetric off-trace clusters, reflecting physical assymmetries. However
+ * this leads to 'unfortunate' sched_domain configurations.
+ *
+ * For example dual socket GNR with SNC-3:
+ *
+ * node distances:
+ * node 0 1 2 3 4 5
+ * 0: 10 15 17 21 28 26
+ * 1: 15 10 15 23 26 23
+ * 2: 17 15 10 26 23 21
+ * 3: 21 28 26 10 15 17
+ * 4: 23 26 23 15 10 15
+ * 5: 26 23 21 17 15 10
+ *
+ * Fix things up by averaging out the off-trace clusters; resulting in:
+ *
+ * node 0 1 2 3 4 5
+ * 0: 10 15 17 24 24 24
+ * 1: 15 10 15 24 24 24
+ * 2: 17 15 10 24 24 24
+ * 3: 24 24 24 10 15 17
+ * 4: 24 24 24 15 10 15
+ * 5: 24 24 24 17 15 10
+ */
+static int slit_cluster_distance(int i, int j)
+{
+ static int slit_valid = -1;
+ int u = topology_num_nodes_per_package();
+ long d = 0;
+ int x, y;
+
+ if (slit_valid < 0) {
+ slit_valid = slit_validate();
+ if (!slit_valid)
+ pr_err(FW_BUG "SLIT table doesn't have the expected form for SNC -- fixup disabled!\n");
+ else
+ pr_info("Fixing up SNC SLIT table.\n");
+ }
+
+ /*
+ * Is this a unit cluster on the trace?
+ */
+ if ((i / u) == (j / u) || !slit_valid)
+ return node_distance(i, j);
+
+ /*
+ * Off-trace cluster.
+ *
+ * Notably average out the symmetric pair of off-trace clusters to
+ * ensure the resulting SLIT table is symmetric.
+ */
+ x = i - (i % u);
+ y = j - (j % u);
+
+ for (i = x; i < x + u; i++) {
+ for (j = y; j < y + u; j++) {
+ d += node_distance(i, j);
+ d += node_distance(j, i);
+ }
+ }
+
+ return d / (2*u*u);
}
int arch_sched_node_distance(int from, int to)
@@ -549,34 +658,14 @@ int arch_sched_node_distance(int from, int to)
switch (boot_cpu_data.x86_vfm) {
case INTEL_GRANITERAPIDS_X:
case INTEL_ATOM_DARKMONT_X:
-
- if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
- d < REMOTE_DISTANCE)
+ if (topology_max_packages() == 1 ||
+ topology_num_nodes_per_package() < 3)
return d;
/*
- * With SNC enabled, there could be too many levels of remote
- * NUMA node distances, creating NUMA domain levels
- * including local nodes and partial remote nodes.
- *
- * Trim finer distance tuning for NUMA nodes in remote package
- * for the purpose of building sched domains. Group NUMA nodes
- * in the remote package in the same sched group.
- * Simplify NUMA domains and avoid extra NUMA levels including
- * different remote NUMA nodes and local nodes.
- *
- * GNR and CWF don't expect systems with more than 2 packages
- * and more than 2 hops between packages. Single average remote
- * distance won't be appropriate if there are more than 2
- * packages as average distance to different remote packages
- * could be different.
+ * Handle SNC-3 asymmetries.
*/
- WARN_ONCE(topology_max_packages() > 2,
- "sched: Expect only up to 2 packages for GNR or CWF, "
- "but saw %d packages when building sched domains.",
- topology_max_packages());
-
- d = avg_remote_numa_distance();
+ return slit_cluster_distance(from, to);
}
return d;
}
@@ -606,7 +695,7 @@ void set_cpu_sibling_map(int cpu)
o = &cpu_data(i);
if (match_pkg(c, o) && !topology_same_node(c, o))
- x86_has_numa_in_package = true;
+ WARN_ON_ONCE(topology_num_nodes_per_package() == 1);
if ((i == cpu) || (has_smt && match_smt(c, o)))
link_mask(topology_sibling_cpumask, cpu, i);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5a6a772e0a6c..4dbff8ef9b1c 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -397,7 +397,7 @@ static inline void handle_invalid_op(struct pt_regs *regs)
ILL_ILLOPN, error_get_trap_addr(regs));
}
-static noinstr bool handle_bug(struct pt_regs *regs)
+noinstr bool handle_bug(struct pt_regs *regs)
{
unsigned long addr = regs->ip;
bool handled = false;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3a24a3fc55f5..4711a35e706c 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -427,6 +427,7 @@ SECTIONS
.llvm_bb_addr_map : { *(.llvm_bb_addr_map) }
#endif
+ MODINFO
ELF_DETAILS
DISCARDS