summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorJason Gunthorpe <jgg@nvidia.com>2025-11-04 14:29:59 -0400
committerJoerg Roedel <joerg.roedel@amd.com>2025-11-05 09:07:04 +0100
commit7c5b184db7145fd417785377337bd15c4fe1d0f4 (patch)
treed311910f9d8361d33b520377b75c73bf3b737b71 /include/linux
parentfd714986e4e46effa6697b13d32918fc59608ccb (diff)
genpt: Generic Page Table base API
The generic API is intended to be separated from the implementation of page table algorithms. It contains only accessors for walking and manipulating the table and helpers that are useful for building an implementation. Memory management is not in the generic API, but part of the implementation. Using a multi-compilation approach the implementation module would include headers in this order: common.h defs_FMT.h pt_defs.h FMT.h pt_common.h IMPLEMENTATION.h Where each compilation unit would have a combination of FMT and IMPLEMENTATION to produce a per-format per-implementation module. The API is designed so that the format headers have minimal logic, and default implementations are provided if the format doesn't include one. Generally formats provide their code via an inline function using the pattern: static inline FMTpt_XX(..) {} #define pt_XX FMTpt_XX The common code then enforces a function signature so that there is no drift in function arguments, or accidental polymorphic functions (as has been slightly troublesome in mm). Use of function-like #defines are avoided in the format even though many of the functions are small enough. Provide kdocs for the API surface. This is enough to implement the 8 initial format variations with all of their features: * Entries comprised of contiguous blocks of IO PTEs for larger page sizes (AMDv1, ARMv8) * Multi-level tables, up to 6 levels. Runtime selected top level * The size of the top table level can be selected at runtime (ARM's concatenated tables) * The number of levels in the table can optionally increase dynamically during map (AMDv1) * Optional leaf entries at any level * 32 bit/64 bit virtual and output addresses, using every bit * Sign extended addressing (x86) * Dirty tracking A basic simple format takes about 200 lines to declare the require inline functions. Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com> Reviewed-by: Samiullah Khawaja <skhawaja@google.com> Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com> Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/generic_pt/common.h135
1 files changed, 135 insertions, 0 deletions
diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h
new file mode 100644
index 000000000000..e69a75511313
--- /dev/null
+++ b/include/linux/generic_pt/common.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __GENERIC_PT_COMMON_H
+#define __GENERIC_PT_COMMON_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+#include <linux/bits.h>
+
+/**
+ * DOC: Generic Radix Page Table
+ *
+ * Generic Radix Page Table is a set of functions and helpers to efficiently
+ * parse radix style page tables typically seen in HW implementations. The
+ * interface is built to deliver similar code generation as the mm's pte/pmd/etc
+ * system by fully inlining the exact code required to handle each table level.
+ *
+ * Like the mm subsystem each format contributes its parsing implementation
+ * under common names and the common code implements the required algorithms.
+ *
+ * The system is divided into three logical levels:
+ *
+ * - The page table format and its manipulation functions
+ * - Generic helpers to give a consistent API regardless of underlying format
+ * - An algorithm implementation (e.g. IOMMU/DRM/KVM/MM)
+ *
+ * Multiple implementations are supported. The intention is to have the generic
+ * format code be re-usable for whatever specialized implementation is required.
+ * The generic code is solely about the format of the radix tree; it does not
+ * include memory allocation or higher level decisions that are left for the
+ * implementation.
+ *
+ * The generic framework supports a superset of functions across many HW
+ * implementations:
+ *
+ * - Entries comprised of contiguous blocks of IO PTEs for larger page sizes
+ * - Multi-level tables, up to 6 levels. Runtime selected top level
+ * - Runtime variable table level size (ARM's concatenated tables)
+ * - Expandable top level allowing dynamic sizing of table levels
+ * - Optional leaf entries at any level
+ * - 32-bit/64-bit virtual and output addresses, using every address bit
+ * - Dirty tracking
+ * - Sign extended addressing
+ */
+
+/**
+ * struct pt_common - struct for all page table implementations
+ */
+struct pt_common {
+ /**
+ * @top_of_table: Encodes the table top pointer and the top level in a
+ * single value. Must use READ_ONCE/WRITE_ONCE to access it. The lower
+ * bits of the aligned table pointer are used for the level.
+ */
+ uintptr_t top_of_table;
+ /**
+ * @max_oasz_lg2: Maximum number of bits the OA can contain. Upper bits
+ * must be zero. This may be less than what the page table format
+ * supports, but must not be more.
+ */
+ u8 max_oasz_lg2;
+ /**
+ * @max_vasz_lg2: Maximum number of bits the VA can contain. Upper bits
+ * are 0 or 1 depending on pt_full_va_prefix(). This may be less than
+ * what the page table format supports, but must not be more. When
+ * PT_FEAT_DYNAMIC_TOP is set this reflects the maximum VA capability.
+ */
+ u8 max_vasz_lg2;
+ /**
+ * @features: Bitmap of `enum pt_features`
+ */
+ unsigned int features;
+};
+
+/* Encoding parameters for top_of_table */
+enum {
+ PT_TOP_LEVEL_BITS = 3,
+ PT_TOP_LEVEL_MASK = GENMASK(PT_TOP_LEVEL_BITS - 1, 0),
+};
+
+/**
+ * enum pt_features - Features turned on in the table. Each symbol is a bit
+ * position.
+ */
+enum pt_features {
+ /**
+ * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to
+ * PT_VADDR_MAX.
+ */
+ PT_FEAT_FULL_VA,
+ /**
+ * @PT_FEAT_DYNAMIC_TOP: The table's top level can be increased
+ * dynamically during map. This requires HW support for atomically
+ * setting both the table top pointer and the starting table level.
+ */
+ PT_FEAT_DYNAMIC_TOP,
+ /**
+ * @PT_FEAT_SIGN_EXTEND: The top most bit of the valid VA range sign
+ * extends up to the full pt_vaddr_t. This divides the page table into
+ * three VA ranges::
+ *
+ * 0 -> 2^N - 1 Lower
+ * 2^N -> (MAX - 2^N - 1) Non-Canonical
+ * MAX - 2^N -> MAX Upper
+ *
+ * In this mode pt_common::max_vasz_lg2 includes the sign bit and the
+ * upper bits that don't fall within the translation are just validated.
+ *
+ * If not set there is no sign extension and valid VA goes from 0 to 2^N
+ * - 1.
+ */
+ PT_FEAT_SIGN_EXTEND,
+ /**
+ * @PT_FEAT_FLUSH_RANGE: IOTLB maintenance is done by flushing IOVA
+ * ranges which will clean out any walk cache or any IOPTE fully
+ * contained by the range. The optimization objective is to minimize the
+ * number of flushes even if ranges include IOVA gaps that do not need
+ * to be flushed.
+ */
+ PT_FEAT_FLUSH_RANGE,
+ /**
+ * @PT_FEAT_FLUSH_RANGE_NO_GAPS: Like PT_FEAT_FLUSH_RANGE except that
+ * the optimization objective is to only flush IOVA that has been
+ * changed. This mode is suitable for cases like hypervisor shadowing
+ * where flushing unchanged ranges may cause the hypervisor to reparse
+ * significant amount of page table.
+ */
+ PT_FEAT_FLUSH_RANGE_NO_GAPS,
+ /* private: */
+ PT_FEAT_FMT_START,
+};
+
+#endif