summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/dma/direct.c240
-rw-r--r--kernel/dma/pool.c4
-rw-r--r--kernel/dma/swiotlb.c50
-rw-r--r--kernel/events/core.c41
-rw-r--r--kernel/exit.c97
-rw-r--r--kernel/fork.c49
-rw-r--r--kernel/futex/core.c2
-rw-r--r--kernel/irq/generic-chip.c2
-rw-r--r--kernel/irq/manage.c8
-rw-r--r--kernel/irq/msi.c792
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kexec_core.c2
-rw-r--r--kernel/kthread.c89
-rw-r--r--kernel/livepatch/core.c29
-rw-r--r--kernel/livepatch/shadow.c6
-rw-r--r--kernel/module.c8
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/profile.c73
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcu/rcutorture.c7
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/signal.c61
-rw-r--r--kernel/sys.c63
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/time/clocksource.c52
-rw-r--r--kernel/trace/Kconfig14
-rw-r--r--kernel/trace/blktrace.c20
-rw-r--r--kernel/trace/ftrace.c34
-rw-r--r--kernel/trace/ring_buffer.c7
-rw-r--r--kernel/trace/trace.c88
-rw-r--r--kernel/trace/trace.h83
-rw-r--r--kernel/trace/trace_eprobe.c38
-rw-r--r--kernel/trace/trace_events.c12
-rw-r--r--kernel/trace/trace_events_filter.c139
-rw-r--r--kernel/trace/trace_events_hist.c69
-rw-r--r--kernel/trace/trace_events_inject.c11
-rw-r--r--kernel/trace/trace_events_synth.c15
-rw-r--r--kernel/trace/trace_events_trigger.c424
-rw-r--r--kernel/trace/trace_hwlat.c6
-rw-r--r--kernel/trace/trace_kprobe.c43
-rw-r--r--kernel/trace/trace_osnoise.c26
-rw-r--r--kernel/trace/trace_output.c4
-rw-r--r--kernel/trace/trace_probe.c5
-rw-r--r--kernel/trace/trace_syscalls.c6
-rw-r--r--kernel/trace/trace_uprobe.c39
-rw-r--r--kernel/tsacct.c7
47 files changed, 1914 insertions, 878 deletions
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 4c6c5e0635e3..50f48e9e4598 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -75,15 +75,45 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}
+static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size)
+{
+ if (!force_dma_unencrypted(dev))
+ return 0;
+ return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size));
+}
+
+static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
+{
+ int ret;
+
+ if (!force_dma_unencrypted(dev))
+ return 0;
+ ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size));
+ if (ret)
+ pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
+ return ret;
+}
+
static void __dma_direct_free_pages(struct device *dev, struct page *page,
size_t size)
{
- if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
- swiotlb_free(dev, page, size))
+ if (swiotlb_free(dev, page, size))
return;
dma_free_contiguous(dev, page, size);
}
+static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
+{
+ struct page *page = swiotlb_alloc(dev, size);
+
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ swiotlb_free(dev, page, size);
+ return NULL;
+ }
+
+ return page;
+}
+
static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp_t gfp)
{
@@ -93,18 +123,11 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
WARN_ON_ONCE(!PAGE_ALIGNED(size));
+ if (is_swiotlb_for_alloc(dev))
+ return dma_direct_alloc_swiotlb(dev, size);
+
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_limit);
- if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
- is_swiotlb_for_alloc(dev)) {
- page = swiotlb_alloc(dev, size);
- if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- __dma_direct_free_pages(dev, page, size);
- return NULL;
- }
- return page;
- }
-
page = dma_alloc_contiguous(dev, size, gfp);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, size);
@@ -133,6 +156,15 @@ again:
return page;
}
+/*
+ * Check if a potentially blocking operations needs to dip into the atomic
+ * pools for the given device/gfp.
+ */
+static bool dma_direct_use_pool(struct device *dev, gfp_t gfp)
+{
+ return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev);
+}
+
static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp)
{
@@ -140,6 +172,9 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
u64 phys_mask;
void *ret;
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)))
+ return NULL;
+
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_mask);
page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
@@ -149,64 +184,103 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
return ret;
}
+static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp)
+{
+ struct page *page;
+
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+ if (!page)
+ return NULL;
+
+ /* remove any dirty cache lines on the kernel alias */
+ if (!PageHighMem(page))
+ arch_dma_prep_coherent(page, size);
+
+ /* return the page pointer as the opaque cookie */
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return page;
+}
+
void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
+ bool remap = false, set_uncached = false;
struct page *page;
void *ret;
- int err;
size = PAGE_ALIGN(size);
if (attrs & DMA_ATTR_NO_WARN)
gfp |= __GFP_NOWARN;
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
- page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
- if (!page)
- return NULL;
- /* remove any dirty cache lines on the kernel alias */
- if (!PageHighMem(page))
- arch_dma_prep_coherent(page, size);
- *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
- /* return the page pointer as the opaque cookie */
- return page;
- }
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev))
+ return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
- !dev_is_dma_coherent(dev) &&
- !is_swiotlb_for_alloc(dev))
- return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
+ if (!dev_is_dma_coherent(dev)) {
+ /*
+ * Fallback to the arch handler if it exists. This should
+ * eventually go away.
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+ !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ !is_swiotlb_for_alloc(dev))
+ return arch_dma_alloc(dev, size, dma_handle, gfp,
+ attrs);
- if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
- !dev_is_dma_coherent(dev))
- return dma_alloc_from_global_coherent(dev, size, dma_handle);
+ /*
+ * If there is a global pool, always allocate from it for
+ * non-coherent devices.
+ */
+ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL))
+ return dma_alloc_from_global_coherent(dev, size,
+ dma_handle);
+
+ /*
+ * Otherwise remap if the architecture is asking for it. But
+ * given that remapping memory is a blocking operation we'll
+ * instead have to dip into the atomic pools.
+ */
+ remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
+ if (remap) {
+ if (dma_direct_use_pool(dev, gfp))
+ return dma_direct_alloc_from_pool(dev, size,
+ dma_handle, gfp);
+ } else {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED))
+ return NULL;
+ set_uncached = true;
+ }
+ }
/*
- * Remapping or decrypting memory may block. If either is required and
- * we can't block, allocate the memory from the atomic pools.
- * If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must
- * set up another device coherent pool by shared-dma-pool and use
- * dma_alloc_from_dev_coherent instead.
+ * Decrypting memory may block, so allocate the memory from the atomic
+ * pools if we can't block.
*/
- if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
- !gfpflags_allow_blocking(gfp) &&
- (force_dma_unencrypted(dev) ||
- (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !dev_is_dma_coherent(dev))) &&
- !is_swiotlb_for_alloc(dev))
+ if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
/* we always manually zero the memory once we are done */
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
if (!page)
return NULL;
+ if (PageHighMem(page)) {
+ /*
+ * Depending on the cma= arguments and per-arch setup,
+ * dma_alloc_contiguous could return highmem pages.
+ * Without remapping there is no way to return them here, so
+ * log an error and fail.
+ */
+ if (!IS_ENABLED(CONFIG_DMA_REMAP)) {
+ dev_info(dev, "Rejecting highmem page from CMA.\n");
+ goto out_free_pages;
+ }
+ remap = true;
+ set_uncached = false;
+ }
- if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !dev_is_dma_coherent(dev)) ||
- (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
+ if (remap) {
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
@@ -216,56 +290,27 @@ void *dma_direct_alloc(struct device *dev, size_t size,
__builtin_return_address(0));
if (!ret)
goto out_free_pages;
- if (force_dma_unencrypted(dev)) {
- err = set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size));
- if (err)
- goto out_free_pages;
- }
- memset(ret, 0, size);
- goto done;
- }
-
- if (PageHighMem(page)) {
- /*
- * Depending on the cma= arguments and per-arch setup
- * dma_alloc_contiguous could return highmem pages.
- * Without remapping there is no way to return them here,
- * so log an error and fail.
- */
- dev_info(dev, "Rejecting highmem page from CMA.\n");
- goto out_free_pages;
- }
-
- ret = page_address(page);
- if (force_dma_unencrypted(dev)) {
- err = set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size));
- if (err)
+ } else {
+ ret = page_address(page);
+ if (dma_set_decrypted(dev, ret, size))
goto out_free_pages;
}
memset(ret, 0, size);
- if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !dev_is_dma_coherent(dev)) {
+ if (set_uncached) {
arch_dma_prep_coherent(page, size);
ret = arch_dma_set_uncached(ret, size);
if (IS_ERR(ret))
goto out_encrypt_pages;
}
-done:
+
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return ret;
out_encrypt_pages:
- if (force_dma_unencrypted(dev)) {
- err = set_memory_encrypted((unsigned long)page_address(page),
- 1 << get_order(size));
- /* If memory cannot be re-encrypted, it must be leaked */
- if (err)
- return NULL;
- }
+ if (dma_set_encrypted(dev, page_address(page), size))
+ return NULL;
out_free_pages:
__dma_direct_free_pages(dev, page, size);
return NULL;
@@ -304,13 +349,14 @@ void dma_direct_free(struct device *dev, size_t size,
dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
return;
- if (force_dma_unencrypted(dev))
- set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
-
- if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
+ if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
vunmap(cpu_addr);
- else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
- arch_dma_clear_uncached(cpu_addr, size);
+ } else {
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
+ arch_dma_clear_uncached(cpu_addr, size);
+ if (dma_set_encrypted(dev, cpu_addr, 1 << page_order))
+ return;
+ }
__dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
}
@@ -321,9 +367,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
struct page *page;
void *ret;
- if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
- force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) &&
- !is_swiotlb_for_alloc(dev))
+ if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
page = __dma_direct_alloc_pages(dev, size, gfp);
@@ -341,11 +385,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
}
ret = page_address(page);
- if (force_dma_unencrypted(dev)) {
- if (set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size)))
- goto out_free_pages;
- }
+ if (dma_set_decrypted(dev, ret, size))
+ goto out_free_pages;
memset(ret, 0, size);
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
@@ -366,9 +407,8 @@ void dma_direct_free_pages(struct device *dev, size_t size,
dma_free_from_pool(dev, vaddr, size))
return;
- if (force_dma_unencrypted(dev))
- set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
-
+ if (dma_set_encrypted(dev, vaddr, 1 << page_order))
+ return;
__dma_direct_free_pages(dev, page, size);
}
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 5f84e6cdb78e..4d40dcce7604 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void)
GFP_KERNEL);
if (!atomic_pool_kernel)
ret = -ENOMEM;
- if (IS_ENABLED(CONFIG_ZONE_DMA)) {
+ if (has_managed_dma()) {
atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
GFP_KERNEL | GFP_DMA);
if (!atomic_pool_dma)
@@ -226,7 +226,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
if (prev == NULL) {
if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
return atomic_pool_dma32;
- if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+ if (atomic_pool_dma && (gfp & GFP_DMA))
return atomic_pool_dma;
return atomic_pool_kernel;
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 8e840fbbed7c..f1e7ea160b43 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -50,6 +50,7 @@
#include <asm/io.h>
#include <asm/dma.h>
+#include <linux/io.h>
#include <linux/init.h>
#include <linux/memblock.h>
#include <linux/iommu-helper.h>
@@ -72,6 +73,8 @@ enum swiotlb_force swiotlb_force;
struct io_tlb_mem io_tlb_default_mem;
+phys_addr_t swiotlb_unencrypted_base;
+
/*
* Max segment that we can provide which (if pages are contingous) will
* not be bounced (unless SWIOTLB_FORCE is set).
@@ -156,6 +159,34 @@ static inline unsigned long nr_slots(u64 val)
}
/*
+ * Remap swioltb memory in the unencrypted physical address space
+ * when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP
+ * Isolation VMs).
+ */
+#ifdef CONFIG_HAS_IOMEM
+static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
+{
+ void *vaddr = NULL;
+
+ if (swiotlb_unencrypted_base) {
+ phys_addr_t paddr = mem->start + swiotlb_unencrypted_base;
+
+ vaddr = memremap(paddr, bytes, MEMREMAP_WB);
+ if (!vaddr)
+ pr_err("Failed to map the unencrypted memory %pa size %lx.\n",
+ &paddr, bytes);
+ }
+
+ return vaddr;
+}
+#else
+static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
+{
+ return NULL;
+}
+#endif
+
+/*
* Early SWIOTLB allocation may be too early to allow an architecture to
* perform the desired operations. This function allows the architecture to
* call SWIOTLB when the operations are possible. It needs to be called
@@ -172,7 +203,12 @@ void __init swiotlb_update_mem_attributes(void)
vaddr = phys_to_virt(mem->start);
bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
- memset(vaddr, 0, bytes);
+
+ mem->vaddr = swiotlb_mem_remap(mem, bytes);
+ if (!mem->vaddr)
+ mem->vaddr = vaddr;
+
+ memset(mem->vaddr, 0, bytes);
}
static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
@@ -196,7 +232,17 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
}
+
+ /*
+ * If swiotlb_unencrypted_base is set, the bounce buffer memory will
+ * be remapped and cleared in swiotlb_update_mem_attributes.
+ */
+ if (swiotlb_unencrypted_base)
+ return;
+
memset(vaddr, 0, bytes);
+ mem->vaddr = vaddr;
+ return;
}
int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
@@ -371,7 +417,7 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
phys_addr_t orig_addr = mem->slots[index].orig_addr;
size_t alloc_size = mem->slots[index].alloc_size;
unsigned long pfn = PFN_DOWN(orig_addr);
- unsigned char *vaddr = phys_to_virt(tlb_addr);
+ unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start;
unsigned int tlb_offset, orig_addr_offset;
if (orig_addr == INVALID_PHYS_ADDR)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1362b9b770d8..fc18664f49b0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6525,26 +6525,43 @@ static void perf_pending_event(struct irq_work *entry)
perf_swevent_put_recursion_context(rctx);
}
-/*
- * We assume there is only KVM supporting the callbacks.
- * Later on, we might change it to a list if there is
- * another virtualization implementation supporting the callbacks.
- */
-struct perf_guest_info_callbacks *perf_guest_cbs;
+#ifdef CONFIG_GUEST_PERF_EVENTS
+struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
+
+DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
+DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
+DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
-int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = cbs;
- return 0;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
+ return;
+
+ rcu_assign_pointer(perf_guest_cbs, cbs);
+ static_call_update(__perf_guest_state, cbs->state);
+ static_call_update(__perf_guest_get_ip, cbs->get_ip);
+
+ /* Implementing ->handle_intel_pt_intr is optional. */
+ if (cbs->handle_intel_pt_intr)
+ static_call_update(__perf_guest_handle_intel_pt_intr,
+ cbs->handle_intel_pt_intr);
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
-int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = NULL;
- return 0;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
+ return;
+
+ rcu_assign_pointer(perf_guest_cbs, NULL);
+ static_call_update(__perf_guest_state, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_handle_intel_pt_intr,
+ (void *)&__static_call_return0);
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+#endif
static void
perf_output_sample_regs(struct perf_output_handle *handle,
diff --git a/kernel/exit.c b/kernel/exit.c
index f702a6a63686..b00a25bb4ab9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -116,7 +116,7 @@ static void __exit_signal(struct task_struct *tsk)
* then notify it:
*/
if (sig->notify_count > 0 && !--sig->notify_count)
- wake_up_process(sig->group_exit_task);
+ wake_up_process(sig->group_exec_task);
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
@@ -697,7 +697,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
- wake_up_process(tsk->signal->group_exit_task);
+ wake_up_process(tsk->signal->group_exec_task);
write_unlock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
@@ -735,37 +735,22 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
- /*
- * We can get here from a kernel oops, sometimes with preemption off.
- * Start by checking for critical errors.
- * Then fix up important state like USER_DS and preemption.
- * Then do everything else.
- */
-
WARN_ON(blk_needs_flush_plug(tsk));
- if (unlikely(in_interrupt()))
- panic("Aiee, killing interrupt handler!");
- if (unlikely(!tsk->pid))
- panic("Attempted to kill the idle task!");
-
/*
- * If do_exit is called because this processes oopsed, it's possible
+ * If do_dead is called because this processes oopsed, it's possible
* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
* continuing. Amongst other possible reasons, this is to prevent
* mm_release()->clear_child_tid() from writing to a user-controlled
* kernel address.
+ *
+ * On uptodate architectures force_uaccess_begin is a noop. On
+ * architectures that still have set_fs/get_fs in addition to handling
+ * oopses handles kernel threads that run as set_fs(KERNEL_DS) by
+ * default.
*/
force_uaccess_begin();
- if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
- preempt_count_set(PREEMPT_ENABLED);
- }
-
- profile_task_exit(tsk);
kcov_task_exit(tsk);
coredump_task_exit(tsk);
@@ -773,17 +758,6 @@ void __noreturn do_exit(long code)
validate_creds_for_do_exit(tsk);
- /*
- * We're taking recursive faults here in do_exit. Safest is to just
- * leave this task alone and wait for reboot.
- */
- if (unlikely(tsk->flags & PF_EXITING)) {
- pr_alert("Fixing recursive fault but reboot is needed!\n");
- futex_exit_recursive(tsk);
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- }
-
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
@@ -882,16 +856,46 @@ void __noreturn do_exit(long code)
lockdep_free_task(tsk);
do_task_dead();
}
-EXPORT_SYMBOL_GPL(do_exit);
-void complete_and_exit(struct completion *comp, long code)
+void __noreturn make_task_dead(int signr)
{
- if (comp)
- complete(comp);
+ /*
+ * Take the task off the cpu after something catastrophic has
+ * happened.
+ *
+ * We can get here from a kernel oops, sometimes with preemption off.
+ * Start by checking for critical errors.
+ * Then fix up important state like USER_DS and preemption.
+ * Then do everything else.
+ */
+ struct task_struct *tsk = current;
- do_exit(code);
+ if (unlikely(in_interrupt()))
+ panic("Aiee, killing interrupt handler!");
+ if (unlikely(!tsk->pid))
+ panic("Attempted to kill the idle task!");
+
+ if (unlikely(in_atomic())) {
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
+
+ /*
+ * We're taking recursive faults here in make_task_dead. Safest is to just
+ * leave this task alone and wait for reboot.
+ */
+ if (unlikely(tsk->flags & PF_EXITING)) {
+ pr_alert("Fixing recursive fault but reboot is needed!\n");
+ futex_exit_recursive(tsk);
+ tsk->exit_state = EXIT_DEAD;
+ refcount_inc(&tsk->rcu_users);
+ do_task_dead();
+ }
+
+ do_exit(signr);
}
-EXPORT_SYMBOL(complete_and_exit);
SYSCALL_DEFINE1(exit, int, error_code)
{
@@ -907,17 +911,19 @@ do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
- BUG_ON(exit_code & 0x80); /* core dumps don't get here */
-
- if (signal_group_exit(sig))
+ if (sig->flags & SIGNAL_GROUP_EXIT)
exit_code = sig->group_exit_code;
+ else if (sig->group_exec_task)
+ exit_code = 0;
else if (!thread_group_empty(current)) {
struct sighand_struct *const sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
- if (signal_group_exit(sig))
+ if (sig->flags & SIGNAL_GROUP_EXIT)
/* Another thread got here before we took the lock. */
exit_code = sig->group_exit_code;
+ else if (sig->group_exec_task)
+ exit_code = 0;
else {
sig->group_exit_code = exit_code;
sig->flags = SIGNAL_GROUP_EXIT;
@@ -1012,7 +1018,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
return 0;
if (unlikely(wo->wo_flags & WNOWAIT)) {
- status = p->exit_code;
+ status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+ ? p->signal->group_exit_code : p->exit_code;
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
diff --git a/kernel/fork.c b/kernel/fork.c
index 3244cc56b697..d75a528f7b21 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -42,6 +42,7 @@
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
@@ -365,12 +366,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
+ dup_vma_anon_name(orig, new);
}
return new;
}
void vm_area_free(struct vm_area_struct *vma)
{
+ free_vma_anon_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}
@@ -754,9 +757,7 @@ void __put_task_struct(struct task_struct *tsk)
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
sched_core_free(tsk);
-
- if (!profile_handoff_task(tsk))
- free_task(tsk);
+ free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);
@@ -950,7 +951,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
- tsk->pf_io_worker = NULL;
+ tsk->worker_private = NULL;
account_kernel_stack(tsk, 1);
@@ -1556,32 +1557,6 @@ out:
return error;
}
-static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
-{
-#ifdef CONFIG_BLOCK
- struct io_context *ioc = current->io_context;
- struct io_context *new_ioc;
-
- if (!ioc)
- return 0;
- /*
- * Share io context with parent, if CLONE_IO is set
- */
- if (clone_flags & CLONE_IO) {
- ioc_task_link(ioc);
- tsk->io_context = ioc;
- } else if (ioprio_valid(ioc->ioprio)) {
- new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
- if (unlikely(!new_ioc))
- return -ENOMEM;
-
- new_ioc->ioprio = ioc->ioprio;
- put_io_context(new_ioc);
- }
-#endif
- return 0;
-}
-
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
@@ -2032,12 +2007,6 @@ static __latent_entropy struct task_struct *copy_process(
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
- /*
- * This _must_ happen before we call free_task(), i.e. before we jump
- * to any of the bad_fork_* labels. This is to avoid freeing
- * p->set_child_tid which is (ab)used as a kthread's data pointer for
- * kernel threads (PF_KTHREAD).
- */
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
@@ -2118,12 +2087,16 @@ static __latent_entropy struct task_struct *copy_process(
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
+ if (p->flags & PF_KTHREAD) {
+ if (!set_kthread_struct(p))
+ goto bad_fork_cleanup_delayacct;
+ }
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
- goto bad_fork_cleanup_threadgroup_lock;
+ goto bad_fork_cleanup_delayacct;
}
#endif
#ifdef CONFIG_CPUSETS
@@ -2460,8 +2433,8 @@ bad_fork_cleanup_policy:
lockdep_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
-bad_fork_cleanup_threadgroup_lock:
#endif
+bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
bad_fork_cleanup_count:
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 926c2bb752bc..51dd822a8060 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1031,7 +1031,7 @@ static void futex_cleanup(struct task_struct *tsk)
* actually finished the futex cleanup. The worst case for this is that the
* waiter runs through the wait loop until the state becomes visible.
*
- * This is called from the recursive fault handling path in do_exit().
+ * This is called from the recursive fault handling path in make_task_dead().
*
* This is best effort. Either the futex exit code has run already or
* not. If the OWNER_DIED bit has been set on the futex then the waiter can
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 6f29bf4c8515..f0862eb6b506 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -451,7 +451,7 @@ static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
}
-struct irq_domain_ops irq_generic_chip_ops = {
+const struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
.unmap = irq_unmap_generic_chip,
.xlate = irq_domain_xlate_onetwocell,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7405e384e5ed..f23ffd30385b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -486,7 +486,8 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
}
EXPORT_SYMBOL_GPL(irq_force_affinity);
-int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
+int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
+ bool setaffinity)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
@@ -495,12 +496,11 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
return -EINVAL;
desc->affinity_hint = m;
irq_put_desc_unlock(desc, flags);
- /* set the initial affinity to prevent every interrupt being on CPU0 */
- if (m)
+ if (m && setaffinity)
__irq_set_affinity(irq, m, false);
return 0;
}
-EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint);
static void irq_affinity_notify(struct work_struct *work)
{
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7f350ae59c5f..2bdfce5edafd 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -14,12 +14,15 @@
#include <linux/irqdomain.h>
#include <linux/msi.h>
#include <linux/slab.h>
+#include <linux/sysfs.h>
#include <linux/pci.h>
#include "internals.h"
+static inline int msi_sysfs_create_group(struct device *dev);
+
/**
- * alloc_msi_entry - Allocate an initialized msi_desc
+ * msi_alloc_desc - Allocate an initialized msi_desc
* @dev: Pointer to the device for which this is allocated
* @nvec: The number of vectors used in this entry
* @affinity: Optional pointer to an affinity mask array size of @nvec
@@ -29,34 +32,134 @@
*
* Return: pointer to allocated &msi_desc on success or %NULL on failure
*/
-struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
- const struct irq_affinity_desc *affinity)
+static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
+ const struct irq_affinity_desc *affinity)
{
- struct msi_desc *desc;
+ struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
- desc = kzalloc(sizeof(*desc), GFP_KERNEL);
if (!desc)
return NULL;
- INIT_LIST_HEAD(&desc->list);
desc->dev = dev;
desc->nvec_used = nvec;
if (affinity) {
- desc->affinity = kmemdup(affinity,
- nvec * sizeof(*desc->affinity), GFP_KERNEL);
+ desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL);
if (!desc->affinity) {
kfree(desc);
return NULL;
}
}
-
return desc;
}
-void free_msi_entry(struct msi_desc *entry)
+static void msi_free_desc(struct msi_desc *desc)
{
- kfree(entry->affinity);
- kfree(entry);
+ kfree(desc->affinity);
+ kfree(desc);
+}
+
+static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index)
+{
+ int ret;
+
+ desc->msi_index = index;
+ ret = xa_insert(&md->__store, index, desc, GFP_KERNEL);
+ if (ret)
+ msi_free_desc(desc);
+ return ret;
+}
+
+/**
+ * msi_add_msi_desc - Allocate and initialize a MSI descriptor
+ * @dev: Pointer to the device for which the descriptor is allocated
+ * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
+{
+ struct msi_desc *desc;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ desc = msi_alloc_desc(dev, init_desc->nvec_used, init_desc->affinity);
+ if (!desc)
+ return -ENOMEM;
+
+ /* Copy type specific data to the new descriptor. */
+ desc->pci = init_desc->pci;
+ return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index);
+}
+
+/**
+ * msi_add_simple_msi_descs - Allocate and initialize MSI descriptors
+ * @dev: Pointer to the device for which the descriptors are allocated
+ * @index: Index for the first MSI descriptor
+ * @ndesc: Number of descriptors to allocate
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc)
+{
+ unsigned int idx, last = index + ndesc - 1;
+ struct msi_desc *desc;
+ int ret;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ for (idx = index; idx <= last; idx++) {
+ desc = msi_alloc_desc(dev, 1, NULL);
+ if (!desc)
+ goto fail_mem;
+ ret = msi_insert_desc(dev->msi.data, desc, idx);
+ if (ret)
+ goto fail;
+ }
+ return 0;
+
+fail_mem:
+ ret = -ENOMEM;
+fail:
+ msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last);
+ return ret;
+}
+
+static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
+{
+ switch (filter) {
+ case MSI_DESC_ALL:
+ return true;
+ case MSI_DESC_NOTASSOCIATED:
+ return !desc->irq;
+ case MSI_DESC_ASSOCIATED:
+ return !!desc->irq;
+ }
+ WARN_ON_ONCE(1);
+ return false;
+}
+
+/**
+ * msi_free_msi_descs_range - Free MSI descriptors of a device
+ * @dev: Device to free the descriptors
+ * @filter: Descriptor state filter
+ * @first_index: Index to start freeing from
+ * @last_index: Last index to be freed
+ */
+void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
+ unsigned int first_index, unsigned int last_index)
+{
+ struct xarray *xa = &dev->msi.data->__store;
+ struct msi_desc *desc;
+ unsigned long idx;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ xa_for_each_range(xa, idx, desc, first_index, last_index) {
+ if (msi_desc_match(desc, filter)) {
+ xa_erase(xa, idx);
+ msi_free_desc(desc);
+ }
+ }
}
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -72,139 +175,290 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
}
EXPORT_SYMBOL_GPL(get_cached_msi_msg);
-static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
- char *buf)
+static void msi_device_data_release(struct device *dev, void *res)
{
- struct msi_desc *entry;
- bool is_msix = false;
- unsigned long irq;
- int retval;
+ struct msi_device_data *md = res;
- retval = kstrtoul(attr->attr.name, 10, &irq);
- if (retval)
- return retval;
+ WARN_ON_ONCE(!xa_empty(&md->__store));
+ xa_destroy(&md->__store);
+ dev->msi.data = NULL;
+}
- entry = irq_get_msi_desc(irq);
- if (!entry)
- return -ENODEV;
+/**
+ * msi_setup_device_data - Setup MSI device data
+ * @dev: Device for which MSI device data should be set up
+ *
+ * Return: 0 on success, appropriate error code otherwise
+ *
+ * This can be called more than once for @dev. If the MSI device data is
+ * already allocated the call succeeds. The allocated memory is
+ * automatically released when the device is destroyed.
+ */
+int msi_setup_device_data(struct device *dev)
+{
+ struct msi_device_data *md;
+ int ret;
- if (dev_is_pci(dev))
- is_msix = entry->msi_attrib.is_msix;
+ if (dev->msi.data)
+ return 0;
- return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
+ md = devres_alloc(msi_device_data_release, sizeof(*md), GFP_KERNEL);
+ if (!md)
+ return -ENOMEM;
+
+ ret = msi_sysfs_create_group(dev);
+ if (ret) {
+ devres_free(md);
+ return ret;
+ }
+
+ xa_init(&md->__store);
+ mutex_init(&md->mutex);
+ dev->msi.data = md;
+ devres_add(dev, md);
+ return 0;
+}
+
+/**
+ * msi_lock_descs - Lock the MSI descriptor storage of a device
+ * @dev: Device to operate on
+ */
+void msi_lock_descs(struct device *dev)
+{
+ mutex_lock(&dev->msi.data->mutex);
+}
+EXPORT_SYMBOL_GPL(msi_lock_descs);
+
+/**
+ * msi_unlock_descs - Unlock the MSI descriptor storage of a device
+ * @dev: Device to operate on
+ */
+void msi_unlock_descs(struct device *dev)
+{
+ /* Invalidate the index wich was cached by the iterator */
+ dev->msi.data->__iter_idx = MSI_MAX_INDEX;
+ mutex_unlock(&dev->msi.data->mutex);
+}
+EXPORT_SYMBOL_GPL(msi_unlock_descs);
+
+static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter)
+{
+ struct msi_desc *desc;
+
+ xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) {
+ if (msi_desc_match(desc, filter))
+ return desc;
+ }
+ md->__iter_idx = MSI_MAX_INDEX;
+ return NULL;
}
/**
- * msi_populate_sysfs - Populate msi_irqs sysfs entries for devices
- * @dev: The device(PCI, platform etc) who will get sysfs entries
+ * msi_first_desc - Get the first MSI descriptor of a device
+ * @dev: Device to operate on
+ * @filter: Descriptor state filter
+ *
+ * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
+ * must be invoked before the call.
*
- * Return attribute_group ** so that specific bus MSI can save it to
- * somewhere during initilizing msi irqs. If devices has no MSI irq,
- * return NULL; if it fails to populate sysfs, return ERR_PTR
+ * Return: Pointer to the first MSI descriptor matching the search
+ * criteria, NULL if none found.
*/
-const struct attribute_group **msi_populate_sysfs(struct device *dev)
-{
- const struct attribute_group **msi_irq_groups;
- struct attribute **msi_attrs, *msi_attr;
- struct device_attribute *msi_dev_attr;
- struct attribute_group *msi_irq_group;
- struct msi_desc *entry;
- int ret = -ENOMEM;
- int num_msi = 0;
- int count = 0;
- int i;
+struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter)
+{
+ struct msi_device_data *md = dev->msi.data;
- /* Determine how many msi entries we have */
- for_each_msi_entry(entry, dev)
- num_msi += entry->nvec_used;
- if (!num_msi)
+ if (WARN_ON_ONCE(!md))
return NULL;
- /* Dynamically create the MSI attributes for the device */
- msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL);
- if (!msi_attrs)
- return ERR_PTR(-ENOMEM);
-
- for_each_msi_entry(entry, dev) {
- for (i = 0; i < entry->nvec_used; i++) {
- msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
- if (!msi_dev_attr)
- goto error_attrs;
- msi_attrs[count] = &msi_dev_attr->attr;
-
- sysfs_attr_init(&msi_dev_attr->attr);
- msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
- entry->irq + i);
- if (!msi_dev_attr->attr.name)
- goto error_attrs;
- msi_dev_attr->attr.mode = 0444;
- msi_dev_attr->show = msi_mode_show;
- ++count;
+ lockdep_assert_held(&md->mutex);
+
+ md->__iter_idx = 0;
+ return msi_find_desc(md, filter);
+}
+EXPORT_SYMBOL_GPL(msi_first_desc);
+
+/**
+ * msi_next_desc - Get the next MSI descriptor of a device
+ * @dev: Device to operate on
+ *
+ * The first invocation of msi_next_desc() has to be preceeded by a
+ * successful invocation of __msi_first_desc(). Consecutive invocations are
+ * only valid if the previous one was successful. All these operations have
+ * to be done within the same MSI mutex held region.
+ *
+ * Return: Pointer to the next MSI descriptor matching the search
+ * criteria, NULL if none found.
+ */
+struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
+{
+ struct msi_device_data *md = dev->msi.data;
+
+ if (WARN_ON_ONCE(!md))
+ return NULL;
+
+ lockdep_assert_held(&md->mutex);
+
+ if (md->__iter_idx >= (unsigned long)MSI_MAX_INDEX)
+ return NULL;
+
+ md->__iter_idx++;
+ return msi_find_desc(md, filter);
+}
+EXPORT_SYMBOL_GPL(msi_next_desc);
+
+/**
+ * msi_get_virq - Return Linux interrupt number of a MSI interrupt
+ * @dev: Device to operate on
+ * @index: MSI interrupt index to look for (0-based)
+ *
+ * Return: The Linux interrupt number on success (> 0), 0 if not found
+ */
+unsigned int msi_get_virq(struct device *dev, unsigned int index)
+{
+ struct msi_desc *desc;
+ unsigned int ret = 0;
+ bool pcimsi;
+
+ if (!dev->msi.data)
+ return 0;
+
+ pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false;
+
+ msi_lock_descs(dev);
+ desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index);
+ if (desc && desc->irq) {
+ /*
+ * PCI-MSI has only one descriptor for multiple interrupts.
+ * PCI-MSIX and platform MSI use a descriptor per
+ * interrupt.
+ */
+ if (pcimsi) {
+ if (index < desc->nvec_used)
+ ret = desc->irq + index;
+ } else {
+ ret = desc->irq;
}
}
+ msi_unlock_descs(dev);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(msi_get_virq);
- msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
- if (!msi_irq_group)
- goto error_attrs;
- msi_irq_group->name = "msi_irqs";
- msi_irq_group->attrs = msi_attrs;
+#ifdef CONFIG_SYSFS
+static struct attribute *msi_dev_attrs[] = {
+ NULL
+};
- msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL);
- if (!msi_irq_groups)
- goto error_irq_group;
- msi_irq_groups[0] = msi_irq_group;
+static const struct attribute_group msi_irqs_group = {
+ .name = "msi_irqs",
+ .attrs = msi_dev_attrs,
+};
- ret = sysfs_create_groups(&dev->kobj, msi_irq_groups);
- if (ret)
- goto error_irq_groups;
-
- return msi_irq_groups;
-
-error_irq_groups:
- kfree(msi_irq_groups);
-error_irq_group:
- kfree(msi_irq_group);
-error_attrs:
- count = 0;
- msi_attr = msi_attrs[count];
- while (msi_attr) {
- msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
- kfree(msi_attr->name);
- kfree(msi_dev_attr);
- ++count;
- msi_attr = msi_attrs[count];
+static inline int msi_sysfs_create_group(struct device *dev)
+{
+ return devm_device_add_group(dev, &msi_irqs_group);
+}
+
+static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ /* MSI vs. MSIX is per device not per interrupt */
+ bool is_msix = dev_is_pci(dev) ? to_pci_dev(dev)->msix_enabled : false;
+
+ return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
+}
+
+static void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc)
+{
+ struct device_attribute *attrs = desc->sysfs_attrs;
+ int i;
+
+ if (!attrs)
+ return;
+
+ desc->sysfs_attrs = NULL;
+ for (i = 0; i < desc->nvec_used; i++) {
+ if (attrs[i].show)
+ sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
+ kfree(attrs[i].attr.name);
}
- kfree(msi_attrs);
- return ERR_PTR(ret);
+ kfree(attrs);
}
+static int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc)
+{
+ struct device_attribute *attrs;
+ int ret, i;
+
+ attrs = kcalloc(desc->nvec_used, sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+
+ desc->sysfs_attrs = attrs;
+ for (i = 0; i < desc->nvec_used; i++) {
+ sysfs_attr_init(&attrs[i].attr);
+ attrs[i].attr.name = kasprintf(GFP_KERNEL, "%d", desc->irq + i);
+ if (!attrs[i].attr.name) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ attrs[i].attr.mode = 0444;
+ attrs[i].show = msi_mode_show;
+
+ ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
+ if (ret) {
+ attrs[i].show = NULL;
+ goto fail;
+ }
+ }
+ return 0;
+
+fail:
+ msi_sysfs_remove_desc(dev, desc);
+ return ret;
+}
+
+#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
/**
- * msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices
- * @dev: The device(PCI, platform etc) who will remove sysfs entries
- * @msi_irq_groups: attribute_group for device msi_irqs entries
+ * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device
+ * @dev: The device (PCI, platform etc) which will get sysfs entries
*/
-void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups)
-{
- struct device_attribute *dev_attr;
- struct attribute **msi_attrs;
- int count = 0;
-
- if (msi_irq_groups) {
- sysfs_remove_groups(&dev->kobj, msi_irq_groups);
- msi_attrs = msi_irq_groups[0]->attrs;
- while (msi_attrs[count]) {
- dev_attr = container_of(msi_attrs[count],
- struct device_attribute, attr);
- kfree(dev_attr->attr.name);
- kfree(dev_attr);
- ++count;
- }
- kfree(msi_attrs);
- kfree(msi_irq_groups[0]);
- kfree(msi_irq_groups);
+int msi_device_populate_sysfs(struct device *dev)
+{
+ struct msi_desc *desc;
+ int ret;
+
+ msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ if (desc->sysfs_attrs)
+ continue;
+ ret = msi_sysfs_populate_desc(dev, desc);
+ if (ret)
+ return ret;
}
+ return 0;
}
+/**
+ * msi_device_destroy_sysfs - Destroy msi_irqs sysfs entries for a device
+ * @dev: The device (PCI, platform etc) for which to remove
+ * sysfs entries
+ */
+void msi_device_destroy_sysfs(struct device *dev)
+{
+ struct msi_desc *desc;
+
+ msi_for_each_desc(desc, dev, MSI_DESC_ALL)
+ msi_sysfs_remove_desc(dev, desc);
+}
+#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */
+#else /* CONFIG_SYSFS */
+static inline int msi_sysfs_create_group(struct device *dev) { return 0; }
+static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; }
+static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { }
+#endif /* !CONFIG_SYSFS */
+
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
@@ -456,43 +710,38 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
}
int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
- int virq, int nvec, msi_alloc_info_t *arg)
+ int virq_base, int nvec, msi_alloc_info_t *arg)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
struct msi_desc *desc;
- int ret = 0;
+ int ret, virq;
- for_each_msi_entry(desc, dev) {
- /* Don't even try the multi-MSI brain damage. */
- if (WARN_ON(!desc->irq || desc->nvec_used != 1)) {
- ret = -EINVAL;
- break;
- }
+ msi_lock_descs(dev);
+ ret = msi_add_simple_msi_descs(dev, virq_base, nvec);
+ if (ret)
+ goto unlock;
- if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
- continue;
+ for (virq = virq_base; virq < virq_base + nvec; virq++) {
+ desc = xa_load(&dev->msi.data->__store, virq);
+ desc->irq = virq;
ops->set_desc(arg, desc);
- /* Assumes the domain mutex is held! */
- ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1,
- arg);
+ ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
if (ret)
- break;
+ goto fail;
- irq_set_msi_desc_off(desc->irq, 0, desc);
- }
-
- if (ret) {
- /* Mop up the damage */
- for_each_msi_entry(desc, dev) {
- if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
- continue;
-
- irq_domain_free_irqs_common(domain, desc->irq, 1);
- }
+ irq_set_msi_desc(virq, desc);
}
+ msi_unlock_descs(dev);
+ return 0;
+fail:
+ for (--virq; virq >= virq_base; virq--)
+ irq_domain_free_irqs_common(domain, virq, 1);
+ msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1);
+unlock:
+ msi_unlock_descs(dev);
return ret;
}
@@ -531,8 +780,59 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
* Checking the first MSI descriptor is sufficient. MSIX supports
* masking and MSI does so when the can_mask attribute is set.
*/
- desc = first_msi_entry(dev);
- return desc->msi_attrib.is_msix || desc->msi_attrib.can_mask;
+ desc = msi_first_desc(dev, MSI_DESC_ALL);
+ return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask;
+}
+
+static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
+ int allocated)
+{
+ switch(domain->bus_token) {
+ case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_VMD_MSI:
+ if (IS_ENABLED(CONFIG_PCI_MSI))
+ break;
+ fallthrough;
+ default:
+ return -ENOSPC;
+ }
+
+ /* Let a failed PCI multi MSI allocation retry */
+ if (desc->nvec_used > 1)
+ return 1;
+
+ /* If there was a successful allocation let the caller know */
+ return allocated ? allocated : -ENOSPC;
+}
+
+#define VIRQ_CAN_RESERVE 0x01
+#define VIRQ_ACTIVATE 0x02
+#define VIRQ_NOMASK_QUIRK 0x04
+
+static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
+{
+ struct irq_data *irqd = irq_domain_get_irq_data(domain, virq);
+ int ret;
+
+ if (!(vflags & VIRQ_CAN_RESERVE)) {
+ irqd_clr_can_reserve(irqd);
+ if (vflags & VIRQ_NOMASK_QUIRK)
+ irqd_set_msi_nomask_quirk(irqd);
+ }
+
+ if (!(vflags & VIRQ_ACTIVATE))
+ return 0;
+
+ ret = irq_domain_activate_irq(irqd, vflags & VIRQ_CAN_RESERVE);
+ if (ret)
+ return ret;
+ /*
+ * If the interrupt uses reservation mode, clear the activated bit
+ * so request_irq() will assign the final vector.
+ */
+ if (vflags & VIRQ_CAN_RESERVE)
+ irqd_clr_activated(irqd);
+ return 0;
}
int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
@@ -540,83 +840,103 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- struct irq_data *irq_data;
- struct msi_desc *desc;
msi_alloc_info_t arg = { };
+ unsigned int vflags = 0;
+ struct msi_desc *desc;
+ int allocated = 0;
int i, ret, virq;
- bool can_reserve;
ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
if (ret)
return ret;
- for_each_msi_entry(desc, dev) {
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY)
+ vflags |= VIRQ_ACTIVATE;
+
+ /*
+ * Interrupt can use a reserved vector and will not occupy
+ * a real device vector until the interrupt is requested.
+ */
+ if (msi_check_reservation_mode(domain, info, dev)) {
+ vflags |= VIRQ_CAN_RESERVE;
+ /*
+ * MSI affinity setting requires a special quirk (X86) when
+ * reservation mode is active.
+ */
+ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ vflags |= VIRQ_NOMASK_QUIRK;
+ }
+
+ msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) {
ops->set_desc(&arg, desc);
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
dev_to_node(dev), &arg, false,
desc->affinity);
- if (virq < 0) {
- ret = -ENOSPC;
- if (ops->handle_error)
- ret = ops->handle_error(domain, desc, ret);
- if (ops->msi_finish)
- ops->msi_finish(&arg, ret);
- return ret;
- }
+ if (virq < 0)
+ return msi_handle_pci_fail(domain, desc, allocated);
for (i = 0; i < desc->nvec_used; i++) {
irq_set_msi_desc_off(virq, i, desc);
irq_debugfs_copy_devname(virq + i, dev);
+ ret = msi_init_virq(domain, virq + i, vflags);
+ if (ret)
+ return ret;
+ }
+ if (info->flags & MSI_FLAG_DEV_SYSFS) {
+ ret = msi_sysfs_populate_desc(dev, desc);
+ if (ret)
+ return ret;
}
+ allocated++;
}
+ return 0;
+}
- if (ops->msi_finish)
- ops->msi_finish(&arg, 0);
+static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info,
+ struct device *dev,
+ unsigned int num_descs)
+{
+ if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS))
+ return 0;
- can_reserve = msi_check_reservation_mode(domain, info, dev);
+ return msi_add_simple_msi_descs(dev, 0, num_descs);
+}
- /*
- * This flag is set by the PCI layer as we need to activate
- * the MSI entries before the PCI layer enables MSI in the
- * card. Otherwise the card latches a random msi message.
- */
- if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
- goto skip_activate;
-
- for_each_msi_vector(desc, i, dev) {
- if (desc->irq == i) {
- virq = desc->irq;
- dev_dbg(dev, "irq [%d-%d] for MSI\n",
- virq, virq + desc->nvec_used - 1);
- }
+/**
+ * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain
+ * @domain: The domain to allocate from
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @nvec: The number of interrupts to allocate
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own vector
+ * allocation/free.
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev,
+ int nvec)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ int ret;
- irq_data = irq_domain_get_irq_data(domain, i);
- if (!can_reserve) {
- irqd_clr_can_reserve(irq_data);
- if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
- irqd_set_msi_nomask_quirk(irq_data);
- }
- ret = irq_domain_activate_irq(irq_data, can_reserve);
- if (ret)
- goto cleanup;
- }
+ lockdep_assert_held(&dev->msi.data->mutex);
-skip_activate:
- /*
- * If these interrupts use reservation mode, clear the activated bit
- * so request_irq() will assign the final vector.
- */
- if (can_reserve) {
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- irqd_clr_activated(irq_data);
- }
- }
- return 0;
+ ret = msi_domain_add_simple_msi_descs(info, dev, nvec);
+ if (ret)
+ return ret;
-cleanup:
- msi_domain_free_irqs(domain, dev);
+ ret = ops->domain_alloc_irqs(domain, dev, nvec);
+ if (ret)
+ msi_domain_free_irqs_descs_locked(domain, dev);
return ret;
}
@@ -629,52 +949,78 @@ cleanup:
*
* Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
+ int ret;
- return ops->domain_alloc_irqs(domain, dev, nvec);
+ msi_lock_descs(dev);
+ ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec);
+ msi_unlock_descs(dev);
+ return ret;
}
void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
- struct irq_data *irq_data;
+ struct msi_domain_info *info = domain->host_data;
+ struct irq_data *irqd;
struct msi_desc *desc;
int i;
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- if (irqd_is_activated(irq_data))
- irq_domain_deactivate_irq(irq_data);
- }
-
- for_each_msi_entry(desc, dev) {
- /*
- * We might have failed to allocate an MSI early
- * enough that there is no IRQ associated to this
- * entry. If that's the case, don't do anything.
- */
- if (desc->irq) {
- irq_domain_free_irqs(desc->irq, desc->nvec_used);
- desc->irq = 0;
+ /* Only handle MSI entries which have an interrupt associated */
+ msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ /* Make sure all interrupts are deactivated */
+ for (i = 0; i < desc->nvec_used; i++) {
+ irqd = irq_domain_get_irq_data(domain, desc->irq + i);
+ if (irqd && irqd_is_activated(irqd))
+ irq_domain_deactivate_irq(irqd);
}
+
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ if (info->flags & MSI_FLAG_DEV_SYSFS)
+ msi_sysfs_remove_desc(dev, desc);
+ desc->irq = 0;
}
}
+static void msi_domain_free_msi_descs(struct msi_domain_info *info,
+ struct device *dev)
+{
+ if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
+ msi_free_msi_descs(dev);
+}
+
/**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev
+ * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev
* @domain: The domain to managing the interrupts
* @dev: Pointer to device struct of the device for which the interrupts
* are free
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own vector
+ * allocation.
*/
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- return ops->domain_free_irqs(domain, dev);
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ ops->domain_free_irqs(domain, dev);
+ msi_domain_free_msi_descs(info, dev);
+}
+
+/**
+ * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev
+ * @domain: The domain to managing the interrupts
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are free
+ */
+void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_descs_locked(domain, dev);
+ msi_unlock_descs(dev);
}
/**
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3011bc33a5ba..951c93216fc4 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -243,6 +243,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
if (ret != 0)
return ret;
+ cond_resched();
}
return 0;
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 5a5d192a89ac..68480f731192 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -81,7 +81,7 @@ int kexec_should_crash(struct task_struct *p)
if (crash_kexec_post_notifiers)
return 0;
/*
- * There are 4 panic() calls in do_exit() path, each of which
+ * There are 4 panic() calls in make_task_dead() path, each of which
* corresponds to each of these 4 conditions.
*/
if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7113003fab63..a2c156ee8275 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -52,6 +52,7 @@ struct kthread_create_info
struct kthread {
unsigned long flags;
unsigned int cpu;
+ int result;
int (*threadfn)(void *);
void *data;
mm_segment_t oldfs;
@@ -71,7 +72,7 @@ enum KTHREAD_BITS {
static inline struct kthread *to_kthread(struct task_struct *k)
{
WARN_ON(!(k->flags & PF_KTHREAD));
- return (__force void *)k->set_child_tid;
+ return k->worker_private;
}
/*
@@ -79,7 +80,7 @@ static inline struct kthread *to_kthread(struct task_struct *k)
*
* Per construction; when:
*
- * (p->flags & PF_KTHREAD) && p->set_child_tid
+ * (p->flags & PF_KTHREAD) && p->worker_private
*
* the task is both a kthread and struct kthread is persistent. However
* PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
@@ -87,26 +88,29 @@ static inline struct kthread *to_kthread(struct task_struct *k)
*/
static inline struct kthread *__to_kthread(struct task_struct *p)
{
- void *kthread = (__force void *)p->set_child_tid;
+ void *kthread = p->worker_private;
if (kthread && !(p->flags & PF_KTHREAD))
kthread = NULL;
return kthread;
}
-void set_kthread_struct(struct task_struct *p)
+bool set_kthread_struct(struct task_struct *p)
{
struct kthread *kthread;
- if (__to_kthread(p))
- return;
+ if (WARN_ON_ONCE(to_kthread(p)))
+ return false;
kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
- /*
- * We abuse ->set_child_tid to avoid the new member and because it
- * can't be wrongly copied by copy_process(). We also rely on fact
- * that the caller can't exec, so PF_KTHREAD can't be cleared.
- */
- p->set_child_tid = (__force void __user *)kthread;
+ if (!kthread)
+ return false;
+
+ init_completion(&kthread->exited);
+ init_completion(&kthread->parked);
+ p->vfork_done = &kthread->exited;
+
+ p->worker_private = kthread;
+ return true;
}
void free_kthread_struct(struct task_struct *k)
@@ -114,13 +118,13 @@ void free_kthread_struct(struct task_struct *k)
struct kthread *kthread;
/*
- * Can be NULL if this kthread was created by kernel_thread()
- * or if kmalloc() in kthread() failed.
+ * Can be NULL if kmalloc() in set_kthread_struct() failed.
*/
kthread = to_kthread(k);
#ifdef CONFIG_BLK_CGROUP
WARN_ON_ONCE(kthread && kthread->blkcg_css);
#endif
+ k->worker_private = NULL;
kfree(kthread);
}
@@ -268,6 +272,44 @@ void kthread_parkme(void)
}
EXPORT_SYMBOL_GPL(kthread_parkme);
+/**
+ * kthread_exit - Cause the current kthread return @result to kthread_stop().
+ * @result: The integer value to return to kthread_stop().
+ *
+ * While kthread_exit can be called directly, it exists so that
+ * functions which do some additional work in non-modular code such as
+ * module_put_and_kthread_exit can be implemented.
+ *
+ * Does not return.
+ */
+void __noreturn kthread_exit(long result)
+{
+ struct kthread *kthread = to_kthread(current);
+ kthread->result = result;
+ do_exit(0);
+}
+
+/**
+ * kthread_complete_and_exit - Exit the current kthread.
+ * @comp: Completion to complete
+ * @code: The integer value to return to kthread_stop().
+ *
+ * If present complete @comp and the reuturn code to kthread_stop().
+ *
+ * A kernel thread whose module may be removed after the completion of
+ * @comp can use this function exit safely.
+ *
+ * Does not return.
+ */
+void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
+{
+ if (comp)
+ complete(comp);
+
+ kthread_exit(code);
+}
+EXPORT_SYMBOL(kthread_complete_and_exit);
+
static int kthread(void *_create)
{
static const struct sched_param param = { .sched_priority = 0 };
@@ -279,27 +321,17 @@ static int kthread(void *_create)
struct kthread *self;
int ret;
- set_kthread_struct(current);
self = to_kthread(current);
/* If user was SIGKILLed, I release the structure. */
done = xchg(&create->done, NULL);
if (!done) {
kfree(create);
- do_exit(-EINTR);
- }
-
- if (!self) {
- create->result = ERR_PTR(-ENOMEM);
- complete(done);
- do_exit(-ENOMEM);
+ kthread_exit(-EINTR);
}
self->threadfn = threadfn;
self->data = data;
- init_completion(&self->exited);
- init_completion(&self->parked);
- current->vfork_done = &self->exited;
/*
* The new thread inherited kthreadd's priority and CPU mask. Reset
@@ -326,7 +358,7 @@ static int kthread(void *_create)
__kthread_parkme(self);
ret = threadfn(data);
}
- do_exit(ret);
+ kthread_exit(ret);
}
/* called from kernel_clone() to get node information for about to be created task */
@@ -523,6 +555,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
to_kthread(p)->cpu = cpu;
return p;
}
+EXPORT_SYMBOL(kthread_create_on_cpu);
void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
@@ -627,7 +660,7 @@ EXPORT_SYMBOL_GPL(kthread_park);
* instead of calling wake_up_process(): the thread will exit without
* calling threadfn().
*
- * If threadfn() may call do_exit() itself, the caller must ensure
+ * If threadfn() may call kthread_exit() itself, the caller must ensure
* task_struct can't go away.
*
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
@@ -646,7 +679,7 @@ int kthread_stop(struct task_struct *k)
kthread_unpark(k);
wake_up_process(k);
wait_for_completion(&kthread->exited);
- ret = k->exit_code;
+ ret = kthread->result;
put_task_struct(k);
trace_sched_kthread_stop_ret(ret);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 335d988bd811..585494ec464f 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -862,14 +862,11 @@ static void klp_init_object_early(struct klp_patch *patch,
list_add_tail(&obj->node, &patch->obj_list);
}
-static int klp_init_patch_early(struct klp_patch *patch)
+static void klp_init_patch_early(struct klp_patch *patch)
{
struct klp_object *obj;
struct klp_func *func;
- if (!patch->objs)
- return -EINVAL;
-
INIT_LIST_HEAD(&patch->list);
INIT_LIST_HEAD(&patch->obj_list);
kobject_init(&patch->kobj, &klp_ktype_patch);
@@ -879,20 +876,12 @@ static int klp_init_patch_early(struct klp_patch *patch)
init_completion(&patch->finish);
klp_for_each_object_static(patch, obj) {
- if (!obj->funcs)
- return -EINVAL;
-
klp_init_object_early(patch, obj);
klp_for_each_func_static(obj, func) {
klp_init_func_early(obj, func);
}
}
-
- if (!try_module_get(patch->mod))
- return -ENODEV;
-
- return 0;
}
static int klp_init_patch(struct klp_patch *patch)
@@ -1024,10 +1013,17 @@ err:
int klp_enable_patch(struct klp_patch *patch)
{
int ret;
+ struct klp_object *obj;
- if (!patch || !patch->mod)
+ if (!patch || !patch->mod || !patch->objs)
return -EINVAL;
+ klp_for_each_object_static(patch, obj) {
+ if (!obj->funcs)
+ return -EINVAL;
+ }
+
+
if (!is_livepatch_module(patch->mod)) {
pr_err("module %s is not marked as a livepatch module\n",
patch->mod->name);
@@ -1051,12 +1047,13 @@ int klp_enable_patch(struct klp_patch *patch)
return -EINVAL;
}
- ret = klp_init_patch_early(patch);
- if (ret) {
+ if (!try_module_get(patch->mod)) {
mutex_unlock(&klp_mutex);
- return ret;
+ return -ENODEV;
}
+ klp_init_patch_early(patch);
+
ret = klp_init_patch(patch);
if (ret)
goto err;
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index e5c9fb295ba9..c2e724d97ddf 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -272,12 +272,12 @@ void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor)
EXPORT_SYMBOL_GPL(klp_shadow_free);
/**
- * klp_shadow_free_all() - detach and free all <*, id> shadow variables
+ * klp_shadow_free_all() - detach and free all <_, id> shadow variables
* @id: data identifier
* @dtor: custom callback that can be used to unregister the variable
* and/or free data that the shadow variable points to (optional)
*
- * This function releases the memory for all <*, id> shadow variable
+ * This function releases the memory for all <_, id> shadow variable
* instances, callers should stop referencing them accordingly.
*/
void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor)
@@ -288,7 +288,7 @@ void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor)
spin_lock_irqsave(&klp_shadow_lock, flags);
- /* Delete all <*, id> from hash */
+ /* Delete all <_, id> from hash */
hash_for_each(klp_shadow_hash, i, shadow, node) {
if (klp_shadow_match(shadow, shadow->obj, id))
klp_shadow_free_struct(shadow, dtor);
diff --git a/kernel/module.c b/kernel/module.c
index 387ee77bdbd6..24dab046e16c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -337,12 +337,12 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
* A thread that wants to hold a reference to a module only while it
* is running can call this to safely exit. nfsd and lockd use this.
*/
-void __noreturn __module_put_and_exit(struct module *mod, long code)
+void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
{
module_put(mod);
- do_exit(code);
+ kthread_exit(code);
}
-EXPORT_SYMBOL(__module_put_and_exit);
+EXPORT_SYMBOL(__module_put_and_kthread_exit);
/* Find a module section: 0 means not found. */
static unsigned int find_sec(const struct load_info *info, const char *name)
@@ -4512,6 +4512,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
mod, kallsyms_symbol_value(sym));
if (ret != 0)
goto out;
+
+ cond_resched();
}
}
out:
diff --git a/kernel/params.c b/kernel/params.c
index 8299bd764e42..5b92310425c5 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -926,9 +926,9 @@ static const struct sysfs_ops module_sysfs_ops = {
.store = module_attr_store,
};
-static int uevent_filter(struct kset *kset, struct kobject *kobj)
+static int uevent_filter(struct kobject *kobj)
{
- struct kobj_type *ktype = get_ktype(kobj);
+ const struct kobj_type *ktype = get_ktype(kobj);
if (ktype == &module_ktype)
return 1;
diff --git a/kernel/profile.c b/kernel/profile.c
index eb9c7f0f5ac5..37640a0bd8a3 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -133,79 +133,6 @@ int __ref profile_init(void)
return -ENOMEM;
}
-/* Profile event notifications */
-
-static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
-static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
-static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
-
-void profile_task_exit(struct task_struct *task)
-{
- blocking_notifier_call_chain(&task_exit_notifier, 0, task);
-}
-
-int profile_handoff_task(struct task_struct *task)
-{
- int ret;
- ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
- return (ret == NOTIFY_OK) ? 1 : 0;
-}
-
-void profile_munmap(unsigned long addr)
-{
- blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
-}
-
-int task_handoff_register(struct notifier_block *n)
-{
- return atomic_notifier_chain_register(&task_free_notifier, n);
-}
-EXPORT_SYMBOL_GPL(task_handoff_register);
-
-int task_handoff_unregister(struct notifier_block *n)
-{
- return atomic_notifier_chain_unregister(&task_free_notifier, n);
-}
-EXPORT_SYMBOL_GPL(task_handoff_unregister);
-
-int profile_event_register(enum profile_type type, struct notifier_block *n)
-{
- int err = -EINVAL;
-
- switch (type) {
- case PROFILE_TASK_EXIT:
- err = blocking_notifier_chain_register(
- &task_exit_notifier, n);
- break;
- case PROFILE_MUNMAP:
- err = blocking_notifier_chain_register(
- &munmap_notifier, n);
- break;
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(profile_event_register);
-
-int profile_event_unregister(enum profile_type type, struct notifier_block *n)
-{
- int err = -EINVAL;
-
- switch (type) {
- case PROFILE_TASK_EXIT:
- err = blocking_notifier_chain_unregister(
- &task_exit_notifier, n);
- break;
- case PROFILE_MUNMAP:
- err = blocking_notifier_chain_unregister(
- &munmap_notifier, n);
- break;
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(profile_event_unregister);
-
#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
/*
* Each cpu has a pair of open-addressed hashtables for pending
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f8589bf8d7dc..eea265082e97 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -419,8 +419,6 @@ static int ptrace_attach(struct task_struct *task, long request,
if (task->ptrace)
goto unlock_tasklist;
- if (seize)
- flags |= PT_SEIZED;
task->ptrace = flags;
ptrace_link(task, current);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 33ea446101b3..422f7e4cc08d 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2031,9 +2031,8 @@ static int rcutorture_booster_init(unsigned int cpu)
mutex_lock(&boost_mutex);
rcu_torture_disable_rt_throttle();
VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
- boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
- cpu_to_node(cpu),
- "rcu_torture_boost");
+ boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL,
+ cpu, "rcu_torture_boost_%u");
if (IS_ERR(boost_tasks[cpu])) {
retval = PTR_ERR(boost_tasks[cpu]);
VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
@@ -2042,8 +2041,6 @@ static int rcutorture_booster_init(unsigned int cpu)
mutex_unlock(&boost_mutex);
return retval;
}
- kthread_bind(boost_tasks[cpu], cpu);
- wake_up_process(boost_tasks[cpu]);
mutex_unlock(&boost_mutex);
return 0;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 83872f95a1ea..2e4ae00e52d1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8642,14 +8642,6 @@ void __init init_idle(struct task_struct *idle, int cpu)
__sched_fork(0, idle);
- /*
- * The idle task doesn't need the kthread struct to function, but it
- * is dressed up as a per-CPU kthread and thus needs to play the part
- * if we want to avoid special-casing it in code that deals with per-CPU
- * kthreads.
- */
- set_kthread_struct(idle);
-
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_rq_lock(rq);
@@ -9469,6 +9461,14 @@ void __init sched_init(void)
enter_lazy_tlb(&init_mm, current);
/*
+ * The idle task doesn't need the kthread struct to function, but it
+ * is dressed up as a per-CPU kthread and thus needs to play the part
+ * if we want to avoid special-casing it in code that deals with per-CPU
+ * kthreads.
+ */
+ WARN_ON(!set_kthread_struct(current));
+
+ /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
diff --git a/kernel/signal.c b/kernel/signal.c
index dfcee3888b00..38602738866e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -626,7 +626,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
*
* All callers have to hold the siglock.
*/
-int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *info)
+int dequeue_signal(struct task_struct *tsk, sigset_t *mask,
+ kernel_siginfo_t *info, enum pid_type *type)
{
bool resched_timer = false;
int signr;
@@ -634,8 +635,10 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
+ *type = PIDTYPE_PID;
signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
if (!signr) {
+ *type = PIDTYPE_TGID;
signr = __dequeue_signal(&tsk->signal->shared_pending,
mask, info, &resched_timer);
#ifdef CONFIG_POSIX_TIMERS
@@ -903,8 +906,8 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
struct task_struct *t;
sigset_t flush;
- if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
- if (!(signal->flags & SIGNAL_GROUP_EXIT))
+ if (signal->flags & SIGNAL_GROUP_EXIT) {
+ if (signal->core_state)
return sig == SIGKILL;
/*
* The process is in the middle of dying, nothing to do.
@@ -1029,7 +1032,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
* then start taking the whole group down immediately.
*/
if (sig_fatal(p, sig) &&
- !(signal->flags & SIGNAL_GROUP_EXIT) &&
+ (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) &&
!sigismember(&t->real_blocked, sig) &&
(sig == SIGKILL || !p->ptrace)) {
/*
@@ -1820,6 +1823,7 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data)
* force_sig_seccomp - signals the task to allow in-process syscall emulation
* @syscall: syscall number to send to userland
* @reason: filter-supplied reason code to send to userland (via si_errno)
+ * @force_coredump: true to trigger a coredump
*
* Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
*/
@@ -2383,7 +2387,8 @@ static bool do_signal_stop(int signr)
WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
- unlikely(signal_group_exit(sig)))
+ unlikely(sig->flags & SIGNAL_GROUP_EXIT) ||
+ unlikely(sig->group_exec_task))
return false;
/*
* There is no group stop already in progress. We must
@@ -2544,7 +2549,7 @@ static void do_freezer_trap(void)
freezable_schedule();
}
-static int ptrace_signal(int signr, kernel_siginfo_t *info)
+static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
{
/*
* We do not check sig_kernel_stop(signr) but set this marker
@@ -2584,8 +2589,9 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info)
}
/* If the (new) signal is now blocked, requeue it. */
- if (sigismember(&current->blocked, signr)) {
- send_signal(signr, info, current, PIDTYPE_PID);
+ if (sigismember(&current->blocked, signr) ||
+ fatal_signal_pending(current)) {
+ send_signal(signr, info, current, type);
signr = 0;
}
@@ -2684,18 +2690,20 @@ relock:
goto relock;
}
- /* Has this task already been marked for death? */
- if (signal_group_exit(signal)) {
- ksig->info.si_signo = signr = SIGKILL;
- sigdelset(&current->pending.signal, SIGKILL);
- trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
- &sighand->action[SIGKILL - 1]);
- recalc_sigpending();
- goto fatal;
- }
-
for (;;) {
struct k_sigaction *ka;
+ enum pid_type type;
+
+ /* Has this task already been marked for death? */
+ if ((signal->flags & SIGNAL_GROUP_EXIT) ||
+ signal->group_exec_task) {
+ ksig->info.si_signo = signr = SIGKILL;
+ sigdelset(&current->pending.signal, SIGKILL);
+ trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
+ &sighand->action[SIGKILL - 1]);
+ recalc_sigpending();
+ goto fatal;
+ }
if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
do_signal_stop(0))
@@ -2728,16 +2736,18 @@ relock:
* so that the instruction pointer in the signal stack
* frame points to the faulting instruction.
*/
+ type = PIDTYPE_PID;
signr = dequeue_synchronous_signal(&ksig->info);
if (!signr)
- signr = dequeue_signal(current, &current->blocked, &ksig->info);
+ signr = dequeue_signal(current, &current->blocked,
+ &ksig->info, &type);
if (!signr)
break; /* will return 0 */
if (unlikely(current->ptrace) && (signr != SIGKILL) &&
!(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) {
- signr = ptrace_signal(signr, &ksig->info);
+ signr = ptrace_signal(signr, &ksig->info, type);
if (!signr)
continue;
}
@@ -2863,13 +2873,13 @@ out:
}
/**
- * signal_delivered -
+ * signal_delivered - called after signal delivery to update blocked signals
* @ksig: kernel signal struct
* @stepping: nonzero if debugger single-step or block-step in use
*
* This function should be called when a signal has successfully been
* delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
- * is always blocked, and the signal itself is blocked unless %SA_NODEFER
+ * is always blocked), and the signal itself is blocked unless %SA_NODEFER
* is set in @ksig->ka.sa.sa_flags. Tracing is notified.
*/
static void signal_delivered(struct ksignal *ksig, int stepping)
@@ -2942,7 +2952,7 @@ void exit_signals(struct task_struct *tsk)
*/
cgroup_threadgroup_change_begin(tsk);
- if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
+ if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
return;
@@ -3562,6 +3572,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
ktime_t *to = NULL, timeout = KTIME_MAX;
struct task_struct *tsk = current;
sigset_t mask = *which;
+ enum pid_type type;
int sig, ret = 0;
if (ts) {
@@ -3578,7 +3589,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
signotset(&mask);
spin_lock_irq(&tsk->sighand->siglock);
- sig = dequeue_signal(tsk, &mask, info);
+ sig = dequeue_signal(tsk, &mask, info, &type);
if (!sig && timeout) {
/*
* None ready, temporarily unblock those we're interested
@@ -3597,7 +3608,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
sigemptyset(&tsk->real_blocked);
- sig = dequeue_signal(tsk, &mask, info);
+ sig = dequeue_signal(tsk, &mask, info, &type);
}
spin_unlock_irq(&tsk->sighand->siglock);
diff --git a/kernel/sys.c b/kernel/sys.c
index 8fdac0d90504..2450a9f33cb0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2261,6 +2261,66 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
+#ifdef CONFIG_ANON_VMA_NAME
+
+#define ANON_VMA_NAME_MAX_LEN 80
+#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]"
+
+static inline bool is_valid_name_char(char ch)
+{
+ /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
+ return ch > 0x1f && ch < 0x7f &&
+ !strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
+}
+
+static int prctl_set_vma(unsigned long opt, unsigned long addr,
+ unsigned long size, unsigned long arg)
+{
+ struct mm_struct *mm = current->mm;
+ const char __user *uname;
+ char *name, *pch;
+ int error;
+
+ switch (opt) {
+ case PR_SET_VMA_ANON_NAME:
+ uname = (const char __user *)arg;
+ if (uname) {
+ name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
+
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ for (pch = name; *pch != '\0'; pch++) {
+ if (!is_valid_name_char(*pch)) {
+ kfree(name);
+ return -EINVAL;
+ }
+ }
+ } else {
+ /* Reset the name */
+ name = NULL;
+ }
+
+ mmap_write_lock(mm);
+ error = madvise_set_anon_name(mm, addr, size, name);
+ mmap_write_unlock(mm);
+ kfree(name);
+ break;
+ default:
+ error = -EINVAL;
+ }
+
+ return error;
+}
+
+#else /* CONFIG_ANON_VMA_NAME */
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long size, unsigned long arg)
+{
+ return -EINVAL;
+}
+#endif /* CONFIG_ANON_VMA_NAME */
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2530,6 +2590,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
#endif
+ case PR_SET_VMA:
+ error = prctl_set_vma(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d1944258cfc0..a492f159624f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -297,6 +297,7 @@ COND_SYSCALL(get_mempolicy);
COND_SYSCALL(set_mempolicy);
COND_SYSCALL(migrate_pages);
COND_SYSCALL(move_pages);
+COND_SYSCALL(set_mempolicy_home_node);
COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d7ed1dffa426..ef77be575d87 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -123,6 +123,7 @@ static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
static int two_hundred = 200;
static int one_thousand = 1000;
+static int three_thousand = 3000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
@@ -2960,7 +2961,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = watermark_scale_factor_sysctl_handler,
.extra1 = SYSCTL_ONE,
- .extra2 = &one_thousand,
+ .extra2 = &three_thousand,
},
{
.procname = "percpu_pagelist_high_fraction",
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b8a14d2fb5ba..b7e52a642948 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -107,7 +107,7 @@ static u64 suspend_start;
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
* a lower bound for cs->uncertainty_margin values when registering clocks.
*/
-#define WATCHDOG_MAX_SKEW (50 * NSEC_PER_USEC)
+#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
@@ -199,23 +199,30 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-ulong max_cswd_read_retries = 3;
+ulong max_cswd_read_retries = 2;
module_param(max_cswd_read_retries, ulong, 0644);
EXPORT_SYMBOL_GPL(max_cswd_read_retries);
static int verify_n_cpus = 8;
module_param(verify_n_cpus, int, 0644);
-static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
+enum wd_read_status {
+ WD_READ_SUCCESS,
+ WD_READ_UNSTABLE,
+ WD_READ_SKIP
+};
+
+static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
{
unsigned int nretries;
- u64 wd_end, wd_delta;
- int64_t wd_delay;
+ u64 wd_end, wd_end2, wd_delta;
+ int64_t wd_delay, wd_seq_delay;
for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
local_irq_disable();
*wdnow = watchdog->read(watchdog);
*csnow = cs->read(cs);
wd_end = watchdog->read(watchdog);
+ wd_end2 = watchdog->read(watchdog);
local_irq_enable();
wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
@@ -226,13 +233,34 @@ static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
smp_processor_id(), watchdog->name, nretries);
}
- return true;
+ return WD_READ_SUCCESS;
}
+
+ /*
+ * Now compute delay in consecutive watchdog read to see if
+ * there is too much external interferences that cause
+ * significant delay in reading both clocksource and watchdog.
+ *
+ * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2,
+ * report system busy, reinit the watchdog and skip the current
+ * watchdog test.
+ */
+ wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
+ wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
+ if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
+ goto skip_test;
}
pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
smp_processor_id(), watchdog->name, wd_delay, nretries);
- return false;
+ return WD_READ_UNSTABLE;
+
+skip_test:
+ pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
+ smp_processor_id(), watchdog->name, wd_seq_delay);
+ pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
+ cs->name, wd_delay);
+ return WD_READ_SKIP;
}
static u64 csnow_mid;
@@ -356,6 +384,7 @@ static void clocksource_watchdog(struct timer_list *unused)
int next_cpu, reset_pending;
int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
+ enum wd_read_status read_ret;
u32 md;
spin_lock(&watchdog_lock);
@@ -373,9 +402,12 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
}
- if (!cs_watchdog_read(cs, &csnow, &wdnow)) {
- /* Clock readout unreliable, so give it up. */
- __clocksource_unstable(cs);
+ read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
+
+ if (read_ret != WD_READ_SUCCESS) {
+ if (read_ret == WD_READ_UNSTABLE)
+ /* Clock readout unreliable, so give it up. */
+ __clocksource_unstable(cs);
continue;
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 420ff4bc67fd..f468767bc287 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -915,6 +915,20 @@ config EVENT_TRACE_TEST_SYSCALLS
TBD - enable a way to actually call the syscalls as we test their
events
+config FTRACE_SORT_STARTUP_TEST
+ bool "Verify compile time sorting of ftrace functions"
+ depends on DYNAMIC_FTRACE
+ depends on BUILDTIME_TABLE_SORT
+ help
+ Sorting of the mcount_loc sections that is used to find the
+ where the ftrace knows where to patch functions for tracing
+ and other callbacks is done at compile time. But if the sort
+ is not done correctly, it will cause non-deterministic failures.
+ When this is set, the sorted sections will be verified that they
+ are in deed sorted and will warn if they are not.
+
+ If unsure, say N
+
config RING_BUFFER_STARTUP_TEST
bool "Ring buffer startup self test"
depends on RING_BUFFER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 1183c88634aa..af68a67179b4 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -34,7 +34,7 @@ static struct trace_array *blk_tr;
static bool blk_tracer_enabled __read_mostly;
static LIST_HEAD(running_trace_list);
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
+static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock);
/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC 0x1
@@ -121,12 +121,12 @@ static void trace_note_tsk(struct task_struct *tsk)
struct blk_trace *bt;
tsk->btrace_seq = blktrace_seq;
- spin_lock_irqsave(&running_trace_lock, flags);
+ raw_spin_lock_irqsave(&running_trace_lock, flags);
list_for_each_entry(bt, &running_trace_list, running_list) {
trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
sizeof(tsk->comm), 0);
}
- spin_unlock_irqrestore(&running_trace_lock, flags);
+ raw_spin_unlock_irqrestore(&running_trace_lock, flags);
}
static void trace_note_time(struct blk_trace *bt)
@@ -666,9 +666,9 @@ static int __blk_trace_startstop(struct request_queue *q, int start)
blktrace_seq++;
smp_mb();
bt->trace_state = Blktrace_running;
- spin_lock_irq(&running_trace_lock);
+ raw_spin_lock_irq(&running_trace_lock);
list_add(&bt->running_list, &running_trace_list);
- spin_unlock_irq(&running_trace_lock);
+ raw_spin_unlock_irq(&running_trace_lock);
trace_note_time(bt);
ret = 0;
@@ -676,9 +676,9 @@ static int __blk_trace_startstop(struct request_queue *q, int start)
} else {
if (bt->trace_state == Blktrace_running) {
bt->trace_state = Blktrace_stopped;
- spin_lock_irq(&running_trace_lock);
+ raw_spin_lock_irq(&running_trace_lock);
list_del_init(&bt->running_list);
- spin_unlock_irq(&running_trace_lock);
+ raw_spin_unlock_irq(&running_trace_lock);
relay_flush(bt->rchan);
ret = 0;
}
@@ -1045,7 +1045,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
}
r.device_from = cpu_to_be32(dev);
- r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
+ r.device_to = cpu_to_be32(disk_devt(rq->q->disk));
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
@@ -1608,9 +1608,9 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (bt->trace_state == Blktrace_running) {
bt->trace_state = Blktrace_stopped;
- spin_lock_irq(&running_trace_lock);
+ raw_spin_lock_irq(&running_trace_lock);
list_del_init(&bt->running_list);
- spin_unlock_irq(&running_trace_lock);
+ raw_spin_unlock_irq(&running_trace_lock);
relay_flush(bt->rchan);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index be5f6b32a012..6163b6f762f7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6394,6 +6394,27 @@ static int ftrace_cmp_ips(const void *a, const void *b)
return 0;
}
+#ifdef CONFIG_FTRACE_SORT_STARTUP_TEST
+static void test_is_sorted(unsigned long *start, unsigned long count)
+{
+ int i;
+
+ for (i = 1; i < count; i++) {
+ if (WARN(start[i - 1] > start[i],
+ "[%d] %pS at %lx is not sorted with %pS at %lx\n", i,
+ (void *)start[i - 1], start[i - 1],
+ (void *)start[i], start[i]))
+ break;
+ }
+ if (i == count)
+ pr_info("ftrace section at %px sorted properly\n", start);
+}
+#else
+static void test_is_sorted(unsigned long *start, unsigned long count)
+{
+}
+#endif
+
static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
@@ -6412,8 +6433,17 @@ static int ftrace_process_locs(struct module *mod,
if (!count)
return 0;
- sort(start, count, sizeof(*start),
- ftrace_cmp_ips, NULL);
+ /*
+ * Sorting mcount in vmlinux at build time depend on
+ * CONFIG_BUILDTIME_TABLE_SORT, while mcount loc in
+ * modules can not be sorted at build time.
+ */
+ if (!IS_ENABLED(CONFIG_BUILDTIME_TABLE_SORT) || mod) {
+ sort(start, count, sizeof(*start),
+ ftrace_cmp_ips, NULL);
+ } else {
+ test_is_sorted(start, count);
+ }
start_pg = ftrace_allocate_pages(count);
if (!start_pg)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2699e9e562b1..05dfc7a12d3d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5898,16 +5898,13 @@ static __init int test_ringbuffer(void)
rb_data[cpu].buffer = buffer;
rb_data[cpu].cpu = cpu;
rb_data[cpu].cnt = cpu;
- rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
- "rbtester/%d", cpu);
+ rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu],
+ cpu, "rbtester/%u");
if (WARN_ON(IS_ERR(rb_threads[cpu]))) {
pr_cont("FAILED\n");
ret = PTR_ERR(rb_threads[cpu]);
goto out_free;
}
-
- kthread_bind(rb_threads[cpu], cpu);
- wake_up_process(rb_threads[cpu]);
}
/* Now create the rb hammer! */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 78ea542ce3bc..a569a0cb81ee 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -980,6 +980,8 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev
ring_buffer_write(buffer, event->array[0], &event->array[1]);
/* Release the temp buffer */
this_cpu_dec(trace_buffered_event_cnt);
+ /* ring_buffer_unlock_commit() enables preemption */
+ preempt_enable_notrace();
} else
ring_buffer_unlock_commit(buffer, event);
}
@@ -2601,6 +2603,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
trace_flags |= TRACE_FLAG_HARDIRQ;
if (in_serving_softirq())
trace_flags |= TRACE_FLAG_SOFTIRQ;
+ if (softirq_count() >> (SOFTIRQ_SHIFT + 1))
+ trace_flags |= TRACE_FLAG_BH_OFF;
if (tif_need_resched())
trace_flags |= TRACE_FLAG_NEED_RESCHED;
@@ -2745,8 +2749,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
*current_rb = tr->array_buffer.buffer;
if (!tr->no_filter_buffering_ref &&
- (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
- (entry = this_cpu_read(trace_buffered_event))) {
+ (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED))) {
+ preempt_disable_notrace();
/*
* Filtering is on, so try to use the per cpu buffer first.
* This buffer will simulate a ring_buffer_event,
@@ -2764,33 +2768,38 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
* is still quicker than no copy on match, but having
* to discard out of the ring buffer on a failed match.
*/
- int max_len = PAGE_SIZE - struct_size(entry, array, 1);
+ if ((entry = __this_cpu_read(trace_buffered_event))) {
+ int max_len = PAGE_SIZE - struct_size(entry, array, 1);
- val = this_cpu_inc_return(trace_buffered_event_cnt);
+ val = this_cpu_inc_return(trace_buffered_event_cnt);
- /*
- * Preemption is disabled, but interrupts and NMIs
- * can still come in now. If that happens after
- * the above increment, then it will have to go
- * back to the old method of allocating the event
- * on the ring buffer, and if the filter fails, it
- * will have to call ring_buffer_discard_commit()
- * to remove it.
- *
- * Need to also check the unlikely case that the
- * length is bigger than the temp buffer size.
- * If that happens, then the reserve is pretty much
- * guaranteed to fail, as the ring buffer currently
- * only allows events less than a page. But that may
- * change in the future, so let the ring buffer reserve
- * handle the failure in that case.
- */
- if (val == 1 && likely(len <= max_len)) {
- trace_event_setup(entry, type, trace_ctx);
- entry->array[0] = len;
- return entry;
+ /*
+ * Preemption is disabled, but interrupts and NMIs
+ * can still come in now. If that happens after
+ * the above increment, then it will have to go
+ * back to the old method of allocating the event
+ * on the ring buffer, and if the filter fails, it
+ * will have to call ring_buffer_discard_commit()
+ * to remove it.
+ *
+ * Need to also check the unlikely case that the
+ * length is bigger than the temp buffer size.
+ * If that happens, then the reserve is pretty much
+ * guaranteed to fail, as the ring buffer currently
+ * only allows events less than a page. But that may
+ * change in the future, so let the ring buffer reserve
+ * handle the failure in that case.
+ */
+ if (val == 1 && likely(len <= max_len)) {
+ trace_event_setup(entry, type, trace_ctx);
+ entry->array[0] = len;
+ /* Return with preemption disabled */
+ return entry;
+ }
+ this_cpu_dec(trace_buffered_event_cnt);
}
- this_cpu_dec(trace_buffered_event_cnt);
+ /* __trace_buffer_lock_reserve() disables preemption */
+ preempt_enable_notrace();
}
entry = __trace_buffer_lock_reserve(*current_rb, type, len,
@@ -4183,7 +4192,7 @@ unsigned long trace_total_entries(struct trace_array *tr)
static void print_lat_help_header(struct seq_file *m)
{
seq_puts(m, "# _------=> CPU# \n"
- "# / _-----=> irqs-off \n"
+ "# / _-----=> irqs-off/BH-disabled\n"
"# | / _----=> need-resched \n"
"# || / _---=> hardirq/softirq \n"
"# ||| / _--=> preempt-depth \n"
@@ -4224,7 +4233,7 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file
print_event_info(buf, m);
- seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space);
+ seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space);
seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
@@ -4834,6 +4843,12 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp)
return 0;
}
+static int tracing_mark_open(struct inode *inode, struct file *filp)
+{
+ stream_open(inode, filp);
+ return tracing_open_generic_tr(inode, filp);
+}
+
static int tracing_release(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -5635,7 +5650,7 @@ static const char readme_msg[] =
"\t - a numeric literal: e.g. ms_per_sec=1000,\n"
"\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n"
"\n"
- "\t hist trigger aritmethic expressions support addition(+), subtraction(-),\n"
+ "\t hist trigger arithmetic expressions support addition(+), subtraction(-),\n"
"\t multiplication(*) and division(/) operators. An operand can be either a\n"
"\t variable reference, field or numeric literal.\n"
"\n"
@@ -6718,10 +6733,9 @@ waitagain:
cnt = PAGE_SIZE - 1;
/* reset all but tr, trace, and overruns */
- memset_startat(iter, 0, seq);
+ trace_iterator_reset(iter);
cpumask_clear(iter->started);
trace_seq_init(&iter->seq);
- iter->pos = -1;
trace_event_read_lock();
trace_access_lock(iter->cpu_file);
@@ -7110,9 +7124,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (tt)
event_triggers_post_call(tr->trace_marker_file, tt);
- if (written > 0)
- *fpos += written;
-
return written;
}
@@ -7171,9 +7182,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
__buffer_unlock_commit(buffer, event);
- if (written > 0)
- *fpos += written;
-
return written;
}
@@ -7573,16 +7581,14 @@ static const struct file_operations tracing_free_buffer_fops = {
};
static const struct file_operations tracing_mark_fops = {
- .open = tracing_open_generic_tr,
+ .open = tracing_mark_open,
.write = tracing_mark_write,
- .llseek = generic_file_llseek,
.release = tracing_release_generic_tr,
};
static const struct file_operations tracing_mark_raw_fops = {
- .open = tracing_open_generic_tr,
+ .open = tracing_mark_open,
.write = tracing_mark_raw_write,
- .llseek = generic_file_llseek,
.release = tracing_release_generic_tr,
};
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 38715aa6cfdf..d038ddbf1bea 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -83,6 +83,9 @@ enum trace_type {
#undef __dynamic_array
#define __dynamic_array(type, item) type item[];
+#undef __rel_dynamic_array
+#define __rel_dynamic_array(type, item) type item[];
+
#undef F_STRUCT
#define F_STRUCT(args...) args
@@ -1334,10 +1337,12 @@ __trace_event_discard_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
if (this_cpu_read(trace_buffered_event) == event) {
- /* Simply release the temp buffer */
+ /* Simply release the temp buffer and enable preemption */
this_cpu_dec(trace_buffered_event_cnt);
+ preempt_enable_notrace();
return;
}
+ /* ring_buffer_discard_commit() enables preemption */
ring_buffer_discard_commit(buffer, event);
}
@@ -1465,6 +1470,7 @@ struct filter_pred {
static inline bool is_string_field(struct ftrace_event_field *field)
{
return field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_RDYN_STRING ||
field->filter_type == FILTER_STATIC_STRING ||
field->filter_type == FILTER_PTR_STRING ||
field->filter_type == FILTER_COMM;
@@ -1572,15 +1578,13 @@ extern int event_enable_trigger_print(struct seq_file *m,
struct event_trigger_data *data);
extern void event_enable_trigger_free(struct event_trigger_ops *ops,
struct event_trigger_data *data);
-extern int event_enable_trigger_func(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *param);
+extern int event_enable_trigger_parse(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param);
extern int event_enable_register_trigger(char *glob,
- struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file);
extern void event_enable_unregister_trigger(char *glob,
- struct event_trigger_ops *ops,
struct event_trigger_data *test,
struct trace_event_file *file);
extern void trigger_data_free(struct event_trigger_data *data);
@@ -1606,6 +1610,30 @@ get_named_trigger_data(struct event_trigger_data *data);
extern int register_event_command(struct event_command *cmd);
extern int unregister_event_command(struct event_command *cmd);
extern int register_trigger_hist_enable_disable_cmds(void);
+extern bool event_trigger_check_remove(const char *glob);
+extern bool event_trigger_empty_param(const char *param);
+extern int event_trigger_separate_filter(char *param_and_filter, char **param,
+ char **filter, bool param_required);
+extern struct event_trigger_data *
+event_trigger_alloc(struct event_command *cmd_ops,
+ char *cmd,
+ char *param,
+ void *private_data);
+extern int event_trigger_parse_num(char *trigger,
+ struct event_trigger_data *trigger_data);
+extern int event_trigger_set_filter(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *param,
+ struct event_trigger_data *trigger_data);
+extern void event_trigger_reset_filter(struct event_command *cmd_ops,
+ struct event_trigger_data *trigger_data);
+extern int event_trigger_register(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob,
+ char *cmd,
+ char *trigger,
+ struct event_trigger_data *trigger_data,
+ int *n_registered);
/**
* struct event_trigger_ops - callbacks for trace event triggers
@@ -1613,10 +1641,20 @@ extern int register_trigger_hist_enable_disable_cmds(void);
* The methods in this structure provide per-event trigger hooks for
* various trigger operations.
*
+ * The @init and @free methods are used during trigger setup and
+ * teardown, typically called from an event_command's @parse()
+ * function implementation.
+ *
+ * The @print method is used to print the trigger spec.
+ *
+ * The @trigger method is the function that actually implements the
+ * trigger and is called in the context of the triggering event
+ * whenever that event occurs.
+ *
* All the methods below, except for @init() and @free(), must be
* implemented.
*
- * @func: The trigger 'probe' function called when the triggering
+ * @trigger: The trigger 'probe' function called when the triggering
* event occurs. The data passed into this callback is the data
* that was supplied to the event_command @reg() function that
* registered the trigger (see struct event_command) along with
@@ -1645,9 +1683,10 @@ extern int register_trigger_hist_enable_disable_cmds(void);
* (see trace_event_triggers.c).
*/
struct event_trigger_ops {
- void (*func)(struct event_trigger_data *data,
- struct trace_buffer *buffer, void *rec,
- struct ring_buffer_event *rbe);
+ void (*trigger)(struct event_trigger_data *data,
+ struct trace_buffer *buffer,
+ void *rec,
+ struct ring_buffer_event *rbe);
int (*init)(struct event_trigger_ops *ops,
struct event_trigger_data *data);
void (*free)(struct event_trigger_ops *ops,
@@ -1696,7 +1735,7 @@ struct event_trigger_ops {
* All the methods below, except for @set_filter() and @unreg_all(),
* must be implemented.
*
- * @func: The callback function responsible for parsing and
+ * @parse: The callback function responsible for parsing and
* registering the trigger written to the 'trigger' file by the
* user. It allocates the trigger instance and registers it with
* the appropriate trace event. It makes use of the other
@@ -1731,21 +1770,24 @@ struct event_trigger_ops {
*
* @get_trigger_ops: The callback function invoked to retrieve the
* event_trigger_ops implementation associated with the command.
+ * This callback function allows a single event_command to
+ * support multiple trigger implementations via different sets of
+ * event_trigger_ops, depending on the value of the @param
+ * string.
*/
struct event_command {
struct list_head list;
char *name;
enum event_trigger_type trigger_type;
int flags;
- int (*func)(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *params);
+ int (*parse)(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd,
+ char *param_and_filter);
int (*reg)(char *glob,
- struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file);
void (*unreg)(char *glob,
- struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file);
void (*unreg_all)(struct trace_event_file *file);
@@ -1926,14 +1968,7 @@ extern struct trace_iterator *tracepoint_print_iter;
*/
static __always_inline void trace_iterator_reset(struct trace_iterator *iter)
{
- const size_t offset = offsetof(struct trace_iterator, seq);
-
- /*
- * Keep gcc from complaining about overwriting more than just one
- * member in the structure.
- */
- memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset);
-
+ memset_startat(iter, 0, seq);
iter->pos = -1;
}
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 928867f527e7..191db32dec46 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -489,18 +489,12 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec)
if (trace_trigger_soft_disabled(edata->file))
return;
- fbuffer.trace_ctx = tracing_gen_ctx();
- fbuffer.trace_file = edata->file;
-
dsize = get_eprobe_size(&edata->ep->tp, rec);
- fbuffer.regs = NULL;
-
- fbuffer.event =
- trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file,
- call->event.type,
- sizeof(*entry) + edata->ep->tp.size + dsize,
- fbuffer.trace_ctx);
- if (!fbuffer.event)
+
+ entry = trace_event_buffer_reserve(&fbuffer, edata->file,
+ sizeof(*entry) + edata->ep->tp.size + dsize);
+
+ if (!entry)
return;
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
@@ -549,29 +543,29 @@ static void eprobe_trigger_func(struct event_trigger_data *data,
}
static struct event_trigger_ops eprobe_trigger_ops = {
- .func = eprobe_trigger_func,
+ .trigger = eprobe_trigger_func,
.print = eprobe_trigger_print,
.init = eprobe_trigger_init,
.free = eprobe_trigger_free,
};
-static int eprobe_trigger_cmd_func(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param)
{
return -1;
}
-static int eprobe_trigger_reg_func(char *glob, struct event_trigger_ops *ops,
- struct event_trigger_data *data,
- struct trace_event_file *file)
+static int eprobe_trigger_reg_func(char *glob,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
{
return -1;
}
-static void eprobe_trigger_unreg_func(char *glob, struct event_trigger_ops *ops,
- struct event_trigger_data *data,
- struct trace_event_file *file)
+static void eprobe_trigger_unreg_func(char *glob,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
{
}
@@ -586,7 +580,7 @@ static struct event_command event_trigger_cmd = {
.name = "eprobe",
.trigger_type = ETT_EVENT_EPROBE,
.flags = EVENT_CMD_FL_NEEDS_REC,
- .func = eprobe_trigger_cmd_func,
+ .parse = eprobe_trigger_cmd_parse,
.reg = eprobe_trigger_reg_func,
.unreg = eprobe_trigger_unreg_func,
.unreg_all = NULL,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 92be9cb1d7d4..3147614c1812 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3461,10 +3461,8 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events,
tr, &ftrace_tr_enable_fops);
- if (!entry) {
- pr_warn("Could not create tracefs 'enable' entry\n");
+ if (!entry)
return -ENOMEM;
- }
/* There are not as crucial, just warn if they are not created */
@@ -3480,17 +3478,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n");
/* ring buffer internal formats */
- entry = trace_create_file("header_page", TRACE_MODE_READ, d_events,
+ trace_create_file("header_page", TRACE_MODE_READ, d_events,
ring_buffer_print_page_header,
&ftrace_show_header_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'header_page' entry\n");
- entry = trace_create_file("header_event", TRACE_MODE_READ, d_events,
+ trace_create_file("header_event", TRACE_MODE_READ, d_events,
ring_buffer_print_entry_header,
&ftrace_show_header_fops);
- if (!entry)
- pr_warn("Could not create tracefs 'header_event' entry\n");
tr->event_dir = d_events;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c9124038b140..b458a9afa2c0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -5,6 +5,7 @@
* Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
*/
+#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/mutex.h>
@@ -654,6 +655,52 @@ DEFINE_EQUALITY_PRED(32);
DEFINE_EQUALITY_PRED(16);
DEFINE_EQUALITY_PRED(8);
+/* user space strings temp buffer */
+#define USTRING_BUF_SIZE 1024
+
+struct ustring_buffer {
+ char buffer[USTRING_BUF_SIZE];
+};
+
+static __percpu struct ustring_buffer *ustring_per_cpu;
+
+static __always_inline char *test_string(char *str)
+{
+ struct ustring_buffer *ubuf;
+ char *kstr;
+
+ if (!ustring_per_cpu)
+ return NULL;
+
+ ubuf = this_cpu_ptr(ustring_per_cpu);
+ kstr = ubuf->buffer;
+
+ /* For safety, do not trust the string pointer */
+ if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
+ return NULL;
+ return kstr;
+}
+
+static __always_inline char *test_ustring(char *str)
+{
+ struct ustring_buffer *ubuf;
+ char __user *ustr;
+ char *kstr;
+
+ if (!ustring_per_cpu)
+ return NULL;
+
+ ubuf = this_cpu_ptr(ustring_per_cpu);
+ kstr = ubuf->buffer;
+
+ /* user space address? */
+ ustr = (char __user *)str;
+ if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
+ return NULL;
+
+ return kstr;
+}
+
/* Filter predicate for fixed sized arrays of characters */
static int filter_pred_string(struct filter_pred *pred, void *event)
{
@@ -667,19 +714,43 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
return match;
}
-/* Filter predicate for char * pointers */
-static int filter_pred_pchar(struct filter_pred *pred, void *event)
+static __always_inline int filter_pchar(struct filter_pred *pred, char *str)
{
- char **addr = (char **)(event + pred->offset);
int cmp, match;
- int len = strlen(*addr) + 1; /* including tailing '\0' */
+ int len;
- cmp = pred->regex.match(*addr, &pred->regex, len);
+ len = strlen(str) + 1; /* including tailing '\0' */
+ cmp = pred->regex.match(str, &pred->regex, len);
match = cmp ^ pred->not;
return match;
}
+/* Filter predicate for char * pointers */
+static int filter_pred_pchar(struct filter_pred *pred, void *event)
+{
+ char **addr = (char **)(event + pred->offset);
+ char *str;
+
+ str = test_string(*addr);
+ if (!str)
+ return 0;
+
+ return filter_pchar(pred, str);
+}
+
+/* Filter predicate for char * pointers in user space*/
+static int filter_pred_pchar_user(struct filter_pred *pred, void *event)
+{
+ char **addr = (char **)(event + pred->offset);
+ char *str;
+
+ str = test_ustring(*addr);
+ if (!str)
+ return 0;
+
+ return filter_pchar(pred, str);
+}
/*
* Filter predicate for dynamic sized arrays of characters.
@@ -706,6 +777,29 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
return match;
}
+/*
+ * Filter predicate for relative dynamic sized arrays of characters.
+ * These are implemented through a list of strings at the end
+ * of the entry as same as dynamic string.
+ * The difference is that the relative one records the location offset
+ * from the field itself, not the event entry.
+ */
+static int filter_pred_strrelloc(struct filter_pred *pred, void *event)
+{
+ u32 *item = (u32 *)(event + pred->offset);
+ u32 str_item = *item;
+ int str_loc = str_item & 0xffff;
+ int str_len = str_item >> 16;
+ char *addr = (char *)(&item[1]) + str_loc;
+ int cmp, match;
+
+ cmp = pred->regex.match(addr, &pred->regex, str_len);
+
+ match = cmp ^ pred->not;
+
+ return match;
+}
+
/* Filter predicate for CPUs. */
static int filter_pred_cpu(struct filter_pred *pred, void *event)
{
@@ -756,7 +850,7 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
*
* Note:
* - @str might not be NULL-terminated if it's of type DYN_STRING
- * or STATIC_STRING, unless @len is zero.
+ * RDYN_STRING, or STATIC_STRING, unless @len is zero.
*/
static int regex_match_full(char *str, struct regex *r, int len)
@@ -1083,6 +1177,9 @@ int filter_assign_type(const char *type)
if (strstr(type, "__data_loc") && strstr(type, "char"))
return FILTER_DYN_STRING;
+ if (strstr(type, "__rel_loc") && strstr(type, "char"))
+ return FILTER_RDYN_STRING;
+
if (strchr(type, '[') && strstr(type, "char"))
return FILTER_STATIC_STRING;
@@ -1158,6 +1255,7 @@ static int parse_pred(const char *str, void *data,
struct filter_pred *pred = NULL;
char num_buf[24]; /* Big enough to hold an address */
char *field_name;
+ bool ustring = false;
char q;
u64 val;
int len;
@@ -1192,6 +1290,12 @@ static int parse_pred(const char *str, void *data,
return -EINVAL;
}
+ /* See if the field is a user space string */
+ if ((len = str_has_prefix(str + i, ".ustring"))) {
+ ustring = true;
+ i += len;
+ }
+
while (isspace(str[i]))
i++;
@@ -1318,10 +1422,24 @@ static int parse_pred(const char *str, void *data,
pred->fn = filter_pred_string;
pred->regex.field_len = field->size;
- } else if (field->filter_type == FILTER_DYN_STRING)
+ } else if (field->filter_type == FILTER_DYN_STRING) {
pred->fn = filter_pred_strloc;
- else
- pred->fn = filter_pred_pchar;
+ } else if (field->filter_type == FILTER_RDYN_STRING)
+ pred->fn = filter_pred_strrelloc;
+ else {
+
+ if (!ustring_per_cpu) {
+ /* Once allocated, keep it around for good */
+ ustring_per_cpu = alloc_percpu(struct ustring_buffer);
+ if (!ustring_per_cpu)
+ goto err_mem;
+ }
+
+ if (ustring)
+ pred->fn = filter_pred_pchar_user;
+ else
+ pred->fn = filter_pred_pchar;
+ }
/* go past the last quote */
i++;
@@ -1387,6 +1505,9 @@ static int parse_pred(const char *str, void *data,
err_free:
kfree(pred);
return -EINVAL;
+err_mem:
+ kfree(pred);
+ return -ENOMEM;
}
enum {
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 319f9c8ca7e7..5e6a988a8a51 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -217,6 +217,20 @@ static u64 hist_field_dynstring(struct hist_field *hist_field,
return (u64)(unsigned long)addr;
}
+static u64 hist_field_reldynstring(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ u32 *item = event + hist_field->field->offset;
+ u32 str_item = *item;
+ int str_loc = str_item & 0xffff;
+ char *addr = (char *)&item[1] + str_loc;
+
+ return (u64)(unsigned long)addr;
+}
+
static u64 hist_field_pstring(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct trace_buffer *buffer,
@@ -1956,8 +1970,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (field->filter_type == FILTER_STATIC_STRING) {
hist_field->fn = hist_field_string;
hist_field->size = field->size;
- } else if (field->filter_type == FILTER_DYN_STRING)
+ } else if (field->filter_type == FILTER_DYN_STRING) {
hist_field->fn = hist_field_dynstring;
+ } else if (field->filter_type == FILTER_RDYN_STRING)
+ hist_field->fn = hist_field_reldynstring;
else
hist_field->fn = hist_field_pstring;
} else {
@@ -2745,9 +2761,9 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data,
}
static struct event_command trigger_hist_cmd;
-static int event_hist_trigger_func(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *param);
+static int event_hist_trigger_parse(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param);
static bool compatible_keys(struct hist_trigger_data *target_hist_data,
struct hist_trigger_data *hist_data,
@@ -2950,8 +2966,8 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
var_hist->hist_data = hist_data;
/* Create the new histogram with our variable */
- ret = event_hist_trigger_func(&trigger_hist_cmd, file,
- "", "hist", cmd);
+ ret = event_hist_trigger_parse(&trigger_hist_cmd, file,
+ "", "hist", cmd);
if (ret) {
kfree(cmd);
kfree(var_hist->cmd);
@@ -4961,7 +4977,8 @@ static inline void add_to_key(char *compound_key, void *key,
struct ftrace_event_field *field;
field = key_field->field;
- if (field->filter_type == FILTER_DYN_STRING)
+ if (field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_RDYN_STRING)
size = *(u32 *)(rec + field->offset) >> 16;
else if (field->filter_type == FILTER_STATIC_STRING)
size = field->size;
@@ -5712,8 +5729,8 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
for (i = 0; i < hist_data->n_field_var_hists; i++) {
file = hist_data->field_var_hists[i]->hist_data->event_file;
cmd = hist_data->field_var_hists[i]->cmd;
- ret = event_hist_trigger_func(&trigger_hist_cmd, file,
- "!hist", "hist", cmd);
+ ret = event_hist_trigger_parse(&trigger_hist_cmd, file,
+ "!hist", "hist", cmd);
WARN_ON_ONCE(ret < 0);
}
}
@@ -5742,7 +5759,7 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
}
static struct event_trigger_ops event_hist_trigger_ops = {
- .func = event_hist_trigger,
+ .trigger = event_hist_trigger,
.print = event_hist_trigger_print,
.init = event_hist_trigger_init,
.free = event_hist_trigger_free,
@@ -5776,7 +5793,7 @@ static void event_hist_trigger_named_free(struct event_trigger_ops *ops,
}
static struct event_trigger_ops event_hist_trigger_named_ops = {
- .func = event_hist_trigger,
+ .trigger = event_hist_trigger,
.print = event_hist_trigger_print,
.init = event_hist_trigger_named_init,
.free = event_hist_trigger_named_free,
@@ -5893,7 +5910,7 @@ static bool hist_trigger_match(struct event_trigger_data *data,
return true;
}
-static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
+static int hist_register_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
@@ -6045,7 +6062,7 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data,
return false;
}
-static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
+static void hist_unregister_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
@@ -6129,9 +6146,9 @@ static void hist_unreg_all(struct trace_event_file *file)
}
}
-static int event_hist_trigger_func(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+static int event_hist_trigger_parse(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param)
{
unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT;
struct event_trigger_data *trigger_data;
@@ -6245,7 +6262,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
goto out_free;
}
- cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+ cmd_ops->unreg(glob+1, trigger_data, file);
se_name = trace_event_name(file->event_call);
se = find_synth_event(se_name);
if (se)
@@ -6254,7 +6271,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
goto out_free;
}
- ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+ ret = cmd_ops->reg(glob, trigger_data, file);
/*
* The above returns on success the # of triggers registered,
* but if it didn't register any it returns zero. Consider no
@@ -6297,7 +6314,7 @@ enable:
return ret;
out_unreg:
- cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+ cmd_ops->unreg(glob+1, trigger_data, file);
out_free:
if (cmd_ops->set_filter)
cmd_ops->set_filter(NULL, trigger_data, NULL);
@@ -6314,7 +6331,7 @@ static struct event_command trigger_hist_cmd = {
.name = "hist",
.trigger_type = ETT_EVENT_HIST,
.flags = EVENT_CMD_FL_NEEDS_REC,
- .func = event_hist_trigger_func,
+ .parse = event_hist_trigger_parse,
.reg = hist_register_trigger,
.unreg = hist_unregister_trigger,
.unreg_all = hist_unreg_all,
@@ -6366,28 +6383,28 @@ hist_enable_count_trigger(struct event_trigger_data *data,
}
static struct event_trigger_ops hist_enable_trigger_ops = {
- .func = hist_enable_trigger,
+ .trigger = hist_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
static struct event_trigger_ops hist_enable_count_trigger_ops = {
- .func = hist_enable_count_trigger,
+ .trigger = hist_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
static struct event_trigger_ops hist_disable_trigger_ops = {
- .func = hist_enable_trigger,
+ .trigger = hist_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
static struct event_trigger_ops hist_disable_count_trigger_ops = {
- .func = hist_enable_count_trigger,
+ .trigger = hist_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
@@ -6429,7 +6446,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file)
static struct event_command trigger_hist_enable_cmd = {
.name = ENABLE_HIST_STR,
.trigger_type = ETT_HIST_ENABLE,
- .func = event_enable_trigger_func,
+ .parse = event_enable_trigger_parse,
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
.unreg_all = hist_enable_unreg_all,
@@ -6440,7 +6457,7 @@ static struct event_command trigger_hist_enable_cmd = {
static struct event_command trigger_hist_disable_cmd = {
.name = DISABLE_HIST_STR,
.trigger_type = ETT_HIST_ENABLE,
- .func = event_enable_trigger_func,
+ .parse = event_enable_trigger_parse,
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
.unreg_all = hist_enable_unreg_all,
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index c188045c5f97..d6b4935a78c0 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -168,10 +168,14 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size)
continue;
if (field->filter_type == FILTER_STATIC_STRING)
continue;
- if (field->filter_type == FILTER_DYN_STRING) {
+ if (field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_RDYN_STRING) {
u32 *str_item;
int str_loc = entry_size & 0xffff;
+ if (field->filter_type == FILTER_RDYN_STRING)
+ str_loc -= field->offset + field->size;
+
str_item = (u32 *)(entry + field->offset);
*str_item = str_loc; /* string length is 0. */
} else {
@@ -214,7 +218,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
if (field->filter_type == FILTER_STATIC_STRING) {
strlcpy(entry + field->offset, addr, field->size);
- } else if (field->filter_type == FILTER_DYN_STRING) {
+ } else if (field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_RDYN_STRING) {
int str_len = strlen(addr) + 1;
int str_loc = entry_size & 0xffff;
u32 *str_item;
@@ -229,6 +234,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
strlcpy(entry + (entry_size - str_len), addr, str_len);
str_item = (u32 *)(entry + field->offset);
+ if (field->filter_type == FILTER_RDYN_STRING)
+ str_loc -= field->offset + field->size;
*str_item = (str_len << 16) | str_loc;
} else {
char **paddr;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index ca9c13b2ecf4..154db74dadbc 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -1979,7 +1979,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_next_val);
/**
* synth_event_add_val - Add a named field's value to an open synth trace
* @field_name: The name of the synthetic event field value to set
- * @val: The value to set the next field to
+ * @val: The value to set the named field to
* @trace_state: A pointer to object tracking the piecewise trace state
*
* Set the value of the named field in an event that's been opened by
@@ -2054,6 +2054,13 @@ static int create_synth_event(const char *raw_command)
last_cmd_set(raw_command);
+ name = raw_command;
+
+ /* Don't try to process if not our system */
+ if (name[0] != 's' || name[1] != ':')
+ return -ECANCELED;
+ name += 2;
+
p = strpbrk(raw_command, " \t");
if (!p) {
synth_err(SYNTH_ERR_INVALID_CMD, 0);
@@ -2062,12 +2069,6 @@ static int create_synth_event(const char *raw_command)
fields = skip_spaces(p);
- name = raw_command;
-
- if (name[0] != 's' || name[1] != ':')
- return -ECANCELED;
- name += 2;
-
/* This interface accepts group name prefix */
if (strchr(name, '/')) {
len = str_has_prefix(name, SYNTH_SYSTEM "/");
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 3d5c07239a2a..d00fee705f9c 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -68,7 +68,7 @@ event_triggers_call(struct trace_event_file *file,
if (data->paused)
continue;
if (!rec) {
- data->ops->func(data, buffer, rec, event);
+ data->ops->trigger(data, buffer, rec, event);
continue;
}
filter = rcu_dereference_sched(data->filter);
@@ -78,7 +78,7 @@ event_triggers_call(struct trace_event_file *file,
tt |= data->cmd_ops->trigger_type;
continue;
}
- data->ops->func(data, buffer, rec, event);
+ data->ops->trigger(data, buffer, rec, event);
}
return tt;
}
@@ -106,7 +106,7 @@ event_triggers_post_call(struct trace_event_file *file,
if (data->paused)
continue;
if (data->cmd_ops->trigger_type & tt)
- data->ops->func(data, NULL, NULL, NULL);
+ data->ops->trigger(data, NULL, NULL, NULL);
}
}
EXPORT_SYMBOL_GPL(event_triggers_post_call);
@@ -245,7 +245,7 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
mutex_lock(&trigger_cmd_mutex);
list_for_each_entry(p, &trigger_commands, list) {
if (strcmp(p->name, command) == 0) {
- ret = p->func(p, file, buff, command, next);
+ ret = p->parse(p, file, buff, command, next);
goto out_unlock;
}
}
@@ -540,7 +540,6 @@ void update_cond_flag(struct trace_event_file *file)
/**
* register_trigger - Generic event_command @reg implementation
* @glob: The raw string used to register the trigger
- * @ops: The trigger ops associated with the trigger
* @data: Trigger-specific data to associate with the trigger
* @file: The trace_event_file associated with the event
*
@@ -551,7 +550,7 @@ void update_cond_flag(struct trace_event_file *file)
*
* Return: 0 on success, errno otherwise
*/
-static int register_trigger(char *glob, struct event_trigger_ops *ops,
+static int register_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
@@ -589,7 +588,6 @@ out:
/**
* unregister_trigger - Generic event_command @unreg implementation
* @glob: The raw string used to register the trigger
- * @ops: The trigger ops associated with the trigger
* @test: Trigger-specific data used to find the trigger to remove
* @file: The trace_event_file associated with the event
*
@@ -598,7 +596,7 @@ out:
* Usually used directly as the @unreg method in event command
* implementations.
*/
-static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
+static void unregister_trigger(char *glob,
struct event_trigger_data *test,
struct trace_event_file *file)
{
@@ -621,8 +619,350 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
data->ops->free(data->ops, data);
}
+/*
+ * Event trigger parsing helper functions.
+ *
+ * These functions help make it easier to write an event trigger
+ * parsing function i.e. the struct event_command.parse() callback
+ * function responsible for parsing and registering a trigger command
+ * written to the 'trigger' file.
+ *
+ * A trigger command (or just 'trigger' for short) takes the form:
+ * [trigger] [if filter]
+ *
+ * The struct event_command.parse() callback (and other struct
+ * event_command functions) refer to several components of a trigger
+ * command. Those same components are referenced by the event trigger
+ * parsing helper functions defined below. These components are:
+ *
+ * cmd - the trigger command name
+ * glob - the trigger command name optionally prefaced with '!'
+ * param_and_filter - text following cmd and ':'
+ * param - text following cmd and ':' and stripped of filter
+ * filter - the optional filter text following (and including) 'if'
+ *
+ * To illustrate the use of these componenents, here are some concrete
+ * examples. For the following triggers:
+ *
+ * echo 'traceon:5 if pid == 0' > trigger
+ * - 'traceon' is both cmd and glob
+ * - '5 if pid == 0' is the param_and_filter
+ * - '5' is the param
+ * - 'if pid == 0' is the filter
+ *
+ * echo 'enable_event:sys:event:n' > trigger
+ * - 'enable_event' is both cmd and glob
+ * - 'sys:event:n' is the param_and_filter
+ * - 'sys:event:n' is the param
+ * - there is no filter
+ *
+ * echo 'hist:keys=pid if prio > 50' > trigger
+ * - 'hist' is both cmd and glob
+ * - 'keys=pid if prio > 50' is the param_and_filter
+ * - 'keys=pid' is the param
+ * - 'if prio > 50' is the filter
+ *
+ * echo '!enable_event:sys:event:n' > trigger
+ * - 'enable_event' the cmd
+ * - '!enable_event' is the glob
+ * - 'sys:event:n' is the param_and_filter
+ * - 'sys:event:n' is the param
+ * - there is no filter
+ *
+ * echo 'traceoff' > trigger
+ * - 'traceoff' is both cmd and glob
+ * - there is no param_and_filter
+ * - there is no param
+ * - there is no filter
+ *
+ * There are a few different categories of event trigger covered by
+ * these helpers:
+ *
+ * - triggers that don't require a parameter e.g. traceon
+ * - triggers that do require a parameter e.g. enable_event and hist
+ * - triggers that though they may not require a param may support an
+ * optional 'n' param (n = number of times the trigger should fire)
+ * e.g.: traceon:5 or enable_event:sys:event:n
+ * - triggers that do not support an 'n' param e.g. hist
+ *
+ * These functions can be used or ignored as necessary - it all
+ * depends on the complexity of the trigger, and the granularity of
+ * the functions supported reflects the fact that some implementations
+ * may need to customize certain aspects of their implementations and
+ * won't need certain functions. For instance, the hist trigger
+ * implementation doesn't use event_trigger_separate_filter() because
+ * it has special requirements for handling the filter.
+ */
+
+/**
+ * event_trigger_check_remove - check whether an event trigger specifies remove
+ * @glob: The trigger command string, with optional remove(!) operator
+ *
+ * The event trigger callback implementations pass in 'glob' as a
+ * parameter. This is the command name either with or without a
+ * remove(!) operator. This function simply parses the glob and
+ * determines whether the command corresponds to a trigger removal or
+ * a trigger addition.
+ *
+ * Return: true if this is a remove command, false otherwise
+ */
+bool event_trigger_check_remove(const char *glob)
+{
+ return (glob && glob[0] == '!') ? true : false;
+}
+
+/**
+ * event_trigger_empty_param - check whether the param is empty
+ * @param: The trigger param string
+ *
+ * The event trigger callback implementations pass in 'param' as a
+ * parameter. This corresponds to the string following the command
+ * name minus the command name. This function can be called by a
+ * callback implementation for any command that requires a param; a
+ * callback that doesn't require a param can ignore it.
+ *
+ * Return: true if this is an empty param, false otherwise
+ */
+bool event_trigger_empty_param(const char *param)
+{
+ return !param;
+}
+
+/**
+ * event_trigger_separate_filter - separate an event trigger from a filter
+ * @param: The param string containing trigger and possibly filter
+ * @trigger: outparam, will be filled with a pointer to the trigger
+ * @filter: outparam, will be filled with a pointer to the filter
+ * @param_required: Specifies whether or not the param string is required
+ *
+ * Given a param string of the form '[trigger] [if filter]', this
+ * function separates the filter from the trigger and returns the
+ * trigger in *trigger and the filter in *filter. Either the *trigger
+ * or the *filter may be set to NULL by this function - if not set to
+ * NULL, they will contain strings corresponding to the trigger and
+ * filter.
+ *
+ * There are two cases that need to be handled with respect to the
+ * passed-in param: either the param is required, or it is not
+ * required. If @param_required is set, and there's no param, it will
+ * return -EINVAL. If @param_required is not set and there's a param
+ * that starts with a number, that corresponds to the case of a
+ * trigger with :n (n = number of times the trigger should fire) and
+ * the parsing continues normally; otherwise the function just returns
+ * and assumes param just contains a filter and there's nothing else
+ * to do.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+int event_trigger_separate_filter(char *param_and_filter, char **param,
+ char **filter, bool param_required)
+{
+ int ret = 0;
+
+ *param = *filter = NULL;
+
+ if (!param_and_filter) {
+ if (param_required)
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Here we check for an optional param. The only legal
+ * optional param is :n, and if that's the case, continue
+ * below. Otherwise we assume what's left is a filter and
+ * return it as the filter string for the caller to deal with.
+ */
+ if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) {
+ *filter = param_and_filter;
+ goto out;
+ }
+
+ /*
+ * Separate the param from the filter (param [if filter]).
+ * Here we have either an optional :n param or a required
+ * param and an optional filter.
+ */
+ *param = strsep(&param_and_filter, " \t");
+
+ /*
+ * Here we have a filter, though it may be empty.
+ */
+ if (param_and_filter) {
+ *filter = skip_spaces(param_and_filter);
+ if (!**filter)
+ *filter = NULL;
+ }
+out:
+ return ret;
+}
+
+/**
+ * event_trigger_alloc - allocate and init event_trigger_data for a trigger
+ * @cmd_ops: The event_command operations for the trigger
+ * @cmd: The cmd string
+ * @param: The param string
+ * @private_data: User data to associate with the event trigger
+ *
+ * Allocate an event_trigger_data instance and initialize it. The
+ * @cmd_ops are used along with the @cmd and @param to get the
+ * trigger_ops to assign to the event_trigger_data. @private_data can
+ * also be passed in and associated with the event_trigger_data.
+ *
+ * Use event_trigger_free() to free an event_trigger_data object.
+ *
+ * Return: The trigger_data object success, NULL otherwise
+ */
+struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops,
+ char *cmd,
+ char *param,
+ void *private_data)
+{
+ struct event_trigger_data *trigger_data;
+ struct event_trigger_ops *trigger_ops;
+
+ trigger_ops = cmd_ops->get_trigger_ops(cmd, param);
+
+ trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+ if (!trigger_data)
+ return NULL;
+
+ trigger_data->count = -1;
+ trigger_data->ops = trigger_ops;
+ trigger_data->cmd_ops = cmd_ops;
+ trigger_data->private_data = private_data;
+
+ INIT_LIST_HEAD(&trigger_data->list);
+ INIT_LIST_HEAD(&trigger_data->named_list);
+ RCU_INIT_POINTER(trigger_data->filter, NULL);
+
+ return trigger_data;
+}
+
+/**
+ * event_trigger_parse_num - parse and return the number param for a trigger
+ * @param: The param string
+ * @trigger_data: The trigger_data for the trigger
+ *
+ * Parse the :n (n = number of times the trigger should fire) param
+ * and set the count variable in the trigger_data to the parsed count.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+int event_trigger_parse_num(char *param,
+ struct event_trigger_data *trigger_data)
+{
+ char *number;
+ int ret = 0;
+
+ if (param) {
+ number = strsep(&param, ":");
+
+ if (!strlen(number))
+ return -EINVAL;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, &trigger_data->count);
+ }
+
+ return ret;
+}
+
+/**
+ * event_trigger_set_filter - set an event trigger's filter
+ * @cmd_ops: The event_command operations for the trigger
+ * @file: The event file for the trigger's event
+ * @param: The string containing the filter
+ * @trigger_data: The trigger_data for the trigger
+ *
+ * Set the filter for the trigger. If the filter is NULL, just return
+ * without error.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+int event_trigger_set_filter(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *param,
+ struct event_trigger_data *trigger_data)
+{
+ if (param && cmd_ops->set_filter)
+ return cmd_ops->set_filter(param, trigger_data, file);
+
+ return 0;
+}
+
+/**
+ * event_trigger_reset_filter - reset an event trigger's filter
+ * @cmd_ops: The event_command operations for the trigger
+ * @trigger_data: The trigger_data for the trigger
+ *
+ * Reset the filter for the trigger to no filter.
+ */
+void event_trigger_reset_filter(struct event_command *cmd_ops,
+ struct event_trigger_data *trigger_data)
+{
+ if (cmd_ops->set_filter)
+ cmd_ops->set_filter(NULL, trigger_data, NULL);
+}
+
+/**
+ * event_trigger_register - register an event trigger
+ * @cmd_ops: The event_command operations for the trigger
+ * @file: The event file for the trigger's event
+ * @glob: The trigger command string, with optional remove(!) operator
+ * @cmd: The cmd string
+ * @param: The param string
+ * @trigger_data: The trigger_data for the trigger
+ * @n_registered: optional outparam, the number of triggers registered
+ *
+ * Register an event trigger. The @cmd_ops are used to call the
+ * cmd_ops->reg() function which actually does the registration. The
+ * cmd_ops->reg() function returns the number of triggers registered,
+ * which is assigned to n_registered, if n_registered is non-NULL.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+int event_trigger_register(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob,
+ char *cmd,
+ char *param,
+ struct event_trigger_data *trigger_data,
+ int *n_registered)
+{
+ int ret;
+
+ if (n_registered)
+ *n_registered = 0;
+
+ ret = cmd_ops->reg(glob, trigger_data, file);
+ /*
+ * The above returns on success the # of functions enabled,
+ * but if it didn't find any functions it returns zero.
+ * Consider no functions a failure too.
+ */
+ if (!ret) {
+ cmd_ops->unreg(glob, trigger_data, file);
+ ret = -ENOENT;
+ } else if (ret > 0) {
+ if (n_registered)
+ *n_registered = ret;
+ /* Just return zero, not the number of enabled functions */
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/*
+ * End event trigger parsing helper functions.
+ */
+
/**
- * event_trigger_callback - Generic event_command @func implementation
+ * event_trigger_parse - Generic event_command @parse implementation
* @cmd_ops: The command ops, used for trigger registration
* @file: The trace_event_file associated with the event
* @glob: The raw string used to register the trigger
@@ -632,15 +972,15 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
* Common implementation for event command parsing and trigger
* instantiation.
*
- * Usually used directly as the @func method in event command
+ * Usually used directly as the @parse method in event command
* implementations.
*
* Return: 0 on success, errno otherwise
*/
static int
-event_trigger_callback(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+event_trigger_parse(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param)
{
struct event_trigger_data *trigger_data;
struct event_trigger_ops *trigger_ops;
@@ -673,7 +1013,7 @@ event_trigger_callback(struct event_command *cmd_ops,
INIT_LIST_HEAD(&trigger_data->named_list);
if (glob[0] == '!') {
- cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+ cmd_ops->unreg(glob+1, trigger_data, file);
kfree(trigger_data);
ret = 0;
goto out;
@@ -708,14 +1048,14 @@ event_trigger_callback(struct event_command *cmd_ops,
out_reg:
/* Up the trigger_data count to make sure reg doesn't free it on failure */
event_trigger_init(trigger_ops, trigger_data);
- ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+ ret = cmd_ops->reg(glob, trigger_data, file);
/*
* The above returns on success the # of functions enabled,
* but if it didn't find any functions it returns zero.
* Consider no functions a failure too.
*/
if (!ret) {
- cmd_ops->unreg(glob, trigger_ops, trigger_data, file);
+ cmd_ops->unreg(glob, trigger_data, file);
ret = -ENOENT;
} else if (ret > 0)
ret = 0;
@@ -1023,28 +1363,28 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
}
static struct event_trigger_ops traceon_trigger_ops = {
- .func = traceon_trigger,
+ .trigger = traceon_trigger,
.print = traceon_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
static struct event_trigger_ops traceon_count_trigger_ops = {
- .func = traceon_count_trigger,
+ .trigger = traceon_count_trigger,
.print = traceon_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
static struct event_trigger_ops traceoff_trigger_ops = {
- .func = traceoff_trigger,
+ .trigger = traceoff_trigger,
.print = traceoff_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
static struct event_trigger_ops traceoff_count_trigger_ops = {
- .func = traceoff_count_trigger,
+ .trigger = traceoff_count_trigger,
.print = traceoff_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
@@ -1069,7 +1409,7 @@ onoff_get_trigger_ops(char *cmd, char *param)
static struct event_command trigger_traceon_cmd = {
.name = "traceon",
.trigger_type = ETT_TRACE_ONOFF,
- .func = event_trigger_callback,
+ .parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
.get_trigger_ops = onoff_get_trigger_ops,
@@ -1080,7 +1420,7 @@ static struct event_command trigger_traceoff_cmd = {
.name = "traceoff",
.trigger_type = ETT_TRACE_ONOFF,
.flags = EVENT_CMD_FL_POST_TRIGGER,
- .func = event_trigger_callback,
+ .parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
.get_trigger_ops = onoff_get_trigger_ops,
@@ -1116,14 +1456,14 @@ snapshot_count_trigger(struct event_trigger_data *data,
}
static int
-register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
+register_snapshot_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
if (tracing_alloc_snapshot_instance(file->tr) != 0)
return 0;
- return register_trigger(glob, ops, data, file);
+ return register_trigger(glob, data, file);
}
static int
@@ -1135,14 +1475,14 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
}
static struct event_trigger_ops snapshot_trigger_ops = {
- .func = snapshot_trigger,
+ .trigger = snapshot_trigger,
.print = snapshot_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
static struct event_trigger_ops snapshot_count_trigger_ops = {
- .func = snapshot_count_trigger,
+ .trigger = snapshot_count_trigger,
.print = snapshot_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
@@ -1157,7 +1497,7 @@ snapshot_get_trigger_ops(char *cmd, char *param)
static struct event_command trigger_snapshot_cmd = {
.name = "snapshot",
.trigger_type = ETT_SNAPSHOT,
- .func = event_trigger_callback,
+ .parse = event_trigger_parse,
.reg = register_snapshot_trigger,
.unreg = unregister_trigger,
.get_trigger_ops = snapshot_get_trigger_ops,
@@ -1226,14 +1566,14 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
}
static struct event_trigger_ops stacktrace_trigger_ops = {
- .func = stacktrace_trigger,
+ .trigger = stacktrace_trigger,
.print = stacktrace_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
static struct event_trigger_ops stacktrace_count_trigger_ops = {
- .func = stacktrace_count_trigger,
+ .trigger = stacktrace_count_trigger,
.print = stacktrace_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
@@ -1249,7 +1589,7 @@ static struct event_command trigger_stacktrace_cmd = {
.name = "stacktrace",
.trigger_type = ETT_STACKTRACE,
.flags = EVENT_CMD_FL_POST_TRIGGER,
- .func = event_trigger_callback,
+ .parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
.get_trigger_ops = stacktrace_get_trigger_ops,
@@ -1353,36 +1693,36 @@ void event_enable_trigger_free(struct event_trigger_ops *ops,
}
static struct event_trigger_ops event_enable_trigger_ops = {
- .func = event_enable_trigger,
+ .trigger = event_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
static struct event_trigger_ops event_enable_count_trigger_ops = {
- .func = event_enable_count_trigger,
+ .trigger = event_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
static struct event_trigger_ops event_disable_trigger_ops = {
- .func = event_enable_trigger,
+ .trigger = event_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
static struct event_trigger_ops event_disable_count_trigger_ops = {
- .func = event_enable_count_trigger,
+ .trigger = event_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-int event_enable_trigger_func(struct event_command *cmd_ops,
- struct trace_event_file *file,
- char *glob, char *cmd, char *param)
+int event_enable_trigger_parse(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param)
{
struct trace_event_file *event_enable_file;
struct enable_trigger_data *enable_data;
@@ -1455,7 +1795,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
trigger_data->private_data = enable_data;
if (glob[0] == '!') {
- cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+ cmd_ops->unreg(glob+1, trigger_data, file);
kfree(trigger_data);
kfree(enable_data);
ret = 0;
@@ -1502,7 +1842,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
ret = trace_event_enable_disable(event_enable_file, 1, 1);
if (ret < 0)
goto out_put;
- ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+ ret = cmd_ops->reg(glob, trigger_data, file);
/*
* The above returns on success the # of functions enabled,
* but if it didn't find any functions it returns zero.
@@ -1532,7 +1872,6 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
}
int event_enable_register_trigger(char *glob,
- struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file)
{
@@ -1574,7 +1913,6 @@ out:
}
void event_enable_unregister_trigger(char *glob,
- struct event_trigger_ops *ops,
struct event_trigger_data *test,
struct trace_event_file *file)
{
@@ -1628,7 +1966,7 @@ event_enable_get_trigger_ops(char *cmd, char *param)
static struct event_command trigger_enable_cmd = {
.name = ENABLE_EVENT_STR,
.trigger_type = ETT_EVENT_ENABLE,
- .func = event_enable_trigger_func,
+ .parse = event_enable_trigger_parse,
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
.get_trigger_ops = event_enable_get_trigger_ops,
@@ -1638,7 +1976,7 @@ static struct event_command trigger_enable_cmd = {
static struct event_command trigger_disable_cmd = {
.name = DISABLE_EVENT_STR,
.trigger_type = ETT_EVENT_ENABLE,
- .func = event_enable_trigger_func,
+ .parse = event_enable_trigger_parse,
.reg = event_enable_register_trigger,
.unreg = event_enable_unregister_trigger,
.get_trigger_ops = event_enable_get_trigger_ops,
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 56bb7b890578..d440ddd5fd8b 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -491,18 +491,14 @@ static void stop_per_cpu_kthreads(void)
static int start_cpu_kthread(unsigned int cpu)
{
struct task_struct *kthread;
- char comm[24];
- snprintf(comm, 24, "hwlatd/%d", cpu);
-
- kthread = kthread_create_on_cpu(kthread_fn, NULL, cpu, comm);
+ kthread = kthread_run_on_cpu(kthread_fn, NULL, cpu, "hwlatd/%u");
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
return -ENOMEM;
}
per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread;
- wake_up_process(kthread);
return 0;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 4e1257f50aa3..508f14af4f2c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -328,11 +328,9 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk)
static void __disable_trace_kprobe(struct trace_probe *tp)
{
- struct trace_probe *pos;
struct trace_kprobe *tk;
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- tk = container_of(pos, struct trace_kprobe, tp);
+ list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) {
if (!trace_kprobe_is_registered(tk))
continue;
if (trace_kprobe_is_return(tk))
@@ -349,7 +347,7 @@ static void __disable_trace_kprobe(struct trace_probe *tp)
static int enable_trace_kprobe(struct trace_event_call *call,
struct trace_event_file *file)
{
- struct trace_probe *pos, *tp;
+ struct trace_probe *tp;
struct trace_kprobe *tk;
bool enabled;
int ret = 0;
@@ -370,8 +368,7 @@ static int enable_trace_kprobe(struct trace_event_call *call,
if (enabled)
return 0;
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- tk = container_of(pos, struct trace_kprobe, tp);
+ list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) {
if (trace_kprobe_has_gone(tk))
continue;
ret = __enable_trace_kprobe(tk);
@@ -560,11 +557,9 @@ static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig,
struct trace_kprobe *comp)
{
struct trace_probe_event *tpe = orig->tp.event;
- struct trace_probe *pos;
int i;
- list_for_each_entry(pos, &tpe->probes, list) {
- orig = container_of(pos, struct trace_kprobe, tp);
+ list_for_each_entry(orig, &tpe->probes, tp.list) {
if (strcmp(trace_kprobe_symbol(orig),
trace_kprobe_symbol(comp)) ||
trace_kprobe_offset(orig) != trace_kprobe_offset(comp))
@@ -1176,15 +1171,18 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct dyn_event *ev = v;
struct trace_kprobe *tk;
+ unsigned long nmissed;
if (!is_trace_kprobe(ev))
return 0;
tk = to_trace_kprobe(ev);
+ nmissed = trace_kprobe_is_return(tk) ?
+ tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
seq_printf(m, " %-44s %15lu %15lu\n",
trace_probe_name(&tk->tp),
trace_kprobe_nhit(tk),
- tk->rp.kp.nmissed);
+ nmissed);
return 0;
}
@@ -1384,17 +1382,11 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
if (trace_trigger_soft_disabled(trace_file))
return;
- fbuffer.trace_ctx = tracing_gen_ctx();
- fbuffer.trace_file = trace_file;
-
dsize = __get_data_size(&tk->tp, regs);
- fbuffer.event =
- trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
- call->event.type,
- sizeof(*entry) + tk->tp.size + dsize,
- fbuffer.trace_ctx);
- if (!fbuffer.event)
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + tk->tp.size + dsize);
+ if (!entry)
return;
fbuffer.regs = regs;
@@ -1431,16 +1423,11 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (trace_trigger_soft_disabled(trace_file))
return;
- fbuffer.trace_ctx = tracing_gen_ctx();
- fbuffer.trace_file = trace_file;
-
dsize = __get_data_size(&tk->tp, regs);
- fbuffer.event =
- trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
- call->event.type,
- sizeof(*entry) + tk->tp.size + dsize,
- fbuffer.trace_ctx);
- if (!fbuffer.event)
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + tk->tp.size + dsize);
+ if (!entry)
return;
fbuffer.regs = regs;
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 7520d43aed55..870a08da5b48 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -138,8 +138,7 @@ static void osnoise_unregister_instance(struct trace_array *tr)
if (!found)
return;
- synchronize_rcu();
- kfree(inst);
+ kvfree_rcu(inst);
}
/*
@@ -1701,7 +1700,7 @@ static int start_kthread(unsigned int cpu)
snprintf(comm, 24, "osnoise/%d", cpu);
}
- kthread = kthread_create_on_cpu(main, NULL, cpu, comm);
+ kthread = kthread_run_on_cpu(main, NULL, cpu, comm);
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
@@ -1710,7 +1709,6 @@ static int start_kthread(unsigned int cpu)
}
per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread;
- wake_up_process(kthread);
return 0;
}
@@ -2123,6 +2121,13 @@ out_unhook_irq:
return -EINVAL;
}
+static void osnoise_unhook_events(void)
+{
+ unhook_thread_events();
+ unhook_softirq_events();
+ unhook_irq_events();
+}
+
/*
* osnoise_workload_start - start the workload and hook to events
*/
@@ -2155,7 +2160,14 @@ static int osnoise_workload_start(void)
retval = start_per_cpu_kthreads();
if (retval) {
- unhook_irq_events();
+ trace_osnoise_callback_enabled = false;
+ /*
+ * Make sure that ftrace_nmi_enter/exit() see
+ * trace_osnoise_callback_enabled as false before continuing.
+ */
+ barrier();
+
+ osnoise_unhook_events();
return retval;
}
@@ -2186,9 +2198,7 @@ static void osnoise_workload_stop(void)
stop_per_cpu_kthreads();
- unhook_irq_events();
- unhook_softirq_events();
- unhook_thread_events();
+ osnoise_unhook_events();
}
static void osnoise_tracer_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 3547e7176ff7..8aa493d25c73 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -445,14 +445,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
char irqs_off;
int hardirq;
int softirq;
+ int bh_off;
int nmi;
nmi = entry->flags & TRACE_FLAG_NMI;
hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+ bh_off = entry->flags & TRACE_FLAG_BH_OFF;
irqs_off =
+ (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' :
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ bh_off ? 'b' :
(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
'.';
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 3ed2a3f37297..73d90179b51b 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -356,6 +356,8 @@ static int __parse_imm_string(char *str, char **pbuf, int offs)
return -EINVAL;
}
*pbuf = kstrndup(str, len - 1, GFP_KERNEL);
+ if (!*pbuf)
+ return -ENOMEM;
return 0;
}
@@ -1138,8 +1140,7 @@ int trace_probe_remove_file(struct trace_probe *tp,
return -ENOENT;
list_del_rcu(&link->list);
- synchronize_rcu();
- kfree(link);
+ kvfree_rcu(link);
if (list_empty(&tp->event->files))
trace_probe_clear_flag(tp, TP_FLAG_TRACE);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8bfcd3b09422..f755bde42fd0 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -323,8 +323,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
trace_ctx = tracing_gen_ctx();
- buffer = tr->array_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer,
+ event = trace_event_buffer_lock_reserve(&buffer, trace_file,
sys_data->enter_event->event.type, size, trace_ctx);
if (!event)
return;
@@ -367,8 +366,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
trace_ctx = tracing_gen_ctx();
- buffer = tr->array_buffer.buffer;
- event = trace_buffer_lock_reserve(buffer,
+ event = trace_event_buffer_lock_reserve(&buffer, trace_file,
sys_data->exit_event->event.type, sizeof(*entry),
trace_ctx);
if (!event)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4f35514a48f3..9711589273cd 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -410,12 +410,10 @@ static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig,
struct trace_uprobe *comp)
{
struct trace_probe_event *tpe = orig->tp.event;
- struct trace_probe *pos;
struct inode *comp_inode = d_real_inode(comp->path.dentry);
int i;
- list_for_each_entry(pos, &tpe->probes, list) {
- orig = container_of(pos, struct trace_uprobe, tp);
+ list_for_each_entry(orig, &tpe->probes, tp.list) {
if (comp_inode != d_real_inode(orig->path.dentry) ||
comp->offset != orig->offset)
continue;
@@ -950,8 +948,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
struct trace_event_file *trace_file)
{
struct uprobe_trace_entry_head *entry;
- struct trace_buffer *buffer;
- struct ring_buffer_event *event;
+ struct trace_event_buffer fbuffer;
void *data;
int size, esize;
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
@@ -966,12 +963,10 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = esize + tu->tp.size + dsize;
- event = trace_event_buffer_lock_reserve(&buffer, trace_file,
- call->event.type, size, 0);
- if (!event)
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
+ if (!entry)
return;
- entry = ring_buffer_event_data(event);
if (is_ret_probe(tu)) {
entry->vaddr[0] = func;
entry->vaddr[1] = instruction_pointer(regs);
@@ -983,7 +978,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
memcpy(data, ucb->buf, tu->tp.size + dsize);
- event_trigger_unlock_commit(trace_file, buffer, event, entry, 0);
+ trace_event_buffer_commit(&fbuffer);
}
/* uprobe handler */
@@ -1076,14 +1071,12 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
static void __probe_event_disable(struct trace_probe *tp)
{
- struct trace_probe *pos;
struct trace_uprobe *tu;
tu = container_of(tp, struct trace_uprobe, tp);
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- tu = container_of(pos, struct trace_uprobe, tp);
+ list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
if (!tu->inode)
continue;
@@ -1095,7 +1088,7 @@ static void __probe_event_disable(struct trace_probe *tp)
static int probe_event_enable(struct trace_event_call *call,
struct trace_event_file *file, filter_func_t filter)
{
- struct trace_probe *pos, *tp;
+ struct trace_probe *tp;
struct trace_uprobe *tu;
bool enabled;
int ret;
@@ -1130,8 +1123,7 @@ static int probe_event_enable(struct trace_event_call *call,
if (ret)
goto err_flags;
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- tu = container_of(pos, struct trace_uprobe, tp);
+ list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
ret = trace_uprobe_enable(tu, filter);
if (ret) {
__probe_event_disable(tp);
@@ -1276,7 +1268,7 @@ static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter,
static int uprobe_perf_close(struct trace_event_call *call,
struct perf_event *event)
{
- struct trace_probe *pos, *tp;
+ struct trace_probe *tp;
struct trace_uprobe *tu;
int ret = 0;
@@ -1288,8 +1280,7 @@ static int uprobe_perf_close(struct trace_event_call *call,
if (trace_uprobe_filter_remove(tu->tp.event->filter, event))
return 0;
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- tu = container_of(pos, struct trace_uprobe, tp);
+ list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
if (ret)
break;
@@ -1301,7 +1292,7 @@ static int uprobe_perf_close(struct trace_event_call *call,
static int uprobe_perf_open(struct trace_event_call *call,
struct perf_event *event)
{
- struct trace_probe *pos, *tp;
+ struct trace_probe *tp;
struct trace_uprobe *tu;
int err = 0;
@@ -1313,8 +1304,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
if (trace_uprobe_filter_add(tu->tp.event->filter, event))
return 0;
- list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
- tu = container_of(pos, struct trace_uprobe, tp);
+ list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
if (err) {
uprobe_perf_close(call, event);
@@ -1620,6 +1610,11 @@ create_local_trace_uprobe(char *name, unsigned long offs,
tu->path = path;
tu->ref_ctr_offset = ref_ctr_offset;
tu->filename = kstrdup(name, GFP_KERNEL);
+ if (!tu->filename) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
init_trace_event_call(tu);
ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f00de83d0246..1d261fbe367b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -38,11 +38,10 @@ void bacct_add_tsk(struct user_namespace *user_ns,
stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
stats->ac_btime64 = btime;
- if (thread_group_leader(tsk)) {
+ if (tsk->flags & PF_EXITING)
stats->ac_exitcode = tsk->exit_code;
- if (tsk->flags & PF_FORKNOEXEC)
- stats->ac_flag |= AFORK;
- }
+ if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC))
+ stats->ac_flag |= AFORK;
if (tsk->flags & PF_SUPERPRIV)
stats->ac_flag |= ASU;
if (tsk->flags & PF_DUMPCORE)