summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arena.c2
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/bpf/bloom_filter.c2
-rw-r--r--kernel/bpf/bpf_insn_array.c2
-rw-r--r--kernel/bpf/bpf_local_storage.c75
-rw-r--r--kernel/bpf/cpumap.c17
-rw-r--r--kernel/bpf/devmap.c47
-rw-r--r--kernel/bpf/hashtab.c86
-rw-r--r--kernel/bpf/local_storage.c2
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/memalloc.c58
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/bpf/tnum.c56
-rw-r--r--kernel/bpf/verifier.c30
-rw-r--r--kernel/configs/debug.config1
-rw-r--r--kernel/dma/direct.h2
-rw-r--r--kernel/events/core.c83
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/kcsan/kcsan_test.c2
-rw-r--r--kernel/liveupdate/luo_file.c41
-rw-r--r--kernel/rseq.c8
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/ext.c4
-rw-r--r--kernel/sched/fair.c150
-rw-r--r--kernel/sched/sched.h11
-rw-r--r--kernel/time/time.c19
-rw-r--r--kernel/trace/bpf_trace.c4
27 files changed, 549 insertions, 162 deletions
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 144f30e740e8..f355cf1c1a16 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -303,7 +303,7 @@ static long arena_map_update_elem(struct bpf_map *map, void *key,
return -EOPNOTSUPP;
}
-static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+static int arena_map_check_btf(struct bpf_map *map, const struct btf *btf,
const struct btf_type *key_type, const struct btf_type *value_type)
{
return 0;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 26763df6134a..33de68c95ad8 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -548,7 +548,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
-static int array_map_check_btf(const struct bpf_map *map,
+static int array_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 35e1ddca74d2..b73336c976b7 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -180,7 +180,7 @@ static long bloom_map_update_elem(struct bpf_map *map, void *key,
return -EINVAL;
}
-static int bloom_map_check_btf(const struct bpf_map *map,
+static int bloom_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c
index c0286f25ca3c..a2f84afe6f7c 100644
--- a/kernel/bpf/bpf_insn_array.c
+++ b/kernel/bpf/bpf_insn_array.c
@@ -98,7 +98,7 @@ static long insn_array_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}
-static int insn_array_check_btf(const struct bpf_map *map,
+static int insn_array_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b28f07d3a0db..9c96a4477f81 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -107,14 +107,12 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
- /* If RCU Tasks Trace grace period implies RCU grace period, do
- * kfree(), else do kfree_rcu().
+ /*
+ * RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree() directly.
*/
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- if (rcu_trace_implies_rcu_gp())
- kfree(local_storage);
- else
- kfree_rcu(local_storage, rcu);
+ kfree(local_storage);
}
/* Handle use_kmalloc_nolock == false */
@@ -138,10 +136,11 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
- if (rcu_trace_implies_rcu_gp())
- bpf_local_storage_free_rcu(rcu);
- else
- call_rcu(rcu, bpf_local_storage_free_rcu);
+ /*
+ * RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree() directly.
+ */
+ bpf_local_storage_free_rcu(rcu);
}
static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
@@ -164,16 +163,29 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
bpf_local_storage_free_trace_rcu);
}
-/* rcu tasks trace callback for use_kmalloc_nolock == false */
-static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+/* rcu callback for use_kmalloc_nolock == false */
+static void __bpf_selem_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
- if (rcu_trace_implies_rcu_gp())
- kfree(selem);
- else
- kfree_rcu(selem, rcu);
+ /* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */
+ smap = rcu_dereference_check(SDATA(selem)->smap, 1);
+
+ if (smap)
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ kfree(selem);
+}
+
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ /*
+ * RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree() directly.
+ */
+ __bpf_selem_free_rcu(rcu);
}
/* Handle use_kmalloc_nolock == false */
@@ -181,7 +193,7 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
bool vanilla_rcu)
{
if (vanilla_rcu)
- kfree_rcu(selem, rcu);
+ call_rcu(&selem->rcu, __bpf_selem_free_rcu);
else
call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
}
@@ -195,37 +207,29 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
/* The bpf_local_storage_map_free will wait for rcu_barrier */
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
- if (smap) {
- migrate_disable();
+ if (smap)
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- migrate_enable();
- }
kfree_nolock(selem);
}
static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
- if (rcu_trace_implies_rcu_gp())
- bpf_selem_free_rcu(rcu);
- else
- call_rcu(rcu, bpf_selem_free_rcu);
+ /*
+ * RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree() directly.
+ */
+ bpf_selem_free_rcu(rcu);
}
void bpf_selem_free(struct bpf_local_storage_elem *selem,
bool reuse_now)
{
- struct bpf_local_storage_map *smap;
-
- smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
-
if (!selem->use_kmalloc_nolock) {
/*
* No uptr will be unpin even when reuse_now == false since uptr
* is only supported in task local storage, where
* smap->use_kmalloc_nolock == true.
*/
- if (smap)
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
return;
}
@@ -797,7 +801,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
return 0;
}
-int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+int bpf_local_storage_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
@@ -958,10 +962,9 @@ restart:
*/
synchronize_rcu();
- if (smap->use_kmalloc_nolock) {
- rcu_barrier_tasks_trace();
- rcu_barrier();
- }
+ /* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */
+ rcu_barrier_tasks_trace();
+ rcu_barrier();
kvfree(smap->buckets);
bpf_map_area_free(smap);
}
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 04171fbc39cb..32b43cb9061b 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -29,6 +29,7 @@
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
+#include <linux/local_lock.h>
#include <linux/completion.h>
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>
@@ -52,6 +53,7 @@ struct xdp_bulk_queue {
struct list_head flush_node;
struct bpf_cpu_map_entry *obj;
unsigned int count;
+ local_lock_t bq_lock;
};
/* Struct for every remote "destination" CPU in map */
@@ -451,6 +453,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
for_each_possible_cpu(i) {
bq = per_cpu_ptr(rcpu->bulkq, i);
bq->obj = rcpu;
+ local_lock_init(&bq->bq_lock);
}
/* Alloc queue */
@@ -722,6 +725,8 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
struct ptr_ring *q;
int i;
+ lockdep_assert_held(&bq->bq_lock);
+
if (unlikely(!bq->count))
return;
@@ -749,11 +754,15 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
}
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
- * Thus, safe percpu variable access.
+ * Thus, safe percpu variable access. PREEMPT_RT relies on
+ * local_lock_nested_bh() to serialise access to the per-CPU bq.
*/
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+ struct xdp_bulk_queue *bq;
+
+ local_lock_nested_bh(&rcpu->bulkq->bq_lock);
+ bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
bq_flush_to_queue(bq);
@@ -774,6 +783,8 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
list_add(&bq->flush_node, flush_list);
}
+
+ local_unlock_nested_bh(&rcpu->bulkq->bq_lock);
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -810,7 +821,9 @@ void __cpu_map_flush(struct list_head *flush_list)
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ local_lock_nested_bh(&bq->obj->bulkq->bq_lock);
bq_flush_to_queue(bq);
+ local_unlock_nested_bh(&bq->obj->bulkq->bq_lock);
/* If already running, costs spin_lock_irqsave + smb_mb */
wake_up_process(bq->obj->kthread);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2625601de76e..3d619d01088e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -45,6 +45,7 @@
* types of devmap; only the lookup and insertion is different.
*/
#include <linux/bpf.h>
+#include <linux/local_lock.h>
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
@@ -60,6 +61,7 @@ struct xdp_dev_bulk_queue {
struct net_device *dev_rx;
struct bpf_prog *xdp_prog;
unsigned int count;
+ local_lock_t bq_lock;
};
struct bpf_dtab_netdev {
@@ -381,6 +383,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
int to_send = cnt;
int i;
+ lockdep_assert_held(&bq->bq_lock);
+
if (unlikely(!cnt))
return;
@@ -425,10 +429,12 @@ void __dev_flush(struct list_head *flush_list)
struct xdp_dev_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
bq_xmit_all(bq, XDP_XMIT_FLUSH);
bq->dev_rx = NULL;
bq->xdp_prog = NULL;
__list_del_clearprev(&bq->flush_node);
+ local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
}
}
@@ -451,12 +457,16 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
* variable access, and map elements stick around. See comment above
- * xdp_do_flush() in filter.c.
+ * xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh()
+ * to serialise access to the per-CPU bq.
*/
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
- struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
+ struct xdp_dev_bulk_queue *bq;
+
+ local_lock_nested_bh(&dev->xdp_bulkq->bq_lock);
+ bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
bq_xmit_all(bq, 0);
@@ -477,6 +487,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
}
bq->q[bq->count++] = xdpf;
+
+ local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock);
}
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
@@ -588,18 +600,22 @@ static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifin
}
/* Get ifindex of each upper device. 'indexes' must be able to hold at
- * least MAX_NEST_DEV elements.
- * Returns the number of ifindexes added.
+ * least 'max' elements.
+ * Returns the number of ifindexes added, or -EOVERFLOW if there are too
+ * many upper devices.
*/
-static int get_upper_ifindexes(struct net_device *dev, int *indexes)
+static int get_upper_ifindexes(struct net_device *dev, int *indexes, int max)
{
struct net_device *upper;
struct list_head *iter;
int n = 0;
netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ if (n >= max)
+ return -EOVERFLOW;
indexes[n++] = upper->ifindex;
}
+
return n;
}
@@ -615,7 +631,11 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
int err;
if (exclude_ingress) {
- num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
+ num_excluded = get_upper_ifindexes(dev_rx, excluded_devices,
+ ARRAY_SIZE(excluded_devices) - 1);
+ if (num_excluded < 0)
+ return num_excluded;
+
excluded_devices[num_excluded++] = dev_rx->ifindex;
}
@@ -733,7 +753,11 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
int err;
if (exclude_ingress) {
- num_excluded = get_upper_ifindexes(dev, excluded_devices);
+ num_excluded = get_upper_ifindexes(dev, excluded_devices,
+ ARRAY_SIZE(excluded_devices) - 1);
+ if (num_excluded < 0)
+ return num_excluded;
+
excluded_devices[num_excluded++] = dev->ifindex;
}
@@ -1115,8 +1139,13 @@ static int dev_map_notification(struct notifier_block *notifier,
if (!netdev->xdp_bulkq)
return NOTIFY_BAD;
- for_each_possible_cpu(cpu)
- per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
+ for_each_possible_cpu(cpu) {
+ struct xdp_dev_bulk_queue *bq;
+
+ bq = per_cpu_ptr(netdev->xdp_bulkq, cpu);
+ bq->dev = netdev;
+ local_lock_init(&bq->bq_lock);
+ }
break;
case NETDEV_UNREGISTER:
/* This rcu_read_lock/unlock pair is needed because
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3b9d297a53be..bc6bc8bb871d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -125,6 +125,11 @@ struct htab_elem {
char key[] __aligned(8);
};
+struct htab_btf_record {
+ struct btf_record *record;
+ u32 key_size;
+};
+
static inline bool htab_is_prealloc(const struct bpf_htab *htab)
{
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
@@ -457,6 +462,83 @@ static int htab_map_alloc_check(union bpf_attr *attr)
return 0;
}
+static void htab_mem_dtor(void *obj, void *ctx)
+{
+ struct htab_btf_record *hrec = ctx;
+ struct htab_elem *elem = obj;
+ void *map_value;
+
+ if (IS_ERR_OR_NULL(hrec->record))
+ return;
+
+ map_value = htab_elem_value(elem, hrec->key_size);
+ bpf_obj_free_fields(hrec->record, map_value);
+}
+
+static void htab_pcpu_mem_dtor(void *obj, void *ctx)
+{
+ void __percpu *pptr = *(void __percpu **)obj;
+ struct htab_btf_record *hrec = ctx;
+ int cpu;
+
+ if (IS_ERR_OR_NULL(hrec->record))
+ return;
+
+ for_each_possible_cpu(cpu)
+ bpf_obj_free_fields(hrec->record, per_cpu_ptr(pptr, cpu));
+}
+
+static void htab_dtor_ctx_free(void *ctx)
+{
+ struct htab_btf_record *hrec = ctx;
+
+ btf_record_free(hrec->record);
+ kfree(ctx);
+}
+
+static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *))
+{
+ u32 key_size = htab->map.key_size;
+ struct bpf_mem_alloc *ma;
+ struct htab_btf_record *hrec;
+ int err;
+
+ /* No need for dtors. */
+ if (IS_ERR_OR_NULL(htab->map.record))
+ return 0;
+
+ hrec = kzalloc(sizeof(*hrec), GFP_KERNEL);
+ if (!hrec)
+ return -ENOMEM;
+ hrec->key_size = key_size;
+ hrec->record = btf_record_dup(htab->map.record);
+ if (IS_ERR(hrec->record)) {
+ err = PTR_ERR(hrec->record);
+ kfree(hrec);
+ return err;
+ }
+ ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma;
+ bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec);
+ return 0;
+}
+
+static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf,
+ const struct btf_type *key_type, const struct btf_type *value_type)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ if (htab_is_prealloc(htab))
+ return 0;
+ /*
+ * We must set the dtor using this callback, as map's BTF record is not
+ * populated in htab_map_alloc(), so it will always appear as NULL.
+ */
+ if (htab_is_percpu(htab))
+ return htab_set_dtor(htab, htab_pcpu_mem_dtor);
+ else
+ return htab_set_dtor(htab, htab_mem_dtor);
+}
+
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -2281,6 +2363,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab),
.map_btf_id = &htab_map_btf_ids[0],
@@ -2303,6 +2386,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru),
.map_btf_id = &htab_map_btf_ids[0],
@@ -2482,6 +2566,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_percpu),
.map_btf_id = &htab_map_btf_ids[0],
@@ -2502,6 +2587,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru_percpu),
.map_btf_id = &htab_map_btf_ids[0],
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 1ccbf28b2ad9..8fca0c64f7b1 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -364,7 +364,7 @@ static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}
-static int cgroup_storage_check_btf(const struct bpf_map *map,
+static int cgroup_storage_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1adeb4d3b8cf..0f57608b385d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -751,7 +751,7 @@ free_stack:
return err;
}
-static int trie_check_btf(const struct bpf_map *map,
+static int trie_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index bd45dda9dc35..682a9f34214b 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -102,6 +102,8 @@ struct bpf_mem_cache {
int percpu_size;
bool draining;
struct bpf_mem_cache *tgt;
+ void (*dtor)(void *obj, void *ctx);
+ void *dtor_ctx;
/* list of objects to be freed after RCU GP */
struct llist_head free_by_rcu;
@@ -260,12 +262,14 @@ static void free_one(void *obj, bool percpu)
kfree(obj);
}
-static int free_all(struct llist_node *llnode, bool percpu)
+static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu)
{
struct llist_node *pos, *t;
int cnt = 0;
llist_for_each_safe(pos, t, llnode) {
+ if (c->dtor)
+ c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx);
free_one(pos, percpu);
cnt++;
}
@@ -276,7 +280,7 @@ static void __free_rcu(struct rcu_head *head)
{
struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
- free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
+ free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
atomic_set(&c->call_rcu_ttrace_in_progress, 0);
}
@@ -308,7 +312,7 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
if (unlikely(READ_ONCE(c->draining))) {
llnode = llist_del_all(&c->free_by_rcu_ttrace);
- free_all(llnode, !!c->percpu_size);
+ free_all(c, llnode, !!c->percpu_size);
}
return;
}
@@ -417,7 +421,7 @@ static void check_free_by_rcu(struct bpf_mem_cache *c)
dec_active(c, &flags);
if (unlikely(READ_ONCE(c->draining))) {
- free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
+ free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
atomic_set(&c->call_rcu_in_progress, 0);
} else {
call_rcu_hurry(&c->rcu, __free_by_rcu);
@@ -635,13 +639,13 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
* Except for waiting_for_gp_ttrace list, there are no concurrent operations
* on these lists, so it is safe to use __llist_del_all().
*/
- free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
- free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
- free_all(__llist_del_all(&c->free_llist), percpu);
- free_all(__llist_del_all(&c->free_llist_extra), percpu);
- free_all(__llist_del_all(&c->free_by_rcu), percpu);
- free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
- free_all(llist_del_all(&c->waiting_for_gp), percpu);
+ free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu);
+ free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu);
+ free_all(c, __llist_del_all(&c->free_llist), percpu);
+ free_all(c, __llist_del_all(&c->free_llist_extra), percpu);
+ free_all(c, __llist_del_all(&c->free_by_rcu), percpu);
+ free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu);
+ free_all(c, llist_del_all(&c->waiting_for_gp), percpu);
}
static void check_mem_cache(struct bpf_mem_cache *c)
@@ -680,6 +684,9 @@ static void check_leaked_objs(struct bpf_mem_alloc *ma)
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
{
+ /* We can free dtor ctx only once all callbacks are done using it. */
+ if (ma->dtor_ctx_free)
+ ma->dtor_ctx_free(ma->dtor_ctx);
check_leaked_objs(ma);
free_percpu(ma->cache);
free_percpu(ma->caches);
@@ -1014,3 +1021,32 @@ int bpf_mem_alloc_check_size(bool percpu, size_t size)
return 0;
}
+
+void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx),
+ void (*dtor_ctx_free)(void *ctx), void *ctx)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i;
+
+ ma->dtor_ctx_free = dtor_ctx_free;
+ ma->dtor_ctx = ctx;
+
+ if (ma->cache) {
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ c->dtor = dtor;
+ c->dtor_ctx = ctx;
+ }
+ }
+ if (ma->caches) {
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ c->dtor = dtor;
+ c->dtor_ctx = ctx;
+ }
+ }
+ }
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0378e83b4099..274039e36465 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1234,7 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
}
EXPORT_SYMBOL_GPL(bpf_obj_name_cpy);
-int map_check_no_btf(const struct bpf_map *map,
+int map_check_no_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 26fbfbb01700..4abc359b3db0 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -269,3 +269,59 @@ struct tnum tnum_bswap64(struct tnum a)
{
return TNUM(swab64(a.value), swab64(a.mask));
}
+
+/* Given tnum t, and a number z such that tmin <= z < tmax, where tmin
+ * is the smallest member of the t (= t.value) and tmax is the largest
+ * member of t (= t.value | t.mask), returns the smallest member of t
+ * larger than z.
+ *
+ * For example,
+ * t = x11100x0
+ * z = 11110001 (241)
+ * result = 11110010 (242)
+ *
+ * Note: if this function is called with z >= tmax, it just returns
+ * early with tmax; if this function is called with z < tmin, the
+ * algorithm already returns tmin.
+ */
+u64 tnum_step(struct tnum t, u64 z)
+{
+ u64 tmax, j, p, q, r, s, v, u, w, res;
+ u8 k;
+
+ tmax = t.value | t.mask;
+
+ /* if z >= largest member of t, return largest member of t */
+ if (z >= tmax)
+ return tmax;
+
+ /* if z < smallest member of t, return smallest member of t */
+ if (z < t.value)
+ return t.value;
+
+ /* keep t's known bits, and match all unknown bits to z */
+ j = t.value | (z & t.mask);
+
+ if (j > z) {
+ p = ~z & t.value & ~t.mask;
+ k = fls64(p); /* k is the most-significant 0-to-1 flip */
+ q = U64_MAX << k;
+ r = q & z; /* positions > k matched to z */
+ s = ~q & t.value; /* positions <= k matched to t.value */
+ v = r | s;
+ res = v;
+ } else {
+ p = z & ~t.value & ~t.mask;
+ k = fls64(p); /* k is the most-significant 1-to-0 flip */
+ q = U64_MAX << k;
+ r = q & t.mask & z; /* unknown positions > k, matched to z */
+ s = q & ~t.mask; /* known positions > k, set to 1 */
+ v = r | s;
+ /* add 1 to unknown positions > k to make value greater than z */
+ u = v + (1ULL << k);
+ /* extract bits in unknown positions > k from u, rest from t.value */
+ w = (u & t.mask) | t.value;
+ res = w;
+ }
+ return res;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb12ba020649..401d6c4960ec 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2379,6 +2379,9 @@ static void __update_reg32_bounds(struct bpf_reg_state *reg)
static void __update_reg64_bounds(struct bpf_reg_state *reg)
{
+ u64 tnum_next, tmax;
+ bool umin_in_tnum;
+
/* min signed is max(sign bit) | min(other bits) */
reg->smin_value = max_t(s64, reg->smin_value,
reg->var_off.value | (reg->var_off.mask & S64_MIN));
@@ -2388,6 +2391,33 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg)
reg->umin_value = max(reg->umin_value, reg->var_off.value);
reg->umax_value = min(reg->umax_value,
reg->var_off.value | reg->var_off.mask);
+
+ /* Check if u64 and tnum overlap in a single value */
+ tnum_next = tnum_step(reg->var_off, reg->umin_value);
+ umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value;
+ tmax = reg->var_off.value | reg->var_off.mask;
+ if (umin_in_tnum && tnum_next > reg->umax_value) {
+ /* The u64 range and the tnum only overlap in umin.
+ * u64: ---[xxxxxx]-----
+ * tnum: --xx----------x-
+ */
+ ___mark_reg_known(reg, reg->umin_value);
+ } else if (!umin_in_tnum && tnum_next == tmax) {
+ /* The u64 range and the tnum only overlap in the maximum value
+ * represented by the tnum, called tmax.
+ * u64: ---[xxxxxx]-----
+ * tnum: xx-----x--------
+ */
+ ___mark_reg_known(reg, tmax);
+ } else if (!umin_in_tnum && tnum_next <= reg->umax_value &&
+ tnum_step(reg->var_off, tnum_next) > reg->umax_value) {
+ /* The u64 range and the tnum only overlap in between umin
+ * (excluded) and umax.
+ * u64: ---[xxxxxx]-----
+ * tnum: xx----x-------x-
+ */
+ ___mark_reg_known(reg, tnum_next);
+ }
}
static void __update_reg_bounds(struct bpf_reg_state *reg)
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 774702591d26..307c97ac5fa9 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -29,7 +29,6 @@ CONFIG_SECTION_MISMATCH_WARN_ONLY=y
# CONFIG_UBSAN_ALIGNMENT is not set
# CONFIG_UBSAN_DIV_ZERO is not set
# CONFIG_UBSAN_TRAP is not set
-# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_FS_ALLOW_ALL=y
CONFIG_DEBUG_IRQFLAGS=y
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index f476c63b668c..e89f175e9c2d 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -85,7 +85,7 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
if (is_swiotlb_force_bounce(dev)) {
if (attrs & DMA_ATTR_MMIO)
- goto err_overflow;
+ return DMA_MAPPING_ERROR;
return swiotlb_map(dev, phys, size, dir, attrs);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ac70d68217b6..1f5699b339ec 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4138,7 +4138,8 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (*perf_event_fasync(event))
event->pending_kill = POLL_ERR;
- perf_event_wakeup(event);
+ event->pending_wakeup = 1;
+ irq_work_queue(&event->pending_irq);
} else {
struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);
@@ -7464,28 +7465,28 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = perf_mmap_aux(vma, event, nr_pages);
if (ret)
return ret;
- }
- /*
- * Since pinned accounting is per vm we cannot allow fork() to copy our
- * vma.
- */
- vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
- vma->vm_ops = &perf_mmap_vmops;
+ /*
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
+ * vma.
+ */
+ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
+ vma->vm_ops = &perf_mmap_vmops;
- mapped = get_mapped(event, event_mapped);
- if (mapped)
- mapped(event, vma->vm_mm);
+ mapped = get_mapped(event, event_mapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
- /*
- * Try to map it into the page table. On fail, invoke
- * perf_mmap_close() to undo the above, as the callsite expects
- * full cleanup in this case and therefore does not invoke
- * vmops::close().
- */
- ret = map_range(event->rb, vma);
- if (ret)
- perf_mmap_close(vma);
+ /*
+ * Try to map it into the page table. On fail, invoke
+ * perf_mmap_close() to undo the above, as the callsite expects
+ * full cleanup in this case and therefore does not invoke
+ * vmops::close().
+ */
+ ret = map_range(event->rb, vma);
+ if (ret)
+ perf_mmap_close(vma);
+ }
return ret;
}
@@ -10776,6 +10777,13 @@ int perf_event_overflow(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
+ /*
+ * Entry point from hardware PMI, interrupts should be disabled here.
+ * This serializes us against perf_event_remove_from_context() in
+ * things like perf_event_release_kernel().
+ */
+ lockdep_assert_irqs_disabled();
+
return __perf_event_overflow(event, 1, data, regs);
}
@@ -10852,6 +10860,19 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
{
struct hw_perf_event *hwc = &event->hw;
+ /*
+ * This is:
+ * - software preempt
+ * - tracepoint preempt
+ * - tp_target_task irq (ctx->lock)
+ * - uprobes preempt/irq
+ * - kprobes preempt/irq
+ * - hw_breakpoint irq
+ *
+ * Any of these are sufficient to hold off RCU and thus ensure @event
+ * exists.
+ */
+ lockdep_assert_preemption_disabled();
local64_add(nr, &event->count);
if (!regs)
@@ -10860,6 +10881,16 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
if (!is_sampling_event(event))
return;
+ /*
+ * Serialize against event_function_call() IPIs like normal overflow
+ * event handling. Specifically, must not allow
+ * perf_event_release_kernel() -> perf_remove_from_context() to make
+ * progress and 'release' the event from under us.
+ */
+ guard(irqsave)();
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
data->period = nr;
return perf_swevent_overflow(event, 1, data, regs);
@@ -11358,6 +11389,11 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct perf_sample_data data;
struct perf_event *event;
+ /*
+ * Per being a tracepoint, this runs with preemption disabled.
+ */
+ lockdep_assert_preemption_disabled();
+
struct perf_raw_record raw = {
.frag = {
.size = entry_size,
@@ -11690,6 +11726,11 @@ void perf_bp_event(struct perf_event *bp, void *data)
struct perf_sample_data sample;
struct pt_regs *regs = data;
+ /*
+ * Exception context, will have interrupts disabled.
+ */
+ lockdep_assert_irqs_disabled();
+
perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
if (!bp->hw.state && !perf_exclude_event(bp, regs))
@@ -12154,7 +12195,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
if (regs && !perf_exclude_event(event, regs)) {
if (!(event->attr.exclude_idle && is_idle_task(current)))
- if (__perf_event_overflow(event, 1, &data, regs))
+ if (perf_event_overflow(event, &data, regs))
ret = HRTIMER_NORESTART;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index e832da9d15a4..65113a304518 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3085,7 +3085,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
return 0;
/* don't need lock here; in the worst case we'll do useless copy */
- if (fs->users == 1)
+ if (!(unshare_flags & CLONE_NEWNS) && fs->users == 1)
return 0;
*new_fsp = copy_fs_struct(fs);
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 79e655ea4ca1..ae758150ccb9 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -168,7 +168,7 @@ static bool __report_matches(const struct expect_report *r)
if (!report_available())
return false;
- expect = kmalloc_obj(observed.lines);
+ expect = (typeof(expect))kmalloc_obj(observed.lines);
if (WARN_ON(!expect))
return false;
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
index 8c79058253e1..5acee4174bf0 100644
--- a/kernel/liveupdate/luo_file.c
+++ b/kernel/liveupdate/luo_file.c
@@ -134,9 +134,12 @@ static LIST_HEAD(luo_file_handler_list);
* state that is not preserved. Set by the handler's .preserve()
* callback, and must be freed in the handler's .unpreserve()
* callback.
- * @retrieved: A flag indicating whether a user/kernel in the new kernel has
+ * @retrieve_status: Status code indicating whether a user/kernel in the new kernel has
* successfully called retrieve() on this file. This prevents
- * multiple retrieval attempts.
+ * multiple retrieval attempts. A value of 0 means a retrieve()
+ * has not been attempted, a positive value means the retrieve()
+ * was successful, and a negative value means the retrieve()
+ * failed, and the value is the error code of the call.
* @mutex: A mutex that protects the fields of this specific instance
* (e.g., @retrieved, @file), ensuring that operations like
* retrieving or finishing a file are atomic.
@@ -161,7 +164,7 @@ struct luo_file {
struct file *file;
u64 serialized_data;
void *private_data;
- bool retrieved;
+ int retrieve_status;
struct mutex mutex;
struct list_head list;
u64 token;
@@ -298,7 +301,6 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
luo_file->file = file;
luo_file->fh = fh;
luo_file->token = token;
- luo_file->retrieved = false;
mutex_init(&luo_file->mutex);
args.handler = fh;
@@ -577,7 +579,12 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
return -ENOENT;
guard(mutex)(&luo_file->mutex);
- if (luo_file->retrieved) {
+ if (luo_file->retrieve_status < 0) {
+ /* Retrieve was attempted and it failed. Return the error code. */
+ return luo_file->retrieve_status;
+ }
+
+ if (luo_file->retrieve_status > 0) {
/*
* Someone is asking for this file again, so get a reference
* for them.
@@ -590,16 +597,19 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
args.handler = luo_file->fh;
args.serialized_data = luo_file->serialized_data;
err = luo_file->fh->ops->retrieve(&args);
- if (!err) {
- luo_file->file = args.file;
-
- /* Get reference so we can keep this file in LUO until finish */
- get_file(luo_file->file);
- *filep = luo_file->file;
- luo_file->retrieved = true;
+ if (err) {
+ /* Keep the error code for later use. */
+ luo_file->retrieve_status = err;
+ return err;
}
- return err;
+ luo_file->file = args.file;
+ /* Get reference so we can keep this file in LUO until finish */
+ get_file(luo_file->file);
+ *filep = luo_file->file;
+ luo_file->retrieve_status = 1;
+
+ return 0;
}
static int luo_file_can_finish_one(struct luo_file_set *file_set,
@@ -615,7 +625,7 @@ static int luo_file_can_finish_one(struct luo_file_set *file_set,
args.handler = luo_file->fh;
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
- args.retrieved = luo_file->retrieved;
+ args.retrieve_status = luo_file->retrieve_status;
can_finish = luo_file->fh->ops->can_finish(&args);
}
@@ -632,7 +642,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set,
args.handler = luo_file->fh;
args.file = luo_file->file;
args.serialized_data = luo_file->serialized_data;
- args.retrieved = luo_file->retrieved;
+ args.retrieve_status = luo_file->retrieve_status;
luo_file->fh->ops->finish(&args);
luo_flb_file_finish(luo_file->fh);
@@ -788,7 +798,6 @@ int luo_file_deserialize(struct luo_file_set *file_set,
luo_file->file = NULL;
luo_file->serialized_data = file_ser[i].data;
luo_file->token = file_ser[i].token;
- luo_file->retrieved = false;
mutex_init(&luo_file->mutex);
list_add_tail(&luo_file->list, &file_set->files_list);
}
diff --git a/kernel/rseq.c b/kernel/rseq.c
index b0973d19f366..38d3ef540760 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -80,6 +80,7 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/types.h>
+#include <linux/rseq.h>
#include <asm/ptrace.h>
#define CREATE_TRACE_POINTS
@@ -449,13 +450,14 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
* auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
* size, the required alignment is the original struct rseq alignment.
*
- * In order to be valid, rseq_len is either the original rseq size, or
- * large enough to contain all supported fields, as communicated to
+ * The rseq_len is required to be greater or equal to the original rseq
+ * size. In order to be valid, rseq_len is either the original rseq size,
+ * or large enough to contain all supported fields, as communicated to
* user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
*/
if (rseq_len < ORIG_RSEQ_SIZE ||
(rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
- (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
+ (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) ||
rseq_len < offsetof(struct rseq, end))))
return -EINVAL;
if (!access_ok(rseq, rseq_len))
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 759777694c78..b7f77c165a6e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6830,6 +6830,7 @@ static void __sched notrace __schedule(int sched_mode)
/* SCX must consult the BPF scheduler to tell if rq is empty */
if (!rq->nr_running && !scx_enabled()) {
next = prev;
+ rq->next_class = &idle_sched_class;
goto picked;
}
} else if (!preempt && prev_state) {
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 62b1f3ac5630..06cc0a4aec66 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2460,7 +2460,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
/* see kick_cpus_irq_workfn() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
- rq->next_class = &ext_sched_class;
+ rq_modified_begin(rq, &ext_sched_class);
rq_unpin_lock(rq, rf);
balance_one(rq, prev);
@@ -2475,7 +2475,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
* If @force_scx is true, always try to pick a SCHED_EXT task,
* regardless of any higher-priority sched classes activity.
*/
- if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class))
+ if (!force_scx && rq_modified_above(rq, &ext_sched_class))
return RETRY_TASK;
keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eea99ec01a3f..bf948db905ed 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -589,6 +589,21 @@ static inline bool entity_before(const struct sched_entity *a,
return vruntime_cmp(a->deadline, "<", b->deadline);
}
+/*
+ * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale
+ * and this value should be no more than two lag bounds. Which puts it in the
+ * general order of:
+ *
+ * (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT
+ *
+ * which is around 44 bits in size (on 64bit); that is 20 for
+ * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for
+ * however many msec the actual slice+tick ends up begin.
+ *
+ * (disregarding the actual divide-by-weight part makes for the worst case
+ * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually
+ * being the zero-lag point).
+ */
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
@@ -676,41 +691,65 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
static inline
-void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta)
{
/*
- * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
+ * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight
*/
cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
+ cfs_rq->zero_vruntime += delta;
}
/*
- * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true
* For this to be so, the result of this function must have a left bias.
+ *
+ * Called in:
+ * - place_entity() -- before enqueue
+ * - update_entity_lag() -- before dequeue
+ * - entity_tick()
+ *
+ * This means it is one entry 'behind' but that puts it close enough to where
+ * the bound on entity_key() is at most two lag bounds.
*/
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->sum_w_vruntime;
- long load = cfs_rq->sum_weight;
+ long weight = cfs_rq->sum_weight;
+ s64 delta = 0;
- if (curr && curr->on_rq) {
- unsigned long weight = scale_load_down(curr->load.weight);
+ if (curr && !curr->on_rq)
+ curr = NULL;
- avg += entity_key(cfs_rq, curr) * weight;
- load += weight;
- }
+ if (weight) {
+ s64 runtime = cfs_rq->sum_w_vruntime;
+
+ if (curr) {
+ unsigned long w = scale_load_down(curr->load.weight);
+
+ runtime += entity_key(cfs_rq, curr) * w;
+ weight += w;
+ }
- if (load) {
/* sign flips effective floor / ceiling */
- if (avg < 0)
- avg -= (load - 1);
- avg = div_s64(avg, load);
+ if (runtime < 0)
+ runtime -= (weight - 1);
+
+ delta = div_s64(runtime, weight);
+ } else if (curr) {
+ /*
+ * When there is but one element, it is the average.
+ */
+ delta = curr->vruntime - cfs_rq->zero_vruntime;
}
- return cfs_rq->zero_vruntime + avg;
+ update_zero_vruntime(cfs_rq, delta);
+
+ return cfs_rq->zero_vruntime;
}
+static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
+
/*
* lag_i = S - s_i = w_i * (V - v_i)
*
@@ -724,17 +763,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
* EEVDF gives the following limit for a steady state system:
*
* -r_max < lag < max(r_max, q)
- *
- * XXX could add max_slice to the augmented data to track this.
*/
static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC;
s64 vlag, limit;
WARN_ON_ONCE(!se->on_rq);
vlag = avg_vruntime(cfs_rq) - se->vruntime;
- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ limit = calc_delta_fair(max_slice, se);
se->vlag = clamp(vlag, -limit, limit);
}
@@ -777,16 +815,6 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
return vruntime_eligible(cfs_rq, se->vruntime);
}
-static void update_zero_vruntime(struct cfs_rq *cfs_rq)
-{
- u64 vruntime = avg_vruntime(cfs_rq);
- s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
-
- sum_w_vruntime_update(cfs_rq, delta);
-
- cfs_rq->zero_vruntime = vruntime;
-}
-
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
{
struct sched_entity *root = __pick_root_entity(cfs_rq);
@@ -802,6 +830,21 @@ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
return min_slice;
}
+static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *root = __pick_root_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 max_slice = 0ULL;
+
+ if (curr && curr->on_rq)
+ max_slice = curr->slice;
+
+ if (root)
+ max_slice = max(max_slice, root->max_slice);
+
+ return max_slice;
+}
+
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
{
return entity_before(__node_2_se(a), __node_2_se(b));
@@ -826,6 +869,15 @@ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *n
}
}
+static inline void __max_slice_update(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (rse->max_slice > se->max_slice)
+ se->max_slice = rse->max_slice;
+ }
+}
+
/*
* se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
@@ -833,6 +885,7 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
{
u64 old_min_vruntime = se->min_vruntime;
u64 old_min_slice = se->min_slice;
+ u64 old_max_slice = se->max_slice;
struct rb_node *node = &se->run_node;
se->min_vruntime = se->vruntime;
@@ -843,8 +896,13 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
__min_slice_update(se, node->rb_right);
__min_slice_update(se, node->rb_left);
+ se->max_slice = se->slice;
+ __max_slice_update(se, node->rb_right);
+ __max_slice_update(se, node->rb_left);
+
return se->min_vruntime == old_min_vruntime &&
- se->min_slice == old_min_slice;
+ se->min_slice == old_min_slice &&
+ se->max_slice == old_max_slice;
}
RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
@@ -856,7 +914,6 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
sum_w_vruntime_add(cfs_rq, se);
- update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -868,7 +925,6 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
sum_w_vruntime_sub(cfs_rq, se);
- update_zero_vruntime(cfs_rq);
}
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -3790,6 +3846,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
+ bool rel_vprot = false;
+ u64 vprot;
if (se->on_rq) {
/* commit outstanding execution time */
@@ -3797,6 +3855,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
update_entity_lag(cfs_rq, se);
se->deadline -= se->vruntime;
se->rel_deadline = 1;
+ if (curr && protect_slice(se)) {
+ vprot = se->vprot - se->vruntime;
+ rel_vprot = true;
+ }
+
cfs_rq->nr_queued--;
if (!curr)
__dequeue_entity(cfs_rq, se);
@@ -3812,6 +3875,9 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
if (se->rel_deadline)
se->deadline = div_s64(se->deadline * se->load.weight, weight);
+ if (rel_vprot)
+ vprot = div_s64(vprot * se->load.weight, weight);
+
update_load_set(&se->load, weight);
do {
@@ -3823,6 +3889,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
place_entity(cfs_rq, se, 0);
+ if (rel_vprot)
+ se->vprot = se->vruntime + vprot;
update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
@@ -5420,7 +5488,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
}
static void
-set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first)
{
clear_buddies(cfs_rq, se);
@@ -5435,7 +5503,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
- set_protect_slice(cfs_rq, se);
+ if (first)
+ set_protect_slice(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
@@ -5524,6 +5593,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_load_avg(cfs_rq, curr, UPDATE_TG);
update_cfs_group(curr);
+ /*
+ * Pulls along cfs_rq::zero_vruntime.
+ */
+ avg_vruntime(cfs_rq);
+
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
@@ -8948,13 +9022,13 @@ again:
pse = parent_entity(pse);
}
if (se_depth >= pse_depth) {
- set_next_entity(cfs_rq_of(se), se);
+ set_next_entity(cfs_rq_of(se), se, true);
se = parent_entity(se);
}
}
put_prev_entity(cfs_rq, pse);
- set_next_entity(cfs_rq, se);
+ set_next_entity(cfs_rq, se, true);
__set_next_task_fair(rq, p, true);
}
@@ -12908,7 +12982,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t0 = sched_clock_cpu(this_cpu);
__sched_balance_update_blocked_averages(this_rq);
- this_rq->next_class = &fair_sched_class;
+ rq_modified_begin(this_rq, &fair_sched_class);
raw_spin_rq_unlock(this_rq);
for_each_domain(this_cpu, sd) {
@@ -12975,7 +13049,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
pulled_task = 1;
/* If a higher prio class was modified, restart the pick */
- if (sched_class_above(this_rq->next_class, &fair_sched_class))
+ if (rq_modified_above(this_rq, &fair_sched_class))
pulled_task = -1;
out:
@@ -13568,7 +13642,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- set_next_entity(cfs_rq, se);
+ set_next_entity(cfs_rq, se, first);
/* ensure bandwidth has been allocated on our new cfs_rq */
account_cfs_rq_runtime(cfs_rq, 0);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b82fb70a9d54..43bbf0693cca 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2748,6 +2748,17 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
#define sched_class_above(_a, _b) ((_a) < (_b))
+static inline void rq_modified_begin(struct rq *rq, const struct sched_class *class)
+{
+ if (sched_class_above(rq->next_class, class))
+ rq->next_class = class;
+}
+
+static inline bool rq_modified_above(struct rq *rq, const struct sched_class *class)
+{
+ return sched_class_above(rq->next_class, class);
+}
+
static inline bool sched_stop_runnable(struct rq *rq)
{
return rq->stop && task_on_rq_queued(rq->stop);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 0ba8e3c50d62..36fd2313ae7e 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -365,20 +365,16 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
}
#endif
+#if HZ > MSEC_PER_SEC || (MSEC_PER_SEC % HZ)
/**
* jiffies_to_msecs - Convert jiffies to milliseconds
* @j: jiffies value
*
- * Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases.
- *
* Return: milliseconds value
*/
unsigned int jiffies_to_msecs(const unsigned long j)
{
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
- return (MSEC_PER_SEC / HZ) * j;
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+#if HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
@@ -390,7 +386,9 @@ unsigned int jiffies_to_msecs(const unsigned long j)
#endif
}
EXPORT_SYMBOL(jiffies_to_msecs);
+#endif
+#if (USEC_PER_SEC % HZ)
/**
* jiffies_to_usecs - Convert jiffies to microseconds
* @j: jiffies value
@@ -405,17 +403,14 @@ unsigned int jiffies_to_usecs(const unsigned long j)
*/
BUILD_BUG_ON(HZ > USEC_PER_SEC);
-#if !(USEC_PER_SEC % HZ)
- return (USEC_PER_SEC / HZ) * j;
-#else
-# if BITS_PER_LONG == 32
+#if BITS_PER_LONG == 32
return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
-# else
+#else
return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
-# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_usecs);
+#endif
/**
* mktime64 - Converts date to seconds.
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9bc0dfd235af..0b040a417442 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2454,8 +2454,10 @@ static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_kprobe_multi_link *kmulti_link;
+ bool has_cookies;
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ has_cookies = !!kmulti_link->cookies;
seq_printf(seq,
"kprobe_cnt:\t%u\n"
@@ -2467,7 +2469,7 @@ static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link,
for (int i = 0; i < kmulti_link->cnt; i++) {
seq_printf(seq,
"%llu\t %pS\n",
- kmulti_link->cookies[i],
+ has_cookies ? kmulti_link->cookies[i] : 0,
(void *)kmulti_link->addrs[i]);
}
}