summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst11
-rw-r--r--include/linux/rseq_entry.h38
-rw-r--r--include/linux/rseq_types.h2
-rw-r--r--kernel/rseq.c132
4 files changed, 170 insertions, 13 deletions
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 239da22c4e28..b09d18e0f75b 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1248,6 +1248,17 @@ reboot-cmd (SPARC only)
ROM/Flash boot loader. Maybe to tell it what to do after
rebooting. ???
+rseq_slice_extension_nsec
+=========================
+
+A task can request to delay its scheduling if it is in a critical section
+via the prctl(PR_RSEQ_SLICE_EXTENSION_SET) mechanism. This sets the maximum
+allowed extension in nanoseconds before scheduling of the task is enforced.
+Default value is 10000ns (10us). The possible range is 10000ns (10us) to
+50000ns (50us).
+
+This value has a direct correlation to the worst case scheduling latency;
+increment at your own risk.
sched_energy_aware
==================
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index 54d8e338b26e..8d04611056aa 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -87,8 +87,24 @@ static __always_inline bool rseq_slice_extension_enabled(void)
{
return static_branch_likely(&rseq_slice_extension_key);
}
+
+extern unsigned int rseq_slice_ext_nsecs;
+bool __rseq_arm_slice_extension_timer(void);
+
+static __always_inline bool rseq_arm_slice_extension_timer(void)
+{
+ if (!rseq_slice_extension_enabled())
+ return false;
+
+ if (likely(!current->rseq.slice.state.granted))
+ return false;
+
+ return __rseq_arm_slice_extension_timer();
+}
+
#else /* CONFIG_RSEQ_SLICE_EXTENSION */
static inline bool rseq_slice_extension_enabled(void) { return false; }
+static inline bool rseq_arm_slice_extension_timer(void) { return false; }
#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
@@ -543,17 +559,19 @@ static __always_inline void clear_tif_rseq(void) { }
static __always_inline bool
rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
{
- if (likely(!test_tif_rseq(ti_work)))
- return false;
-
- if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
- current->rseq.event.slowpath = true;
- set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
- return true;
+ if (unlikely(test_tif_rseq(ti_work))) {
+ if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
+ current->rseq.event.slowpath = true;
+ set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+ return true;
+ }
+ clear_tif_rseq();
}
-
- clear_tif_rseq();
- return false;
+ /*
+ * Arm the slice extension timer if nothing to do anymore and the
+ * task really goes out to user space.
+ */
+ return rseq_arm_slice_extension_timer();
}
#else /* CONFIG_GENERIC_ENTRY */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 8c540e775161..8a2e76c5d2a8 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -89,10 +89,12 @@ union rseq_slice_state {
/**
* struct rseq_slice - Status information for rseq time slice extension
* @state: Time slice extension state
+ * @expires: The time when a grant expires
* @yielded: Indicator for rseq_slice_yield()
*/
struct rseq_slice {
union rseq_slice_state state;
+ u64 expires;
u8 yielded;
};
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 8aa4821e3979..275d70114107 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -71,6 +71,8 @@
#define RSEQ_BUILD_SLOW_PATH
#include <linux/debugfs.h>
+#include <linux/hrtimer.h>
+#include <linux/percpu.h>
#include <linux/prctl.h>
#include <linux/ratelimit.h>
#include <linux/rseq_entry.h>
@@ -500,8 +502,91 @@ efault:
}
#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+struct slice_timer {
+ struct hrtimer timer;
+ void *cookie;
+};
+
+unsigned int rseq_slice_ext_nsecs __read_mostly = 10 * NSEC_PER_USEC;
+static DEFINE_PER_CPU(struct slice_timer, slice_timer);
DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
+/*
+ * When the timer expires and the task is still in user space, the return
+ * from interrupt will revoke the grant and schedule. If the task already
+ * entered the kernel via a syscall and the timer fires before the syscall
+ * work was able to cancel it, then depending on the preemption model this
+ * will either reschedule on return from interrupt or in the syscall work
+ * below.
+ */
+static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
+{
+ struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
+
+ /*
+ * Validate that the task which armed the timer is still on the
+ * CPU. It could have been scheduled out without canceling the
+ * timer.
+ */
+ if (st->cookie == current && current->rseq.slice.state.granted) {
+ rseq_stat_inc(rseq_stats.s_expired);
+ set_need_resched_current();
+ }
+ return HRTIMER_NORESTART;
+}
+
+bool __rseq_arm_slice_extension_timer(void)
+{
+ struct slice_timer *st = this_cpu_ptr(&slice_timer);
+ struct task_struct *curr = current;
+
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * This check prevents a task, which got a time slice extension
+ * granted, from exceeding the maximum scheduling latency when the
+ * grant expired before going out to user space. Don't bother to
+ * clear the grant here, it will be cleaned up automatically before
+ * going out to user space after being scheduled back in.
+ */
+ if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
+ set_need_resched_current();
+ return true;
+ }
+
+ /*
+ * Store the task pointer as a cookie for comparison in the timer
+ * function. This is safe as the timer is CPU local and cannot be
+ * in the expiry function at this point.
+ */
+ st->cookie = curr;
+ hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /* Arm the syscall entry work */
+ set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+ return false;
+}
+
+static void rseq_cancel_slice_extension_timer(void)
+{
+ struct slice_timer *st = this_cpu_ptr(&slice_timer);
+
+ /*
+ * st->cookie can be safely read as preemption is disabled and the
+ * timer is CPU local.
+ *
+ * As this is most probably the first expiring timer, the cancel is
+ * expensive as it has to reprogram the hardware, but that's less
+ * expensive than going through a full hrtimer_interrupt() cycle
+ * for nothing.
+ *
+ * hrtimer_try_to_cancel() is sufficient here as the timer is CPU
+ * local and once the hrtimer code disabled interrupts the timer
+ * callback cannot be running.
+ */
+ if (st->cookie == current)
+ hrtimer_try_to_cancel(&st->timer);
+}
+
static inline void rseq_slice_set_need_resched(struct task_struct *curr)
{
/*
@@ -563,11 +648,14 @@ void rseq_syscall_enter_work(long syscall)
return;
/*
- * Required to make set_tsk_need_resched() correct on PREEMPT[RT]
- * kernels. Leaving the scope will reschedule on preemption models
- * FULL, LAZY and RT if necessary.
+ * Required to stabilize the per CPU timer pointer and to make
+ * set_tsk_need_resched() correct on PREEMPT[RT] kernels.
+ *
+ * Leaving the scope will reschedule on preemption models FULL,
+ * LAZY and RT if necessary.
*/
scoped_guard(preempt) {
+ rseq_cancel_slice_extension_timer();
/*
* Now that preemption is disabled, quickly check whether
* the task was already rescheduled before arriving here.
@@ -665,6 +753,31 @@ SYSCALL_DEFINE0(rseq_slice_yield)
return yielded;
}
+#ifdef CONFIG_SYSCTL
+static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC;
+static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
+
+static const struct ctl_table rseq_slice_ext_sysctl[] = {
+ {
+ .procname = "rseq_slice_extension_nsec",
+ .data = &rseq_slice_ext_nsecs,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (unsigned int *)&rseq_slice_ext_nsecs_min,
+ .extra2 = (unsigned int *)&rseq_slice_ext_nsecs_max,
+ },
+};
+
+static void rseq_slice_sysctl_init(void)
+{
+ if (rseq_slice_extension_enabled())
+ register_sysctl_init("kernel", rseq_slice_ext_sysctl);
+}
+#else /* CONFIG_SYSCTL */
+static inline void rseq_slice_sysctl_init(void) { }
+#endif /* !CONFIG_SYSCTL */
+
static int __init rseq_slice_cmdline(char *str)
{
bool on;
@@ -677,4 +790,17 @@ static int __init rseq_slice_cmdline(char *str)
return 1;
}
__setup("rseq_slice_ext=", rseq_slice_cmdline);
+
+static int __init rseq_slice_init(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
+ }
+ rseq_slice_sysctl_init();
+ return 0;
+}
+device_initcall(rseq_slice_init);
#endif /* CONFIG_RSEQ_SLICE_EXTENSION */