From 237bbd29f7a049d310d907f4b2716a7feef9abf3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:03 -0700 Subject: KEYS: prevent creating a different user's keyrings It was possible for an unprivileged user to create the user and user session keyrings for another user. For example: sudo -u '#3000' sh -c 'keyctl add keyring _uid.4000 "" @u keyctl add keyring _uid_ses.4000 "" @u sleep 15' & sleep 1 sudo -u '#4000' keyctl describe @u sudo -u '#4000' keyctl describe @us This is problematic because these "fake" keyrings won't have the right permissions. In particular, the user who created them first will own them and will have full access to them via the possessor permissions, which can be used to compromise the security of a user's keys: -4: alswrv-----v------------ 3000 0 keyring: _uid.4000 -5: alswrv-----v------------ 3000 0 keyring: _uid_ses.4000 Fix it by marking user and user session keyrings with a flag KEY_FLAG_UID_KEYRING. Then, when searching for a user or user session keyring by name, skip all keyrings that don't have the flag set. Fixes: 69664cf16af4 ("keys: don't generate user and user session keyrings unless they're accessed") Cc: [v2.6.26+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- include/linux/key.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/key.h b/include/linux/key.h index 044114185120..e315e16b6ff8 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -187,6 +187,7 @@ struct key { #define KEY_FLAG_BUILTIN 8 /* set if key is built in to the kernel */ #define KEY_FLAG_ROOT_CAN_INVAL 9 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 10 /* set if key should not be removed */ +#define KEY_FLAG_UID_KEYRING 11 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -243,6 +244,7 @@ extern struct key *key_alloc(struct key_type *type, #define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ #define KEY_ALLOC_BUILT_IN 0x0004 /* Key is built into kernel */ #define KEY_ALLOC_BYPASS_RESTRICTION 0x0008 /* Override the check on restricted keyrings */ +#define KEY_ALLOC_UID_KEYRING 0x0010 /* allocating a user or user session keyring */ extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); -- cgit v1.2.3 From 5acb3cc2c2e9d3020a4fee43763c6463767f1572 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 20 Sep 2017 13:12:20 -0600 Subject: blktrace: Fix potential deadlock between delete & sysfs ops The lockdep code had reported the following unsafe locking scenario: CPU0 CPU1 ---- ---- lock(s_active#228); lock(&bdev->bd_mutex/1); lock(s_active#228); lock(&bdev->bd_mutex); *** DEADLOCK *** The deadlock may happen when one task (CPU1) is trying to delete a partition in a block device and another task (CPU0) is accessing tracing sysfs file (e.g. /sys/block/dm-1/trace/act_mask) in that partition. The s_active isn't an actual lock. It is a reference count (kn->count) on the sysfs (kernfs) file. Removal of a sysfs file, however, require a wait until all the references are gone. The reference count is treated like a rwsem using lockdep instrumentation code. The fact that a thread is in the sysfs callback method or in the ioctl call means there is a reference to the opended sysfs or device file. That should prevent the underlying block structure from being removed. Instead of using bd_mutex in the block_device structure, a new blk_trace_mutex is now added to the request_queue structure to protect access to the blk_trace structure. Suggested-by: Christoph Hellwig Signed-off-by: Waiman Long Acked-by: Steven Rostedt (VMware) Fix typo in patch subject line, and prune a comment detailing how the code used to work. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 460294bb0fa5..02fa42d24b52 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -551,6 +551,7 @@ struct request_queue { int node; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace *blk_trace; + struct mutex blk_trace_mutex; #endif /* * for flush operations -- cgit v1.2.3 From c98cb3bd882119e7e1a7c8df2f1eacfcc701450b Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:25 -0700 Subject: nvme.h: remove FC transport-specific error values The NVM express group recinded the reserved range for the transport. Remove the FC-centric values that had been defined. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 87723c86f136..2440be32be1d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1127,19 +1127,6 @@ enum { NVME_SC_UNWRITTEN_BLOCK = 0x287, NVME_SC_DNR = 0x4000, - - - /* - * FC Transport-specific error status values for NVME commands - * - * Transport-specific status code values must be in the range 0xB0..0xBF - */ - - /* Generic FC failure - catchall */ - NVME_SC_FC_TRANSPORT_ERROR = 0x00B0, - - /* I/O failure due to FC ABTS'd */ - NVME_SC_FC_TRANSPORT_ABORTED = 0x00B1, }; struct nvme_completion { -- cgit v1.2.3 From d85cf207499e6740ab9c490ff4f360af5c432d23 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 13:20:23 -0700 Subject: nvme: add transport SGL definitions Add transport SGL defintions from NVMe TP 4008, required for the final NVMe-FC standard. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2440be32be1d..9310ce77d8e1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -471,12 +471,14 @@ enum nvme_opcode { * * @NVME_SGL_FMT_ADDRESS: absolute address of the data block * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block + * @NVME_SGL_FMT_TRANSPORT_A: transport defined format, value 0xA * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation * request subtype */ enum { NVME_SGL_FMT_ADDRESS = 0x00, NVME_SGL_FMT_OFFSET = 0x01, + NVME_SGL_FMT_TRANSPORT_A = 0x0A, NVME_SGL_FMT_INVALIDATE = 0x0f, }; @@ -490,12 +492,16 @@ enum { * * For struct nvme_keyed_sgl_desc: * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor + * + * Transport-specific SGL types: + * @NVME_TRANSPORT_SGL_DATA_DESC: Transport SGL data dlock descriptor */ enum { NVME_SGL_FMT_DATA_DESC = 0x00, NVME_SGL_FMT_SEG_DESC = 0x02, NVME_SGL_FMT_LAST_SEG_DESC = 0x03, NVME_KEY_SGL_FMT_DATA_DESC = 0x04, + NVME_TRANSPORT_SGL_DATA_DESC = 0x05, }; struct nvme_sgl_desc { -- cgit v1.2.3 From fe59493240169a2cc3f445ae5f2a2308fda06b63 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Sep 2017 14:29:15 +0200 Subject: PCI: Add dummy pci_acs_enabled() for CONFIG_PCI=n build If CONFIG_PCI=n and gcc (e.g. 4.1.2) decides not to inline get_pci_function_alias_group(), the build fails with: drivers/iommu/iommu.o: In function `get_pci_function_alias_group': iommu.c:(.text+0xfdc): undefined reference to `pci_acs_enabled' Due to the various dummies for PCI calls in the CONFIG_PCI=n case, pci_acs_enabled() never called, but not all versions of gcc are smart enough to realize that. While explicitly marking get_pci_function_alias_group() inline would fix the build, this would inflate the code for the CONFIG_PCI=y case, as get_pci_function_alias_group() is a not-so-small function called from two places. Hence fix the issue by introducing a dummy for pci_acs_enabled() instead. Fixes: 0ae349a0f33f ("iommu/qcom: Add qcom_iommu") Signed-off-by: Geert Uytterhoeven Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index f68c58a93dd0..f4f8ee5a7362 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1685,6 +1685,8 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } #define dev_is_pci(d) (false) #define dev_is_pf(d) (false) +static inline bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags) +{ return false; } #endif /* CONFIG_PCI */ /* Include architecture-dependent settings and functions */ -- cgit v1.2.3 From 6b71f9e1e849f82abb4a8d54ce7f4b1c71f19ac4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 20 Sep 2017 11:07:26 -0700 Subject: nvmet-fc: sync header templates with comments Comments were incorrect: - defer_rcv was in host port template. moved to target port template - Added Mandatory statements for target port template items Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme-fc-driver.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 9c5cb4480806..a726f96010d5 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -346,11 +346,6 @@ struct nvme_fc_remote_port { * indicating an FC transport Aborted status. * Entrypoint is Mandatory. * - * @defer_rcv: Called by the transport to signal the LLLD that it has - * begun processing of a previously received NVME CMD IU. The LLDD - * is now free to re-use the rcv buffer associated with the - * nvmefc_tgt_fcp_req. - * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -806,11 +801,19 @@ struct nvmet_fc_target_port { * outstanding operation (if there was one) to complete, then will * call the fcp_req_release() callback to return the command's * exchange context back to the LLDD. + * Entrypoint is Mandatory. * * @fcp_req_release: Called by the transport to return a nvmefc_tgt_fcp_req * to the LLDD after all operations on the fcp operation are complete. * This may be due to the command completing or upon completion of * abort cleanup. + * Entrypoint is Mandatory. + * + * @defer_rcv: Called by the transport to signal the LLLD that it has + * begun processing of a previously received NVME CMD IU. The LLDD + * is now free to re-use the rcv buffer associated with the + * nvmefc_tgt_fcp_req. + * Entrypoint is Optional. * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. -- cgit v1.2.3 From fac1c2040203363eab6c6e86ce883cb71390418f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:15 +0200 Subject: smp/hotplug: Add state diagram Add a state diagram to clarify when which states are ran where. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.661598270@infradead.org --- include/linux/cpuhotplug.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f24bfb2b9a2d..477b2e6f60f7 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -3,6 +3,24 @@ #include +/* + * CPU-up CPU-down + * + * BP AP BP AP + * + * OFFLINE OFFLINE + * | ^ + * v | + * BRINGUP_CPU->AP_OFFLINE BRINGUP_CPU <- AP_IDLE_DEAD (idle thread/play_dead) + * | AP_OFFLINE + * v (IRQ-off) ,---------------^ + * AP_ONLNE | (stop_machine) + * | TEARDOWN_CPU <- AP_ONLINE_IDLE + * | ^ + * v | + * AP_ACTIVE AP_ACTIVE + */ + enum cpuhp_state { CPUHP_OFFLINE, CPUHP_CREATE_THREADS, -- cgit v1.2.3 From 1db49484f21ed0fcdadd0635a3669f5f386546fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:21 +0200 Subject: smp/hotplug: Hotplug state fail injection Add a sysfs file to one-time fail a specific state. This can be used to test the state rollback code paths. Something like this (hotplug-up.sh): #!/bin/bash echo 0 > /debug/sched_debug echo 1 > /debug/tracing/events/cpuhp/enable ALL_STATES=`cat /sys/devices/system/cpu/hotplug/states | cut -d':' -f1` STATES=${1:-$ALL_STATES} for state in $STATES do echo 0 > /sys/devices/system/cpu/cpu1/online echo 0 > /debug/tracing/trace echo Fail state: $state echo $state > /sys/devices/system/cpu/cpu1/hotplug/fail cat /sys/devices/system/cpu/cpu1/hotplug/fail echo 1 > /sys/devices/system/cpu/cpu1/online cat /debug/tracing/trace > hotfail-${state}.trace sleep 1 done Can be used to test for all possible rollback (barring multi-instance) scenarios on CPU-up, CPU-down is a trivial modification of the above. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.972581715@infradead.org --- include/linux/cpuhotplug.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 477b2e6f60f7..6d508767e144 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -22,7 +22,8 @@ */ enum cpuhp_state { - CPUHP_OFFLINE, + CPUHP_INVALID = -1, + CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, CPUHP_PERF_PREPARE, CPUHP_PERF_X86_PREPARE, -- cgit v1.2.3 From 50ce6312f293e129eedf2affc7bd791c71d8287e Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 26 Sep 2017 19:32:52 +0100 Subject: iommu: Fix comment for iommu_ops.map_sg The definition of map_sg was split during a recent addition to iommu_ops. Put it back together. Fixes: add02cfdc9bc ("iommu: Introduce Interface for IOMMU TLB Flushing") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a7f2ac689d29..41b8c5757859 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -167,11 +167,11 @@ struct iommu_resv_region { * @map: map a physically contiguous memory region to an iommu domain * @unmap: unmap a physically contiguous memory region from an iommu domain * @map_sg: map a scatter-gather list of physically contiguous memory chunks + * to an iommu domain * @flush_tlb_all: Synchronously flush all hardware TLBs for this domain * @tlb_range_add: Add a given iova range to the flush queue for this domain * @tlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue - * to an iommu domain * @iova_to_phys: translate iova to physical address * @add_device: add device to iommu grouping * @remove_device: remove device from iommu grouping -- cgit v1.2.3 From 686fef928bba6be13cabe639f154af7d72b63120 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 28 Sep 2017 06:38:17 -0700 Subject: timer: Prepare to change timer callback argument type Modern kernel callback systems pass the structure associated with a given callback to the callback function. The timer callback remains one of the legacy cases where an arbitrary unsigned long argument continues to be passed as the callback argument. This has several problems: - This bloats the timer_list structure with a normally redundant .data field. - No type checking is being performed, forcing callbacks to do explicit type casts of the unsigned long argument into the object that was passed, rather than using container_of(), as done in most of the other callback infrastructure. - Neighboring buffer overflows can overwrite both the .function and the .data field, providing attackers with a way to elevate from a buffer overflow into a simplistic ROP-like mechanism that allows calling arbitrary functions with a controlled first argument. - For future Control Flow Integrity work, this creates a unique function prototype for timer callbacks, instead of allowing them to continue to be clustered with other void functions that take a single unsigned long argument. This adds a new timer initialization API, which will ultimately replace the existing setup_timer(), setup_{deferrable,pinned,etc}_timer() family, named timer_setup() (to mirror hrtimer_setup(), making instances of its use much easier to grep for). In order to support the migration of existing timers into the new callback arguments, timer_setup() casts its arguments to the existing legacy types, and explicitly passes the timer pointer as the legacy data argument. Once all setup_*timer() callers have been replaced with timer_setup(), the casts can be removed, and the data argument can be dropped with the timer expiration code changed to just pass the timer to the callback directly. Since the regular pattern of using container_of() during local variable declaration repeats the need for the variable type declaration to be included, this adds a helper modeled after other from_*() helpers that wrap container_of(), named from_timer(). This helper uses typeof(*variable), removing the type redundancy and minimizing the need for line wraps in forthcoming conversions from "unsigned data long" to "struct timer_list *" in the timer callbacks: -void callback(unsigned long data) +void callback(struct timer_list *t) { - struct some_data_structure *local = (struct some_data_structure *)data; + struct some_data_structure *local = from_timer(local, t, timer); Finally, in order to support the handful of timer users that perform open-coded assignments of the .function (and .data) fields, provide cast macros (TIMER_FUNC_TYPE and TIMER_DATA_TYPE) that can be used temporarily. Once conversion has been completed, these can be globally trivially removed. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20170928133817.GA113410@beast --- include/linux/timer.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index e6789b8757d5..6383c528b148 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -168,6 +168,20 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define setup_pinned_deferrable_timer_on_stack(timer, fn, data) \ __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) +#define TIMER_DATA_TYPE unsigned long +#define TIMER_FUNC_TYPE void (*)(TIMER_DATA_TYPE) + +static inline void timer_setup(struct timer_list *timer, + void (*callback)(struct timer_list *), + unsigned int flags) +{ + __setup_timer(timer, (TIMER_FUNC_TYPE)callback, + (TIMER_DATA_TYPE)timer, flags); +} + +#define from_timer(var, callback_timer, timer_fieldname) \ + container_of(callback_timer, typeof(*var), timer_fieldname) + /** * timer_pending - is a timer pending? * @timer: the timer in question -- cgit v1.2.3 From 1593baab910da72480d651ea7bf2ce6e3a25a484 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:09:26 +0200 Subject: sched/debug: Implement consistent task-state printing Currently get_task_state() and task_state_to_char() report different states, create a number of common helpers and unify the reported state space. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 92fb8dd5a9e4..163a0b738908 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1243,17 +1243,29 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } -static inline char task_state_to_char(struct task_struct *task) +static inline unsigned int __get_task_state(struct task_struct *tsk) { - const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - unsigned long state = task->state; + unsigned int tsk_state = READ_ONCE(tsk->state); + unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; - state = state ? __ffs(state) + 1 : 0; + if (tsk_state == TASK_PARKED) + state = TASK_INTERRUPTIBLE; - /* Make sure the string lines up properly with the number of task states: */ - BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); + return fls(state); +} + +static inline char __task_state_to_char(unsigned int state) +{ + static const char state_char[] = "RSDTtXZ"; + + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 2); - return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'; + return state_char[state]; +} + +static inline char task_state_to_char(struct task_struct *tsk) +{ + return __task_state_to_char(__get_task_state(tsk)); } /** -- cgit v1.2.3 From 92c4bc9f9cd92a8581e36bc5105f03b569f37e36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:13:36 +0200 Subject: sched/debug: Convert TASK_state to hex Bit patterns are easier in hex. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 163a0b738908..69bed5339ffa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -65,23 +65,23 @@ struct task_group; */ /* Used in tsk->state: */ -#define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define __TASK_STOPPED 4 -#define __TASK_TRACED 8 +#define TASK_RUNNING 0x0000 +#define TASK_INTERRUPTIBLE 0x0001 +#define TASK_UNINTERRUPTIBLE 0x0002 +#define __TASK_STOPPED 0x0004 +#define __TASK_TRACED 0x0008 /* Used in tsk->exit_state: */ -#define EXIT_DEAD 16 -#define EXIT_ZOMBIE 32 +#define EXIT_DEAD 0x0010 +#define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ -#define TASK_DEAD 64 -#define TASK_WAKEKILL 128 -#define TASK_WAKING 256 -#define TASK_PARKED 512 -#define TASK_NOLOAD 1024 -#define TASK_NEW 2048 -#define TASK_STATE_MAX 4096 +#define TASK_DEAD 0x0040 +#define TASK_WAKEKILL 0x0080 +#define TASK_WAKING 0x0100 +#define TASK_PARKED 0x0200 +#define TASK_NOLOAD 0x0400 +#define TASK_NEW 0x0800 +#define TASK_STATE_MAX 0x1000 #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" -- cgit v1.2.3 From efb40f588b4370ffaeffafbd50f6ff213d954254 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:19:53 +0200 Subject: sched/tracing: Fix trace_sched_switch task-state printing Convert trace_sched_switch to use the common task-state helpers and fix the "X" and "Z" order, possibly they ended up in the wrong order because TASK_REPORT has them in the wrong order too. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 69bed5339ffa..a2fe636b6825 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -99,7 +99,7 @@ struct task_group; /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD) + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) -- cgit v1.2.3 From 5f6ad26ea353fdf3dad2328052cbee49e0b9c5b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:23:31 +0200 Subject: sched/tracing: Use common task-state helpers Remove yet another task-state char instance. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a2fe636b6825..bc7807933415 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -83,8 +83,6 @@ struct task_group; #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" - /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) -- cgit v1.2.3 From 06eb61844d841d0032a9950ce7f8e783ee49c0d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:30:40 +0200 Subject: sched/debug: Add explicit TASK_IDLE printing Markus reported that kthreads that idle using TASK_IDLE instead of TASK_INTERRUPTIBLE are reported in as TASK_UNINTERRUPTIBLE and things like htop mark those red. This is undesirable, so add an explicit state for TASK_IDLE. Reported-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index bc7807933415..286fc1117046 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1241,22 +1241,30 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } +#define TASK_REPORT_IDLE (TASK_REPORT + 1) +#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) + static inline unsigned int __get_task_state(struct task_struct *tsk) { unsigned int tsk_state = READ_ONCE(tsk->state); unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; + BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); + if (tsk_state == TASK_PARKED) state = TASK_INTERRUPTIBLE; + if (tsk_state == TASK_IDLE) + state = TASK_REPORT_IDLE; + return fls(state); } static inline char __task_state_to_char(unsigned int state) { - static const char state_char[] = "RSDTtXZ"; + static const char state_char[] = "RSDTtXZI"; - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 2); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); return state_char[state]; } -- cgit v1.2.3 From 8ef9925b02c23e3838d5e593c5cf37984141150f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:37:28 +0200 Subject: sched/debug: Add explicit TASK_PARKED printing Currently TASK_PARKED is masqueraded as TASK_INTERRUPTIBLE, give it its own print state because it will not in fact get woken by regular wakeups and is a long-term state. This requires moving TASK_PARKED into the TASK_REPORT mask, and since that latter needs to be a contiguous bitmask, we need to shuffle the bits around a bit. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 286fc1117046..26a7df4e558c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -75,10 +75,10 @@ struct task_group; #define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ -#define TASK_DEAD 0x0040 -#define TASK_WAKEKILL 0x0080 -#define TASK_WAKING 0x0100 -#define TASK_PARKED 0x0200 +#define TASK_PARKED 0x0040 +#define TASK_DEAD 0x0080 +#define TASK_WAKEKILL 0x0100 +#define TASK_WAKING 0x0200 #define TASK_NOLOAD 0x0400 #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000 @@ -97,7 +97,8 @@ struct task_group; /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE) + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) @@ -1251,9 +1252,6 @@ static inline unsigned int __get_task_state(struct task_struct *tsk) BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); - if (tsk_state == TASK_PARKED) - state = TASK_INTERRUPTIBLE; - if (tsk_state == TASK_IDLE) state = TASK_REPORT_IDLE; @@ -1262,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk) static inline char __task_state_to_char(unsigned int state) { - static const char state_char[] = "RSDTtXZI"; + static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); -- cgit v1.2.3 From d4d7b4ad2f05c03fb25252aea66f9f3cd7cfbe06 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 26 Oct 2017 10:44:07 +0100 Subject: irqchip/gic-v3-its: Setup VLPI properties at map time So far, we require the hypervisor to update the VLPI properties once the the VLPI mapping has been established. While this makes it easy for the ITS driver, it creates a window where an incoming interrupt can be delivered with an unknown set of properties. Not very nice. Instead, let's add a "properties" field to the mapping structure, and use that to configure the VLPI before it actually gets mapped. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v4.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 43cde15f221b..447da8ca2156 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -71,12 +71,14 @@ struct its_vpe { * @vm: Pointer to the GICv4 notion of a VM * @vpe: Pointer to the GICv4 notion of a virtual CPU (VPE) * @vintid: Virtual LPI number + * @properties: Priority and enable bits (as written in the prop table) * @db_enabled: Is the VPE doorbell to be generated? */ struct its_vlpi_map { struct its_vm *vm; struct its_vpe *vpe; u32 vintid; + u8 properties; bool db_enabled; }; -- cgit v1.2.3 From 4b82130077d93539c9fbb0f5eee21965cea9cfe9 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Mon, 30 Oct 2017 10:15:00 +0800 Subject: irqdomain: Update the comments of fwnode field of irq_domain structure Commit: f110711a6053 ("irqdomain: Convert irqdomain-%3Eof_node to fwnode") converted of_node field to fwnode, but didn't update its comments. Update it. Fixes: f110711a6053 ("irqdomain: Convert irqdomain-%3Eof_node to fwnode") Signed-off-by: Dou Liyang Signed-off-by: Marc Zyngier --- include/linux/irqdomain.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index df162f7a4aad..ce48a23d621f 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -138,8 +138,8 @@ struct irq_domain_chip_generic; * @mapcount: The number of mapped interrupts * * Optional elements - * @of_node: Pointer to device tree nodes associated with the irq_domain. Used - * when decoding device tree interrupt specifiers. + * @fwnode: Pointer to firmware node associated with the irq_domain. Pretty easy + * to swap it for the of_node via the irq_domain_get_of_node accessor * @gc: Pointer to a list of generic chips. There is a helper function for * setting up one or more generic chips for interrupt controllers * drivers using the generic chip library which uses this pointer. -- cgit v1.2.3 From da61fcf9d62a05f3508f5646d353a9c2604bac76 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 31 Oct 2017 09:41:45 -0700 Subject: irqchip: mips-gic: Use irq_cpu_online to (un)mask all-VP(E) IRQs The gic_all_vpes_local_irq_controller chip currently attempts to operate on all CPUs/VPs in the system when masking or unmasking an interrupt. This has a few drawbacks: - In multi-cluster systems we may not always have access to all CPUs in the system. When all CPUs in a cluster are powered down that cluster's GIC may also power down, in which case we cannot configure its state. - Relatedly, if we power down a cluster after having configured interrupts for CPUs within it then the cluster's GIC may lose state & we need to reconfigure it. The current approach doesn't take this into account. - It's wasteful if we run Linux on fewer VPs than are present in the system. For example if we run a uniprocessor kernel on CPU0 of a system with 16 CPUs then there's no point in us configuring CPUs 1-15. - The implementation is also lacking in that it expects the range 0..gic_vpes-1 to represent valid Linux CPU numbers which may not always be the case - for example if we run on a system with more VPs than the kernel is configured to support. Fix all of these issues by only configuring the affected interrupts for CPUs which are online at the time, and recording the configuration in a new struct gic_all_vpes_chip_data for later use by CPUs being brought online. We register a CPU hotplug state (reusing CPUHP_AP_IRQ_GIC_STARTING which the ARM GIC driver uses, and which seems suitably generic for reuse with the MIPS GIC) and execute irq_cpu_online() in order to configure the interrupts on the newly onlined CPU. Signed-off-by: Paul Burton Cc: Jason Cooper Cc: Marc Zyngier Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org Signed-off-by: Marc Zyngier --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 6d508767e144..1966a45bc453 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -98,6 +98,7 @@ enum cpuhp_state { CPUHP_AP_IRQ_HIP04_STARTING, CPUHP_AP_IRQ_ARMADA_XP_STARTING, CPUHP_AP_IRQ_BCM2836_STARTING, + CPUHP_AP_IRQ_MIPS_GIC_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, -- cgit v1.2.3