From 731bb3118f859d2a68444a9ae580681522d32bc0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 22 Jan 2026 16:35:56 -0800 Subject: Revert "PCI/TSM: Report active IDE streams" The proposed ABI failed to account for multiple host bridges with the same stream name. The fix needs to namespace streams or otherwise link back to the host bridge, but a change like that is too big for a fix. Given this ABI never saw a released kernel, delete it for now and bring it back later with this issue addressed. Reported-by: Xu Yilun Reported-by: Yi Lai Closes: http://lore.kernel.org/20251223085601.2607455-1-yilun.xu@linux.intel.com Link: http://patch.msgid.link/6972c872acbb9_1d3310035@dwillia2-mobl4.notmuch Signed-off-by: Dan Williams --- include/linux/pci-ide.h | 2 -- include/linux/tsm.h | 3 --- 2 files changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index 37a1ad9501b0..5d4d56ed088d 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -82,7 +82,6 @@ struct pci_ide_regs { * @host_bridge_stream: allocated from host bridge @ide_stream_ida pool * @stream_id: unique Stream ID (within Partner Port pairing) * @name: name of the established Selective IDE Stream in sysfs - * @tsm_dev: For TSM established IDE, the TSM device context * * Negative @stream_id values indicate "uninitialized" on the * expectation that with TSM established IDE the TSM owns the stream_id @@ -94,7 +93,6 @@ struct pci_ide { u8 host_bridge_stream; int stream_id; const char *name; - struct tsm_dev *tsm_dev; }; /* diff --git a/include/linux/tsm.h b/include/linux/tsm.h index a3b7ab668eff..22e05b2aac69 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -123,7 +123,4 @@ int tsm_report_unregister(const struct tsm_report_ops *ops); struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *ops); void tsm_unregister(struct tsm_dev *tsm_dev); struct tsm_dev *find_tsm_dev(int id); -struct pci_ide; -int tsm_ide_stream_register(struct pci_ide *ide); -void tsm_ide_stream_unregister(struct pci_ide *ide); #endif /* __TSM_H */ -- cgit v1.2.3 From 8370af2019dee9ca004ca7c5e36b1f629ecb1e39 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Wed, 14 Jan 2026 19:14:55 +0800 Subject: PCI/IDE: Fix off by one error calculating VF RID range The VF ID range of an SR-IOV device is [0, num_VFs - 1]. pci_ide_stream_alloc() mistakenly uses num_VFs to represent the last ID. Fix that off by one error to stay in bounds of the range. Fixes: 1e4d2ff3ae45 ("PCI/IDE: Add IDE establishment helpers") Signed-off-by: Li Ming Reviewed-by: Xu Yilun Link: https://patch.msgid.link/20260114111455.550984-1-ming.li@zohomail.com Signed-off-by: Dan Williams --- include/linux/pci-ide.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index 5d4d56ed088d..ae07d9f699c0 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -26,7 +26,7 @@ enum pci_ide_partner_select { /** * struct pci_ide_partner - Per port pair Selective IDE Stream settings * @rid_start: Partner Port Requester ID range start - * @rid_end: Partner Port Requester ID range end + * @rid_end: Partner Port Requester ID range end (inclusive) * @stream_index: Selective IDE Stream Register Block selection * @mem_assoc: PCI bus memory address association for targeting peer partner * @pref_assoc: PCI bus prefetchable memory address association for -- cgit v1.2.3 From 13e00fdc9236bd4d0bff4109d2983171fbcb74c4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Jan 2026 14:15:38 +0000 Subject: net: add skb_header_pointer_careful() helper This variant of skb_header_pointer() should be used in contexts where @offset argument is user-controlled and could be negative. Negative offsets are supported, as long as the zone starts between skb->head and skb->data. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260128141539.3404400-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 86737076101d..112e48970338 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4301,6 +4301,18 @@ skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) skb_headlen(skb), buffer); } +/* Variant of skb_header_pointer() where @offset is user-controlled + * and potentially negative. + */ +static inline void * __must_check +skb_header_pointer_careful(const struct sk_buff *skb, int offset, + int len, void *buffer) +{ + if (unlikely(offset < 0 && -offset > skb_headroom(skb))) + return NULL; + return skb_header_pointer(skb, offset, len, buffer); +} + static inline void * __must_check skb_pointer_if_linear(const struct sk_buff *skb, int offset, int len) { -- cgit v1.2.3 From 47ee94efccf6732e4ef1a815c451aacaf1464757 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Feb 2026 10:39:45 +0100 Subject: sched/mmcid: Protect transition on weakly ordered systems Shrikanth reported a hard lockup which he observed once. The stack trace shows the following CID related participants: watchdog: CPU 23 self-detected hard LOCKUP @ mm_get_cid+0xe8/0x188 NIP: mm_get_cid+0xe8/0x188 LR: mm_get_cid+0x108/0x188 mm_cid_switch_to+0x3c4/0x52c __schedule+0x47c/0x700 schedule_idle+0x3c/0x64 do_idle+0x160/0x1b0 cpu_startup_entry+0x48/0x50 start_secondary+0x284/0x288 start_secondary_prolog+0x10/0x14 watchdog: CPU 11 self-detected hard LOCKUP @ plpar_hcall_norets_notrace+0x18/0x2c NIP: plpar_hcall_norets_notrace+0x18/0x2c LR: queued_spin_lock_slowpath+0xd88/0x15d0 _raw_spin_lock+0x80/0xa0 raw_spin_rq_lock_nested+0x3c/0xf8 mm_cid_fixup_cpus_to_tasks+0xc8/0x28c sched_mm_cid_exit+0x108/0x22c do_exit+0xf4/0x5d0 make_task_dead+0x0/0x178 system_call_exception+0x128/0x390 system_call_vectored_common+0x15c/0x2ec The task on CPU11 is running the CID ownership mode change fixup function and is stuck on a runqueue lock. The task on CPU23 is trying to get a CID from the pool with the same runqueue lock held, but the pool is empty. After decoding a similar issue in the opposite direction switching from per task to per CPU mode the tool which models the possible scenarios failed to come up with a similar loop hole. This showed up only once, was not reproducible and according to tooling not related to a overlooked scheduling scenario permutation. But the fact that it was observed on a PowerPC system gave the right hint: PowerPC is a weakly ordered architecture. The transition mechanism does: WRITE_ONCE(mm->mm_cid.transit, MM_CID_TRANSIT); WRITE_ONCE(mm->mm_cid.percpu, new_mode); fixup() WRITE_ONCE(mm->mm_cid.transit, 0); mm_cid_schedin() does: if (!READ_ONCE(mm->mm_cid.percpu)) ... cid |= READ_ONCE(mm->mm_cid.transit); so weakly ordered systems can observe percpu == false and transit == 0 even if the fixup function has not yet completed. As a consequence the task will not drop the CID when scheduling out before the fixup is completed, which means the CID space can be exhausted and the next task scheduling in will loop in mm_get_cid() and the fixup thread can livelock on the held runqueue lock as above. This could obviously be solved by using: smp_store_release(&mm->mm_cid.percpu, true); and smp_load_acquire(&mm->mm_cid.percpu); but that brings a memory barrier back into the scheduler hotpath, which was just designed out by the CID rewrite. That can be completely avoided by combining the per CPU mode and the transit storage into a single mm_cid::mode member and ordering the stores against the fixup functions to prevent the CPU from reordering them. That makes the update of both states atomic and a concurrent read observes always consistent state. The price is an additional AND operation in mm_cid_schedin() to evaluate the per CPU or the per task path, but that's in the noise even on strongly ordered architectures as the actual load can be significantly more expensive and the conditional branch evaluation is there anyway. Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions") Closes: https://lore.kernel.org/bdfea828-4585-40e8-8835-247c6a8a76b0@linux.ibm.com Reported-by: Shrikanth Hegde Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20260201192834.965217106@kernel.org --- include/linux/rseq_types.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 332dc14b81c9..ef0811379c54 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -121,8 +121,7 @@ struct mm_cid_pcpu { /** * struct mm_mm_cid - Storage for per MM CID data * @pcpu: Per CPU storage for CIDs associated to a CPU - * @percpu: Set, when CIDs are in per CPU mode - * @transit: Set to MM_CID_TRANSIT during a mode change transition phase + * @mode: Indicates per CPU and transition mode * @max_cids: The exclusive maximum CID value for allocation and convergence * @irq_work: irq_work to handle the affinity mode change case * @work: Regular work to handle the affinity mode change case @@ -139,8 +138,7 @@ struct mm_cid_pcpu { struct mm_mm_cid { /* Hotpath read mostly members */ struct mm_cid_pcpu __percpu *pcpu; - unsigned int percpu; - unsigned int transit; + unsigned int mode; unsigned int max_cids; /* Rarely used. Moves @lock and @mutex into the second cacheline */ -- cgit v1.2.3 From 7987cce375ac8ce98e170a77aa2399f2cf6eb99f Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Tue, 3 Feb 2026 14:54:46 -0800 Subject: ceph: fix NULL pointer dereference in ceph_mds_auth_match() The CephFS kernel client has regression starting from 6.18-rc1. We have issue in ceph_mds_auth_match() if fs_name == NULL: const char fs_name = mdsc->fsc->mount_options->mds_namespace; ... if (auth->match.fs_name && strcmp(auth->match.fs_name, fs_name)) { / fsname mismatch, try next one */ return 0; } Patrick Donnelly suggested that: In summary, we should definitely start decoding `fs_name` from the MDSMap and do strict authorizations checks against it. Note that the `-o mds_namespace=foo` should only be used for selecting the file system to mount and nothing else. It's possible no mds_namespace is specified but the kernel will mount the only file system that exists which may have name "foo". This patch reworks ceph_mdsmap_decode() and namespace_equals() with the goal of supporting the suggested concept. Now struct ceph_mdsmap contains m_fs_name field that receives copy of extracted FS name by ceph_extract_encoded_string(). For the case of "old" CephFS file systems, it is used "cephfs" name. [ idryomov: replace redundant %*pE with %s in ceph_mdsmap_decode(), get rid of a series of strlen() calls in ceph_namespace_match(), drop changes to namespace_equals() body to avoid treating empty mds_namespace as equal, drop changes to ceph_mdsc_handle_fsmap() as namespace_equals() isn't an equivalent substitution there ] Cc: stable@vger.kernel.org Fixes: 22c73d52a6d0 ("ceph: fix multifs mds auth caps issue") Link: https://tracker.ceph.com/issues/73886 Signed-off-by: Viacheslav Dubeyko Reviewed-by: Patrick Donnelly Tested-by: Patrick Donnelly Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index c7f2c63b3bc3..08e5dbe15ca4 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -31,6 +31,12 @@ #define CEPH_INO_CEPH 2 /* hidden .ceph dir */ #define CEPH_INO_GLOBAL_SNAPREALM 3 /* global dummy snaprealm */ +/* + * name for "old" CephFS file systems, + * see ceph.git e2b151d009640114b2565c901d6f41f6cd5ec652 + */ +#define CEPH_OLD_FS_NAME "cephfs" + /* arbitrary limit on max # of monitors (cluster of 3 is typical) */ #define CEPH_MAX_MON 31 -- cgit v1.2.3 From ab10815472fcbc2c772dc21a979460b7f74f0145 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 23 Jan 2026 11:26:56 +0100 Subject: livepatch: Fix having __klp_objects relics in non-livepatch modules The linker script scripts/module.lds.S specifies that all input __klp_objects sections should be consolidated into an output section of the same name, and start/stop symbols should be created to enable scripts/livepatch/init.c to locate this data. This start/stop pattern is not ideal for modules because the symbols are created even if no __klp_objects input sections are present. Consequently, a dummy __klp_objects section also appears in the resulting module. This unnecessarily pollutes non-livepatch modules. Instead, since modules are relocatable files, the usual method for locating consolidated data in a module is to read its section table. This approach avoids the aforementioned problem. The klp_modinfo already stores a copy of the entire section table with the final addresses. Introduce a helper function that scripts/livepatch/init.c can call to obtain the location of the __klp_objects section from this data. Fixes: dd590d4d57eb ("objtool/klp: Introduce klp diff subcommand for diffing object files") Signed-off-by: Petr Pavlu Acked-by: Joe Lawrence Acked-by: Miroslav Benes Reviewed-by: Aaron Tomlin Link: https://patch.msgid.link/20260123102825.3521961-2-petr.pavlu@suse.com Signed-off-by: Josh Poimboeuf --- include/linux/livepatch.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 772919e8096a..ba9e3988c07c 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -175,6 +175,9 @@ int klp_enable_patch(struct klp_patch *); int klp_module_coming(struct module *mod); void klp_module_going(struct module *mod); +void *klp_find_section_by_name(const struct module *mod, const char *name, + size_t *sec_size); + void klp_copy_process(struct task_struct *child); void klp_update_patch_state(struct task_struct *task); -- cgit v1.2.3 From b5cbacd7f86f4f62b8813688c8e73be94e8e1951 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 29 Jan 2026 13:53:40 -0800 Subject: procfs: avoid fetching build ID while holding VMA lock Fix PROCMAP_QUERY to fetch optional build ID only after dropping mmap_lock or per-VMA lock, whichever was used to lock VMA under question, to avoid deadlock reported by syzbot: -> #1 (&mm->mmap_lock){++++}-{4:4}: __might_fault+0xed/0x170 _copy_to_iter+0x118/0x1720 copy_page_to_iter+0x12d/0x1e0 filemap_read+0x720/0x10a0 blkdev_read_iter+0x2b5/0x4e0 vfs_read+0x7f4/0xae0 ksys_read+0x12a/0x250 do_syscall_64+0xcb/0xf80 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (&sb->s_type->i_mutex_key#8){++++}-{4:4}: __lock_acquire+0x1509/0x26d0 lock_acquire+0x185/0x340 down_read+0x98/0x490 blkdev_read_iter+0x2a7/0x4e0 __kernel_read+0x39a/0xa90 freader_fetch+0x1d5/0xa80 __build_id_parse.isra.0+0xea/0x6a0 do_procmap_query+0xd75/0x1050 procfs_procmap_ioctl+0x7a/0xb0 __x64_sys_ioctl+0x18e/0x210 do_syscall_64+0xcb/0xf80 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- rlock(&mm->mmap_lock); lock(&sb->s_type->i_mutex_key#8); lock(&mm->mmap_lock); rlock(&sb->s_type->i_mutex_key#8); *** DEADLOCK *** This seems to be exacerbated (as we haven't seen these syzbot reports before that) by the recent: 777a8560fd29 ("lib/buildid: use __kernel_read() for sleepable context") To make this safe, we need to grab file refcount while VMA is still locked, but other than that everything is pretty straightforward. Internal build_id_parse() API assumes VMA is passed, but it only needs the underlying file reference, so just add another variant build_id_parse_file() that expects file passed directly. [akpm@linux-foundation.org: fix up kerneldoc] Link: https://lkml.kernel.org/r/20260129215340.3742283-1-andrii@kernel.org Fixes: ed5d583a88a9 ("fs/procfs: implement efficient VMA querying API for /proc//maps") Signed-off-by: Andrii Nakryiko Reported-by: Reviewed-by: Suren Baghdasaryan Tested-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Eduard Zingerman Cc: Hao Luo Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Martin KaFai Lau Cc: Song Liu Cc: Stanislav Fomichev Cc: Yonghong Song Cc: Signed-off-by: Andrew Morton --- include/linux/buildid.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 831c1b4b626c..7acc06b22fb7 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -7,7 +7,10 @@ #define BUILD_ID_SIZE_MAX 20 struct vm_area_struct; +struct file; + int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); +int build_id_parse_file(struct file *file, unsigned char *build_id, __u32 *size); int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size); -- cgit v1.2.3