diff options
Diffstat (limited to 'include/linux')
32 files changed, 508 insertions, 155 deletions
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 50b47eba7d01..e7195750d21b 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -105,6 +105,12 @@ ARM_SMCCC_SMC_32, \ 0, 0x3fff) +/* C1-Pro erratum 4193714: SME DVMSync early acknowledgement */ +#define ARM_SMCCC_CPU_WORKAROUND_4193714 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_CPU, 0x10) + #define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ARM_SMCCC_SMC_32, \ diff --git a/include/linux/bio.h b/include/linux/bio.h index 97d747320b35..dc17780d6c1e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -475,7 +475,8 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen); +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen, + size_t minsize); void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty); extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4b703c90ca9..cd191c5fdb0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2917,7 +2917,13 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); #ifndef CONFIG_BPF_JIT_ALWAYS_ON -void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +s32 bpf_call_args_imm(s16 idx); +#else +static inline s32 bpf_call_args_imm(s16 idx) +{ + return 0; +} #endif struct btf *bpf_get_btf_vmlinux(void); @@ -3725,6 +3731,7 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; +extern const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto; extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_find_vma_proto; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b148f816f25b..185b2aa43a42 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -729,6 +729,7 @@ struct bpf_subprog_info { */ s16 fastcall_stack_off; bool has_tail_call: 1; + bool might_throw: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; bool is_cb: 1; @@ -1308,6 +1309,7 @@ void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog); int bpf_find_subprog(struct bpf_verifier_env *env, int off); +bool bpf_is_throw_kfunc(struct bpf_insn *insn); int bpf_compute_const_regs(struct bpf_verifier_env *env); int bpf_prune_dead_branches(struct bpf_verifier_env *env); int bpf_check_cfg(struct bpf_verifier_env *env); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f6d037a30fd8..c5648fcf74e2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -777,6 +777,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) /* * cgroup scalable recursive statistics. */ +void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu); void css_rstat_updated(struct cgroup_subsys_state *css, int cpu); void css_rstat_flush(struct cgroup_subsys_state *css); diff --git a/include/linux/efi.h b/include/linux/efi.h index 72e76ec54641..ccbc35479684 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -61,7 +61,7 @@ typedef void *efi_handle_t; /* * The UEFI spec and EDK2 reference implementation both define EFI_GUID as - * struct { u32 a; u16; b; u16 c; u8 d[8]; }; and so the implied alignment + * struct { u32 a; u16 b; u16 c; u8 d[8]; }; and so the implied alignment * is 32 bits not 8 bits like our guid_t. In some cases (i.e., on 32-bit ARM), * this means that firmware services invoked by the kernel may assume that * efi_guid_t* arguments are 32-bit aligned, and use memory accessors that diff --git a/include/linux/filter.h b/include/linux/filter.h index 1ec6d5ba64cc..88a241aac36a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1151,9 +1151,6 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -#define __bpf_call_base_args \ - ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ - (void *)__bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 0a3bcd1718f3..be1b38c981d4 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -94,6 +94,7 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num); int register_fprobe_syms(struct fprobe *fp, const char **syms, int num); int unregister_fprobe(struct fprobe *fp); +int unregister_fprobe_async(struct fprobe *fp); bool fprobe_is_registered(struct fprobe *fp); int fprobe_count_ips_from_filter(const char *filter, const char *notfilter); #else @@ -113,6 +114,10 @@ static inline int unregister_fprobe(struct fprobe *fp) { return -EOPNOTSUPP; } +static inline int unregister_fprobe_async(struct fprobe *fp) +{ + return -EOPNOTSUPP; +} static inline bool fprobe_is_registered(struct fprobe *fp) { return false; diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 80b38fbf2121..31df7608737e 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -208,6 +208,7 @@ struct fwnode_operations { static inline void fwnode_init(struct fwnode_handle *fwnode, const struct fwnode_operations *ops) { + fwnode->secondary = NULL; fwnode->ops = ops; INIT_LIST_HEAD(&fwnode->consumers); INIT_LIST_HEAD(&fwnode->suppliers); diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6c75df30a281..cd4972a7c97c 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -273,11 +273,11 @@ enum { * * %__GFP_ZERO returns a zeroed page on success. * - * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself - * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that - * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting - * memory tags at the same time as zeroing memory has minimal additional - * performance impact. + * %__GFP_ZEROTAGS zeroes memory tags at allocation time. Setting memory tags at + * the same time as zeroing memory (e.g., with __GFP_ZERO) has minimal + * additional performance impact. However, __GFP_ZEROTAGS also zeroes the tags + * even if memory is not getting zeroed at allocation time (e.g., + * with init_on_free). * * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. * Used for userspace and vmalloc pages; the latter are unpoisoned by diff --git a/include/linux/hid.h b/include/linux/hid.h index 442a80d79e89..bfb9859f391e 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1030,6 +1030,8 @@ struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_ty int hid_set_field(struct hid_field *, unsigned, __s32); int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); +int hid_safe_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt); struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code); @@ -1298,8 +1300,8 @@ static inline u32 hid_report_len(struct hid_report *report) return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } -int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, - int interrupt); +int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt); /* HID quirks API */ unsigned long hid_lookup_quirk(const struct hid_device *hdev); diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index a2e47dbcf82c..19fffa4574a4 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -72,8 +72,8 @@ struct hid_ops { int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len, u64 source, bool from_bpf); int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, - bool lock_already_taken); + u8 *data, size_t bufsize, u32 size, int interrupt, u64 source, + bool from_bpf, bool lock_already_taken); struct module *owner; const struct bus_type *bus_type; }; @@ -200,7 +200,8 @@ struct hid_bpf { #ifdef CONFIG_HID_BPF u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, - u32 *size, int interrupt, u64 source, bool from_bpf); + size_t *buf_size, u32 *size, int interrupt, u64 source, + bool from_bpf); int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, u32 size, enum hid_report_type rtype, @@ -215,8 +216,11 @@ int hid_bpf_device_init(struct hid_device *hid); const u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size); #else /* CONFIG_HID_BPF */ static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 *size, int interrupt, - u64 source, bool from_bpf) { return data; } + u8 *data, size_t *buf_size, u32 *size, + int interrupt, u64 source, bool from_bpf) +{ + return data; +} static inline int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, u8 *buf, u32 size, enum hid_report_type rtype, diff --git a/include/linux/highmem.h b/include/linux/highmem.h index af03db851a1d..d7aac9de1c8a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -347,10 +347,11 @@ static inline void clear_highpage_kasan_tagged(struct page *page) #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES -/* Return false to let people know we did not initialize the pages */ -static inline bool tag_clear_highpages(struct page *page, int numpages) +/* Returns true if the caller has to initialize the pages */ +static inline bool tag_clear_highpages(struct page *page, int numpages, + bool clear_pages) { - return false; + return clear_pages; } #endif diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index 94c06bf214fb..15f02422e9ca 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -28,6 +28,12 @@ enum intel_tpmi_id { TPMI_INFO_ID = 0x81, /* Special ID for PCI BDF and Package ID information */ }; +#define TPMI_CORE_INIT 0 +#define TPMI_CORE_EXIT 1 + +int tpmi_register_notifier(struct notifier_block *nb); +int tpmi_unregister_notifier(struct notifier_block *nb); + struct oobmsm_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev); struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int index); int tpmi_get_resource_count(struct auxiliary_device *auxdev); diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 167fba7dbf04..1fabf0f5ea8e 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -218,14 +218,6 @@ static __always_inline void __exit_to_user_mode_validate(void) lockdep_sys_exit(); } -/* Temporary workaround to keep ARM64 alive */ -static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) -{ - __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK); - rseq_exit_to_user_mode_legacy(); - __exit_to_user_mode_validate(); -} - /** * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index 40d2fce68294..f78787e654f4 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -425,9 +425,6 @@ struct gicv5_its_itt_cfg { void gicv5_init_lpis(u32 max); void gicv5_deinit_lpis(void); -int gicv5_alloc_lpi(void); -void gicv5_free_lpi(u32 lpi); - void __init gicv5_its_of_probe(struct device_node *parent); void __init gicv5_its_acpi_probe(void); #endif diff --git a/include/linux/libata.h b/include/linux/libata.h index 5c085ef4eda7..127229fbd1a6 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -371,6 +371,7 @@ enum { /* return values for ->qc_defer */ ATA_DEFER_LINK = 1, ATA_DEFER_PORT = 2, + ATA_DEFER_LINK_EXCL = 3, /* desc_len for ata_eh_info and context */ ATA_EH_DESC_LEN = 80, @@ -854,6 +855,9 @@ struct ata_link { unsigned int sata_spd; /* current SATA PHY speed */ enum ata_lpm_policy lpm_policy; + struct work_struct deferred_qc_work; + struct ata_queued_cmd *deferred_qc; + /* record runtime error info, protected by host_set lock */ struct ata_eh_info eh_info; /* EH context */ @@ -899,9 +903,6 @@ struct ata_port { u64 qc_active; int nr_active_links; /* #links with active qcs */ - struct work_struct deferred_qc_work; - struct ata_queued_cmd *deferred_qc; - struct ata_link link; /* host default link */ struct ata_link *slave_link; /* see ata_slave_link_init() */ diff --git a/include/linux/list.h b/include/linux/list.h index 00ea8e5fb88b..09d979976b3b 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -191,6 +191,29 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head) __list_add(new, head->prev, head); } +/** + * list_add_tail_release - add a new entry with release barrier + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head, using a release barrier to set + * the ->next pointer that points to it. This is useful for implementing + * queues, in particular one that the elements will be walked through forwards + * locklessly. + */ +static inline void list_add_tail_release(struct list_head *new, + struct list_head *head) +{ + struct list_head *prev = head->prev; + + if (__list_add_valid(new, prev, head)) { + new->next = head; + new->prev = prev; + head->prev = new; + smp_store_release(&prev->next, new); + } +} + /* * Delete a list entry by making the prev/next entries * point to each other. @@ -645,6 +668,20 @@ static inline void list_splice_tail_init(struct list_head *list, }) /** + * list_first_entry_or_null_acquire - get the first element from a list with barrier + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#define list_first_entry_or_null_acquire(ptr, type, member) ({ \ + struct list_head *head__ = (ptr); \ + struct list_head *pos__ = smp_load_acquire(&head__->next); \ + pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ +}) + +/** * list_last_entry_or_null - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index a81b46af5118..5a1c5c336fa4 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -305,9 +305,11 @@ struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, + const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); -void *xt_unregister_table(struct xt_table *table); +void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name); +struct xt_table *xt_unregister_table_exit(struct net *net, u8 af, const char *name); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index a40aaf645fa4..05631a25e622 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -53,7 +53,6 @@ int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, const struct nf_hook_ops *ops); void arpt_unregister_table(struct net *net, const char *name); -void arpt_unregister_table_pre_exit(struct net *net, const char *name); extern unsigned int arpt_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 132b0e4a6d4d..13593391d605 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -26,7 +26,6 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *ops); -void ipt_unregister_table_pre_exit(struct net *net, const char *name); void ipt_unregister_table_exit(struct net *net, const char *name); /* Standard entry. */ diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 8b8885a73c76..c6d5b927830d 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -27,7 +27,6 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *); int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *ops); -void ip6t_unregister_table_pre_exit(struct net *net, const char *name); void ip6t_unregister_table_exit(struct net *net, const char *name); extern unsigned int ip6t_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ba17ac5bf356..243c0f737938 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -62,8 +62,8 @@ struct netfs_inode { struct fscache_cookie *cache; #endif struct mutex wb_lock; /* Writeback serialisation */ - loff_t remote_i_size; /* Size of the remote file */ - loff_t zero_point; /* Size after which we assume there's no data + loff_t _remote_i_size; /* Size of the remote file */ + loff_t _zero_point; /* Size after which we assume there's no data * on the server */ atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; @@ -252,7 +252,7 @@ struct netfs_io_request { unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ unsigned long long abandon_to; /* Position to abandon folios to */ - pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ + const struct folio *no_unlock_folio; /* Don't unlock this folio after read */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; unsigned int rsize; /* Maximum read size (0 for none) */ @@ -475,6 +475,254 @@ static inline struct netfs_inode *netfs_inode(struct inode *inode) } /** + * netfs_read_remote_i_size - Read remote_i_size safely + * @inode: The inode to access + * + * Read remote_i_size safely without the potential for tearing on 32-bit + * arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline unsigned long long netfs_read_remote_i_size(const struct inode *inode) +{ + const struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); + unsigned long long remote_i_size; + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + remote_i_size = ictx->_remote_i_size; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + remote_i_size = ictx->_remote_i_size; + preempt_enable(); +#else + /* Pairs with smp_store_release() in netfs_write_remote_i_size() */ + remote_i_size = smp_load_acquire(&ictx->_remote_i_size); +#endif + return remote_i_size; +} + +/* + * netfs_write_remote_i_size - Set remote_i_size safely + * @inode: The inode to access + * @remote_i_size: The new value for the size of the file on the server + * + * Set remote_i_size safely without the potential for tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_remote_i_size(), netfs_write_remote_i_size() does + * need locking around it (normally i_rwsem), otherwise on 32bit/SMP an update + * of i_size_seqcount can be lost, resulting in subsequent i_size_read() calls + * spinning forever. + */ +static inline void netfs_write_remote_i_size(struct inode *inode, + unsigned long long remote_i_size) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + ictx->_remote_i_size = remote_i_size; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + ictx->_remote_i_size = remote_i_size; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_remote_i_size() to + * ensure changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&ictx->_remote_i_size, remote_i_size); +#endif +} + +/** + * netfs_read_zero_point - Read zero_point safely + * @inode: The inode to access + * + * Read zero_point safely without the potential for tearing on 32-bit + * arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline unsigned long long netfs_read_zero_point(const struct inode *inode) +{ + struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); + unsigned long long zero_point; + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + zero_point = ictx->_zero_point; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + zero_point = ictx->_zero_point; + preempt_enable(); +#else + /* Pairs with smp_store_release() in netfs_write_zero_point() */ + zero_point = smp_load_acquire(&ictx->_zero_point); +#endif + return zero_point; +} + +/* + * netfs_write_zero_point - Set zero_point safely + * @inode: The inode to access + * @zero_point: The new value for the point beyond which the server has no data + * + * Set zero_point safely without the potential for tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_zero_point(), netfs_write_zero_point() does need + * locking around it (normally i_rwsem), otherwise on 32bit/SMP an update of + * i_size_seqcount can be lost, resulting in subsequent read calls spinning + * forever. + */ +static inline void netfs_write_zero_point(struct inode *inode, + unsigned long long zero_point) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + ictx->_zero_point = zero_point; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + ictx->_zero_point = zero_point; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_zero_point() to + * ensure changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&ictx->_zero_point, zero_point); +#endif +} + +/** + * netfs_read_sizes - Read remote_i_size and zero_point safely + * @inode: The inode to access + * @i_size: Where to return the local file size. + * @remote_i_size: Where to return the size of the file on the server + * @zero_point: Where to return the the point beyond which the server has no data + * + * Read remote_i_size and zero_point safely without the potential for tearing + * on 32-bit arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline void netfs_read_sizes(const struct inode *inode, + unsigned long long *i_size, + unsigned long long *remote_i_size, + unsigned long long *zero_point) +{ + const struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + *i_size = inode->i_size; + *remote_i_size = ictx->_remote_i_size; + *zero_point = ictx->_zero_point; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + *i_size = inode->i_size; + *remote_i_size = ictx->_remote_i_size; + *zero_point = ictx->_zero_point; + preempt_enable(); +#else + /* Pairs with smp_store_release() in i_size_write() */ + *i_size = smp_load_acquire(&inode->i_size); + /* Pairs with smp_store_release() in netfs_write_remote_i_size() */ + *remote_i_size = smp_load_acquire(&ictx->_remote_i_size); + /* Pairs with smp_store_release() in netfs_write_zero_point() */ + *zero_point = smp_load_acquire(&ictx->_zero_point); +#endif +} + +/* + * netfs_write_sizes - Set i_size, remote_i_size and zero_point safely + * @inode: The inode to access + * @i_size: The new value for the local size of the file + * @remote_i_size: The new value for the size of the file on the server + * @zero_point: The new value for the point beyond which the server has no data + * + * Set both remote_i_size and zero_point safely without the potential for + * tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_zero_point(), netfs_write_zero_point() does need + * locking around it (normally i_rwsem), otherwise on 32bit/SMP an update of + * i_size_seqcount can be lost, resulting in subsequent read calls spinning + * forever. + */ +static inline void netfs_write_sizes(struct inode *inode, + unsigned long long i_size, + unsigned long long remote_i_size, + unsigned long long zero_point) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + inode->i_size = i_size; + ictx->_remote_i_size = remote_i_size; + ictx->_zero_point = zero_point; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + inode->i_size = i_size; + ictx->_remote_i_size = remote_i_size; + ictx->_zero_point = zero_point; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in i_size_read(), + * netfs_read_remote_i_size() and netfs_read_zero_point() to ensure + * changes related to inode size (such as page contents) are visible + * before we see the changed inode size. + */ + smp_store_release(&inode->i_size, i_size); + smp_store_release(&ictx->_remote_i_size, remote_i_size); + smp_store_release(&ictx->_zero_point, zero_point); +#endif +} + +/** * netfs_inode_init - Initialise a netfslib inode context * @ctx: The netfs inode to initialise * @ops: The netfs's operations list @@ -488,8 +736,8 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, bool use_zero_point) { ctx->ops = ops; - ctx->remote_i_size = i_size_read(&ctx->inode); - ctx->zero_point = LLONG_MAX; + ctx->_remote_i_size = i_size_read(&ctx->inode); + ctx->_zero_point = LLONG_MAX; ctx->flags = 0; atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) @@ -498,7 +746,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, mutex_init(&ctx->wb_lock); /* ->releasepage() drives zero_point */ if (use_zero_point) { - ctx->zero_point = ctx->remote_i_size; + ctx->_zero_point = ctx->_remote_i_size; mapping_set_release_always(ctx->inode.i_mapping); } } @@ -511,13 +759,40 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, * * Inform the netfs lib that a file got resized so that it can adjust its state. */ -static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size, +static inline void netfs_resize_file(struct netfs_inode *ictx, + unsigned long long new_i_size, bool changed_on_server) { +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + struct inode *inode = &ictx->inode; + + preempt_disable(); + write_seqcount_begin(&inode->i_size_seqcount); + if (changed_on_server) + ictx->_remote_i_size = new_i_size; + if (new_i_size < ictx->_zero_point) + ictx->_zero_point = new_i_size; + write_seqcount_end(&inode->i_size_seqcount); + preempt_enable(); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); if (changed_on_server) - ctx->remote_i_size = new_i_size; - if (new_i_size < ctx->zero_point) - ctx->zero_point = new_i_size; + ictx->_remote_i_size = new_i_size; + if (new_i_size < ictx->_zero_point) + ictx->_zero_point = new_i_size; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_remote_i_size and + * netfs_read_zero_point() to ensure changes related to inode size + * (such as page contents) are visible before we see the changed inode + * size. + */ + if (changed_on_server) + smp_store_release(&ictx->_remote_i_size, new_i_size); + if (new_i_size < ictx->_zero_point) + smp_store_release(&ictx->_zero_point, new_i_size); +#endif } /** diff --git a/include/linux/rseq.h b/include/linux/rseq.h index b9d62fc2140d..7ef79b25e714 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -9,6 +9,11 @@ void __rseq_handle_slowpath(struct pt_regs *regs); +static __always_inline bool rseq_v2(struct task_struct *t) +{ + return IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && likely(t->rseq.event.has_rseq > 1); +} + /* Invoked from resume_user_mode_work() */ static inline void rseq_handle_slowpath(struct pt_regs *regs) { @@ -16,8 +21,7 @@ static inline void rseq_handle_slowpath(struct pt_regs *regs) if (current->rseq.event.slowpath) __rseq_handle_slowpath(regs); } else { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) + if (current->rseq.event.sched_switch && current->rseq.event.has_rseq) __rseq_handle_slowpath(regs); } } @@ -30,9 +34,9 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs); */ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.has_rseq & current->rseq.event.user_irq) + if (rseq_v2(current)) { + /* has_rseq is implied in rseq_v2() */ + if (current->rseq.event.user_irq) __rseq_signal_deliver(ksig->sig, regs); } else { if (current->rseq.event.has_rseq) @@ -50,15 +54,22 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t) { struct rseq_event *ev = &t->rseq.event; - if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* + * Only apply the user_irq optimization for RSEQ ABI V2 registrations. + * Legacy users like TCMalloc rely on the original ABI V1 behaviour + * which updates IDs on every context swtich. + */ + if (rseq_v2(t)) { /* - * Avoid a boat load of conditionals by using simple logic - * to determine whether NOTIFY_RESUME needs to be raised. + * Avoid a boat load of conditionals by using simple logic to + * determine whether TIF_NOTIFY_RESUME or TIF_RSEQ needs to be + * raised. * - * It's required when the CPU or MM CID has changed or - * the entry was from user space. + * It's required when the CPU or MM CID has changed or the entry + * was via interrupt from user space. ev->has_rseq does not have + * to be evaluated here because rseq_v2() implies has_rseq. */ - bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; + bool raise = ev->user_irq | ev->ids_changed; if (raise) { ev->sched_switch = true; @@ -66,6 +77,7 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t) } } else { if (ev->has_rseq) { + t->rseq.event.ids_changed = true; t->rseq.event.sched_switch = true; rseq_raise_notify_resume(t); } @@ -119,6 +131,8 @@ static inline void rseq_virt_userspace_exit(void) static inline void rseq_reset(struct task_struct *t) { + /* Protect against preemption and membarrier IPI */ + guard(irqsave)(); memset(&t->rseq, 0, sizeof(t->rseq)); t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; } @@ -159,6 +173,7 @@ static inline unsigned int rseq_alloc_align(void) } #else /* CONFIG_RSEQ */ +static inline bool rseq_v2(struct task_struct *t) { return false; } static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index f11ebd34f8b9..63bc72086e75 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -111,6 +111,20 @@ static __always_inline void rseq_slice_clear_grant(struct task_struct *t) t->rseq.slice.state.granted = false; } +/* + * Open coded, so it can be invoked within a user access region. + * + * This clears the user space state of the time slice extensions field only when + * the task has registered the optimized RSEQ_ABI V2. Some legacy registrations, + * e.g. TCMalloc, have conflicting non-ABI fields in struct RSEQ, which would be + * overwritten by an unconditional write. + */ +#define rseq_slice_clear_user(rseq, efault) \ +do { \ + if (rseq_slice_extension_enabled()) \ + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); \ +} while (0) + static __always_inline bool __rseq_grant_slice_extension(bool work_pending) { struct task_struct *curr = current; @@ -230,10 +244,10 @@ static __always_inline bool rseq_slice_extension_enabled(void) { return false; } static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; } static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { } static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } +#define rseq_slice_clear_user(rseq, efault) do { } while (0) #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); -bool rseq_debug_validate_ids(struct task_struct *t); static __always_inline void rseq_note_user_irq_entry(void) { @@ -353,43 +367,6 @@ efault: return false; } -/* - * On debug kernels validate that user space did not mess with it if the - * debug branch is enabled. - */ -bool rseq_debug_validate_ids(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq.usrptr; - u32 cpu_id, uval, node_id; - - /* - * On the first exit after registering the rseq region CPU ID is - * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! - */ - node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? - cpu_to_node(t->rseq.ids.cpu_id) : 0; - - scoped_user_read_access(rseq, efault) { - unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); - if (cpu_id != t->rseq.ids.cpu_id) - goto die; - unsafe_get_user(uval, &rseq->cpu_id, efault); - if (uval != cpu_id) - goto die; - unsafe_get_user(uval, &rseq->node_id, efault); - if (uval != node_id) - goto die; - unsafe_get_user(uval, &rseq->mm_cid, efault); - if (uval != t->rseq.ids.mm_cid) - goto die; - } - return true; -die: - t->rseq.event.fatal = true; -efault: - return false; -} - #endif /* RSEQ_BUILD_SLOW_PATH */ /* @@ -499,37 +476,50 @@ efault: * faults in task context are fatal too. */ static rseq_inline -bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, - u32 node_id, u64 *csaddr) +bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, u64 *csaddr) { struct rseq __user *rseq = t->rseq.usrptr; - if (static_branch_unlikely(&rseq_debug_enabled)) { - if (!rseq_debug_validate_ids(t)) - return false; - } - scoped_user_rw_access(rseq, efault) { + /* Validate the R/O fields for debug and optimized mode */ + if (static_branch_unlikely(&rseq_debug_enabled) || rseq_v2(t)) { + u32 cpu_id, uval; + + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); + if (cpu_id != t->rseq.ids.cpu_id) + goto die; + unsafe_get_user(uval, &rseq->cpu_id, efault); + if (uval != cpu_id) + goto die; + unsafe_get_user(uval, &rseq->node_id, efault); + if (uval != t->rseq.ids.node_id) + goto die; + unsafe_get_user(uval, &rseq->mm_cid, efault); + if (uval != t->rseq.ids.mm_cid) + goto die; + } + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); - unsafe_put_user(node_id, &rseq->node_id, efault); + unsafe_put_user(ids->node_id, &rseq->node_id, efault); unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); if (csaddr) unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); - /* Open coded, so it's in the same user access region */ - if (rseq_slice_extension_enabled()) { - /* Unconditionally clear it, no point in conditionals */ - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); - } + /* RSEQ ABI V2 only operations */ + if (rseq_v2(t)) + rseq_slice_clear_user(rseq, efault); } rseq_slice_clear_grant(t); /* Cache the new values */ - t->rseq.ids.cpu_cid = ids->cpu_cid; + t->rseq.ids = *ids; rseq_stat_inc(rseq_stats.ids); rseq_trace_update(t, ids); return true; + +die: + t->rseq.event.fatal = true; efault: return false; } @@ -539,11 +529,11 @@ efault: * is in a critical section. */ static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, - struct rseq_ids *ids, u32 node_id) + struct rseq_ids *ids) { u64 csaddr; - if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) + if (!rseq_set_ids_get_csaddr(t, ids, &csaddr)) return false; /* @@ -612,6 +602,14 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t * interrupts disabled */ guard(pagefault)(); + /* + * This optimization is only valid when the task registered for the + * optimized RSEQ_ABI_V2 variant. Some legacy users rely on the original + * RSEQ implementation behaviour which unconditionally updated the IDs. + * rseq_sched_switch_event() ensures that legacy registrations always + * have both sched_switch and ids_changed set, which is compatible with + * the historical TIF_NOTIFY_RESUME behaviour. + */ if (likely(!t->rseq.event.ids_changed)) { struct rseq __user *rseq = t->rseq.usrptr; /* @@ -623,11 +621,9 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t scoped_user_rw_access(rseq, efault) { unsafe_get_user(csaddr, &rseq->rseq_cs, efault); - /* Open coded, so it's in the same user access region */ - if (rseq_slice_extension_enabled()) { - /* Unconditionally clear it, no point in conditionals */ - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); - } + /* RSEQ ABI V2 only operations */ + if (rseq_v2(t)) + rseq_slice_clear_user(rseq, efault); } rseq_slice_clear_grant(t); @@ -640,12 +636,12 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t } struct rseq_ids ids = { - .cpu_id = task_cpu(t), - .mm_cid = task_mm_cid(t), + .cpu_id = task_cpu(t), + .mm_cid = task_mm_cid(t), + .node_id = cpu_to_node(ids.cpu_id), }; - u32 node_id = cpu_to_node(ids.cpu_id); - return rseq_update_usr(t, regs, &ids, node_id); + return rseq_update_usr(t, regs, &ids); efault: return false; } @@ -753,24 +749,6 @@ static __always_inline void rseq_irqentry_exit_to_user_mode(void) ev->events = 0; } -/* Required to keep ARM64 working */ -static __always_inline void rseq_exit_to_user_mode_legacy(void) -{ - struct rseq_event *ev = ¤t->rseq.event; - - rseq_stat_inc(rseq_stats.exit); - - if (static_branch_unlikely(&rseq_debug_enabled)) - WARN_ON_ONCE(ev->sched_switch); - - /* - * Ensure that event (especially user_irq) is cleared when the - * interrupt did not result in a schedule and therefore the - * rseq processing did not clear it. - */ - ev->events = 0; -} - void __rseq_debug_syscall_return(struct pt_regs *regs); static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs) @@ -786,7 +764,6 @@ static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned } static inline void rseq_syscall_exit_to_user_mode(void) { } static inline void rseq_irqentry_exit_to_user_mode(void) { } -static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 0b42045988db..85739a63e85e 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -9,6 +9,12 @@ #ifdef CONFIG_RSEQ struct rseq; +/* + * rseq_event::has_rseq contains the ABI version number so preserving it + * in AND operations requires a mask. + */ +#define RSEQ_HAS_RSEQ_VERSION_MASK 0xff + /** * struct rseq_event - Storage for rseq related event management * @all: Compound to initialize and clear the data efficiently @@ -17,7 +23,8 @@ struct rseq; * exit to user * @ids_changed: Indicator that IDs need to be updated * @user_irq: True on interrupt entry from user mode - * @has_rseq: True if the task has a rseq pointer installed + * @has_rseq: Greater than 0 if the task has a rseq pointer installed. + * Contains the RSEQ version number * @error: Compound error code for the slow path to analyze * @fatal: User space data corrupted or invalid * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME @@ -59,8 +66,9 @@ struct rseq_event { * compiler emit a single compare on 64-bit * @cpu_id: The CPU ID which was written last to user space * @mm_cid: The MM CID which was written last to user space + * @node_id: The node ID which was written last to user space * - * @cpu_id and @mm_cid are updated when the data is written to user space. + * @cpu_id, @mm_cid and @node_id are updated when the data is written to user space. */ struct rseq_ids { union { @@ -70,6 +78,7 @@ struct rseq_ids { u32 mm_cid; }; }; + u32 node_id; }; /** diff --git a/include/linux/sched.h b/include/linux/sched.h index 368c7b4d7cb5..ee06cba5c6f5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1002,6 +1002,9 @@ struct task_struct { unsigned sched_rt_mutex:1; #endif + /* Save user-dumpable when mm goes away */ + unsigned user_dumpable:1; + /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 1198138cb839..273538200a44 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -33,6 +33,15 @@ struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); extern void dl_clear_root_domain_cpu(int cpu); +/* + * Return whether moving DL task @p to @new_mask requires moving DL + * bandwidth accounting between root domains. This helper is specific to + * DL bandwidth move accounting semantics and is shared by + * cpuset_can_attach() and set_cpus_allowed_dl() so both paths use the + * same source root-domain test. + */ +extern bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask); extern u64 dl_cookie; extern bool dl_bw_visited(int cpu, u64 cookie); diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index adb9a4de068a..2129e18ada58 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -101,24 +101,27 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ - SCX_TASK_OFF_TASKS = 1 << 6, /* removed from scx_tasks by sched_ext_dead() */ /* - * Bits 8 and 9 are used to carry task state: + * Bits 8 to 10 are used to carry task state: * * NONE ops.init_task() not called yet + * INIT_BEGIN ops.init_task() in flight; see sched_ext_dead() * INIT ops.init_task() succeeded, but task can be cancelled * READY fully initialized, but not in sched_ext * ENABLED fully initialized and in sched_ext + * DEAD terminal state set by sched_ext_dead() */ - SCX_TASK_STATE_SHIFT = 8, /* bits 8 and 9 are used to carry task state */ - SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_SHIFT = 8, + SCX_TASK_STATE_BITS = 3, SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, - SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, - SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, - SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT_BEGIN = 1 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT = 2 << SCX_TASK_STATE_SHIFT, + SCX_TASK_READY = 3 << SCX_TASK_STATE_SHIFT, + SCX_TASK_ENABLED = 4 << SCX_TASK_STATE_SHIFT, + SCX_TASK_DEAD = 5 << SCX_TASK_STATE_SHIFT, /* * Bits 12 and 13 are used to carry reenqueue reason. In addition to diff --git a/include/linux/slab.h b/include/linux/slab.h index 15a60b501b95..2b5ab488e96b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1234,6 +1234,9 @@ void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long alig extern void kvfree(const void *addr); DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T)) +extern void kvfree_atomic(const void *addr); +DEFINE_FREE(kvfree_atomic, void *, if (!IS_ERR_OR_NULL(_T)) kvfree_atomic(_T)) + extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index d01ef4a6b3d7..7589fccfeef6 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -71,9 +71,9 @@ static inline void airoha_ppe_dev_check_skb(struct airoha_ppe_dev *dev, #define NPU_RX1_DESC_NUM 512 /* CTRL */ -#define NPU_RX_DMA_DESC_LAST_MASK BIT(27) -#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(26, 14) -#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(13, 1) +#define NPU_RX_DMA_DESC_LAST_MASK BIT(29) +#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(28, 15) +#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(14, 1) #define NPU_RX_DMA_DESC_DONE_MASK BIT(0) /* INFO */ #define NPU_RX_DMA_PKT_COUNT_MASK GENMASK(31, 29) diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 2ebba746c18f..89165b769e5c 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -21,7 +21,7 @@ #define VFIO_PCI_CORE_H #define VFIO_PCI_OFFSET_SHIFT 40 -#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_TO_INDEX(off) ((u64)(off) >> VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) |
