From c1a371cf80fbc06280cc0064ca99a39d0428ded3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 13 Apr 2020 09:14:35 -0700 Subject: printk: fix global comment Fix typo/spello. Signed-off-by: Randy Dunlap Reviewed-by: Sergey Senozhatsky Signed-off-by: Jiri Kosina --- kernel/printk/printk_safe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index b4045e782743..08a8bf778990 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -20,7 +20,7 @@ * is later flushed into the main ring buffer via IRQ work. * * The alternative implementation is chosen transparently - * by examinig current printk() context mask stored in @printk_context + * by examining current printk() context mask stored in @printk_context * per-CPU variable. * * The implementation allows to flush the strings also from another CPU. -- cgit v1.2.3 From b6cf8b3f3312e2ffe38f23c7c692eb9389277a9a Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 9 Jul 2020 15:29:42 +0206 Subject: printk: add lockless ringbuffer Introduce a multi-reader multi-writer lockless ringbuffer for storing the kernel log messages. Readers and writers may use their API from any context (including scheduler and NMI). This ringbuffer will make it possible to decouple printk() callers from any context, locking, or console constraints. It also makes it possible for readers to have full access to the ringbuffer contents at any time and context (for example from any panic situation). The printk_ringbuffer is made up of 3 internal ringbuffers: desc_ring: A ring of descriptors. A descriptor contains all record meta data (sequence number, timestamp, loglevel, etc.) as well as internal state information about the record and logical positions specifying where in the other ringbuffers the text and dictionary strings are located. text_data_ring: A ring of data blocks. A data block consists of an unsigned long integer (ID) that maps to a desc_ring index followed by the text string of the record. dict_data_ring: A ring of data blocks. A data block consists of an unsigned long integer (ID) that maps to a desc_ring index followed by the dictionary string of the record. The internal state information of a descriptor is the key element to allow readers and writers to locklessly synchronize access to the data. Co-developed-by: Petr Mladek Signed-off-by: John Ogness Reviewed-by: Petr Mladek Reviewed-by: Paul E. McKenney Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200709132344.760-3-john.ogness@linutronix.de --- kernel/printk/Makefile | 1 + kernel/printk/printk_ringbuffer.c | 1687 +++++++++++++++++++++++++++++++++++++ kernel/printk/printk_ringbuffer.h | 399 +++++++++ 3 files changed, 2087 insertions(+) create mode 100644 kernel/printk/printk_ringbuffer.c create mode 100644 kernel/printk/printk_ringbuffer.h (limited to 'kernel') diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 4d052fc6bcde..eee3dc9b60a9 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -2,3 +2,4 @@ obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o +obj-$(CONFIG_PRINTK) += printk_ringbuffer.o diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c new file mode 100644 index 000000000000..7355ca99e852 --- /dev/null +++ b/kernel/printk/printk_ringbuffer.c @@ -0,0 +1,1687 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include "printk_ringbuffer.h" + +/** + * DOC: printk_ringbuffer overview + * + * Data Structure + * -------------- + * The printk_ringbuffer is made up of 3 internal ringbuffers: + * + * desc_ring + * A ring of descriptors. A descriptor contains all record meta data + * (sequence number, timestamp, loglevel, etc.) as well as internal state + * information about the record and logical positions specifying where in + * the other ringbuffers the text and dictionary strings are located. + * + * text_data_ring + * A ring of data blocks. A data block consists of an unsigned long + * integer (ID) that maps to a desc_ring index followed by the text + * string of the record. + * + * dict_data_ring + * A ring of data blocks. A data block consists of an unsigned long + * integer (ID) that maps to a desc_ring index followed by the dictionary + * string of the record. + * + * The internal state information of a descriptor is the key element to allow + * readers and writers to locklessly synchronize access to the data. + * + * Implementation + * -------------- + * + * Descriptor Ring + * ~~~~~~~~~~~~~~~ + * The descriptor ring is an array of descriptors. A descriptor contains all + * the meta data of a printk record as well as blk_lpos structs pointing to + * associated text and dictionary data blocks (see "Data Rings" below). Each + * descriptor is assigned an ID that maps directly to index values of the + * descriptor array and has a state. The ID and the state are bitwise combined + * into a single descriptor field named @state_var, allowing ID and state to + * be synchronously and atomically updated. + * + * Descriptors have three states: + * + * reserved + * A writer is modifying the record. + * + * committed + * The record and all its data are complete and available for reading. + * + * reusable + * The record exists, but its text and/or dictionary data may no longer + * be available. + * + * Querying the @state_var of a record requires providing the ID of the + * descriptor to query. This can yield a possible fourth (pseudo) state: + * + * miss + * The descriptor being queried has an unexpected ID. + * + * The descriptor ring has a @tail_id that contains the ID of the oldest + * descriptor and @head_id that contains the ID of the newest descriptor. + * + * When a new descriptor should be created (and the ring is full), the tail + * descriptor is invalidated by first transitioning to the reusable state and + * then invalidating all tail data blocks up to and including the data blocks + * associated with the tail descriptor (for text and dictionary rings). Then + * @tail_id is advanced, followed by advancing @head_id. And finally the + * @state_var of the new descriptor is initialized to the new ID and reserved + * state. + * + * The @tail_id can only be advanced if the new @tail_id would be in the + * committed or reusable queried state. This makes it possible that a valid + * sequence number of the tail is always available. + * + * Data Rings + * ~~~~~~~~~~ + * The two data rings (text and dictionary) function identically. They exist + * separately so that their buffer sizes can be individually set and they do + * not affect one another. + * + * Data rings are byte arrays composed of data blocks. Data blocks are + * referenced by blk_lpos structs that point to the logical position of the + * beginning of a data block and the beginning of the next adjacent data + * block. Logical positions are mapped directly to index values of the byte + * array ringbuffer. + * + * Each data block consists of an ID followed by the writer data. The ID is + * the identifier of a descriptor that is associated with the data block. A + * given data block is considered valid if all of the following conditions + * are met: + * + * 1) The descriptor associated with the data block is in the committed + * queried state. + * + * 2) The blk_lpos struct within the descriptor associated with the data + * block references back to the same data block. + * + * 3) The data block is within the head/tail logical position range. + * + * If the writer data of a data block would extend beyond the end of the + * byte array, only the ID of the data block is stored at the logical + * position and the full data block (ID and writer data) is stored at the + * beginning of the byte array. The referencing blk_lpos will point to the + * ID before the wrap and the next data block will be at the logical + * position adjacent the full data block after the wrap. + * + * Data rings have a @tail_lpos that points to the beginning of the oldest + * data block and a @head_lpos that points to the logical position of the + * next (not yet existing) data block. + * + * When a new data block should be created (and the ring is full), tail data + * blocks will first be invalidated by putting their associated descriptors + * into the reusable state and then pushing the @tail_lpos forward beyond + * them. Then the @head_lpos is pushed forward and is associated with a new + * descriptor. If a data block is not valid, the @tail_lpos cannot be + * advanced beyond it. + * + * Usage + * ----- + * Here are some simple examples demonstrating writers and readers. For the + * examples a global ringbuffer (test_rb) is available (which is not the + * actual ringbuffer used by printk):: + * + * DEFINE_PRINTKRB(test_rb, 15, 5, 3); + * + * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of + * 1 MiB (2 ^ (15 + 5)) for text data and 256 KiB (2 ^ (15 + 3)) for + * dictionary data. + * + * Sample writer code:: + * + * const char *dictstr = "dictionary text"; + * const char *textstr = "message text"; + * struct prb_reserved_entry e; + * struct printk_record r; + * + * // specify how much to allocate + * prb_rec_init_wr(&r, strlen(textstr) + 1, strlen(dictstr) + 1); + * + * if (prb_reserve(&e, &test_rb, &r)) { + * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); + * + * // dictionary allocation may have failed + * if (r.dict_buf) + * snprintf(r.dict_buf, r.dict_buf_size, "%s", dictstr); + * + * r.info->ts_nsec = local_clock(); + * + * prb_commit(&e); + * } + * + * Sample reader code:: + * + * struct printk_info info; + * struct printk_record r; + * char text_buf[32]; + * char dict_buf[32]; + * u64 seq; + * + * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf), + * &dict_buf[0], sizeof(dict_buf)); + * + * prb_for_each_record(0, &test_rb, &seq, &r) { + * if (info.seq != seq) + * pr_warn("lost %llu records\n", info.seq - seq); + * + * if (info.text_len > r.text_buf_size) { + * pr_warn("record %llu text truncated\n", info.seq); + * text_buf[r.text_buf_size - 1] = 0; + * } + * + * if (info.dict_len > r.dict_buf_size) { + * pr_warn("record %llu dict truncated\n", info.seq); + * dict_buf[r.dict_buf_size - 1] = 0; + * } + * + * pr_info("%llu: %llu: %s;%s\n", info.seq, info.ts_nsec, + * &text_buf[0], info.dict_len ? &dict_buf[0] : ""); + * } + * + * Note that additional less convenient reader functions are available to + * allow complex record access. + * + * ABA Issues + * ~~~~~~~~~~ + * To help avoid ABA issues, descriptors are referenced by IDs (array index + * values combined with tagged bits counting array wraps) and data blocks are + * referenced by logical positions (array index values combined with tagged + * bits counting array wraps). However, on 32-bit systems the number of + * tagged bits is relatively small such that an ABA incident is (at least + * theoretically) possible. For example, if 4 million maximally sized (1KiB) + * printk messages were to occur in NMI context on a 32-bit system, the + * interrupted context would not be able to recognize that the 32-bit integer + * completely wrapped and thus represents a different data block than the one + * the interrupted context expects. + * + * To help combat this possibility, additional state checking is performed + * (such as using cmpxchg() even though set() would suffice). These extra + * checks are commented as such and will hopefully catch any ABA issue that + * a 32-bit system might experience. + * + * Memory Barriers + * ~~~~~~~~~~~~~~~ + * Multiple memory barriers are used. To simplify proving correctness and + * generating litmus tests, lines of code related to memory barriers + * (loads, stores, and the associated memory barriers) are labeled:: + * + * LMM(function:letter) + * + * Comments reference the labels using only the "function:letter" part. + * + * The memory barrier pairs and their ordering are: + * + * desc_reserve:D / desc_reserve:B + * push descriptor tail (id), then push descriptor head (id) + * + * desc_reserve:D / data_push_tail:B + * push data tail (lpos), then set new descriptor reserved (state) + * + * desc_reserve:D / desc_push_tail:C + * push descriptor tail (id), then set new descriptor reserved (state) + * + * desc_reserve:D / prb_first_seq:C + * push descriptor tail (id), then set new descriptor reserved (state) + * + * desc_reserve:F / desc_read:D + * set new descriptor id and reserved (state), then allow writer changes + * + * data_alloc:A / desc_read:D + * set old descriptor reusable (state), then modify new data block area + * + * data_alloc:A / data_push_tail:B + * push data tail (lpos), then modify new data block area + * + * prb_commit:B / desc_read:B + * store writer changes, then set new descriptor committed (state) + * + * data_push_tail:D / data_push_tail:A + * set descriptor reusable (state), then push data tail (lpos) + * + * desc_push_tail:B / desc_reserve:D + * set descriptor reusable (state), then push descriptor tail (id) + */ + +#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) +#define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1) + +#define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits) +#define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1) + +/* Determine the data array index from a logical position. */ +#define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring)) + +/* Determine the desc array index from an ID or sequence number. */ +#define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring)) + +/* Determine how many times the data array has wrapped. */ +#define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits) + +/* Get the logical position at index 0 of the current wrap. */ +#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ +((lpos) & ~DATA_SIZE_MASK(data_ring)) + +/* Get the ID for the same index of the previous wrap as the given ID. */ +#define DESC_ID_PREV_WRAP(desc_ring, id) \ +DESC_ID((id) - DESCS_COUNT(desc_ring)) + +/* + * A data block: mapped directly to the beginning of the data block area + * specified as a logical position within the data ring. + * + * @id: the ID of the associated descriptor + * @data: the writer data + * + * Note that the size of a data block is only known by its associated + * descriptor. + */ +struct prb_data_block { + unsigned long id; + char data[0]; +}; + +/* + * Return the descriptor associated with @n. @n can be either a + * descriptor ID or a sequence number. + */ +static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n) +{ + return &desc_ring->descs[DESC_INDEX(desc_ring, n)]; +} + +static struct prb_data_block *to_block(struct prb_data_ring *data_ring, + unsigned long begin_lpos) +{ + return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)]; +} + +/* + * Increase the data size to account for data block meta data plus any + * padding so that the adjacent data block is aligned on the ID size. + */ +static unsigned int to_blk_size(unsigned int size) +{ + struct prb_data_block *db = NULL; + + size += sizeof(*db); + size = ALIGN(size, sizeof(db->id)); + return size; +} + +/* + * Sanity checker for reserve size. The ringbuffer code assumes that a data + * block does not exceed the maximum possible size that could fit within the + * ringbuffer. This function provides that basic size check so that the + * assumption is safe. + * + * Writers are also not allowed to write 0-sized (data-less) records. Such + * records are used only internally by the ringbuffer. + */ +static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) +{ + struct prb_data_block *db = NULL; + + /* + * Writers are not allowed to write data-less records. Such records + * are used only internally by the ringbuffer to denote records where + * their data failed to allocate or have been lost. + */ + if (size == 0) + return false; + + /* + * Ensure the alignment padded size could possibly fit in the data + * array. The largest possible data block must still leave room for + * at least the ID of the next block. + */ + size = to_blk_size(size); + if (size > DATA_SIZE(data_ring) - sizeof(db->id)) + return false; + + return true; +} + +/* The possible responses of a descriptor state-query. */ +enum desc_state { + desc_miss, /* ID mismatch */ + desc_reserved, /* reserved, in use by writer */ + desc_committed, /* committed, writer is done */ + desc_reusable, /* free, not yet used by any writer */ +}; + +/* Query the state of a descriptor. */ +static enum desc_state get_desc_state(unsigned long id, + unsigned long state_val) +{ + if (id != DESC_ID(state_val)) + return desc_miss; + + if (state_val & DESC_REUSE_MASK) + return desc_reusable; + + if (state_val & DESC_COMMITTED_MASK) + return desc_committed; + + return desc_reserved; +} + +/* + * Get a copy of a specified descriptor and its queried state. A descriptor + * that is not in the committed or reusable state must be considered garbage + * by the reader. + */ +static enum desc_state desc_read(struct prb_desc_ring *desc_ring, + unsigned long id, struct prb_desc *desc_out) +{ + struct prb_desc *desc = to_desc(desc_ring, id); + atomic_long_t *state_var = &desc->state_var; + enum desc_state d_state; + unsigned long state_val; + + /* Check the descriptor state. */ + state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */ + d_state = get_desc_state(id, state_val); + if (d_state != desc_committed && d_state != desc_reusable) + return d_state; + + /* + * Guarantee the state is loaded before copying the descriptor + * content. This avoids copying obsolete descriptor content that might + * not apply to the descriptor state. This pairs with prb_commit:B. + * + * Memory barrier involvement: + * + * If desc_read:A reads from prb_commit:B, then desc_read:C reads + * from prb_commit:A. + * + * Relies on: + * + * WMB from prb_commit:A to prb_commit:B + * matching + * RMB from desc_read:A to desc_read:C + */ + smp_rmb(); /* LMM(desc_read:B) */ + + /* + * Copy the descriptor data. The data is not valid until the + * state has been re-checked. + */ + memcpy(desc_out, desc, sizeof(*desc_out)); /* LMM(desc_read:C) */ + + /* + * 1. Guarantee the descriptor content is loaded before re-checking + * the state. This avoids reading an obsolete descriptor state + * that may not apply to the copied content. This pairs with + * desc_reserve:F. + * + * Memory barrier involvement: + * + * If desc_read:C reads from desc_reserve:G, then desc_read:E + * reads from desc_reserve:F. + * + * Relies on: + * + * WMB from desc_reserve:F to desc_reserve:G + * matching + * RMB from desc_read:C to desc_read:E + * + * 2. Guarantee the record data is loaded before re-checking the + * state. This avoids reading an obsolete descriptor state that may + * not apply to the copied data. This pairs with data_alloc:A. + * + * Memory barrier involvement: + * + * If copy_data:A reads from data_alloc:B, then desc_read:E + * reads from desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to data_alloc:B + * matching + * RMB from desc_read:C to desc_read:E + * + * Note: desc_make_reusable:A and data_alloc:B can be different + * CPUs. However, the data_alloc:B CPU (which performs the + * full memory barrier) must have previously seen + * desc_make_reusable:A. + */ + smp_rmb(); /* LMM(desc_read:D) */ + + /* Re-check the descriptor state. */ + state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ + return get_desc_state(id, state_val); +} + +/* + * Take a specified descriptor out of the committed state by attempting + * the transition from committed to reusable. Either this context or some + * other context will have been successful. + */ +static void desc_make_reusable(struct prb_desc_ring *desc_ring, + unsigned long id) +{ + unsigned long val_committed = id | DESC_COMMITTED_MASK; + unsigned long val_reusable = val_committed | DESC_REUSE_MASK; + struct prb_desc *desc = to_desc(desc_ring, id); + atomic_long_t *state_var = &desc->state_var; + + atomic_long_cmpxchg_relaxed(state_var, val_committed, + val_reusable); /* LMM(desc_make_reusable:A) */ +} + +/* + * Given a data ring (text or dict), put the associated descriptor of each + * data block from @lpos_begin until @lpos_end into the reusable state. + * + * If there is any problem making the associated descriptor reusable, either + * the descriptor has not yet been committed or another writer context has + * already pushed the tail lpos past the problematic data block. Regardless, + * on error the caller can re-load the tail lpos to determine the situation. + */ +static bool data_make_reusable(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, + unsigned long lpos_begin, + unsigned long lpos_end, + unsigned long *lpos_out) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct prb_data_blk_lpos *blk_lpos; + struct prb_data_block *blk; + enum desc_state d_state; + struct prb_desc desc; + unsigned long id; + + /* + * Using the provided @data_ring, point @blk_lpos to the correct + * blk_lpos within the local copy of the descriptor. + */ + if (data_ring == &rb->text_data_ring) + blk_lpos = &desc.text_blk_lpos; + else + blk_lpos = &desc.dict_blk_lpos; + + /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ + while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { + blk = to_block(data_ring, lpos_begin); + + /* + * Load the block ID from the data block. This is a data race + * against a writer that may have newly reserved this data + * area. If the loaded value matches a valid descriptor ID, + * the blk_lpos of that descriptor will be checked to make + * sure it points back to this data block. If the check fails, + * the data area has been recycled by another writer. + */ + id = blk->id; /* LMM(data_make_reusable:A) */ + + d_state = desc_read(desc_ring, id, &desc); /* LMM(data_make_reusable:B) */ + + switch (d_state) { + case desc_miss: + return false; + case desc_reserved: + return false; + case desc_committed: + /* + * This data block is invalid if the descriptor + * does not point back to it. + */ + if (blk_lpos->begin != lpos_begin) + return false; + desc_make_reusable(desc_ring, id); + break; + case desc_reusable: + /* + * This data block is invalid if the descriptor + * does not point back to it. + */ + if (blk_lpos->begin != lpos_begin) + return false; + break; + } + + /* Advance @lpos_begin to the next data block. */ + lpos_begin = blk_lpos->next; + } + + *lpos_out = lpos_begin; + return true; +} + +/* + * Advance the data ring tail to at least @lpos. This function puts + * descriptors into the reusable state if the tail is pushed beyond + * their associated data block. + */ +static bool data_push_tail(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, + unsigned long lpos) +{ + unsigned long tail_lpos_new; + unsigned long tail_lpos; + unsigned long next_lpos; + + /* If @lpos is not valid, there is nothing to do. */ + if (lpos == INVALID_LPOS) + return true; + + /* + * Any descriptor states that have transitioned to reusable due to the + * data tail being pushed to this loaded value will be visible to this + * CPU. This pairs with data_push_tail:D. + * + * Memory barrier involvement: + * + * If data_push_tail:A reads from data_push_tail:D, then this CPU can + * see desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to data_push_tail:D + * matches + * READFROM from data_push_tail:D to data_push_tail:A + * thus + * READFROM from desc_make_reusable:A to this CPU + */ + tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */ + + /* + * Loop until the tail lpos is at or beyond @lpos. This condition + * may already be satisfied, resulting in no full memory barrier + * from data_push_tail:D being performed. However, since this CPU + * sees the new tail lpos, any descriptor states that transitioned to + * the reusable state must already be visible. + */ + while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { + /* + * Make all descriptors reusable that are associated with + * data blocks before @lpos. + */ + if (!data_make_reusable(rb, data_ring, tail_lpos, lpos, + &next_lpos)) { + /* + * 1. Guarantee the block ID loaded in + * data_make_reusable() is performed before + * reloading the tail lpos. The failed + * data_make_reusable() may be due to a newly + * recycled data area causing the tail lpos to + * have been previously pushed. This pairs with + * data_alloc:A. + * + * Memory barrier involvement: + * + * If data_make_reusable:A reads from data_alloc:B, + * then data_push_tail:C reads from + * data_push_tail:D. + * + * Relies on: + * + * MB from data_push_tail:D to data_alloc:B + * matching + * RMB from data_make_reusable:A to + * data_push_tail:C + * + * Note: data_push_tail:D and data_alloc:B can be + * different CPUs. However, the data_alloc:B + * CPU (which performs the full memory + * barrier) must have previously seen + * data_push_tail:D. + * + * 2. Guarantee the descriptor state loaded in + * data_make_reusable() is performed before + * reloading the tail lpos. The failed + * data_make_reusable() may be due to a newly + * recycled descriptor causing the tail lpos to + * have been previously pushed. This pairs with + * desc_reserve:D. + * + * Memory barrier involvement: + * + * If data_make_reusable:B reads from + * desc_reserve:F, then data_push_tail:C reads + * from data_push_tail:D. + * + * Relies on: + * + * MB from data_push_tail:D to desc_reserve:F + * matching + * RMB from data_make_reusable:B to + * data_push_tail:C + * + * Note: data_push_tail:D and desc_reserve:F can + * be different CPUs. However, the + * desc_reserve:F CPU (which performs the + * full memory barrier) must have previously + * seen data_push_tail:D. + */ + smp_rmb(); /* LMM(data_push_tail:B) */ + + tail_lpos_new = atomic_long_read(&data_ring->tail_lpos + ); /* LMM(data_push_tail:C) */ + if (tail_lpos_new == tail_lpos) + return false; + + /* Another CPU pushed the tail. Try again. */ + tail_lpos = tail_lpos_new; + continue; + } + + /* + * Guarantee any descriptor states that have transitioned to + * reusable are stored before pushing the tail lpos. A full + * memory barrier is needed since other CPUs may have made + * the descriptor states reusable. This pairs with + * data_push_tail:A. + */ + if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos, + next_lpos)) { /* LMM(data_push_tail:D) */ + break; + } + } + + return true; +} + +/* + * Advance the desc ring tail. This function advances the tail by one + * descriptor, thus invalidating the oldest descriptor. Before advancing + * the tail, the tail descriptor is made reusable and all data blocks up to + * and including the descriptor's data block are invalidated (i.e. the data + * ring tail is pushed past the data block of the descriptor being made + * reusable). + */ +static bool desc_push_tail(struct printk_ringbuffer *rb, + unsigned long tail_id) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + struct prb_desc desc; + + d_state = desc_read(desc_ring, tail_id, &desc); + + switch (d_state) { + case desc_miss: + /* + * If the ID is exactly 1 wrap behind the expected, it is + * in the process of being reserved by another writer and + * must be considered reserved. + */ + if (DESC_ID(atomic_long_read(&desc.state_var)) == + DESC_ID_PREV_WRAP(desc_ring, tail_id)) { + return false; + } + + /* + * The ID has changed. Another writer must have pushed the + * tail and recycled the descriptor already. Success is + * returned because the caller is only interested in the + * specified tail being pushed, which it was. + */ + return true; + case desc_reserved: + return false; + case desc_committed: + desc_make_reusable(desc_ring, tail_id); + break; + case desc_reusable: + break; + } + + /* + * Data blocks must be invalidated before their associated + * descriptor can be made available for recycling. Invalidating + * them later is not possible because there is no way to trust + * data blocks once their associated descriptor is gone. + */ + + if (!data_push_tail(rb, &rb->text_data_ring, desc.text_blk_lpos.next)) + return false; + if (!data_push_tail(rb, &rb->dict_data_ring, desc.dict_blk_lpos.next)) + return false; + + /* + * Check the next descriptor after @tail_id before pushing the tail + * to it because the tail must always be in a committed or reusable + * state. The implementation of prb_first_seq() relies on this. + * + * A successful read implies that the next descriptor is less than or + * equal to @head_id so there is no risk of pushing the tail past the + * head. + */ + d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc); /* LMM(desc_push_tail:A) */ + + if (d_state == desc_committed || d_state == desc_reusable) { + /* + * Guarantee any descriptor states that have transitioned to + * reusable are stored before pushing the tail ID. This allows + * verifying the recycled descriptor state. A full memory + * barrier is needed since other CPUs may have made the + * descriptor states reusable. This pairs with desc_reserve:D. + */ + atomic_long_cmpxchg(&desc_ring->tail_id, tail_id, + DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */ + } else { + /* + * Guarantee the last state load from desc_read() is before + * reloading @tail_id in order to see a new tail ID in the + * case that the descriptor has been recycled. This pairs + * with desc_reserve:D. + * + * Memory barrier involvement: + * + * If desc_push_tail:A reads from desc_reserve:F, then + * desc_push_tail:D reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:F + * matching + * RMB from desc_push_tail:A to desc_push_tail:D + * + * Note: desc_push_tail:B and desc_reserve:F can be different + * CPUs. However, the desc_reserve:F CPU (which performs + * the full memory barrier) must have previously seen + * desc_push_tail:B. + */ + smp_rmb(); /* LMM(desc_push_tail:C) */ + + /* + * Re-check the tail ID. The descriptor following @tail_id is + * not in an allowed tail state. But if the tail has since + * been moved by another CPU, then it does not matter. + */ + if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */ + return false; + } + + return true; +} + +/* Reserve a new descriptor, invalidating the oldest if necessary. */ +static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + unsigned long prev_state_val; + unsigned long id_prev_wrap; + struct prb_desc *desc; + unsigned long head_id; + unsigned long id; + + head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ + + do { + desc = to_desc(desc_ring, head_id); + + id = DESC_ID(head_id + 1); + id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); + + /* + * Guarantee the head ID is read before reading the tail ID. + * Since the tail ID is updated before the head ID, this + * guarantees that @id_prev_wrap is never ahead of the tail + * ID. This pairs with desc_reserve:D. + * + * Memory barrier involvement: + * + * If desc_reserve:A reads from desc_reserve:D, then + * desc_reserve:C reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:D + * matching + * RMB from desc_reserve:A to desc_reserve:C + * + * Note: desc_push_tail:B and desc_reserve:D can be different + * CPUs. However, the desc_reserve:D CPU (which performs + * the full memory barrier) must have previously seen + * desc_push_tail:B. + */ + smp_rmb(); /* LMM(desc_reserve:B) */ + + if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id + )) { /* LMM(desc_reserve:C) */ + /* + * Make space for the new descriptor by + * advancing the tail. + */ + if (!desc_push_tail(rb, id_prev_wrap)) + return false; + } + + /* + * 1. Guarantee the tail ID is read before validating the + * recycled descriptor state. A read memory barrier is + * sufficient for this. This pairs with desc_push_tail:B. + * + * Memory barrier involvement: + * + * If desc_reserve:C reads from desc_push_tail:B, then + * desc_reserve:E reads from desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to desc_push_tail:B + * matching + * RMB from desc_reserve:C to desc_reserve:E + * + * Note: desc_make_reusable:A and desc_push_tail:B can be + * different CPUs. However, the desc_push_tail:B CPU + * (which performs the full memory barrier) must have + * previously seen desc_make_reusable:A. + * + * 2. Guarantee the tail ID is stored before storing the head + * ID. This pairs with desc_reserve:B. + * + * 3. Guarantee any data ring tail changes are stored before + * recycling the descriptor. Data ring tail changes can + * happen via desc_push_tail()->data_push_tail(). A full + * memory barrier is needed since another CPU may have + * pushed the data ring tails. This pairs with + * data_push_tail:B. + * + * 4. Guarantee a new tail ID is stored before recycling the + * descriptor. A full memory barrier is needed since + * another CPU may have pushed the tail ID. This pairs + * with desc_push_tail:C and this also pairs with + * prb_first_seq:C. + */ + } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id, + id)); /* LMM(desc_reserve:D) */ + + desc = to_desc(desc_ring, id); + + /* + * If the descriptor has been recycled, verify the old state val. + * See "ABA Issues" about why this verification is performed. + */ + prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */ + if (prev_state_val && + prev_state_val != (id_prev_wrap | DESC_COMMITTED_MASK | DESC_REUSE_MASK)) { + WARN_ON_ONCE(1); + return false; + } + + /* + * Assign the descriptor a new ID and set its state to reserved. + * See "ABA Issues" about why cmpxchg() instead of set() is used. + * + * Guarantee the new descriptor ID and state is stored before making + * any other changes. A write memory barrier is sufficient for this. + * This pairs with desc_read:D. + */ + if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val, + id | 0)) { /* LMM(desc_reserve:F) */ + WARN_ON_ONCE(1); + return false; + } + + /* Now data in @desc can be modified: LMM(desc_reserve:G) */ + + *id_out = id; + return true; +} + +/* Determine the end of a data block. */ +static unsigned long get_next_lpos(struct prb_data_ring *data_ring, + unsigned long lpos, unsigned int size) +{ + unsigned long begin_lpos; + unsigned long next_lpos; + + begin_lpos = lpos; + next_lpos = lpos + size; + + /* First check if the data block does not wrap. */ + if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) + return next_lpos; + + /* Wrapping data blocks store their data at the beginning. */ + return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size); +} + +/* + * Allocate a new data block, invalidating the oldest data block(s) + * if necessary. This function also associates the data block with + * a specified descriptor. + */ +static char *data_alloc(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, unsigned int size, + struct prb_data_blk_lpos *blk_lpos, unsigned long id) +{ + struct prb_data_block *blk; + unsigned long begin_lpos; + unsigned long next_lpos; + + if (size == 0) { + /* Specify a data-less block. */ + blk_lpos->begin = INVALID_LPOS; + blk_lpos->next = INVALID_LPOS; + return NULL; + } + + size = to_blk_size(size); + + begin_lpos = atomic_long_read(&data_ring->head_lpos); + + do { + next_lpos = get_next_lpos(data_ring, begin_lpos, size); + + if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) { + /* Failed to allocate, specify a data-less block. */ + blk_lpos->begin = INVALID_LPOS; + blk_lpos->next = INVALID_LPOS; + return NULL; + } + + /* + * 1. Guarantee any descriptor states that have transitioned + * to reusable are stored before modifying the newly + * allocated data area. A full memory barrier is needed + * since other CPUs may have made the descriptor states + * reusable. See data_push_tail:A about why the reusable + * states are visible. This pairs with desc_read:D. + * + * 2. Guarantee any updated tail lpos is stored before + * modifying the newly allocated data area. Another CPU may + * be in data_make_reusable() and is reading a block ID + * from this area. data_make_reusable() can handle reading + * a garbage block ID value, but then it must be able to + * load a new tail lpos. A full memory barrier is needed + * since other CPUs may have updated the tail lpos. This + * pairs with data_push_tail:B. + */ + } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos, + next_lpos)); /* LMM(data_alloc:A) */ + + blk = to_block(data_ring, begin_lpos); + blk->id = id; /* LMM(data_alloc:B) */ + + if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { + /* Wrapping data blocks store their data at the beginning. */ + blk = to_block(data_ring, 0); + + /* + * Store the ID on the wrapped block for consistency. + * The printk_ringbuffer does not actually use it. + */ + blk->id = id; + } + + blk_lpos->begin = begin_lpos; + blk_lpos->next = next_lpos; + + return &blk->data[0]; +} + +/* Return the number of bytes used by a data block. */ +static unsigned int space_used(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos) +{ + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { + /* Data block does not wrap. */ + return (DATA_INDEX(data_ring, blk_lpos->next) - + DATA_INDEX(data_ring, blk_lpos->begin)); + } + + /* + * For wrapping data blocks, the trailing (wasted) space is + * also counted. + */ + return (DATA_INDEX(data_ring, blk_lpos->next) + + DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin)); +} + +/** + * prb_reserve() - Reserve space in the ringbuffer. + * + * @e: The entry structure to setup. + * @rb: The ringbuffer to reserve data in. + * @r: The record structure to allocate buffers for. + * + * This is the public function available to writers to reserve data. + * + * The writer specifies the text and dict sizes to reserve by setting the + * @text_buf_size and @dict_buf_size fields of @r, respectively. Dictionaries + * are optional, so @dict_buf_size is allowed to be 0. To ensure proper + * initialization of @r, prb_rec_init_wr() should be used. + * + * Context: Any context. Disables local interrupts on success. + * Return: true if at least text data could be allocated, otherwise false. + * + * On success, the fields @info, @text_buf, @dict_buf of @r will be set by + * this function and should be filled in by the writer before committing. Also + * on success, prb_record_text_space() can be used on @e to query the actual + * space used for the text data block. + * + * If the function fails to reserve dictionary space (but all else succeeded), + * it will still report success. In that case @dict_buf is set to NULL and + * @dict_buf_size is set to 0. Writers must check this before writing to + * dictionary space. + * + * @info->text_len and @info->dict_len will already be set to @text_buf_size + * and @dict_buf_size, respectively. If dictionary space reservation fails, + * @info->dict_len is set to 0. + */ +bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct prb_desc *d; + unsigned long id; + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + /* Records are allowed to not have dictionaries. */ + if (r->dict_buf_size) { + if (!data_check_size(&rb->dict_data_ring, r->dict_buf_size)) + goto fail; + } + + /* + * Descriptors in the reserved state act as blockers to all further + * reservations once the desc_ring has fully wrapped. Disable + * interrupts during the reserve/commit window in order to minimize + * the likelihood of this happening. + */ + local_irq_save(e->irqflags); + + if (!desc_reserve(rb, &id)) { + /* Descriptor reservation failures are tracked. */ + atomic_long_inc(&rb->fail); + local_irq_restore(e->irqflags); + goto fail; + } + + d = to_desc(desc_ring, id); + + /* + * Set the @e fields here so that prb_commit() can be used if + * text data allocation fails. + */ + e->rb = rb; + e->id = id; + + /* + * Initialize the sequence number if it has "never been set". + * Otherwise just increment it by a full wrap. + * + * @seq is considered "never been set" if it has a value of 0, + * _except_ for @descs[0], which was specially setup by the ringbuffer + * initializer and therefore is always considered as set. + * + * See the "Bootstrap" comment block in printk_ringbuffer.h for + * details about how the initializer bootstraps the descriptors. + */ + if (d->info.seq == 0 && DESC_INDEX(desc_ring, id) != 0) + d->info.seq = DESC_INDEX(desc_ring, id); + else + d->info.seq += DESCS_COUNT(desc_ring); + + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + /* If text data allocation fails, a data-less record is committed. */ + if (r->text_buf_size && !r->text_buf) { + d->info.text_len = 0; + d->info.dict_len = 0; + prb_commit(e); + /* prb_commit() re-enabled interrupts. */ + goto fail; + } + + r->dict_buf = data_alloc(rb, &rb->dict_data_ring, r->dict_buf_size, + &d->dict_blk_lpos, id); + /* + * If dict data allocation fails, the caller can still commit + * text. But dictionary information will not be available. + */ + if (r->dict_buf_size && !r->dict_buf) + r->dict_buf_size = 0; + + r->info = &d->info; + + /* Set default values for the sizes. */ + d->info.text_len = r->text_buf_size; + d->info.dict_len = r->dict_buf_size; + + /* Record full text space used by record. */ + e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); + + return true; +fail: + /* Make it clear to the caller that the reserve failed. */ + memset(r, 0, sizeof(*r)); + return false; +} + +/** + * prb_commit() - Commit (previously reserved) data to the ringbuffer. + * + * @e: The entry containing the reserved data information. + * + * This is the public function available to writers to commit data. + * + * Context: Any context. Enables local interrupts. + */ +void prb_commit(struct prb_reserved_entry *e) +{ + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + struct prb_desc *d = to_desc(desc_ring, e->id); + unsigned long prev_state_val = e->id | 0; + + /* Now the writer has finished all writing: LMM(prb_commit:A) */ + + /* + * Set the descriptor as committed. See "ABA Issues" about why + * cmpxchg() instead of set() is used. + * + * Guarantee all record data is stored before the descriptor state + * is stored as committed. A write memory barrier is sufficient for + * this. This pairs with desc_read:B. + */ + if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, + e->id | DESC_COMMITTED_MASK)) { /* LMM(prb_commit:B) */ + WARN_ON_ONCE(1); + } + + /* Restore interrupts, the reserve/commit window is finished. */ + local_irq_restore(e->irqflags); +} + +/* + * Given @blk_lpos, return a pointer to the writer data from the data block + * and calculate the size of the data part. A NULL pointer is returned if + * @blk_lpos specifies values that could never be legal. + * + * This function (used by readers) performs strict validation on the lpos + * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is + * triggered if an internal error is detected. + */ +static char *get_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, + unsigned int *data_size) +{ + struct prb_data_block *db; + + /* Data-less data block description. */ + if (blk_lpos->begin == INVALID_LPOS && + blk_lpos->next == INVALID_LPOS) { + return NULL; + } + + /* Regular data block: @begin less than @next and in same wrap. */ + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && + blk_lpos->begin < blk_lpos->next) { + db = to_block(data_ring, blk_lpos->begin); + *data_size = blk_lpos->next - blk_lpos->begin; + + /* Wrapping data block: @begin is one wrap behind @next. */ + } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == + DATA_WRAPS(data_ring, blk_lpos->next)) { + db = to_block(data_ring, 0); + *data_size = DATA_INDEX(data_ring, blk_lpos->next); + + /* Illegal block description. */ + } else { + WARN_ON_ONCE(1); + return NULL; + } + + /* A valid data block will always be aligned to the ID size. */ + if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || + WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { + return NULL; + } + + /* A valid data block will always have at least an ID. */ + if (WARN_ON_ONCE(*data_size < sizeof(db->id))) + return NULL; + + /* Subtract block ID space from size to reflect data size. */ + *data_size -= sizeof(db->id); + + return &db->data[0]; +} + +/* + * Count the number of lines in provided text. All text has at least 1 line + * (even if @text_size is 0). Each '\n' processed is counted as an additional + * line. + */ +static unsigned int count_lines(char *text, unsigned int text_size) +{ + unsigned int next_size = text_size; + unsigned int line_count = 1; + char *next = text; + + while (next_size) { + next = memchr(next, '\n', next_size); + if (!next) + break; + line_count++; + next++; + next_size = text_size - (next - text); + } + + return line_count; +} + +/* + * Given @blk_lpos, copy an expected @len of data into the provided buffer. + * If @line_count is provided, count the number of lines in the data. + * + * This function (used by readers) performs strict validation on the data + * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is + * triggered if an internal error is detected. + */ +static bool copy_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf, + unsigned int buf_size, unsigned int *line_count) +{ + unsigned int data_size; + char *data; + + /* Caller might not want any data. */ + if ((!buf || !buf_size) && !line_count) + return true; + + data = get_data(data_ring, blk_lpos, &data_size); + if (!data) + return false; + + /* + * Actual cannot be less than expected. It can be more than expected + * because of the trailing alignment padding. + */ + if (WARN_ON_ONCE(data_size < (unsigned int)len)) { + pr_warn_once("wrong data size (%u, expecting %hu) for data: %.*s\n", + data_size, len, data_size, data); + return false; + } + + /* Caller interested in the line count? */ + if (line_count) + *line_count = count_lines(data, data_size); + + /* Caller interested in the data content? */ + if (!buf || !buf_size) + return true; + + data_size = min_t(u16, buf_size, len); + + if (!WARN_ON_ONCE(!data_size)) + memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ + return true; +} + +/* + * This is an extended version of desc_read(). It gets a copy of a specified + * descriptor. However, it also verifies that the record is committed and has + * the sequence number @seq. On success, 0 is returned. + * + * Error return values: + * -EINVAL: A committed record with sequence number @seq does not exist. + * -ENOENT: A committed record with sequence number @seq exists, but its data + * is not available. This is a valid record, so readers should + * continue with the next record. + */ +static int desc_read_committed_seq(struct prb_desc_ring *desc_ring, + unsigned long id, u64 seq, + struct prb_desc *desc_out) +{ + struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos; + enum desc_state d_state; + + d_state = desc_read(desc_ring, id, desc_out); + + /* + * An unexpected @id (desc_miss) or @seq mismatch means the record + * does not exist. A descriptor in the reserved state means the + * record does not yet exist for the reader. + */ + if (d_state == desc_miss || + d_state == desc_reserved || + desc_out->info.seq != seq) { + return -EINVAL; + } + + /* + * A descriptor in the reusable state may no longer have its data + * available; report it as a data-less record. Or the record may + * actually be a data-less record. + */ + if (d_state == desc_reusable || + (blk_lpos->begin == INVALID_LPOS && blk_lpos->next == INVALID_LPOS)) { + return -ENOENT; + } + + return 0; +} + +/* + * Copy the ringbuffer data from the record with @seq to the provided + * @r buffer. On success, 0 is returned. + * + * See desc_read_committed_seq() for error return values. + */ +static int prb_read(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r, unsigned int *line_count) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct prb_desc *rdesc = to_desc(desc_ring, seq); + atomic_long_t *state_var = &rdesc->state_var; + struct prb_desc desc; + unsigned long id; + int err; + + /* Extract the ID, used to specify the descriptor to read. */ + id = DESC_ID(atomic_long_read(state_var)); + + /* Get a local copy of the correct descriptor (if available). */ + err = desc_read_committed_seq(desc_ring, id, seq, &desc); + + /* + * If @r is NULL, the caller is only interested in the availability + * of the record. + */ + if (err || !r) + return err; + + /* If requested, copy meta data. */ + if (r->info) + memcpy(r->info, &desc.info, sizeof(*(r->info))); + + /* Copy text data. If it fails, this is a data-less record. */ + if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, desc.info.text_len, + r->text_buf, r->text_buf_size, line_count)) { + return -ENOENT; + } + + /* + * Copy dict data. Although this should not fail, dict data is not + * important. So if it fails, modify the copied meta data to report + * that there is no dict data, thus silently dropping the dict data. + */ + if (!copy_data(&rb->dict_data_ring, &desc.dict_blk_lpos, desc.info.dict_len, + r->dict_buf, r->dict_buf_size, NULL)) { + if (r->info) + r->info->dict_len = 0; + } + + /* Ensure the record is still committed and has the same @seq. */ + return desc_read_committed_seq(desc_ring, id, seq, &desc); +} + +/* Get the sequence number of the tail descriptor. */ +static u64 prb_first_seq(struct printk_ringbuffer *rb) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + struct prb_desc desc; + unsigned long id; + + for (;;) { + id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */ + + d_state = desc_read(desc_ring, id, &desc); /* LMM(prb_first_seq:B) */ + + /* + * This loop will not be infinite because the tail is + * _always_ in the committed or reusable state. + */ + if (d_state == desc_committed || d_state == desc_reusable) + break; + + /* + * Guarantee the last state load from desc_read() is before + * reloading @tail_id in order to see a new tail in the case + * that the descriptor has been recycled. This pairs with + * desc_reserve:D. + * + * Memory barrier involvement: + * + * If prb_first_seq:B reads from desc_reserve:F, then + * prb_first_seq:A reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:F + * matching + * RMB prb_first_seq:B to prb_first_seq:A + */ + smp_rmb(); /* LMM(prb_first_seq:C) */ + } + + return desc.info.seq; +} + +/* + * Non-blocking read of a record. Updates @seq to the last committed record + * (which may have no data). + * + * See the description of prb_read_valid() and prb_read_valid_info() + * for details. + */ +static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, + struct printk_record *r, unsigned int *line_count) +{ + u64 tail_seq; + int err; + + while ((err = prb_read(rb, *seq, r, line_count))) { + tail_seq = prb_first_seq(rb); + + if (*seq < tail_seq) { + /* + * Behind the tail. Catch up and try again. This + * can happen for -ENOENT and -EINVAL cases. + */ + *seq = tail_seq; + + } else if (err == -ENOENT) { + /* Record exists, but no data available. Skip. */ + (*seq)++; + + } else { + /* Non-existent/non-committed record. Must stop. */ + return false; + } + } + + return true; +} + +/** + * prb_read_valid() - Non-blocking read of a requested record or (if gone) + * the next available record. + * + * @rb: The ringbuffer to read from. + * @seq: The sequence number of the record to read. + * @r: A record data buffer to store the read record to. + * + * This is the public function available to readers to read a record. + * + * The reader provides the @info, @text_buf, @dict_buf buffers of @r to be + * filled in. Any of the buffer pointers can be set to NULL if the reader + * is not interested in that data. To ensure proper initialization of @r, + * prb_rec_init_rd() should be used. + * + * Context: Any context. + * Return: true if a record was read, otherwise false. + * + * On success, the reader must check r->info.seq to see which record was + * actually read. This allows the reader to detect dropped records. + * + * Failure means @seq refers to a not yet written record. + */ +bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r) +{ + return _prb_read_valid(rb, &seq, r, NULL); +} + +/** + * prb_read_valid_info() - Non-blocking read of meta data for a requested + * record or (if gone) the next available record. + * + * @rb: The ringbuffer to read from. + * @seq: The sequence number of the record to read. + * @info: A buffer to store the read record meta data to. + * @line_count: A buffer to store the number of lines in the record text. + * + * This is the public function available to readers to read only the + * meta data of a record. + * + * The reader provides the @info, @line_count buffers to be filled in. + * Either of the buffer pointers can be set to NULL if the reader is not + * interested in that data. + * + * Context: Any context. + * Return: true if a record's meta data was read, otherwise false. + * + * On success, the reader must check info->seq to see which record meta data + * was actually read. This allows the reader to detect dropped records. + * + * Failure means @seq refers to a not yet written record. + */ +bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, + struct printk_info *info, unsigned int *line_count) +{ + struct printk_record r; + + prb_rec_init_rd(&r, info, NULL, 0, NULL, 0); + + return _prb_read_valid(rb, &seq, &r, line_count); +} + +/** + * prb_first_valid_seq() - Get the sequence number of the oldest available + * record. + * + * @rb: The ringbuffer to get the sequence number from. + * + * This is the public function available to readers to see what the + * first/oldest valid sequence number is. + * + * This provides readers a starting point to begin iterating the ringbuffer. + * + * Context: Any context. + * Return: The sequence number of the first/oldest record or, if the + * ringbuffer is empty, 0 is returned. + */ +u64 prb_first_valid_seq(struct printk_ringbuffer *rb) +{ + u64 seq = 0; + + if (!_prb_read_valid(rb, &seq, NULL, NULL)) + return 0; + + return seq; +} + +/** + * prb_next_seq() - Get the sequence number after the last available record. + * + * @rb: The ringbuffer to get the sequence number from. + * + * This is the public function available to readers to see what the next + * newest sequence number available to readers will be. + * + * This provides readers a sequence number to jump to if all currently + * available records should be skipped. + * + * Context: Any context. + * Return: The sequence number of the next newest (not yet available) record + * for readers. + */ +u64 prb_next_seq(struct printk_ringbuffer *rb) +{ + u64 seq = 0; + + /* Search forward from the oldest descriptor. */ + while (_prb_read_valid(rb, &seq, NULL, NULL)) + seq++; + + return seq; +} + +/** + * prb_init() - Initialize a ringbuffer to use provided external buffers. + * + * @rb: The ringbuffer to initialize. + * @text_buf: The data buffer for text data. + * @textbits: The size of @text_buf as a power-of-2 value. + * @dict_buf: The data buffer for dictionary data. + * @dictbits: The size of @dict_buf as a power-of-2 value. + * @descs: The descriptor buffer for ringbuffer records. + * @descbits: The count of @descs items as a power-of-2 value. + * + * This is the public function available to writers to setup a ringbuffer + * during runtime using provided buffers. + * + * This must match the initialization of DEFINE_PRINTKRB(). + * + * Context: Any context. + */ +void prb_init(struct printk_ringbuffer *rb, + char *text_buf, unsigned int textbits, + char *dict_buf, unsigned int dictbits, + struct prb_desc *descs, unsigned int descbits) +{ + memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0])); + + rb->desc_ring.count_bits = descbits; + rb->desc_ring.descs = descs; + atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); + atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); + + rb->text_data_ring.size_bits = textbits; + rb->text_data_ring.data = text_buf; + atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits)); + atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits)); + + rb->dict_data_ring.size_bits = dictbits; + rb->dict_data_ring.data = dict_buf; + atomic_long_set(&rb->dict_data_ring.head_lpos, BLK0_LPOS(dictbits)); + atomic_long_set(&rb->dict_data_ring.tail_lpos, BLK0_LPOS(dictbits)); + + atomic_long_set(&rb->fail, 0); + + descs[0].info.seq = -(u64)_DESCS_COUNT(descbits); + + descs[_DESCS_COUNT(descbits) - 1].info.seq = 0; + atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = INVALID_LPOS; + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = INVALID_LPOS; + descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.begin = INVALID_LPOS; + descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.next = INVALID_LPOS; +} + +/** + * prb_record_text_space() - Query the full actual used ringbuffer space for + * the text data of a reserved entry. + * + * @e: The successfully reserved entry to query. + * + * This is the public function available to writers to see how much actual + * space is used in the ringbuffer to store the text data of the specified + * entry. + * + * This function is only valid if @e has been successfully reserved using + * prb_reserve(). + * + * Context: Any context. + * Return: The size in bytes used by the text data of the associated record. + */ +unsigned int prb_record_text_space(struct prb_reserved_entry *e) +{ + return e->text_space; +} diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h new file mode 100644 index 000000000000..3e46a7423c13 --- /dev/null +++ b/kernel/printk/printk_ringbuffer.h @@ -0,0 +1,399 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_PRINTK_RINGBUFFER_H +#define _KERNEL_PRINTK_RINGBUFFER_H + +#include + +/* + * Meta information about each stored message. + * + * All fields are set and used by the printk code except for + * @seq, @text_len, @dict_len, which are set and/or modified + * by the ringbuffer code. + */ +struct printk_info { + u64 seq; /* sequence number */ + u64 ts_nsec; /* timestamp in nanoseconds */ + u16 text_len; /* length of text message */ + u16 dict_len; /* length of dictionary message */ + u8 facility; /* syslog facility */ + u8 flags:5; /* internal record flags */ + u8 level:3; /* syslog level */ + u32 caller_id; /* thread id or processor id */ +}; + +/* + * A structure providing the buffers, used by writers and readers. + * + * Writers: + * Using prb_rec_init_wr(), a writer sets @text_buf_size and @dict_buf_size + * before calling prb_reserve(). On success, prb_reserve() sets @info, + * @text_buf, @dict_buf to buffers reserved for that writer. + * + * Readers: + * Using prb_rec_init_rd(), a reader sets all fields before calling + * prb_read_valid(). Note that the reader provides the @info, @text_buf, + * @dict_buf buffers. On success, the struct pointed to by @info will be + * filled and the char arrays pointed to by @text_buf and @dict_buf will + * be filled with text and dict data. + */ +struct printk_record { + struct printk_info *info; + char *text_buf; + char *dict_buf; + unsigned int text_buf_size; + unsigned int dict_buf_size; +}; + +/* Specifies the logical position and span of a data block. */ +struct prb_data_blk_lpos { + unsigned long begin; + unsigned long next; +}; + +/* + * A descriptor: the complete meta-data for a record. + * + * @state_var: A bitwise combination of descriptor ID and descriptor state. + */ +struct prb_desc { + struct printk_info info; + atomic_long_t state_var; + struct prb_data_blk_lpos text_blk_lpos; + struct prb_data_blk_lpos dict_blk_lpos; +}; + +/* A ringbuffer of "ID + data" elements. */ +struct prb_data_ring { + unsigned int size_bits; + char *data; + atomic_long_t head_lpos; + atomic_long_t tail_lpos; +}; + +/* A ringbuffer of "struct prb_desc" elements. */ +struct prb_desc_ring { + unsigned int count_bits; + struct prb_desc *descs; + atomic_long_t head_id; + atomic_long_t tail_id; +}; + +/* + * The high level structure representing the printk ringbuffer. + * + * @fail: Count of failed prb_reserve() calls where not even a data-less + * record was created. + */ +struct printk_ringbuffer { + struct prb_desc_ring desc_ring; + struct prb_data_ring text_data_ring; + struct prb_data_ring dict_data_ring; + atomic_long_t fail; +}; + +/* + * Used by writers as a reserve/commit handle. + * + * @rb: Ringbuffer where the entry is reserved. + * @irqflags: Saved irq flags to restore on entry commit. + * @id: ID of the reserved descriptor. + * @text_space: Total occupied buffer space in the text data ring, including + * ID, alignment padding, and wrapping data blocks. + * + * This structure is an opaque handle for writers. Its contents are only + * to be used by the ringbuffer implementation. + */ +struct prb_reserved_entry { + struct printk_ringbuffer *rb; + unsigned long irqflags; + unsigned long id; + unsigned int text_space; +}; + +#define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) +#define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) +#define DESC_SV_BITS (sizeof(unsigned long) * 8) +#define DESC_COMMITTED_MASK (1UL << (DESC_SV_BITS - 1)) +#define DESC_REUSE_MASK (1UL << (DESC_SV_BITS - 2)) +#define DESC_FLAGS_MASK (DESC_COMMITTED_MASK | DESC_REUSE_MASK) +#define DESC_ID_MASK (~DESC_FLAGS_MASK) +#define DESC_ID(sv) ((sv) & DESC_ID_MASK) +#define INVALID_LPOS 1 + +#define INVALID_BLK_LPOS \ +{ \ + .begin = INVALID_LPOS, \ + .next = INVALID_LPOS, \ +} + +/* + * Descriptor Bootstrap + * + * The descriptor array is minimally initialized to allow immediate usage + * by readers and writers. The requirements that the descriptor array + * initialization must satisfy: + * + * Req1 + * The tail must point to an existing (committed or reusable) descriptor. + * This is required by the implementation of prb_first_seq(). + * + * Req2 + * Readers must see that the ringbuffer is initially empty. + * + * Req3 + * The first record reserved by a writer is assigned sequence number 0. + * + * To satisfy Req1, the tail initially points to a descriptor that is + * minimally initialized (having no data block, i.e. data-less with the + * data block's lpos @begin and @next values set to INVALID_LPOS). + * + * To satisfy Req2, the initial tail descriptor is initialized to the + * reusable state. Readers recognize reusable descriptors as existing + * records, but skip over them. + * + * To satisfy Req3, the last descriptor in the array is used as the initial + * head (and tail) descriptor. This allows the first record reserved by a + * writer (head + 1) to be the first descriptor in the array. (Only the first + * descriptor in the array could have a valid sequence number of 0.) + * + * The first time a descriptor is reserved, it is assigned a sequence number + * with the value of the array index. A "first time reserved" descriptor can + * be recognized because it has a sequence number of 0 but does not have an + * index of 0. (Only the first descriptor in the array could have a valid + * sequence number of 0.) After the first reservation, all future reservations + * (recycling) simply involve incrementing the sequence number by the array + * count. + * + * Hack #1 + * Only the first descriptor in the array is allowed to have the sequence + * number 0. In this case it is not possible to recognize if it is being + * reserved the first time (set to index value) or has been reserved + * previously (increment by the array count). This is handled by _always_ + * incrementing the sequence number by the array count when reserving the + * first descriptor in the array. In order to satisfy Req3, the sequence + * number of the first descriptor in the array is initialized to minus + * the array count. Then, upon the first reservation, it is incremented + * to 0, thus satisfying Req3. + * + * Hack #2 + * prb_first_seq() can be called at any time by readers to retrieve the + * sequence number of the tail descriptor. However, due to Req2 and Req3, + * initially there are no records to report the sequence number of + * (sequence numbers are u64 and there is nothing less than 0). To handle + * this, the sequence number of the initial tail descriptor is initialized + * to 0. Technically this is incorrect, because there is no record with + * sequence number 0 (yet) and the tail descriptor is not the first + * descriptor in the array. But it allows prb_read_valid() to correctly + * report the existence of a record for _any_ given sequence number at all + * times. Bootstrapping is complete when the tail is pushed the first + * time, thus finally pointing to the first descriptor reserved by a + * writer, which has the assigned sequence number 0. + */ + +/* + * Initiating Logical Value Overflows + * + * Both logical position (lpos) and ID values can be mapped to array indexes + * but may experience overflows during the lifetime of the system. To ensure + * that printk_ringbuffer can handle the overflows for these types, initial + * values are chosen that map to the correct initial array indexes, but will + * result in overflows soon. + * + * BLK0_LPOS + * The initial @head_lpos and @tail_lpos for data rings. It is at index + * 0 and the lpos value is such that it will overflow on the first wrap. + * + * DESC0_ID + * The initial @head_id and @tail_id for the desc ring. It is at the last + * index of the descriptor array (see Req3 above) and the ID value is such + * that it will overflow on the second wrap. + */ +#define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits))) +#define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1)) +#define DESC0_SV(ct_bits) (DESC_COMMITTED_MASK | DESC_REUSE_MASK | DESC0_ID(ct_bits)) + +/* + * Define a ringbuffer with an external text data buffer. The same as + * DEFINE_PRINTKRB() but requires specifying an external buffer for the + * text data. + * + * Note: The specified external buffer must be of the size: + * 2 ^ (descbits + avgtextbits) + */ +#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits, text_buf) \ +static char _##name##_dict[1U << ((avgdictbits) + (descbits))] \ + __aligned(__alignof__(unsigned long)); \ +static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ + /* this will be the first record reserved by a writer */ \ + [0] = { \ + .info = { \ + /* will be incremented to 0 on the first reservation */ \ + .seq = -(u64)_DESCS_COUNT(descbits), \ + }, \ + }, \ + /* the initial head and tail */ \ + [_DESCS_COUNT(descbits) - 1] = { \ + .info = { \ + /* reports the first seq value during the bootstrap phase */ \ + .seq = 0, \ + }, \ + /* reusable */ \ + .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ + /* no associated data block */ \ + .text_blk_lpos = INVALID_BLK_LPOS, \ + .dict_blk_lpos = INVALID_BLK_LPOS, \ + }, \ +}; \ +static struct printk_ringbuffer name = { \ + .desc_ring = { \ + .count_bits = descbits, \ + .descs = &_##name##_descs[0], \ + .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + }, \ + .text_data_ring = { \ + .size_bits = (avgtextbits) + (descbits), \ + .data = text_buf, \ + .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + }, \ + .dict_data_ring = { \ + .size_bits = (avgtextbits) + (descbits), \ + .data = &_##name##_dict[0], \ + .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + }, \ + .fail = ATOMIC_LONG_INIT(0), \ +} + +/** + * DEFINE_PRINTKRB() - Define a ringbuffer. + * + * @name: The name of the ringbuffer variable. + * @descbits: The number of descriptors as a power-of-2 value. + * @avgtextbits: The average text data size per record as a power-of-2 value. + * @avgdictbits: The average dictionary data size per record as a + * power-of-2 value. + * + * This is a macro for defining a ringbuffer and all internal structures + * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a + * variant where the text data buffer can be specified externally. + */ +#define DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits) \ +static char _##name##_text[1U << ((avgtextbits) + (descbits))] \ + __aligned(__alignof__(unsigned long)); \ +_DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits, &_##name##_text[0]) + +/* Writer Interface */ + +/** + * prb_rec_init_wd() - Initialize a buffer for writing records. + * + * @r: The record to initialize. + * @text_buf_size: The needed text buffer size. + * @dict_buf_size: The needed dictionary buffer size. + * + * Initialize all the fields that a writer is interested in. If + * @dict_buf_size is 0, a dictionary buffer will not be reserved. + * @text_buf_size must be greater than 0. + * + * Note that although @dict_buf_size may be initialized to non-zero, + * its value must be rechecked after a successful call to prb_reserve() + * to verify a dictionary buffer was actually reserved. Dictionary buffer + * reservation is allowed to fail. + */ +static inline void prb_rec_init_wr(struct printk_record *r, + unsigned int text_buf_size, + unsigned int dict_buf_size) +{ + r->info = NULL; + r->text_buf = NULL; + r->dict_buf = NULL; + r->text_buf_size = text_buf_size; + r->dict_buf_size = dict_buf_size; +} + +bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r); +void prb_commit(struct prb_reserved_entry *e); + +void prb_init(struct printk_ringbuffer *rb, + char *text_buf, unsigned int text_buf_size, + char *dict_buf, unsigned int dict_buf_size, + struct prb_desc *descs, unsigned int descs_count_bits); +unsigned int prb_record_text_space(struct prb_reserved_entry *e); + +/* Reader Interface */ + +/** + * prb_rec_init_rd() - Initialize a buffer for reading records. + * + * @r: The record to initialize. + * @info: A buffer to store record meta-data. + * @text_buf: A buffer to store text data. + * @text_buf_size: The size of @text_buf. + * @dict_buf: A buffer to store dictionary data. + * @dict_buf_size: The size of @dict_buf. + * + * Initialize all the fields that a reader is interested in. All arguments + * (except @r) are optional. Only record data for arguments that are + * non-NULL or non-zero will be read. + */ +static inline void prb_rec_init_rd(struct printk_record *r, + struct printk_info *info, + char *text_buf, unsigned int text_buf_size, + char *dict_buf, unsigned int dict_buf_size) +{ + r->info = info; + r->text_buf = text_buf; + r->dict_buf = dict_buf; + r->text_buf_size = text_buf_size; + r->dict_buf_size = dict_buf_size; +} + +/** + * prb_for_each_record() - Iterate over the records of a ringbuffer. + * + * @from: The sequence number to begin with. + * @rb: The ringbuffer to iterate over. + * @s: A u64 to store the sequence number on each iteration. + * @r: A printk_record to store the record on each iteration. + * + * This is a macro for conveniently iterating over a ringbuffer. + * Note that @s may not be the sequence number of the record on each + * iteration. For the sequence number, @r->info->seq should be checked. + * + * Context: Any context. + */ +#define prb_for_each_record(from, rb, s, r) \ +for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1) + +/** + * prb_for_each_info() - Iterate over the meta data of a ringbuffer. + * + * @from: The sequence number to begin with. + * @rb: The ringbuffer to iterate over. + * @s: A u64 to store the sequence number on each iteration. + * @i: A printk_info to store the record meta data on each iteration. + * @lc: An unsigned int to store the text line count of each record. + * + * This is a macro for conveniently iterating over a ringbuffer. + * Note that @s may not be the sequence number of the record on each + * iteration. For the sequence number, @r->info->seq should be checked. + * + * Context: Any context. + */ +#define prb_for_each_info(from, rb, s, i, lc) \ +for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1) + +bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r); +bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, + struct printk_info *info, unsigned int *line_count); + +u64 prb_first_valid_seq(struct printk_ringbuffer *rb); +u64 prb_next_seq(struct printk_ringbuffer *rb); + +#endif /* _KERNEL_PRINTK_RINGBUFFER_H */ -- cgit v1.2.3 From 8749efc0c0c325bf0c948c0b11d77bd3e497ead5 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 9 Jul 2020 15:29:43 +0206 Subject: Revert "printk: lock/unlock console only for new logbuf entries" This reverts commit 3ac37a93fa9217e576bebfd4ba3e80edaaeb2289. This optimization will not apply once the transition to a lockless printk is complete. Rather than porting this optimization through the transition only to remove it anyway, just revert it now to simplify the transition. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200709132344.760-4-john.ogness@linutronix.de --- kernel/printk/printk.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b71eaf5f5a86..1b41e1b98221 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1981,9 +1981,8 @@ asmlinkage int vprintk_emit(int facility, int level, const char *fmt, va_list args) { int printed_len; - bool in_sched = false, pending_output; + bool in_sched = false; unsigned long flags; - u64 curr_log_seq; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) @@ -1999,13 +1998,11 @@ asmlinkage int vprintk_emit(int facility, int level, /* This stops the holder of console_sem just where we want him */ logbuf_lock_irqsave(flags); - curr_log_seq = log_next_seq; printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); - pending_output = (curr_log_seq != log_next_seq); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ - if (!in_sched && pending_output) { + if (!in_sched) { /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to @@ -2022,8 +2019,7 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - if (pending_output) - wake_up_klogd(); + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); -- cgit v1.2.3 From 896fbe20b4e2333fb55cc9b9b783ebcc49eee7c7 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 9 Jul 2020 15:29:44 +0206 Subject: printk: use the lockless ringbuffer Replace the existing ringbuffer usage and implementation with lockless ringbuffer usage. Even though the new ringbuffer does not require locking, all existing locking is left in place. Therefore, this change is purely replacing the underlining ringbuffer. Changes that exist due to the ringbuffer replacement: - The VMCOREINFO has been updated for the new structures. - Dictionary data is now stored in a separate data buffer from the human-readable messages. The dictionary data buffer is set to the same size as the message buffer. Therefore, the total required memory for both dictionary and message data is 2 * (2 ^ CONFIG_LOG_BUF_SHIFT) for the initial static buffers and 2 * log_buf_len (the kernel parameter) for the dynamic buffers. - Record meta-data is now stored in a separate array of descriptors. This is an additional 72 * (2 ^ (CONFIG_LOG_BUF_SHIFT - 5)) bytes for the static array and 72 * (log_buf_len >> 5) bytes for the dynamic array. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200709132344.760-5-john.ogness@linutronix.de --- kernel/printk/printk.c | 940 ++++++++++++++++++++++++++----------------------- 1 file changed, 493 insertions(+), 447 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1b41e1b98221..fec71229169e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -55,6 +55,7 @@ #define CREATE_TRACE_POINTS #include +#include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" #include "internal.h" @@ -294,30 +295,24 @@ enum con_msg_format_flags { static int console_msg_format = MSG_FORMAT_DEFAULT; /* - * The printk log buffer consists of a chain of concatenated variable - * length records. Every record starts with a record header, containing - * the overall length of the record. + * The printk log buffer consists of a sequenced collection of records, each + * containing variable length message and dictionary text. Every record + * also contains its own meta-data (@info). * - * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these entries are maintained when messages are - * stored. + * Every record meta-data carries the timestamp in microseconds, as well as + * the standard userspace syslog level and syslog facility. The usual kernel + * messages use LOG_KERN; userspace-injected messages always carry a matching + * syslog facility, by default LOG_USER. The origin of every message can be + * reliably determined that way. * - * If the heads indicate available messages, the length in the header - * tells the start next message. A length == 0 for the next message - * indicates a wrap-around to the beginning of the buffer. + * The human readable log message of a record is available in @text, the + * length of the message text in @text_len. The stored message is not + * terminated. * - * Every record carries the monotonic timestamp in microseconds, as well as - * the standard userspace syslog level and syslog facility. The usual - * kernel messages use LOG_KERN; userspace-injected messages always carry - * a matching syslog facility, by default LOG_USER. The origin of every - * message can be reliably determined that way. - * - * The human readable log message directly follows the message header. The - * length of the message text is stored in the header, the stored message - * is not terminated. - * - * Optionally, a message can carry a dictionary of properties (key/value pairs), - * to provide userspace with a machine-readable message context. + * Optionally, a record can carry a dictionary of properties (key/value + * pairs), to provide userspace with a machine-readable message context. The + * length of the dictionary is available in @dict_len. The dictionary is not + * terminated. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier @@ -331,21 +326,19 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; * follows directly after a '=' character. Every property is terminated by * a '\0' character. The last property is not terminated. * - * Example of a message structure: - * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec - * 0008 34 00 record is 52 bytes long - * 000a 0b 00 text is 11 bytes long - * 000c 1f 00 dictionary is 23 bytes long - * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) - * 0010 69 74 27 73 20 61 20 6c "it's a l" - * 69 6e 65 "ine" - * 001b 44 45 56 49 43 "DEVIC" - * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" - * 52 49 56 45 52 3d 62 75 "RIVER=bu" - * 67 "g" - * 0032 00 00 00 padding to next message header - * - * The 'struct printk_log' buffer header must never be directly exported to + * Example of record values: + * record.text_buf = "it's a line" (unterminated) + * record.dict_buf = "DEVICE=b8:2\0DRIVER=bug" (unterminated) + * record.info.seq = 56 + * record.info.ts_nsec = 36863 + * record.info.text_len = 11 + * record.info.dict_len = 22 + * record.info.facility = 0 (LOG_KERN) + * record.info.flags = 0 + * record.info.level = 3 (LOG_ERR) + * record.info.caller_id = 299 (task 299) + * + * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * @@ -365,23 +358,6 @@ enum log_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; -struct printk_log { - u64 ts_nsec; /* timestamp in nanoseconds */ - u16 len; /* length of entire record */ - u16 text_len; /* length of text buffer */ - u16 dict_len; /* length of dictionary buffer */ - u8 facility; /* syslog facility */ - u8 flags:5; /* internal record flags */ - u8 level:3; /* syslog level */ -#ifdef CONFIG_PRINTK_CALLER - u32 caller_id; /* thread id or processor id */ -#endif -} -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -__packed __aligned(4) -#endif -; - /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken * within the scheduler's rq lock. It must be released before calling @@ -421,26 +397,16 @@ DEFINE_RAW_SPINLOCK(logbuf_lock); DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; -static u32 syslog_idx; static size_t syslog_partial; static bool syslog_time; -/* index and sequence number of the first record stored in the buffer */ -static u64 log_first_seq; -static u32 log_first_idx; - -/* index and sequence number of the next record to store in the buffer */ -static u64 log_next_seq; -static u32 log_next_idx; - /* the next printk record to write to the console */ static u64 console_seq; -static u32 console_idx; static u64 exclusive_console_stop_seq; +static unsigned long console_dropped; /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; -static u32 clear_idx; #ifdef CONFIG_PRINTK_CALLER #define PREFIX_MAX 48 @@ -453,13 +419,31 @@ static u32 clear_idx; #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ -#define LOG_ALIGN __alignof__(struct printk_log) +#define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX (u32)(1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; +/* + * Define the average message size. This only affects the number of + * descriptors that will be available. Underestimating is better than + * overestimating (too many available descriptors is better than not enough). + * The dictionary buffer will be the same size as the text buffer. + */ +#define PRB_AVGBITS 5 /* 32 character average length */ + +#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS +#error CONFIG_LOG_BUF_SHIFT value too small. +#endif +_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, + PRB_AVGBITS, PRB_AVGBITS, &__log_buf[0]); + +static struct printk_ringbuffer printk_rb_dynamic; + +static struct printk_ringbuffer *prb = &printk_rb_static; + /* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before * per_cpu_areas are initialised. This variable is set to true when @@ -484,108 +468,6 @@ u32 log_buf_len_get(void) return log_buf_len; } -/* human readable text of the record */ -static char *log_text(const struct printk_log *msg) -{ - return (char *)msg + sizeof(struct printk_log); -} - -/* optional key/value pair dictionary attached to the record */ -static char *log_dict(const struct printk_log *msg) -{ - return (char *)msg + sizeof(struct printk_log) + msg->text_len; -} - -/* get record by index; idx must point to valid msg */ -static struct printk_log *log_from_idx(u32 idx) -{ - struct printk_log *msg = (struct printk_log *)(log_buf + idx); - - /* - * A length == 0 record is the end of buffer marker. Wrap around and - * read the message at the start of the buffer. - */ - if (!msg->len) - return (struct printk_log *)log_buf; - return msg; -} - -/* get next record; idx must point to valid msg */ -static u32 log_next(u32 idx) -{ - struct printk_log *msg = (struct printk_log *)(log_buf + idx); - - /* length == 0 indicates the end of the buffer; wrap */ - /* - * A length == 0 record is the end of buffer marker. Wrap around and - * read the message at the start of the buffer as *this* one, and - * return the one after that. - */ - if (!msg->len) { - msg = (struct printk_log *)log_buf; - return msg->len; - } - return idx + msg->len; -} - -/* - * Check whether there is enough free space for the given message. - * - * The same values of first_idx and next_idx mean that the buffer - * is either empty or full. - * - * If the buffer is empty, we must respect the position of the indexes. - * They cannot be reset to the beginning of the buffer. - */ -static int logbuf_has_space(u32 msg_size, bool empty) -{ - u32 free; - - if (log_next_idx > log_first_idx || empty) - free = max(log_buf_len - log_next_idx, log_first_idx); - else - free = log_first_idx - log_next_idx; - - /* - * We need space also for an empty header that signalizes wrapping - * of the buffer. - */ - return free >= msg_size + sizeof(struct printk_log); -} - -static int log_make_free_space(u32 msg_size) -{ - while (log_first_seq < log_next_seq && - !logbuf_has_space(msg_size, false)) { - /* drop old messages until we have enough contiguous space */ - log_first_idx = log_next(log_first_idx); - log_first_seq++; - } - - if (clear_seq < log_first_seq) { - clear_seq = log_first_seq; - clear_idx = log_first_idx; - } - - /* sequence numbers are equal, so the log buffer is empty */ - if (logbuf_has_space(msg_size, log_first_seq == log_next_seq)) - return 0; - - return -ENOMEM; -} - -/* compute the message size including the padding bytes */ -static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) -{ - u32 size; - - size = sizeof(struct printk_log) + text_len + dict_len; - *pad_len = (-size) & (LOG_ALIGN - 1); - size += *pad_len; - - return size; -} - /* * Define how much of the log buffer we could take at maximum. The value * must be greater than two. Note that only half of the buffer is available @@ -594,22 +476,23 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) #define MAX_LOG_TAKE_PART 4 static const char trunc_msg[] = ""; -static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, - u16 *dict_len, u32 *pad_len) +static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) { /* * The message should not take the whole buffer. Otherwise, it might * get removed too soon. */ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; + if (*text_len > max_text_len) *text_len = max_text_len; - /* enable the warning message */ + + /* enable the warning message (if there is room) */ *trunc_msg_len = strlen(trunc_msg); - /* disable the "dict" completely */ - *dict_len = 0; - /* compute the size again, count also the warning message */ - return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); + if (*text_len >= *trunc_msg_len) + *text_len -= *trunc_msg_len; + else + *trunc_msg_len = 0; } /* insert record into the buffer, discard old ones, update heads */ @@ -618,60 +501,40 @@ static int log_store(u32 caller_id, int facility, int level, const char *dict, u16 dict_len, const char *text, u16 text_len) { - struct printk_log *msg; - u32 size, pad_len; + struct prb_reserved_entry e; + struct printk_record r; u16 trunc_msg_len = 0; - /* number of '\0' padding bytes to next message */ - size = msg_used_size(text_len, dict_len, &pad_len); + prb_rec_init_wr(&r, text_len, dict_len); - if (log_make_free_space(size)) { + if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ - size = truncate_msg(&text_len, &trunc_msg_len, - &dict_len, &pad_len); + truncate_msg(&text_len, &trunc_msg_len); + prb_rec_init_wr(&r, text_len + trunc_msg_len, dict_len); /* survive when the log buffer is too small for trunc_msg */ - if (log_make_free_space(size)) + if (!prb_reserve(&e, prb, &r)) return 0; } - if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { - /* - * This message + an additional empty header does not fit - * at the end of the buffer. Add an empty header with len == 0 - * to signify a wrap around. - */ - memset(log_buf + log_next_idx, 0, sizeof(struct printk_log)); - log_next_idx = 0; - } - /* fill message */ - msg = (struct printk_log *)(log_buf + log_next_idx); - memcpy(log_text(msg), text, text_len); - msg->text_len = text_len; - if (trunc_msg_len) { - memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); - msg->text_len += trunc_msg_len; - } - memcpy(log_dict(msg), dict, dict_len); - msg->dict_len = dict_len; - msg->facility = facility; - msg->level = level & 7; - msg->flags = flags & 0x1f; + memcpy(&r.text_buf[0], text, text_len); + if (trunc_msg_len) + memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); + if (r.dict_buf) + memcpy(&r.dict_buf[0], dict, dict_len); + r.info->facility = facility; + r.info->level = level & 7; + r.info->flags = flags & 0x1f; if (ts_nsec > 0) - msg->ts_nsec = ts_nsec; + r.info->ts_nsec = ts_nsec; else - msg->ts_nsec = local_clock(); -#ifdef CONFIG_PRINTK_CALLER - msg->caller_id = caller_id; -#endif - memset(log_dict(msg) + dict_len, 0, pad_len); - msg->len = size; + r.info->ts_nsec = local_clock(); + r.info->caller_id = caller_id; /* insert message */ - log_next_idx += msg->len; - log_next_seq++; + prb_commit(&e); - return msg->text_len; + return (text_len + trunc_msg_len); } int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); @@ -723,13 +586,13 @@ static void append_char(char **pp, char *e, char c) *(*pp)++ = c; } -static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, u64 seq) +static ssize_t info_print_ext_header(char *buf, size_t size, + struct printk_info *info) { - u64 ts_usec = msg->ts_nsec; + u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER - u32 id = msg->caller_id; + u32 id = info->caller_id; snprintf(caller, sizeof(caller), ",caller=%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); @@ -740,8 +603,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", - (msg->facility << 3) | msg->level, seq, ts_usec, - msg->flags & LOG_CONT ? 'c' : '-', caller); + (info->facility << 3) | info->level, info->seq, + ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_print_ext_body(char *buf, size_t size, @@ -795,10 +658,14 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; - u32 idx; struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; + + struct printk_info info; + char text_buf[CONSOLE_EXT_LOG_MAX]; + char dict_buf[CONSOLE_EXT_LOG_MAX]; + struct printk_record record; }; static __printf(3, 4) __cold @@ -881,7 +748,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; - struct printk_log *msg; + struct printk_record *r = &user->record; size_t len; ssize_t ret; @@ -893,7 +760,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, return ret; logbuf_lock_irq(); - while (user->seq == log_next_seq) { + if (!prb_read_valid(prb, user->seq, r)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; logbuf_unlock_irq(); @@ -902,30 +769,26 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, logbuf_unlock_irq(); ret = wait_event_interruptible(log_wait, - user->seq != log_next_seq); + prb_read_valid(prb, user->seq, r)); if (ret) goto out; logbuf_lock_irq(); } - if (user->seq < log_first_seq) { + if (user->seq < prb_first_valid_seq(prb)) { /* our last seen message is gone, return error and reset */ - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); ret = -EPIPE; logbuf_unlock_irq(); goto out; } - msg = log_from_idx(user->idx); - len = msg_print_ext_header(user->buf, sizeof(user->buf), - msg, user->seq); + len = info_print_ext_header(user->buf, sizeof(user->buf), r->info); len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, - log_dict(msg), msg->dict_len, - log_text(msg), msg->text_len); + &r->dict_buf[0], r->info->dict_len, + &r->text_buf[0], r->info->text_len); - user->idx = log_next(user->idx); - user->seq++; + user->seq = r->info->seq + 1; logbuf_unlock_irq(); if (len > count) { @@ -957,8 +820,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_SET: /* the first record */ - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); break; case SEEK_DATA: /* @@ -966,13 +828,11 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ - user->idx = clear_idx; user->seq = clear_seq; break; case SEEK_END: /* after the last record */ - user->idx = log_next_idx; - user->seq = log_next_seq; + user->seq = prb_next_seq(prb); break; default: ret = -EINVAL; @@ -992,9 +852,9 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); logbuf_lock_irq(); - if (user->seq < log_next_seq) { + if (prb_read_valid(prb, user->seq, NULL)) { /* return error when data has vanished underneath us */ - if (user->seq < log_first_seq) + if (user->seq < prb_first_valid_seq(prb)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; @@ -1029,9 +889,12 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); + prb_rec_init_rd(&user->record, &user->info, + &user->text_buf[0], sizeof(user->text_buf), + &user->dict_buf[0], sizeof(user->dict_buf)); + logbuf_lock_irq(); - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); logbuf_unlock_irq(); file->private_data = user; @@ -1072,23 +935,52 @@ const struct file_operations kmsg_fops = { */ void log_buf_vmcoreinfo_setup(void) { - VMCOREINFO_SYMBOL(log_buf); - VMCOREINFO_SYMBOL(log_buf_len); - VMCOREINFO_SYMBOL(log_first_idx); - VMCOREINFO_SYMBOL(clear_idx); - VMCOREINFO_SYMBOL(log_next_idx); + VMCOREINFO_SYMBOL(prb); + VMCOREINFO_SYMBOL(printk_rb_static); + VMCOREINFO_SYMBOL(clear_seq); + /* - * Export struct printk_log size and field offsets. User space tools can + * Export struct size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ - VMCOREINFO_STRUCT_SIZE(printk_log); - VMCOREINFO_OFFSET(printk_log, ts_nsec); - VMCOREINFO_OFFSET(printk_log, len); - VMCOREINFO_OFFSET(printk_log, text_len); - VMCOREINFO_OFFSET(printk_log, dict_len); -#ifdef CONFIG_PRINTK_CALLER - VMCOREINFO_OFFSET(printk_log, caller_id); -#endif + + VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); + VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); + VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); + VMCOREINFO_OFFSET(printk_ringbuffer, dict_data_ring); + VMCOREINFO_OFFSET(printk_ringbuffer, fail); + + VMCOREINFO_STRUCT_SIZE(prb_desc_ring); + VMCOREINFO_OFFSET(prb_desc_ring, count_bits); + VMCOREINFO_OFFSET(prb_desc_ring, descs); + VMCOREINFO_OFFSET(prb_desc_ring, head_id); + VMCOREINFO_OFFSET(prb_desc_ring, tail_id); + + VMCOREINFO_STRUCT_SIZE(prb_desc); + VMCOREINFO_OFFSET(prb_desc, info); + VMCOREINFO_OFFSET(prb_desc, state_var); + VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); + VMCOREINFO_OFFSET(prb_desc, dict_blk_lpos); + + VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); + VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); + VMCOREINFO_OFFSET(prb_data_blk_lpos, next); + + VMCOREINFO_STRUCT_SIZE(printk_info); + VMCOREINFO_OFFSET(printk_info, seq); + VMCOREINFO_OFFSET(printk_info, ts_nsec); + VMCOREINFO_OFFSET(printk_info, text_len); + VMCOREINFO_OFFSET(printk_info, dict_len); + VMCOREINFO_OFFSET(printk_info, caller_id); + + VMCOREINFO_STRUCT_SIZE(prb_data_ring); + VMCOREINFO_OFFSET(prb_data_ring, size_bits); + VMCOREINFO_OFFSET(prb_data_ring, data); + VMCOREINFO_OFFSET(prb_data_ring, head_lpos); + VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); + + VMCOREINFO_SIZE(atomic_long_t); + VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); } #endif @@ -1166,11 +1058,48 @@ static void __init set_percpu_data_ready(void) __printk_percpu_data_ready = true; } +static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, + struct printk_record *r) +{ + struct prb_reserved_entry e; + struct printk_record dest_r; + + prb_rec_init_wr(&dest_r, r->info->text_len, r->info->dict_len); + + if (!prb_reserve(&e, rb, &dest_r)) + return 0; + + memcpy(&dest_r.text_buf[0], &r->text_buf[0], dest_r.text_buf_size); + if (dest_r.dict_buf) { + memcpy(&dest_r.dict_buf[0], &r->dict_buf[0], + dest_r.dict_buf_size); + } + dest_r.info->facility = r->info->facility; + dest_r.info->level = r->info->level; + dest_r.info->flags = r->info->flags; + dest_r.info->ts_nsec = r->info->ts_nsec; + dest_r.info->caller_id = r->info->caller_id; + + prb_commit(&e); + + return prb_record_text_space(&e); +} + +static char setup_text_buf[CONSOLE_EXT_LOG_MAX] __initdata; +static char setup_dict_buf[CONSOLE_EXT_LOG_MAX] __initdata; + void __init setup_log_buf(int early) { + unsigned int new_descs_count; + struct prb_desc *new_descs; + struct printk_info info; + struct printk_record r; + size_t new_descs_size; unsigned long flags; + char *new_dict_buf; char *new_log_buf; unsigned int free; + u64 seq; /* * Some archs call setup_log_buf() multiple times - first is very @@ -1189,21 +1118,70 @@ void __init setup_log_buf(int early) if (!new_log_buf_len) return; + new_descs_count = new_log_buf_len >> PRB_AVGBITS; + if (new_descs_count == 0) { + pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); + return; + } + new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { - pr_err("log_buf_len: %lu bytes not available\n", - new_log_buf_len); + pr_err("log_buf_len: %lu text bytes not available\n", + new_log_buf_len); return; } + new_dict_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); + if (unlikely(!new_dict_buf)) { + pr_err("log_buf_len: %lu dict bytes not available\n", + new_log_buf_len); + memblock_free(__pa(new_log_buf), new_log_buf_len); + return; + } + + new_descs_size = new_descs_count * sizeof(struct prb_desc); + new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); + if (unlikely(!new_descs)) { + pr_err("log_buf_len: %zu desc bytes not available\n", + new_descs_size); + memblock_free(__pa(new_dict_buf), new_log_buf_len); + memblock_free(__pa(new_log_buf), new_log_buf_len); + return; + } + + prb_rec_init_rd(&r, &info, + &setup_text_buf[0], sizeof(setup_text_buf), + &setup_dict_buf[0], sizeof(setup_dict_buf)); + + prb_init(&printk_rb_dynamic, + new_log_buf, ilog2(new_log_buf_len), + new_dict_buf, ilog2(new_log_buf_len), + new_descs, ilog2(new_descs_count)); + logbuf_lock_irqsave(flags); + log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; - free = __LOG_BUF_LEN - log_next_idx; - memcpy(log_buf, __log_buf, __LOG_BUF_LEN); + + free = __LOG_BUF_LEN; + prb_for_each_record(0, &printk_rb_static, seq, &r) + free -= add_to_rb(&printk_rb_dynamic, &r); + + /* + * This is early enough that everything is still running on the + * boot CPU and interrupts are disabled. So no new messages will + * appear during the transition to the dynamic buffer. + */ + prb = &printk_rb_dynamic; + logbuf_unlock_irqrestore(flags); + if (seq != prb_next_seq(&printk_rb_static)) { + pr_err("dropped %llu messages\n", + prb_next_seq(&printk_rb_static) - seq); + } + pr_info("log_buf_len: %u bytes\n", log_buf_len); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); @@ -1313,18 +1291,18 @@ static size_t print_caller(u32 id, char *buf) #define print_caller(id, buf) 0 #endif -static size_t print_prefix(const struct printk_log *msg, bool syslog, - bool time, char *buf) +static size_t info_print_prefix(const struct printk_info *info, bool syslog, + bool time, char *buf) { size_t len = 0; if (syslog) - len = print_syslog((msg->facility << 3) | msg->level, buf); + len = print_syslog((info->facility << 3) | info->level, buf); if (time) - len += print_time(msg->ts_nsec, buf + len); + len += print_time(info->ts_nsec, buf + len); - len += print_caller(msg->caller_id, buf + len); + len += print_caller(info->caller_id, buf + len); if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { buf[len++] = ' '; @@ -1334,72 +1312,143 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, return len; } -static size_t msg_print_text(const struct printk_log *msg, bool syslog, - bool time, char *buf, size_t size) +/* + * Prepare the record for printing. The text is shifted within the given + * buffer to avoid a need for another one. The following operations are + * done: + * + * - Add prefix for each line. + * - Add the trailing newline that has been removed in vprintk_store(). + * - Drop truncated lines that do not longer fit into the buffer. + * + * Return: The length of the updated/prepared text, including the added + * prefixes and the newline. The dropped line(s) are not counted. + */ +static size_t record_print_text(struct printk_record *r, bool syslog, + bool time) { - const char *text = log_text(msg); - size_t text_size = msg->text_len; - size_t len = 0; + size_t text_len = r->info->text_len; + size_t buf_size = r->text_buf_size; + char *text = r->text_buf; char prefix[PREFIX_MAX]; - const size_t prefix_len = print_prefix(msg, syslog, time, prefix); + bool truncated = false; + size_t prefix_len; + size_t line_len; + size_t len = 0; + char *next; - do { - const char *next = memchr(text, '\n', text_size); - size_t text_len; + prefix_len = info_print_prefix(r->info, syslog, time, prefix); + /* + * @text_len: bytes of unprocessed text + * @line_len: bytes of current line _without_ newline + * @text: pointer to beginning of current line + * @len: number of bytes prepared in r->text_buf + */ + for (;;) { + next = memchr(text, '\n', text_len); if (next) { - text_len = next - text; - next++; - text_size -= next - text; + line_len = next - text; } else { - text_len = text_size; + /* Drop truncated line(s). */ + if (truncated) + break; + line_len = text_len; } - if (buf) { - if (prefix_len + text_len + 1 >= size - len) + /* + * Truncate the text if there is not enough space to add the + * prefix and a trailing newline. + */ + if (len + prefix_len + text_len + 1 > buf_size) { + /* Drop even the current line if no space. */ + if (len + prefix_len + line_len + 1 > buf_size) break; - memcpy(buf + len, prefix, prefix_len); - len += prefix_len; - memcpy(buf + len, text, text_len); - len += text_len; - buf[len++] = '\n'; - } else { - /* SYSLOG_ACTION_* buffer size only calculation */ - len += prefix_len + text_len + 1; + text_len = buf_size - len - prefix_len - 1; + truncated = true; } - text = next; - } while (text); + memmove(text + prefix_len, text, text_len); + memcpy(text, prefix, prefix_len); + + len += prefix_len + line_len + 1; + + if (text_len == line_len) { + /* + * Add the trailing newline removed in + * vprintk_store(). + */ + text[prefix_len + line_len] = '\n'; + break; + } + + /* + * Advance beyond the added prefix and the related line with + * its newline. + */ + text += prefix_len + line_len + 1; + + /* + * The remaining text has only decreased by the line with its + * newline. + * + * Note that @text_len can become zero. It happens when @text + * ended with a newline (either due to truncation or the + * original string ending with "\n\n"). The loop is correctly + * repeated and (if not truncated) an empty line with a prefix + * will be prepared. + */ + text_len -= line_len + 1; + } return len; } +static size_t get_record_print_text_size(struct printk_info *info, + unsigned int line_count, + bool syslog, bool time) +{ + char prefix[PREFIX_MAX]; + size_t prefix_len; + + prefix_len = info_print_prefix(info, syslog, time, prefix); + + /* + * Each line will be preceded with a prefix. The intermediate + * newlines are already within the text, but a final trailing + * newline will be added. + */ + return ((prefix_len * line_count) + info->text_len + 1); +} + static int syslog_print(char __user *buf, int size) { + struct printk_info info; + struct printk_record r; char *text; - struct printk_log *msg; int len = 0; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX, NULL, 0); + while (size > 0) { size_t n; size_t skip; logbuf_lock_irq(); - if (syslog_seq < log_first_seq) { - /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; - syslog_partial = 0; - } - if (syslog_seq == log_next_seq) { + if (!prb_read_valid(prb, syslog_seq, &r)) { logbuf_unlock_irq(); break; } + if (r.info->seq != syslog_seq) { + /* message is gone, move to next valid one */ + syslog_seq = r.info->seq; + syslog_partial = 0; + } /* * To keep reading/counting partial line consistent, @@ -1409,13 +1458,10 @@ static int syslog_print(char __user *buf, int size) syslog_time = printk_time; skip = syslog_partial; - msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, true, syslog_time, text, - LOG_LINE_MAX + PREFIX_MAX); + n = record_print_text(&r, true, syslog_time); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ - syslog_idx = log_next(syslog_idx); - syslog_seq++; + syslog_seq = r.info->seq + 1; n -= syslog_partial; syslog_partial = 0; } else if (!len){ @@ -1446,11 +1492,12 @@ static int syslog_print(char __user *buf, int size) static int syslog_print_all(char __user *buf, int size, bool clear) { + struct printk_info info; + unsigned int line_count; + struct printk_record r; char *text; int len = 0; - u64 next_seq; u64 seq; - u32 idx; bool time; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); @@ -1463,38 +1510,28 @@ static int syslog_print_all(char __user *buf, int size, bool clear) * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ - seq = clear_seq; - idx = clear_idx; - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len += msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; - } + prb_for_each_info(clear_seq, prb, seq, &info, &line_count) + len += get_record_print_text_size(&info, line_count, true, time); /* move first record forward until length fits into the buffer */ - seq = clear_seq; - idx = clear_idx; - while (len > size && seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len -= msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + prb_for_each_info(clear_seq, prb, seq, &info, &line_count) { + if (len <= size) + break; + len -= get_record_print_text_size(&info, line_count, true, time); } - /* last message fitting into this dump */ - next_seq = log_next_seq; + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX, NULL, 0); len = 0; - while (len >= 0 && seq < next_seq) { - struct printk_log *msg = log_from_idx(idx); - int textlen = msg_print_text(msg, true, time, text, - LOG_LINE_MAX + PREFIX_MAX); + prb_for_each_record(seq, prb, seq, &r) { + int textlen; - idx = log_next(idx); - seq++; + textlen = record_print_text(&r, true, time); + + if (len + textlen > size) { + seq--; + break; + } logbuf_unlock_irq(); if (copy_to_user(buf + len, text, textlen)) @@ -1503,17 +1540,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear) len += textlen; logbuf_lock_irq(); - if (seq < log_first_seq) { - /* messages are gone, move to next one */ - seq = log_first_seq; - idx = log_first_idx; - } + if (len < 0) + break; } - if (clear) { - clear_seq = log_next_seq; - clear_idx = log_next_idx; - } + if (clear) + clear_seq = seq; logbuf_unlock_irq(); kfree(text); @@ -1523,8 +1555,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) static void syslog_clear(void) { logbuf_lock_irq(); - clear_seq = log_next_seq; - clear_idx = log_next_idx; + clear_seq = prb_next_seq(prb); logbuf_unlock_irq(); } @@ -1551,7 +1582,7 @@ int do_syslog(int type, char __user *buf, int len, int source) if (!access_ok(buf, len)) return -EFAULT; error = wait_event_interruptible(log_wait, - syslog_seq != log_next_seq); + prb_read_valid(prb, syslog_seq, NULL)); if (error) return error; error = syslog_print(buf, len); @@ -1600,10 +1631,9 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: logbuf_lock_irq(); - if (syslog_seq < log_first_seq) { + if (syslog_seq < prb_first_valid_seq(prb)) { /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; + syslog_seq = prb_first_valid_seq(prb); syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { @@ -1612,20 +1642,18 @@ int do_syslog(int type, char __user *buf, int len, int source) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_seq - syslog_seq; + error = prb_next_seq(prb) - syslog_seq; } else { - u64 seq = syslog_seq; - u32 idx = syslog_idx; bool time = syslog_partial ? syslog_time : printk_time; - - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - error += msg_print_text(msg, true, time, NULL, - 0); + struct printk_info info; + unsigned int line_count; + u64 seq; + + prb_for_each_info(syslog_seq, prb, seq, &info, + &line_count) { + error += get_record_print_text_size(&info, line_count, + true, time); time = printk_time; - idx = log_next(idx); - seq++; } error -= syslog_partial; } @@ -1796,10 +1824,22 @@ static int console_trylock_spinning(void) static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) { + static char dropped_text[64]; + size_t dropped_len = 0; struct console *con; trace_console_rcuidle(text, len); + if (!console_drivers) + return; + + if (console_dropped) { + dropped_len = snprintf(dropped_text, sizeof(dropped_text), + "** %lu printk messages dropped **\n", + console_dropped); + console_dropped = 0; + } + for_each_console(con) { if (exclusive_console && con != exclusive_console) continue; @@ -1812,8 +1852,11 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, continue; if (con->flags & CON_EXTENDED) con->write(con, ext_text, ext_len); - else + else { + if (dropped_len) + con->write(con, dropped_text, dropped_len); con->write(con, text, len); + } } } @@ -2076,21 +2119,24 @@ EXPORT_SYMBOL(printk); #define PREFIX_MAX 0 #define printk_time false +#define prb_read_valid(rb, seq, r) false +#define prb_first_valid_seq(rb) 0 + static u64 syslog_seq; -static u32 syslog_idx; static u64 console_seq; -static u32 console_idx; static u64 exclusive_console_stop_seq; -static u64 log_first_seq; -static u32 log_first_idx; -static u64 log_next_seq; -static char *log_text(const struct printk_log *msg) { return NULL; } -static char *log_dict(const struct printk_log *msg) { return NULL; } -static struct printk_log *log_from_idx(u32 idx) { return NULL; } -static u32 log_next(u32 idx) { return 0; } -static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, - u64 seq) { return 0; } +static unsigned long console_dropped; + +static size_t record_print_text(const struct printk_record *r, + bool syslog, bool time) +{ + return 0; +} +static ssize_t info_print_ext_header(char *buf, size_t size, + struct printk_info *info) +{ + return 0; +} static ssize_t msg_print_ext_body(char *buf, size_t size, char *dict, size_t dict_len, char *text, size_t text_len) { return 0; } @@ -2098,8 +2144,6 @@ static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} -static size_t msg_print_text(const struct printk_log *msg, bool syslog, - bool time, char *buf, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2384,14 +2428,19 @@ void console_unlock(void) { static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[LOG_LINE_MAX + PREFIX_MAX]; + static char dict[LOG_LINE_MAX]; unsigned long flags; bool do_cond_resched, retry; + struct printk_info info; + struct printk_record r; if (console_suspended) { up_console_sem(); return; } + prb_rec_init_rd(&r, &info, text, sizeof(text), dict, sizeof(dict)); + /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @@ -2422,35 +2471,26 @@ again: } for (;;) { - struct printk_log *msg; size_t ext_len = 0; size_t len; printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); - if (console_seq < log_first_seq) { - len = snprintf(text, sizeof(text), - "** %llu printk messages dropped **\n", - log_first_seq - console_seq); - - /* messages are gone, move to first one */ - console_seq = log_first_seq; - console_idx = log_first_idx; - } else { - len = 0; - } skip: - if (console_seq == log_next_seq) + if (!prb_read_valid(prb, console_seq, &r)) break; - msg = log_from_idx(console_idx); - if (suppress_message_printing(msg->level)) { + if (console_seq != r.info->seq) { + console_dropped += r.info->seq - console_seq; + console_seq = r.info->seq; + } + + if (suppress_message_printing(r.info->level)) { /* * Skip record we have buffered and already printed * directly to the console when we received it, and * record that has level above the console loglevel. */ - console_idx = log_next(console_idx); console_seq++; goto skip; } @@ -2461,19 +2501,24 @@ skip: exclusive_console = NULL; } - len += msg_print_text(msg, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time, text + len, sizeof(text) - len); + /* + * Handle extended console text first because later + * record_print_text() will modify the record buffer in-place. + */ if (nr_ext_console_drivers) { - ext_len = msg_print_ext_header(ext_text, + ext_len = info_print_ext_header(ext_text, sizeof(ext_text), - msg, console_seq); + r.info); ext_len += msg_print_ext_body(ext_text + ext_len, sizeof(ext_text) - ext_len, - log_dict(msg), msg->dict_len, - log_text(msg), msg->text_len); + &r.dict_buf[0], + r.info->dict_len, + &r.text_buf[0], + r.info->text_len); } - console_idx = log_next(console_idx); + len = record_print_text(&r, + console_msg_format & MSG_FORMAT_SYSLOG, + printk_time); console_seq++; raw_spin_unlock(&logbuf_lock); @@ -2513,7 +2558,7 @@ skip: * flush, no worries. */ raw_spin_lock(&logbuf_lock); - retry = console_seq != log_next_seq; + retry = prb_read_valid(prb, console_seq, NULL); raw_spin_unlock(&logbuf_lock); printk_safe_exit_irqrestore(flags); @@ -2582,8 +2627,7 @@ void console_flush_on_panic(enum con_flush_mode mode) unsigned long flags; logbuf_lock_irqsave(flags); - console_seq = log_first_seq; - console_idx = log_first_idx; + console_seq = prb_first_valid_seq(prb); logbuf_unlock_irqrestore(flags); } console_unlock(); @@ -2826,7 +2870,6 @@ void register_console(struct console *newcon) exclusive_console = newcon; exclusive_console_stop_seq = console_seq; console_seq = syslog_seq; - console_idx = syslog_idx; logbuf_unlock_irqrestore(flags); } console_unlock(); @@ -3215,9 +3258,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) logbuf_lock_irqsave(flags); dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; + dumper->next_seq = prb_next_seq(prb); logbuf_unlock_irqrestore(flags); /* invoke dumper which will iterate over records */ @@ -3251,28 +3292,33 @@ void kmsg_dump(enum kmsg_dump_reason reason) bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, char *line, size_t size, size_t *len) { - struct printk_log *msg; + struct printk_info info; + unsigned int line_count; + struct printk_record r; size_t l = 0; bool ret = false; + prb_rec_init_rd(&r, &info, line, size, NULL, 0); + if (!dumper->active) goto out; - if (dumper->cur_seq < log_first_seq) { - /* messages are gone, move to first available one */ - dumper->cur_seq = log_first_seq; - dumper->cur_idx = log_first_idx; - } - - /* last entry */ - if (dumper->cur_seq >= log_next_seq) - goto out; + /* Read text or count text lines? */ + if (line) { + if (!prb_read_valid(prb, dumper->cur_seq, &r)) + goto out; + l = record_print_text(&r, syslog, printk_time); + } else { + if (!prb_read_valid_info(prb, dumper->cur_seq, + &info, &line_count)) { + goto out; + } + l = get_record_print_text_size(&info, line_count, syslog, + printk_time); - msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, syslog, printk_time, line, size); + } - dumper->cur_idx = log_next(dumper->cur_idx); - dumper->cur_seq++; + dumper->cur_seq = r.info->seq + 1; ret = true; out: if (len) @@ -3333,23 +3379,25 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, char *buf, size_t size, size_t *len) { + struct printk_info info; + unsigned int line_count; + struct printk_record r; unsigned long flags; u64 seq; - u32 idx; u64 next_seq; - u32 next_idx; size_t l = 0; bool ret = false; bool time = printk_time; - if (!dumper->active) + prb_rec_init_rd(&r, &info, buf, size, NULL, 0); + + if (!dumper->active || !buf || !size) goto out; logbuf_lock_irqsave(flags); - if (dumper->cur_seq < log_first_seq) { + if (dumper->cur_seq < prb_first_valid_seq(prb)) { /* messages are gone, move to first available one */ - dumper->cur_seq = log_first_seq; - dumper->cur_idx = log_first_idx; + dumper->cur_seq = prb_first_valid_seq(prb); } /* last entry */ @@ -3360,41 +3408,41 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* calculate length of entire buffer */ seq = dumper->cur_seq; - idx = dumper->cur_idx; - while (seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); - - l += msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + while (prb_read_valid_info(prb, seq, &info, &line_count)) { + if (r.info->seq >= dumper->next_seq) + break; + l += get_record_print_text_size(&info, line_count, true, time); + seq = r.info->seq + 1; } /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; - idx = dumper->cur_idx; - while (l >= size && seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); - - l -= msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + while (l >= size && prb_read_valid_info(prb, seq, + &info, &line_count)) { + if (r.info->seq >= dumper->next_seq) + break; + l -= get_record_print_text_size(&info, line_count, true, time); + seq = r.info->seq + 1; } /* last message in next interation */ next_seq = seq; - next_idx = idx; + /* actually read text into the buffer now */ l = 0; - while (seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); + while (prb_read_valid(prb, seq, &r)) { + if (r.info->seq >= dumper->next_seq) + break; + + l += record_print_text(&r, syslog, time); + + /* adjust record to store to remaining buffer space */ + prb_rec_init_rd(&r, &info, buf + l, size - l, NULL, 0); - l += msg_print_text(msg, syslog, time, buf + l, size - l); - idx = log_next(idx); - seq++; + seq = r.info->seq + 1; } dumper->next_seq = next_seq; - dumper->next_idx = next_idx; ret = true; logbuf_unlock_irqrestore(flags); out: @@ -3417,9 +3465,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) { dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; + dumper->next_seq = prb_next_seq(prb); } /** -- cgit v1.2.3 From 31ed1b5dff46fc5c9347338b7e65973f34e9713f Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Fri, 3 Jul 2020 16:29:37 +0200 Subject: kernel/params.c: Align last argument with a tab MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The second and third arguments are aligned with tabs, so do the same for the fourth. Cc: linux-kernel@vger.kernel.org Signed-off-by: Paul Menzel Reviewed-by: Christian König Signed-off-by: Christian König Link: https://lore.kernel.org/patchwork/patch/1267600/ --- kernel/params.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 8e56f8b12d8f..111eee82b999 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -233,14 +233,14 @@ char *parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); -STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); -STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); -STANDARD_PARAM_DEF(long, long, "%li", kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); -STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); +STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); +STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); +STANDARD_PARAM_DEF(long, long, "%li", kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); int param_set_charp(const char *val, const struct kernel_param *kp) { -- cgit v1.2.3 From 7d8365771ffb0edc336f2cd45e96ef8214a83dca Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Fri, 3 Jul 2020 16:29:38 +0200 Subject: moduleparams: Add hexint type parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For bitmasks printing values in hex is more convenient. Prefix with `0x` to make it clear, that it’s a hex value, and pad it out. Using the helper for `amdgpu.ppfeaturemask`, it will look like below. Before: $ more /sys/module/amdgpu/parameters/ppfeaturemask 4294950911 After: $ more /sys/module/amdgpu/parameters/ppfeaturemask 0xffffbfff Cc: linux-kernel@vger.kernel.org Cc: amd-gfx@lists.freedesktop.org Signed-off-by: Paul Menzel Acked-by: Linus Torvalds Reviewed-by: Christian König Signed-off-by: Christian König Link: https://patchwork.freedesktop.org/patch/374726/ --- kernel/params.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 111eee82b999..3835fb82c64b 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -233,14 +233,15 @@ char *parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); -STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); -STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); -STANDARD_PARAM_DEF(long, long, "%li", kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); -STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); +STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); +STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); +STANDARD_PARAM_DEF(long, long, "%li", kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint); int param_set_charp(const char *val, const struct kernel_param *kp) { -- cgit v1.2.3 From 547bbf7d214fff2d29bc885ea7efea0fbc127791 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 6 Aug 2020 20:32:27 -0700 Subject: kernel: printk: delete repeated words in comments Drop repeated words "the" in kernel/printk/. Signed-off-by: Randy Dunlap Cc: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200807033227.8349-1-rdunlap@infradead.org --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9b75f6bfc333..e4987ebe21b4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2416,7 +2416,7 @@ void console_unlock(void) * * console_trylock() is not able to detect the preemptive * context reliably. Therefore the value must be stored before - * and cleared after the the "again" goto label. + * and cleared after the "again" goto label. */ do_cond_resched = console_may_schedule; again: @@ -3332,7 +3332,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); * @len: length of line placed into buffer * * Start at the end of the kmsg buffer and fill the provided buffer - * with as many of the the *youngest* kmsg records that fit into it. + * with as many of the *youngest* kmsg records that fit into it. * If the buffer is large enough, all available kmsg records will be * copied with a single call. * -- cgit v1.2.3 From 6b87024f76bc755354c18116880dc3b632447cbd Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 3 Aug 2020 13:34:38 +0100 Subject: audit: change unnecessary globals into statics Variables sig_pid, audit_sig_uid and audit_sig_sid are only used in the audit.c file across the kernel Hence it appears no reason for declaring them as globals This patch removes their global declarations from the .h file and change them into static in the .c file. Signed-off-by: Jules Irenge Signed-off-by: Paul Moore --- kernel/audit.c | 6 +++--- kernel/audit.h | 4 ---- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7efaece534a9..5cf4781d5546 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -123,9 +123,9 @@ static u32 audit_backlog_limit = 64; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ -kuid_t audit_sig_uid = INVALID_UID; -pid_t audit_sig_pid = -1; -u32 audit_sig_sid = 0; +static kuid_t audit_sig_uid = INVALID_UID; +static pid_t audit_sig_pid = -1; +static u32 audit_sig_sid = 0; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] diff --git a/kernel/audit.h b/kernel/audit.h index ddc22878433d..3b9c0945225a 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -327,10 +327,6 @@ static inline int audit_signal_info_syscall(struct task_struct *t) extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); -extern pid_t audit_sig_pid; -extern kuid_t audit_sig_uid; -extern u32 audit_sig_sid; - extern int audit_filter(int msgtype, unsigned int listtype); extern void audit_ctl_lock(void); -- cgit v1.2.3 From 265c32072b0ce4156f40527a969d335d03681b30 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 3 Aug 2020 13:34:39 +0100 Subject: audit: uninitialize variable audit_sig_sid Checkpatch tool reports "ERROR: do not initialise globals/statics to 0" To fix this, audit_sig_sid is uninitialized As this is stored in the .bss section, the compiler can initialize the variable automatically. Signed-off-by: Jules Irenge Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 5cf4781d5546..86f2b76e3d4e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -125,7 +125,7 @@ static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ static kuid_t audit_sig_uid = INVALID_UID; static pid_t audit_sig_pid = -1; -static u32 audit_sig_sid = 0; +static u32 audit_sig_sid; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] -- cgit v1.2.3 From 9f0c4fa111dc909ca545c45ea20ec84da555ce16 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 23 Jul 2020 10:11:10 -0700 Subject: perf/core: Add a new PERF_EV_CAP_SIBLING event capability Current perf assumes that events in a group are independent. Close an event doesn't impact the value of the other events in the same group. If the closed event is a member, after the event closure, other events are still running like a group. If the closed event is a leader, other events are running as singleton events. Add PERF_EV_CAP_SIBLING to allow events to indicate they require being part of a group, and when the leader dies they cannot exist independently. Suggested-by: Peter Zijlstra Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200723171117.9918-8-kan.liang@linux.intel.com --- kernel/events/core.c | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5bfe8e3c6e44..57efe3b21e29 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2133,8 +2133,24 @@ static inline struct list_head *get_event_list(struct perf_event *event) return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; } +/* + * Events that have PERF_EV_CAP_SIBLING require being part of a group and + * cannot exist on their own, schedule them out and move them into the ERROR + * state. Also see _perf_event_enable(), it will not be able to recover + * this ERROR state. + */ +static inline void perf_remove_sibling_event(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + event_sched_out(event, cpuctx, ctx); + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); +} + static void perf_group_detach(struct perf_event *event) { + struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; struct perf_event_context *ctx = event->ctx; @@ -2153,7 +2169,7 @@ static void perf_group_detach(struct perf_event *event) /* * If this is a sibling, remove it from its group. */ - if (event->group_leader != event) { + if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; goto out; @@ -2166,6 +2182,9 @@ static void perf_group_detach(struct perf_event *event) */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + if (sibling->event_caps & PERF_EV_CAP_SIBLING) + perf_remove_sibling_event(sibling); + sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); @@ -2183,10 +2202,10 @@ static void perf_group_detach(struct perf_event *event) } out: - perf_event__header_size(event->group_leader); - - for_each_sibling_event(tmp, event->group_leader) + for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); + + perf_event__header_size(leader); } static bool is_orphaned_event(struct perf_event *event) @@ -2979,6 +2998,7 @@ static void _perf_event_enable(struct perf_event *event) raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { +out: raw_spin_unlock_irq(&ctx->lock); return; } @@ -2990,8 +3010,16 @@ static void _perf_event_enable(struct perf_event *event) * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ - if (event->state == PERF_EVENT_STATE_ERROR) + if (event->state == PERF_EVENT_STATE_ERROR) { + /* + * Detached SIBLING events cannot leave ERROR state. + */ + if (event->event_caps & PERF_EV_CAP_SIBLING && + event->group_leader == event) + goto out; + event->state = PERF_EVENT_STATE_OFF; + } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_enable, NULL); -- cgit v1.2.3 From cfe7ddcbd72dc67ce5749cc6f451a2b0c6aec5b5 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:47 +0100 Subject: ARM, sched/topology: Remove SD_SHARE_POWERDOMAIN This flag was introduced in 2014 by commit: d77b3ed5c9f8 ("sched: Add a new SD_SHARE_POWERDOMAIN for sched_domain") but AFAIA it was never leveraged by the scheduler. The closest thing I can think of is EAS caring about frequency domains, and it does that by leveraging performance domains. Remove the flag. No change in functionality is expected. Suggested-by: Morten Rasmussen Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-2-valentin.schneider@arm.com --- kernel/sched/topology.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 007b0a6b0152..fe0396978fea 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -148,8 +148,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { + SD_SHARE_PKG_RESOURCES)) { if (sd->groups != sd->groups->next) return 0; } @@ -180,8 +179,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + SD_PREFER_SIBLING); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -1292,7 +1290,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; * SD_SHARE_CPUCAPACITY - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1303,8 +1300,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_ASYM_PACKING) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, -- cgit v1.2.3 From 65c5e253168dbbb52c20026b0c5b7a2f344b9197 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:51 +0100 Subject: sched/topology: Verify SD_* flags setup when sched_debug is on Now that we have some description of what we expect the flags layout to be, we can use that to assert at runtime that the actual layout is sane. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-6-valentin.schneider@arm.com --- kernel/sched/topology.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index fe0396978fea..cbdaf084ff32 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -29,6 +29,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; + unsigned long flags = sd->flags; + unsigned int idx; cpumask_clear(groupmask); @@ -43,6 +45,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + unsigned int flag = BIT(idx); + unsigned int meta_flags = sd_flag_debug[idx].meta_flags; + + if ((meta_flags & SDF_SHARED_CHILD) && sd->child && + !(sd->child->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in child\n", + sd_flag_debug[idx].name); + + if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && + !(sd->parent->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in parent\n", + sd_flag_debug[idx].name); + } + printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { -- cgit v1.2.3 From 5b9f8ff7b320a34af3dbcf04edb40d9b04f22f4a Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:52 +0100 Subject: sched/debug: Output SD flag names rather than their values Decoding the output of /proc/sys/kernel/sched_domain/cpu*/domain*/flags has always been somewhat annoying, as one needs to go fetch the bit -> name mapping from the source code itself. This encoding can be saved in a script somewhere, but that isn't safe from flags being added, removed or even shuffled around. What matters for debugging purposes is to get *which* flags are set in a given domain, their associated value is pretty much meaningless. Make the sd flags debug file output flag names. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-7-valentin.schneider@arm.com --- kernel/sched/debug.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 36c54265bb2b..0655524700d2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,6 +245,60 @@ set_table_entry(struct ctl_table *entry, entry->proc_handler = proc_handler; } +static int sd_ctl_doflags(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned long flags = *(unsigned long *)table->data; + size_t data_size = 0; + size_t len = 0; + char *tmp; + int idx; + + if (write) + return 0; + + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + char *name = sd_flag_debug[idx].name; + + /* Name plus whitespace */ + data_size += strlen(name) + 1; + } + + if (*ppos > data_size) { + *lenp = 0; + return 0; + } + + tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + char *name = sd_flag_debug[idx].name; + + len += snprintf(tmp + len, strlen(name) + 2, "%s ", name); + } + + tmp += *ppos; + len -= *ppos; + + if (len > *lenp) + len = *lenp; + if (len) + memcpy(buffer, tmp, len); + if (len < *lenp) { + ((char *)buffer)[len] = '\n'; + len++; + } + + *lenp = len; + *ppos += len; + + kfree(tmp); + + return 0; +} + static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { @@ -258,7 +312,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags); set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); /* &table[8] is terminator */ -- cgit v1.2.3 From 6f349818621dff364fc000909135d0065e9756c9 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:54 +0100 Subject: sched/topology: Use prebuilt SD flag degeneration mask Leverage SD_DEGENERATE_GROUPS_MASK in sd_degenerate() and sd_parent_degenerate(). Suggested-by: Peter Zijlstra Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-9-valentin.schneider@arm.com --- kernel/sched/topology.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index cbdaf084ff32..38e6671c3bd7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -160,15 +160,9 @@ static int sd_degenerate(struct sched_domain *sd) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUCAPACITY | - SD_ASYM_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES)) { - if (sd->groups != sd->groups->next) - return 0; - } + if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && + (sd->groups != sd->groups->next)) + return 0; /* Following flags don't use groups */ if (sd->flags & (SD_WAKE_AFFINE)) @@ -190,13 +184,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { - pflags &= ~(SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING); + pflags &= ~(SD_DEGENERATE_GROUPS_MASK | SD_PREFER_SIBLING); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } -- cgit v1.2.3 From ab65afb094c7b2e954e8d56ffbc7df843211c2c8 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:55 +0100 Subject: sched/topology: Remove SD_SERIALIZE degeneration special case If there is only a single NUMA node in the system, the only NUMA topology level that will be generated will be NODE (identity distance), which doesn't have SD_SERIALIZE. This means we don't need this special case in sd_parent_degenerate(), as having the NODE level "naturally" covers it. Thus, remove it. Suggested-by: Peter Zijlstra Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-10-valentin.schneider@arm.com --- kernel/sched/topology.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 38e6671c3bd7..c662c53736e2 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -183,11 +183,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 0; /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { + if (parent->groups == parent->groups->next) pflags &= ~(SD_DEGENERATE_GROUPS_MASK | SD_PREFER_SIBLING); - if (nr_node_ids == 1) - pflags &= ~SD_SERIALIZE; - } + if (~cflags & pflags) return 0; -- cgit v1.2.3 From c200191d4c2c1aa2ffb62a984b756ac1f02dc55c Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:56 +0100 Subject: sched/topology: Propagate SD_ASYM_CPUCAPACITY upwards We currently set this flag *only* on domains whose topology level exactly match the level where we detect asymmetry (as returned by asym_cpu_capacity_level()). This is rather problematic. Say there are two clusters in the system, one with a lone big CPU and the other with a mix of big and LITTLE CPUs (as is allowed by DynamIQ): DIE [ ] MC [ ][ ] 0 1 2 3 4 L L B B B asym_cpu_capacity_level() will figure out that the MC level is the one where all CPUs can see a CPU of max capacity, and we will thus set SD_ASYM_CPUCAPACITY at MC level for all CPUs. That lone big CPU will degenerate its MC domain, since it would be alone in there, and will end up with just a DIE domain. Since the flag was only set at MC, this CPU ends up not seeing any SD with the flag set, which is broken. Rather than clearing dflags at every topology level, clear it before entering the topology level loop. This will properly propagate upwards flags that are set starting from a certain level. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Quentin Perret Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-11-valentin.schneider@arm.com --- kernel/sched/topology.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c662c53736e2..f36ed96f3197 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1988,11 +1988,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; + int dflags = 0; sd = NULL; for_each_sd_topology(tl) { - int dflags = 0; - if (tl == tl_asym) { dflags |= SD_ASYM_CPUCAPACITY; has_asym = true; -- cgit v1.2.3 From 3a6712c7685352a5d71eaf459a4fddfc3589f018 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:57 +0100 Subject: sched/topology: Mark SD_PREFER_SIBLING as SDF_NEEDS_GROUPS SD_PREFER_SIBLING is currently considered in sd_parent_degenerate() but not in sd_degenerate(). It too hinges on load balancing, and thus won't have any effect when set on a domain with a single group. Add it to SD_DEGENERATE_GROUPS_MASK. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-12-valentin.schneider@arm.com --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f36ed96f3197..c674aaab312c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -184,7 +184,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) - pflags &= ~(SD_DEGENERATE_GROUPS_MASK | SD_PREFER_SIBLING); + pflags &= ~SD_DEGENERATE_GROUPS_MASK; if (~cflags & pflags) return 0; -- cgit v1.2.3 From cad6967ac10843a70842cd39c7b53412901dd21f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 19 Aug 2020 12:46:45 +0200 Subject: fork: introduce kernel_clone() The old _do_fork() helper doesn't follow naming conventions of in-kernel helpers for syscalls. The process creation cleanup in [1] didn't change the name to something more reasonable mainly because _do_fork() was used in quite a few places. So sending this as a separate series seemed the better strategy. This commit does two things: 1. renames _do_fork() to kernel_clone() but keeps _do_fork() as a simple static inline wrapper around kernel_clone(). 2. Changes the return type from long to pid_t. This aligns kernel_thread() and kernel_clone(). Also, the return value from kernel_clone that is surfaced in fork(), vfork(), clone(), and clone3() is taken from pid_vrn() which returns a pid_t too. Follow-up patches will switch each caller of _do_fork() and each place where it is referenced over to kernel_clone(). After all these changes are done, we can remove _do_fork() completely and will only be left with kernel_clone(). [1]: 9ba27414f2ec ("Merge tag 'fork-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux") Signed-off-by: Christian Brauner Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: "Peter Zijlstra (Intel)" Link: https://lore.kernel.org/r/20200819104655.436656-2-christian.brauner@ubuntu.com --- kernel/fork.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4d32190861bd..d822c7a4b9c4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2384,14 +2384,14 @@ struct mm_struct *copy_init_mm(void) * * args->exit_signal is expected to be checked for sanity by the caller. */ -long _do_fork(struct kernel_clone_args *args) +pid_t kernel_clone(struct kernel_clone_args *args) { u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; int trace = 0; - long nr; + pid_t nr; /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument @@ -2477,7 +2477,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .stack_size = (unsigned long)arg, }; - return _do_fork(&args); + return kernel_clone(&args); } #ifdef __ARCH_WANT_SYS_FORK @@ -2488,7 +2488,7 @@ SYSCALL_DEFINE0(fork) .exit_signal = SIGCHLD, }; - return _do_fork(&args); + return kernel_clone(&args); #else /* can not support in nommu mode */ return -EINVAL; @@ -2504,7 +2504,7 @@ SYSCALL_DEFINE0(vfork) .exit_signal = SIGCHLD, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif @@ -2542,7 +2542,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, .tls = tls, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif @@ -2700,7 +2700,7 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) if (!clone3_args_valid(&kargs)) return -EINVAL; - return _do_fork(&kargs); + return kernel_clone(&kargs); } #endif @@ -2863,7 +2863,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* - * functions used by _do_fork() cannot be used here directly + * functions used by kernel_clone() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. -- cgit v1.2.3 From 005142b8a1f0f32d33fbe04b728464c1b7acfa0e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 18 Aug 2020 21:27:56 -0700 Subject: bpf: Factor out bpf_link_by_id() helper. Refactor the code a bit to extract bpf_link_by_id() helper. It's similar to existing bpf_prog_by_id(). Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200819042759.51280-2-alexei.starovoitov@gmail.com --- kernel/bpf/syscall.c | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 86299a292214..689d736b6904 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4014,40 +4014,50 @@ static int link_detach(union bpf_attr *attr) return ret; } -static int bpf_link_inc_not_zero(struct bpf_link *link) +static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) { - return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; + return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); } -#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id - -static int bpf_link_get_fd_by_id(const union bpf_attr *attr) +struct bpf_link *bpf_link_by_id(u32 id) { struct bpf_link *link; - u32 id = attr->link_id; - int fd, err; - if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!id) + return ERR_PTR(-ENOENT); spin_lock_bh(&link_idr_lock); - link = idr_find(&link_idr, id); /* before link is "settled", ID is 0, pretend it doesn't exist yet */ + link = idr_find(&link_idr, id); if (link) { if (link->id) - err = bpf_link_inc_not_zero(link); + link = bpf_link_inc_not_zero(link); else - err = -EAGAIN; + link = ERR_PTR(-EAGAIN); } else { - err = -ENOENT; + link = ERR_PTR(-ENOENT); } spin_unlock_bh(&link_idr_lock); + return link; +} - if (err) - return err; +#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id + +static int bpf_link_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_link *link; + u32 id = attr->link_id; + int fd; + + if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + link = bpf_link_by_id(id); + if (IS_ERR(link)) + return PTR_ERR(link); fd = bpf_link_new_fd(link); if (fd < 0) -- cgit v1.2.3 From f0fdfefb2d4da5b76c3f810be0edb2ab90360224 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 18 Aug 2020 21:27:57 -0700 Subject: bpf: Add BPF program and map iterators as built-in BPF programs. The program and map iterators work similar to seq_file-s. Once the program is pinned in bpffs it can be read with "cat" tool to print human readable output. In this case about BPF programs and maps. For example: $ cat /sys/fs/bpf/progs.debug id name attached 5 dump_bpf_map bpf_iter_bpf_map 6 dump_bpf_prog bpf_iter_bpf_prog $ cat /sys/fs/bpf/maps.debug id name max_entries 3 iterator.rodata 1 To avoid kernel build dependency on clang 10 separate bpf skeleton generation into manual "make" step and instead check-in generated .skel.h into git. Unlike 'bpftool prog show' in-kernel BTF name is used (when available) to print full name of BPF program instead of 16-byte truncated name. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200819042759.51280-3-alexei.starovoitov@gmail.com --- kernel/bpf/preload/iterators/.gitignore | 2 + kernel/bpf/preload/iterators/Makefile | 57 ++++ kernel/bpf/preload/iterators/README | 4 + kernel/bpf/preload/iterators/iterators.bpf.c | 114 +++++++ kernel/bpf/preload/iterators/iterators.skel.h | 410 ++++++++++++++++++++++++++ 5 files changed, 587 insertions(+) create mode 100644 kernel/bpf/preload/iterators/.gitignore create mode 100644 kernel/bpf/preload/iterators/Makefile create mode 100644 kernel/bpf/preload/iterators/README create mode 100644 kernel/bpf/preload/iterators/iterators.bpf.c create mode 100644 kernel/bpf/preload/iterators/iterators.skel.h (limited to 'kernel') diff --git a/kernel/bpf/preload/iterators/.gitignore b/kernel/bpf/preload/iterators/.gitignore new file mode 100644 index 000000000000..ffdb70230c8b --- /dev/null +++ b/kernel/bpf/preload/iterators/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/.output diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile new file mode 100644 index 000000000000..28fa8c1440f4 --- /dev/null +++ b/kernel/bpf/preload/iterators/Makefile @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: GPL-2.0 +OUTPUT := .output +CLANG ?= clang +LLC ?= llc +LLVM_STRIP ?= llvm-strip +DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool +BPFTOOL ?= $(DEFAULT_BPFTOOL) +LIBBPF_SRC := $(abspath ../../../../tools/lib/bpf) +BPFOBJ := $(OUTPUT)/libbpf.a +BPF_INCLUDE := $(OUTPUT) +INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../../../tools/lib) \ + -I$(abspath ../../../../tools/include/uapi) +CFLAGS := -g -Wall + +abs_out := $(abspath $(OUTPUT)) +ifeq ($(V),1) +Q = +msg = +else +Q = @ +msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))"; +MAKEFLAGS += --no-print-directory +submake_extras := feature_display=0 +endif + +.DELETE_ON_ERROR: + +.PHONY: all clean + +all: iterators.skel.h + +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) iterators + +iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + + +$(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES) \ + -c $(filter %.c,$^) -o $@ && \ + $(LLVM_STRIP) -g $@ + +$(OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $(OUTPUT) + +$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \ + OUTPUT=$(abspath $(dir $@))/ $(abspath $@) + +$(DEFAULT_BPFTOOL): + $(Q)$(MAKE) $(submake_extras) -C ../../../../tools/bpf/bpftool \ + prefix= OUTPUT=$(abs_out)/ DESTDIR=$(abs_out) install diff --git a/kernel/bpf/preload/iterators/README b/kernel/bpf/preload/iterators/README new file mode 100644 index 000000000000..7fd6d39a9ad2 --- /dev/null +++ b/kernel/bpf/preload/iterators/README @@ -0,0 +1,4 @@ +WARNING: +If you change "iterators.bpf.c" do "make -j" in this directory to rebuild "iterators.skel.h". +Make sure to have clang 10 installed. +See Documentation/bpf/bpf_devel_QA.rst diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c new file mode 100644 index 000000000000..5ded550b2ed6 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.bpf.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include + +#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record) +struct seq_file; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +}; + +struct bpf_map { + __u32 id; + char name[16]; + __u32 max_entries; +}; + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +}; + +struct btf_type { + __u32 name_off; +}; + +struct btf_header { + __u32 str_len; +}; + +struct btf { + const char *strings; + struct btf_type **types; + struct btf_header hdr; +}; + +struct bpf_prog_aux { + __u32 id; + char name[16]; + const char *attach_func_name; + struct bpf_prog *linked_prog; + struct bpf_func_info *func_info; + struct btf *btf; +}; + +struct bpf_prog { + struct bpf_prog_aux *aux; +}; + +struct bpf_iter__bpf_prog { + struct bpf_iter_meta *meta; + struct bpf_prog *prog; +}; +#pragma clang attribute pop + +static const char *get_name(struct btf *btf, long btf_id, const char *fallback) +{ + struct btf_type **types, *t; + unsigned int name_off; + const char *str; + + if (!btf) + return fallback; + str = btf->strings; + types = btf->types; + bpf_probe_read_kernel(&t, sizeof(t), types + btf_id); + name_off = BPF_CORE_READ(t, name_off); + if (name_off >= btf->hdr.str_len) + return fallback; + return str + name_off; +} + +SEC("iter/bpf_map") +int dump_bpf_map(struct bpf_iter__bpf_map *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + __u64 seq_num = ctx->meta->seq_num; + struct bpf_map *map = ctx->map; + + if (!map) + return 0; + + if (seq_num == 0) + BPF_SEQ_PRINTF(seq, " id name max_entries\n"); + + BPF_SEQ_PRINTF(seq, "%4u %-16s%6d\n", map->id, map->name, map->max_entries); + return 0; +} + +SEC("iter/bpf_prog") +int dump_bpf_prog(struct bpf_iter__bpf_prog *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + __u64 seq_num = ctx->meta->seq_num; + struct bpf_prog *prog = ctx->prog; + struct bpf_prog_aux *aux; + + if (!prog) + return 0; + + aux = prog->aux; + if (seq_num == 0) + BPF_SEQ_PRINTF(seq, " id name attached\n"); + + BPF_SEQ_PRINTF(seq, "%4u %-16s %s %s\n", aux->id, + get_name(aux->btf, aux->func_info[0].type_id, aux->name), + aux->attach_func_name, aux->linked_prog->aux->name); + return 0; +} +char LICENSE[] SEC("license") = "GPL"; diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h new file mode 100644 index 000000000000..c3171357dc4f --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.skel.h @@ -0,0 +1,410 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* THIS FILE IS AUTOGENERATED! */ +#ifndef __ITERATORS_BPF_SKEL_H__ +#define __ITERATORS_BPF_SKEL_H__ + +#include +#include + +struct iterators_bpf { + struct bpf_object_skeleton *skeleton; + struct bpf_object *obj; + struct { + struct bpf_map *rodata; + } maps; + struct { + struct bpf_program *dump_bpf_map; + struct bpf_program *dump_bpf_prog; + } progs; + struct { + struct bpf_link *dump_bpf_map; + struct bpf_link *dump_bpf_prog; + } links; + struct iterators_bpf__rodata { + char dump_bpf_map____fmt[35]; + char dump_bpf_map____fmt_1[14]; + char dump_bpf_prog____fmt[32]; + char dump_bpf_prog____fmt_2[17]; + } *rodata; +}; + +static void +iterators_bpf__destroy(struct iterators_bpf *obj) +{ + if (!obj) + return; + if (obj->skeleton) + bpf_object__destroy_skeleton(obj->skeleton); + free(obj); +} + +static inline int +iterators_bpf__create_skeleton(struct iterators_bpf *obj); + +static inline struct iterators_bpf * +iterators_bpf__open_opts(const struct bpf_object_open_opts *opts) +{ + struct iterators_bpf *obj; + + obj = (typeof(obj))calloc(1, sizeof(*obj)); + if (!obj) + return NULL; + if (iterators_bpf__create_skeleton(obj)) + goto err; + if (bpf_object__open_skeleton(obj->skeleton, opts)) + goto err; + + return obj; +err: + iterators_bpf__destroy(obj); + return NULL; +} + +static inline struct iterators_bpf * +iterators_bpf__open(void) +{ + return iterators_bpf__open_opts(NULL); +} + +static inline int +iterators_bpf__load(struct iterators_bpf *obj) +{ + return bpf_object__load_skeleton(obj->skeleton); +} + +static inline struct iterators_bpf * +iterators_bpf__open_and_load(void) +{ + struct iterators_bpf *obj; + + obj = iterators_bpf__open(); + if (!obj) + return NULL; + if (iterators_bpf__load(obj)) { + iterators_bpf__destroy(obj); + return NULL; + } + return obj; +} + +static inline int +iterators_bpf__attach(struct iterators_bpf *obj) +{ + return bpf_object__attach_skeleton(obj->skeleton); +} + +static inline void +iterators_bpf__detach(struct iterators_bpf *obj) +{ + return bpf_object__detach_skeleton(obj->skeleton); +} + +static inline int +iterators_bpf__create_skeleton(struct iterators_bpf *obj) +{ + struct bpf_object_skeleton *s; + + s = (typeof(s))calloc(1, sizeof(*s)); + if (!s) + return -1; + obj->skeleton = s; + + s->sz = sizeof(*s); + s->name = "iterators_bpf"; + s->obj = &obj->obj; + + /* maps */ + s->map_cnt = 1; + s->map_skel_sz = sizeof(*s->maps); + s->maps = (typeof(s->maps))calloc(s->map_cnt, s->map_skel_sz); + if (!s->maps) + goto err; + + s->maps[0].name = "iterator.rodata"; + s->maps[0].map = &obj->maps.rodata; + s->maps[0].mmaped = (void **)&obj->rodata; + + /* programs */ + s->prog_cnt = 2; + s->prog_skel_sz = sizeof(*s->progs); + s->progs = (typeof(s->progs))calloc(s->prog_cnt, s->prog_skel_sz); + if (!s->progs) + goto err; + + s->progs[0].name = "dump_bpf_map"; + s->progs[0].prog = &obj->progs.dump_bpf_map; + s->progs[0].link = &obj->links.dump_bpf_map; + + s->progs[1].name = "dump_bpf_prog"; + s->progs[1].prog = &obj->progs.dump_bpf_prog; + s->progs[1].link = &obj->links.dump_bpf_prog; + + s->data_sz = 7128; + s->data = (void *)"\ +\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x18\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\ +\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\ +\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\ +\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\ +\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\ +\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\ +\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\ +\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\ +\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\ +\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\ +\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\ +\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\ +\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\ +\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\ +\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\ +\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\ +\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\ +\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\ +\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\ +\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\ +\0\x04\0\0\0\x85\0\0\0\x04\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\ +\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\ +\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\ +\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\ +\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\ +\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\ +\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\ +\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\ +\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\ +\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\ +\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\ +\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\0\x05\0\0\0\0\0\0\0\0\0\ +\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\ +\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\ +\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\ +\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\ +\0\0\xb1\0\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\ +\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\ +\0\0\0\0\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x98\x01\0\0\x03\ +\0\0\x04\x18\0\0\0\xa0\x01\0\0\x0e\0\0\0\0\0\0\0\xa3\x01\0\0\x11\0\0\0\x20\0\0\ +\0\xa8\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb4\x01\0\0\0\0\0\x08\x0f\0\0\0\xba\x01\0\0\ +\0\0\0\x01\x04\0\0\0\x20\0\0\0\xc7\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\ +\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xcc\x01\0\0\0\0\0\x01\x04\ +\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x30\x02\0\0\x02\0\0\x04\x10\0\0\0\ +\x13\0\0\0\x03\0\0\0\0\0\0\0\x43\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\ +\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x48\x02\0\0\x01\0\ +\0\x0c\x16\0\0\0\x94\x02\0\0\x01\0\0\x04\x08\0\0\0\x9d\x02\0\0\x19\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xee\x02\0\0\x06\0\0\x04\x38\0\0\0\xa0\x01\0\0\ +\x0e\0\0\0\0\0\0\0\xa3\x01\0\0\x11\0\0\0\x20\0\0\0\xfb\x02\0\0\x1b\0\0\0\xc0\0\ +\0\0\x0c\x03\0\0\x15\0\0\0\0\x01\0\0\x18\x03\0\0\x1d\0\0\0\x40\x01\0\0\x22\x03\ +\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\ +\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x6c\x03\0\0\x02\0\ +\0\x04\x08\0\0\0\x7a\x03\0\0\x0e\0\0\0\0\0\0\0\x83\x03\0\0\x0e\0\0\0\x20\0\0\0\ +\x22\x03\0\0\x03\0\0\x04\x18\0\0\0\x8d\x03\0\0\x1b\0\0\0\0\0\0\0\x95\x03\0\0\ +\x21\0\0\0\x40\0\0\0\x9b\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\ +\0\0\0\0\0\0\0\0\x02\x24\0\0\0\x9f\x03\0\0\x01\0\0\x04\x04\0\0\0\xaa\x03\0\0\ +\x0e\0\0\0\0\0\0\0\x13\x04\0\0\x01\0\0\x04\x04\0\0\0\x1c\x04\0\0\x0e\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x92\x04\0\0\0\0\0\ +\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\ +\xa6\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\ +\x12\0\0\0\x20\0\0\0\xbc\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\ +\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xd1\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe8\x04\0\0\0\0\0\x0e\ +\x2d\0\0\0\x01\0\0\0\xf0\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\ +\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\ +\0\0\x11\0\0\0\xf8\x04\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\ +\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\ +\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\ +\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\ +\x30\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\ +\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\ +\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\ +\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\ +\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\ +\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\ +\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\ +\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\ +\x6c\x6f\x6e\x67\x20\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\ +\x31\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\ +\x61\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\ +\x21\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\ +\x5f\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\ +\x65\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\ +\x20\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\ +\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\ +\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\ +\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\ +\x75\x33\x32\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\ +\x72\0\x5f\x5f\x41\x52\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\ +\x5f\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\ +\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\ +\x2c\x20\x6d\x61\x70\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\ +\x65\x2c\x20\x6d\x61\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\ +\x29\x3b\0\x7d\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\ +\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\ +\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\ +\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\ +\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\ +\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\ +\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\ +\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\ +\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\ +\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\ +\x63\x5f\x6e\x61\x6d\x65\0\x6c\x69\x6e\x6b\x65\x64\x5f\x70\x72\x6f\x67\0\x66\ +\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\ +\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\ +\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\ +\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\ +\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\ +\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\ +\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\ +\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\ +\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\ +\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\ +\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\ +\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\ +\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\ +\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\ +\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\ +\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\ +\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\ +\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\ +\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\ +\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\ +\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\ +\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\ +\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\ +\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\ +\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\ +\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\ +\0\0\0\0\0\x07\0\0\0\x56\x02\0\0\x01\0\0\0\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\ +\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x7b\ +\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\xf2\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\ +\0\0\0\x13\x01\0\0\x06\x50\x01\0\x20\0\0\0\x42\0\0\0\x22\x01\0\0\x1d\x44\x01\0\ +\x28\0\0\0\x42\0\0\0\x47\x01\0\0\x06\x5c\x01\0\x38\0\0\0\x42\0\0\0\x5a\x01\0\0\ +\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xe0\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\ +\0\x2e\x02\0\0\x01\x70\x01\0\x56\x02\0\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\ +\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x7b\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\ +\x64\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\x42\0\0\0\x88\x02\0\0\x06\x98\x01\0\x20\0\ +\0\0\x42\0\0\0\xa1\x02\0\0\x0e\xa4\x01\0\x28\0\0\0\x42\0\0\0\x22\x01\0\0\x1d\ +\x88\x01\0\x30\0\0\0\x42\0\0\0\x47\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\ +\xb3\x02\0\0\x03\xac\x01\0\x80\0\0\0\x42\0\0\0\x26\x03\0\0\x02\xb4\x01\0\xb8\0\ +\0\0\x42\0\0\0\x61\x03\0\0\x06\x08\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\ +\xd8\0\0\0\x42\0\0\0\xb2\x03\0\0\x0f\x14\x01\0\xe0\0\0\0\x42\0\0\0\xc7\x03\0\0\ +\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\xfe\x03\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\ +\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\0\0\0\xc7\x03\0\0\x02\x18\x01\0\x20\x01\0\0\ +\x42\0\0\0\x25\x04\0\0\x0d\x1c\x01\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\ +\x01\0\0\x42\0\0\0\x25\x04\0\0\x0d\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x25\x04\0\0\ +\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\0\x53\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\ +\0\0\0\x53\x04\0\0\x06\x20\x01\0\x70\x01\0\0\x42\0\0\0\x76\x04\0\0\x0d\x28\x01\ +\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x80\x01\0\0\x42\0\0\0\x26\x03\0\0\x02\ +\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\x2e\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\ +\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\ +\0\0\0\x10\0\0\0\x02\0\0\0\xee\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x1e\x01\0\0\0\ +\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xee\0\0\0\0\0\ +\0\0\xa0\0\0\0\x0d\0\0\0\x1e\x01\0\0\0\0\0\0\x56\x02\0\0\x12\0\0\0\0\0\0\0\x14\ +\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\ +\0\0\xee\0\0\0\0\0\0\0\x20\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\ +\0\x1e\x01\0\0\0\0\0\0\x80\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\ +\0\xee\0\0\0\0\0\0\0\xa8\0\0\0\x1a\0\0\0\x59\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\ +\0\x5d\x03\0\0\0\0\0\0\xc0\0\0\0\x1f\0\0\0\x8b\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\ +\0\0\xee\0\0\0\0\0\0\0\xf0\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\ +\0\0\x3e\0\0\0\0\0\0\0\x50\x01\0\0\x1a\0\0\0\xee\0\0\0\0\0\0\0\x60\x01\0\0\x20\ +\0\0\0\x4d\x04\0\0\0\0\0\0\x88\x01\0\0\x1a\0\0\0\x1e\x01\0\0\0\0\0\0\x98\x01\0\ +\0\x1a\0\0\0\x8e\x04\0\0\0\0\0\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd6\0\0\0\0\0\x02\0\x70\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\xc8\0\0\0\0\0\x02\0\xf0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\xcf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc1\0\0\0\0\0\x03\0\x80\ +\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xba\0\0\0\0\0\x03\0\xf8\x01\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\0\0\0\0\0\0\0\xf4\0\0\0\ +\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\0\0\0\x01\0\x04\0\x31\0\0\ +\0\0\0\0\0\x20\0\0\0\0\0\0\0\xdd\0\0\0\x01\0\x04\0\x51\0\0\0\0\0\0\0\x11\0\0\0\ +\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\x03\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\xb2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\ +\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\ +\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\0\0\x0c\0\0\0\xc8\0\0\0\0\0\0\0\ +\x01\0\0\0\x0c\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\0\x0c\0\0\0\xd0\x01\0\0\0\0\0\0\ +\x01\0\0\0\x0c\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\0\0\x0c\0\0\0\xfc\x03\0\0\0\0\0\ +\0\x0a\0\0\0\x0c\0\0\0\x08\x04\0\0\0\0\0\0\x0a\0\0\0\x0c\0\0\0\x14\x04\0\0\0\0\ +\0\0\x0a\0\0\0\x0c\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\x0d\0\0\0\x2c\0\0\0\0\0\0\ +\0\0\0\0\0\x0a\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x50\0\0\0\0\0\0\0\0\0\ +\0\0\x0a\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\ +\x0a\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\x90\0\0\0\0\0\0\0\0\0\0\0\x0a\0\ +\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\xb0\0\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\ +\xc0\0\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\xd0\0\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\xe8\0\ +\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xf8\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x08\x01\0\0\ +\0\0\0\0\0\0\0\0\x0b\0\0\0\x18\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x28\x01\0\0\0\ +\0\0\0\0\0\0\0\x0b\0\0\0\x38\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x48\x01\0\0\0\0\ +\0\0\0\0\0\0\x0b\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x68\x01\0\0\0\0\0\ +\0\0\0\0\0\x0b\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x88\x01\0\0\0\0\0\0\ +\0\0\0\0\x0b\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa8\x01\0\0\0\0\0\0\0\ +\0\0\0\x0b\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\ +\0\0\x0b\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\ +\0\x0b\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\ +\x0a\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\ +\x0a\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\ +\x0a\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\ +\x0a\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\x4e\x4f\x41\x42\x43\x44\x4d\0\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\ +\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\ +\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\ +\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\ +\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\ +\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\ +\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\x6e\x73\x65\0\x2e\x73\x74\x72\x74\x61\ +\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\x61\x74\x61\0\x2e\x72\x65\ +\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\x4c\x42\x42\x31\x5f\x37\0\ +\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\x4c\x42\x42\x31\x5f\x33\0\ +\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\ +\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\ +\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\0\0\0\0\x08\ +\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\ +\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\0\0\0\0\0\x62\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x89\0\0\0\x01\0\0\0\x03\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xad\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x34\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\xe2\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\x99\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\x11\0\0\0\ +\0\0\0\x80\x01\0\0\0\0\0\0\x0e\0\0\0\x0d\0\0\0\x08\0\0\0\0\0\0\0\x18\0\0\0\0\0\ +\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x90\x12\0\0\0\0\0\0\ +\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x69\ +\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\ +\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\xa9\0\0\0\x09\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\0\0\0\0\0\0\x50\0\0\0\0\0\0\0\ +\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x07\0\0\0\x09\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x13\0\0\0\0\0\0\xe0\x03\0\0\0\0\0\0\x08\0\0\ +\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\0\0\x03\x4c\xff\x6f\0\0\ +\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x91\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\x07\x17\0\0\0\0\0\0\x0a\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0"; + + return 0; +err: + bpf_object__destroy_skeleton(s); + return -1; +} + +#endif /* __ITERATORS_BPF_SKEL_H__ */ -- cgit v1.2.3 From d71fa5c9763c24dd997a2fa4feb7a13a95bab42c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 18 Aug 2020 21:27:58 -0700 Subject: bpf: Add kernel module with user mode driver that populates bpffs. Add kernel module with user mode driver that populates bpffs with BPF iterators. $ mount bpffs /my/bpffs/ -t bpf $ ls -la /my/bpffs/ total 4 drwxrwxrwt 2 root root 0 Jul 2 00:27 . drwxr-xr-x 19 root root 4096 Jul 2 00:09 .. -rw------- 1 root root 0 Jul 2 00:27 maps.debug -rw------- 1 root root 0 Jul 2 00:27 progs.debug The user mode driver will load BPF Type Formats, create BPF maps, populate BPF maps, load two BPF programs, attach them to BPF iterators, and finally send two bpf_link IDs back to the kernel. The kernel will pin two bpf_links into newly mounted bpffs instance under names "progs.debug" and "maps.debug". These two files become human readable. $ cat /my/bpffs/progs.debug id name attached 11 dump_bpf_map bpf_iter_bpf_map 12 dump_bpf_prog bpf_iter_bpf_prog 27 test_pkt_access 32 test_main test_pkt_access test_pkt_access 33 test_subprog1 test_pkt_access_subprog1 test_pkt_access 34 test_subprog2 test_pkt_access_subprog2 test_pkt_access 35 test_subprog3 test_pkt_access_subprog3 test_pkt_access 36 new_get_skb_len get_skb_len test_pkt_access 37 new_get_skb_ifindex get_skb_ifindex test_pkt_access 38 new_get_constant get_constant test_pkt_access The BPF program dump_bpf_prog() in iterators.bpf.c is printing this data about all BPF programs currently loaded in the system. This information is unstable and will change from kernel to kernel as ".debug" suffix conveys. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200819042759.51280-4-alexei.starovoitov@gmail.com --- kernel/Makefile | 2 +- kernel/bpf/Makefile | 1 + kernel/bpf/inode.c | 116 +++++++++++++++++++++- kernel/bpf/preload/Kconfig | 23 +++++ kernel/bpf/preload/Makefile | 23 +++++ kernel/bpf/preload/bpf_preload.h | 16 +++ kernel/bpf/preload/bpf_preload_kern.c | 91 +++++++++++++++++ kernel/bpf/preload/bpf_preload_umd_blob.S | 7 ++ kernel/bpf/preload/iterators/bpf_preload_common.h | 13 +++ kernel/bpf/preload/iterators/iterators.c | 94 ++++++++++++++++++ 10 files changed, 382 insertions(+), 4 deletions(-) create mode 100644 kernel/bpf/preload/Kconfig create mode 100644 kernel/bpf/preload/Makefile create mode 100644 kernel/bpf/preload/bpf_preload.h create mode 100644 kernel/bpf/preload/bpf_preload_kern.c create mode 100644 kernel/bpf/preload/bpf_preload_umd_blob.S create mode 100644 kernel/bpf/preload/iterators/bpf_preload_common.h create mode 100644 kernel/bpf/preload/iterators/iterators.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 9a20016d4900..22b0760660fc 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,7 +12,7 @@ obj-y = fork.o exec_domain.o panic.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o -obj-$(CONFIG_BPFILTER) += usermode_driver.o +obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e6eb9c0402da..19e137aae40e 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -29,3 +29,4 @@ ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif +obj-$(CONFIG_BPF_PRELOAD) += preload/ diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index fb878ba3f22f..b48a56f53495 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -20,6 +20,7 @@ #include #include #include +#include "preload/bpf_preload.h" enum bpf_type { BPF_TYPE_UNSPEC = 0, @@ -369,9 +370,10 @@ static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future - * extensions. + * extensions. That allows popoulate_bpffs() create special files. */ - if (strchr(dentry->d_name.name, '.')) + if ((dir->i_mode & S_IALLUGO) && + strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); return simple_lookup(dir, dentry, flags); @@ -409,6 +411,27 @@ static const struct inode_operations bpf_dir_iops = { .unlink = simple_unlink, }; +/* pin iterator link into bpffs */ +static int bpf_iter_link_pin_kernel(struct dentry *parent, + const char *name, struct bpf_link *link) +{ + umode_t mode = S_IFREG | S_IRUSR; + struct dentry *dentry; + int ret; + + inode_lock(parent->d_inode); + dentry = lookup_one_len(name, parent, strlen(name)); + if (IS_ERR(dentry)) { + inode_unlock(parent->d_inode); + return PTR_ERR(dentry); + } + ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops, + &bpf_iter_fops); + dput(dentry); + inode_unlock(parent->d_inode); + return ret; +} + static int bpf_obj_do_pin(const char __user *pathname, void *raw, enum bpf_type type) { @@ -638,6 +661,91 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } +struct bpf_preload_ops *bpf_preload_ops; +EXPORT_SYMBOL_GPL(bpf_preload_ops); + +static bool bpf_preload_mod_get(void) +{ + /* If bpf_preload.ko wasn't loaded earlier then load it now. + * When bpf_preload is built into vmlinux the module's __init + * function will populate it. + */ + if (!bpf_preload_ops) { + request_module("bpf_preload"); + if (!bpf_preload_ops) + return false; + } + /* And grab the reference, so the module doesn't disappear while the + * kernel is interacting with the kernel module and its UMD. + */ + if (!try_module_get(bpf_preload_ops->owner)) { + pr_err("bpf_preload module get failed.\n"); + return false; + } + return true; +} + +static void bpf_preload_mod_put(void) +{ + if (bpf_preload_ops) + /* now user can "rmmod bpf_preload" if necessary */ + module_put(bpf_preload_ops->owner); +} + +static DEFINE_MUTEX(bpf_preload_lock); + +static int populate_bpffs(struct dentry *parent) +{ + struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {}; + struct bpf_link *links[BPF_PRELOAD_LINKS] = {}; + int err = 0, i; + + /* grab the mutex to make sure the kernel interactions with bpf_preload + * UMD are serialized + */ + mutex_lock(&bpf_preload_lock); + + /* if bpf_preload.ko wasn't built into vmlinux then load it */ + if (!bpf_preload_mod_get()) + goto out; + + if (!bpf_preload_ops->info.tgid) { + /* preload() will start UMD that will load BPF iterator programs */ + err = bpf_preload_ops->preload(objs); + if (err) + goto out_put; + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + links[i] = bpf_link_by_id(objs[i].link_id); + if (IS_ERR(links[i])) { + err = PTR_ERR(links[i]); + goto out_put; + } + } + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + err = bpf_iter_link_pin_kernel(parent, + objs[i].link_name, links[i]); + if (err) + goto out_put; + /* do not unlink successfully pinned links even + * if later link fails to pin + */ + links[i] = NULL; + } + /* finish() will tell UMD process to exit */ + err = bpf_preload_ops->finish(); + if (err) + goto out_put; + } +out_put: + bpf_preload_mod_put(); +out: + mutex_unlock(&bpf_preload_lock); + for (i = 0; i < BPF_PRELOAD_LINKS && err; i++) + if (!IS_ERR_OR_NULL(links[i])) + bpf_link_put(links[i]); + return err; +} + static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; @@ -654,8 +762,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) inode = sb->s_root->d_inode; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; + populate_bpffs(sb->s_root); inode->i_mode |= S_ISVTX | opts->mode; - return 0; } @@ -705,6 +813,8 @@ static int __init bpf_init(void) { int ret; + mutex_init(&bpf_preload_lock); + ret = sysfs_create_mount_point(fs_kobj, "bpf"); if (ret) return ret; diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig new file mode 100644 index 000000000000..7144e2d01ee4 --- /dev/null +++ b/kernel/bpf/preload/Kconfig @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0-only +config USERMODE_DRIVER + bool + default n + +menuconfig BPF_PRELOAD + bool "Preload BPF file system with kernel specific program and map iterators" + depends on BPF + select USERMODE_DRIVER + help + This builds kernel module with several embedded BPF programs that are + pinned into BPF FS mount point as human readable files that are + useful in debugging and introspection of BPF programs and maps. + +if BPF_PRELOAD +config BPF_PRELOAD_UMD + tristate "bpf_preload kernel module with user mode driver" + depends on CC_CAN_LINK + depends on m || CC_CAN_LINK_STATIC + default m + help + This builds bpf_preload kernel module with embedded user mode driver. +endif diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile new file mode 100644 index 000000000000..12c7b62b9b6e --- /dev/null +++ b/kernel/bpf/preload/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0 + +LIBBPF_SRCS = $(srctree)/tools/lib/bpf/ +LIBBPF_A = $(obj)/libbpf.a +LIBBPF_OUT = $(abspath $(obj)) + +$(LIBBPF_A): + $(Q)$(MAKE) -C $(LIBBPF_SRCS) OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a + +userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ + -I $(srctree)/tools/lib/ -Wno-unused-result + +userprogs := bpf_preload_umd + +bpf_preload_umd-objs := iterators/iterators.o +bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz + +$(obj)/bpf_preload_umd: $(LIBBPF_A) + +$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd + +obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o +bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h new file mode 100644 index 000000000000..2f9932276f2e --- /dev/null +++ b/kernel/bpf/preload/bpf_preload.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_PRELOAD_H +#define _BPF_PRELOAD_H + +#include +#include "iterators/bpf_preload_common.h" + +struct bpf_preload_ops { + struct umd_info info; + int (*preload)(struct bpf_preload_info *); + int (*finish)(void); + struct module *owner; +}; +extern struct bpf_preload_ops *bpf_preload_ops; +#define BPF_PRELOAD_LINKS 2 +#endif diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c new file mode 100644 index 000000000000..79c5772465f1 --- /dev/null +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include "bpf_preload.h" + +extern char bpf_preload_umd_start; +extern char bpf_preload_umd_end; + +static int preload(struct bpf_preload_info *obj); +static int finish(void); + +static struct bpf_preload_ops umd_ops = { + .info.driver_name = "bpf_preload", + .preload = preload, + .finish = finish, + .owner = THIS_MODULE, +}; + +static int preload(struct bpf_preload_info *obj) +{ + int magic = BPF_PRELOAD_START; + loff_t pos = 0; + int i, err; + ssize_t n; + + err = fork_usermode_driver(&umd_ops.info); + if (err) + return err; + + /* send the start magic to let UMD proceed with loading BPF progs */ + n = kernel_write(umd_ops.info.pipe_to_umh, + &magic, sizeof(magic), &pos); + if (n != sizeof(magic)) + return -EPIPE; + + /* receive bpf_link IDs and names from UMD */ + pos = 0; + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + n = kernel_read(umd_ops.info.pipe_from_umh, + &obj[i], sizeof(*obj), &pos); + if (n != sizeof(*obj)) + return -EPIPE; + } + return 0; +} + +static int finish(void) +{ + int magic = BPF_PRELOAD_END; + struct pid *tgid; + loff_t pos = 0; + ssize_t n; + + /* send the last magic to UMD. It will do a normal exit. */ + n = kernel_write(umd_ops.info.pipe_to_umh, + &magic, sizeof(magic), &pos); + if (n != sizeof(magic)) + return -EPIPE; + tgid = umd_ops.info.tgid; + wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); + umd_ops.info.tgid = NULL; + return 0; +} + +static int __init load_umd(void) +{ + int err; + + err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start, + &bpf_preload_umd_end - &bpf_preload_umd_start); + if (err) + return err; + bpf_preload_ops = &umd_ops; + return err; +} + +static void __exit fini_umd(void) +{ + bpf_preload_ops = NULL; + /* kill UMD in case it's still there due to earlier error */ + kill_pid(umd_ops.info.tgid, SIGKILL, 1); + umd_ops.info.tgid = NULL; + umd_unload_blob(&umd_ops.info); +} +late_initcall(load_umd); +module_exit(fini_umd); +MODULE_LICENSE("GPL"); diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S new file mode 100644 index 000000000000..f1f40223b5c3 --- /dev/null +++ b/kernel/bpf/preload/bpf_preload_umd_blob.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + .section .init.rodata, "a" + .global bpf_preload_umd_start +bpf_preload_umd_start: + .incbin "kernel/bpf/preload/bpf_preload_umd" + .global bpf_preload_umd_end +bpf_preload_umd_end: diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h new file mode 100644 index 000000000000..8464d1a48c05 --- /dev/null +++ b/kernel/bpf/preload/iterators/bpf_preload_common.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_PRELOAD_COMMON_H +#define _BPF_PRELOAD_COMMON_H + +#define BPF_PRELOAD_START 0x5555 +#define BPF_PRELOAD_END 0xAAAA + +struct bpf_preload_info { + char link_name[16]; + int link_id; +}; + +#endif diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c new file mode 100644 index 000000000000..b7ff87939172 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "iterators.skel.h" +#include "bpf_preload_common.h" + +int to_kernel = -1; +int from_kernel = 0; + +static int send_link_to_kernel(struct bpf_link *link, const char *link_name) +{ + struct bpf_preload_info obj = {}; + struct bpf_link_info info = {}; + __u32 info_len = sizeof(info); + int err; + + err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len); + if (err) + return err; + obj.link_id = info.id; + if (strlen(link_name) >= sizeof(obj.link_name)) + return -E2BIG; + strcpy(obj.link_name, link_name); + if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj)) + return -EPIPE; + return 0; +} + +int main(int argc, char **argv) +{ + struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY }; + struct iterators_bpf *skel; + int err, magic; + int debug_fd; + + debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC); + if (debug_fd < 0) + return 1; + to_kernel = dup(1); + close(1); + dup(debug_fd); + /* now stdin and stderr point to /dev/console */ + + read(from_kernel, &magic, sizeof(magic)); + if (magic != BPF_PRELOAD_START) { + printf("bad start magic %d\n", magic); + return 1; + } + setrlimit(RLIMIT_MEMLOCK, &rlim); + /* libbpf opens BPF object and loads it into the kernel */ + skel = iterators_bpf__open_and_load(); + if (!skel) { + /* iterators.skel.h is little endian. + * libbpf doesn't support automatic little->big conversion + * of BPF bytecode yet. + * The program load will fail in such case. + */ + printf("Failed load could be due to wrong endianness\n"); + return 1; + } + err = iterators_bpf__attach(skel); + if (err) + goto cleanup; + + /* send two bpf_link IDs with names to the kernel */ + err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug"); + if (err) + goto cleanup; + err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug"); + if (err) + goto cleanup; + + /* The kernel will proceed with pinnging the links in bpffs. + * UMD will wait on read from pipe. + */ + read(from_kernel, &magic, sizeof(magic)); + if (magic != BPF_PRELOAD_END) { + printf("bad final magic %d\n", magic); + err = -EINVAL; + } +cleanup: + iterators_bpf__destroy(skel); + + return err != 0; +} -- cgit v1.2.3 From 6b0a249a301e2af9adda84adbced3a2988248b95 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 21 Aug 2020 11:44:18 -0700 Subject: bpf: Implement link_query for bpf iterators This patch implemented bpf_link callback functions show_fdinfo and fill_link_info to support link_query interface. The general interface for show_fdinfo and fill_link_info will print/fill the target_name. Each targets can register show_fdinfo and fill_link_info callbacks to print/fill more target specific information. For example, the below is a fdinfo result for a bpf task iterator. $ cat /proc/1749/fdinfo/7 pos: 0 flags: 02000000 mnt_id: 14 link_type: iter link_id: 11 prog_tag: 990e1f8152f7e54f prog_id: 59 target_name: task Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200821184418.574122-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b6715964b685..aeec7e174188 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -377,10 +377,68 @@ out_unlock: return ret; } +static void bpf_iter_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + bpf_iter_show_fdinfo_t show_fdinfo; + + seq_printf(seq, + "target_name:\t%s\n", + iter_link->tinfo->reg_info->target); + + show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo; + if (show_fdinfo) + show_fdinfo(&iter_link->aux, seq); +} + +static int bpf_iter_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + char __user *ubuf = u64_to_user_ptr(info->iter.target_name); + bpf_iter_fill_link_info_t fill_link_info; + u32 ulen = info->iter.target_name_len; + const char *target_name; + u32 target_len; + + if (!ulen ^ !ubuf) + return -EINVAL; + + target_name = iter_link->tinfo->reg_info->target; + target_len = strlen(target_name); + info->iter.target_name_len = target_len + 1; + + if (ubuf) { + if (ulen >= target_len + 1) { + if (copy_to_user(ubuf, target_name, target_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, target_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + } + + fill_link_info = iter_link->tinfo->reg_info->fill_link_info; + if (fill_link_info) + return fill_link_info(&iter_link->aux, info); + + return 0; +} + static const struct bpf_link_ops bpf_iter_link_lops = { .release = bpf_iter_link_release, .dealloc = bpf_iter_link_dealloc, .update_prog = bpf_iter_link_replace, + .show_fdinfo = bpf_iter_link_show_fdinfo, + .fill_link_info = bpf_iter_link_fill_link_info, }; bool bpf_link_is_iter(struct bpf_link *link) -- cgit v1.2.3 From b76f22269028fb252727a696084c70494d80a52c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 21 Aug 2020 11:44:19 -0700 Subject: bpf: Implement link_query callbacks in map element iterators For bpf_map_elem and bpf_sk_local_storage bpf iterators, additional map_id should be shown for fdinfo and userspace query. For example, the following is for a bpf_map_elem iterator. $ cat /proc/1753/fdinfo/9 pos: 0 flags: 02000000 mnt_id: 14 link_type: iter link_id: 34 prog_tag: 104be6d3fe45e6aa prog_id: 173 target_name: bpf_map_elem map_id: 127 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200821184419.574240-1-yhs@fb.com --- kernel/bpf/map_iter.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index af86048e5afd..6a9542af4212 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -149,6 +149,19 @@ static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) bpf_map_put_with_uref(aux->map); } +void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + seq_printf(seq, "map_id:\t%u\n", aux->map->id); +} + +int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info) +{ + info->iter.map.map_id = aux->map->id; + return 0; +} + DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, struct bpf_map *map, void *key, void *value) @@ -156,6 +169,8 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { .target = "bpf_map_elem", .attach_target = bpf_iter_attach_map, .detach_target = bpf_iter_detach_map, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), -- cgit v1.2.3 From 13b79d3ffbb8add9e2a6d604db2b49f241b97303 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 21 Aug 2020 11:29:45 +0100 Subject: bpf: sockmap: Call sock_map_update_elem directly Don't go via map->ops to call sock_map_update_elem, since we know what function to call in bpf_map_update_value. Since we currently don't allow calling map_update_elem from BPF context, we can remove ops->map_update_elem and rename the function to sock_map_update_elem_sys. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200821102948.21918-4-lmb@cloudflare.com --- kernel/bpf/syscall.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 689d736b6904..b46e973faee9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -157,10 +157,11 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, if (bpf_map_is_dev_bound(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || - map->map_type == BPF_MAP_TYPE_SOCKHASH || - map->map_type == BPF_MAP_TYPE_SOCKMAP || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { return map->ops->map_update_elem(map, key, value, flags); + } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || + map->map_type == BPF_MAP_TYPE_SOCKMAP) { + return sock_map_update_elem_sys(map, key, value, flags); } else if (IS_FD_PROG_ARRAY(map)) { return bpf_fd_array_map_update_elem(map, f.file, key, value, flags); -- cgit v1.2.3 From 912f442cfb1fc695510e055bdae5f4a88e4de6b8 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 21 Aug 2020 11:29:46 +0100 Subject: bpf: Override the meaning of ARG_PTR_TO_MAP_VALUE for sockmap and sockhash The verifier assumes that map values are simple blobs of memory, and therefore treats ARG_PTR_TO_MAP_VALUE, etc. as such. However, there are map types where this isn't true. For example, sockmap and sockhash store sockets. In general this isn't a big problem: we can just write helpers that explicitly requests PTR_TO_SOCKET instead of ARG_PTR_TO_MAP_VALUE. The one exception are the standard map helpers like map_update_elem, map_lookup_elem, etc. Here it would be nice we could overload the function prototype for different kinds of maps. Unfortunately, this isn't entirely straight forward: We only know the type of the map once we have resolved meta->map_ptr in check_func_arg. This means we can't swap out the prototype in check_helper_call until we're half way through the function. Instead, modify check_func_arg to treat ARG_PTR_TO_MAP_VALUE to mean "the native type for the map" instead of "pointer to memory" for sockmap and sockhash. This means we don't have to modify the function prototype at all Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200821102948.21918-5-lmb@cloudflare.com --- kernel/bpf/verifier.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ef938f17b944..f8629bf848fe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3872,6 +3872,33 @@ static int int_ptr_type_to_size(enum bpf_arg_type type) return -EINVAL; } +static int resolve_map_arg_type(struct bpf_verifier_env *env, + const struct bpf_call_arg_meta *meta, + enum bpf_arg_type *arg_type) +{ + if (!meta->map_ptr) { + /* kernel subsystem misconfigured verifier */ + verbose(env, "invalid map_ptr to access map->type\n"); + return -EACCES; + } + + switch (meta->map_ptr->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_SOCKHASH: + if (*arg_type == ARG_PTR_TO_MAP_VALUE) { + *arg_type = ARG_PTR_TO_SOCKET; + } else { + verbose(env, "invalid arg_type for sockmap/sockhash\n"); + return -EINVAL; + } + break; + + default: + break; + } + return 0; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn) @@ -3904,6 +3931,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } + if (arg_type == ARG_PTR_TO_MAP_VALUE || + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { + err = resolve_map_arg_type(env, meta, &arg_type); + if (err) + return err; + } + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE || arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || -- cgit v1.2.3 From 0126240f448d5bba29d0d1593aa527d3bf67b916 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 21 Aug 2020 11:29:47 +0100 Subject: bpf: sockmap: Allow update from BPF Allow calling bpf_map_update_elem on sockmap and sockhash from a BPF context. The synchronization required for this is a bit fiddly: we need to prevent the socket from changing its state while we add it to the sockmap, since we rely on getting a callback via sk_prot->unhash. However, we can't just lock_sock like in sock_map_sk_acquire because that might sleep. So instead we disable softirq processing and use bh_lock_sock to prevent further modification. Yet, this is still not enough. BPF can be called in contexts where the current CPU might have locked a socket. If the BPF can get a hold of such a socket, inserting it into a sockmap would lead to a deadlock. One straight forward example are sock_ops programs that have ctx->sk, but the same problem exists for kprobes, etc. We deal with this by allowing sockmap updates only from known safe contexts. Improper usage is rejected by the verifier. I've audited the enabled contexts to make sure they can't run in a locked context. It's possible that CGROUP_SKB and others are safe as well, but the auditing here is much more difficult. In any case, we can extend the safe contexts when the need arises. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200821102948.21918-6-lmb@cloudflare.com --- kernel/bpf/verifier.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f8629bf848fe..dd24503ab3d3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4178,6 +4178,38 @@ err_type: return -EACCES; } +static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) +{ + enum bpf_attach_type eatype = env->prog->expected_attach_type; + enum bpf_prog_type type = env->prog->type; + + if (func_id != BPF_FUNC_map_update_elem) + return false; + + /* It's not possible to get access to a locked struct sock in these + * contexts, so updating is safe. + */ + switch (type) { + case BPF_PROG_TYPE_TRACING: + if (eatype == BPF_TRACE_ITER) + return true; + break; + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_SK_REUSEPORT: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_SK_LOOKUP: + return true; + default: + break; + } + + verbose(env, "cannot update sockmap in this context\n"); + return false; +} + static int check_map_func_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, int func_id) { @@ -4249,7 +4281,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_map && func_id != BPF_FUNC_sk_select_reuseport && - func_id != BPF_FUNC_map_lookup_elem) + func_id != BPF_FUNC_map_lookup_elem && + !may_update_sockmap(env, func_id)) goto error; break; case BPF_MAP_TYPE_SOCKHASH: @@ -4258,7 +4291,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_hash && func_id != BPF_FUNC_sk_select_reuseport && - func_id != BPF_FUNC_map_lookup_elem) + func_id != BPF_FUNC_map_lookup_elem && + !may_update_sockmap(env, func_id)) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: -- cgit v1.2.3 From 71419b30cab099f7ca37e61bf41028d8b7d4984d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 14 Aug 2020 12:19:34 +0200 Subject: timekeeping: Utilize local_clock() for NMI safe timekeeper during early boot During early boot the NMI safe timekeeper returns 0 until the first clocksource becomes available. This prevents it from being used for printk or other facilities which today use sched clock. sched clock can be available way before timekeeping is initialized. The obvious workaround for this is to utilize the early sched clock in the default dummy clock read function until a clocksource becomes available. After switching to the clocksource clock MONOTONIC and BOOTTIME will not jump because the timekeeping_init() bases clock MONOTONIC on sched clock and the offset between clock MONOTONIC and BOOTTIME is zero during boot. Clock REALTIME cannot provide useful timestamps during early boot up to the point where a persistent clock becomes available, which is either in timekeeping_init() or later when the RTC driver which might depend on I2C or other subsystems is initialized. There is a minor difference to sched_clock() vs. suspend/resume. As the timekeeper clock source might not be accessible during suspend, after timekeeping_suspend() timestamps freeze up to the point where timekeeping_resume() is invoked. OTOH this is true for some sched clock implementations as well. Signed-off-by: Thomas Gleixner Tested-by: Petr Mladek Link: https://lore.kernel.org/r/20200814115512.041422402@linutronix.de --- kernel/time/timekeeping.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4c47f388a83f..57d064d94cc2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -54,6 +54,9 @@ static struct { static struct timekeeper shadow_timekeeper; +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + /** * struct tk_fast - NMI safe timekeeper * @seq: Sequence counter for protecting updates. The lowest bit @@ -73,28 +76,42 @@ static u64 cycles_at_suspend; static u64 dummy_clock_read(struct clocksource *cs) { - return cycles_at_suspend; + if (timekeeping_suspended) + return cycles_at_suspend; + return local_clock(); } static struct clocksource dummy_clock = { .read = dummy_clock_read, }; +/* + * Boot time initialization which allows local_clock() to be utilized + * during early boot when clocksources are not available. local_clock() + * returns nanoseconds already so no conversion is required, hence mult=1 + * and shift=0. When the first proper clocksource is installed then + * the fast time keepers are updated with the correct values. + */ +#define FAST_TK_INIT \ + { \ + .clock = &dummy_clock, \ + .mask = CLOCKSOURCE_MASK(64), \ + .mult = 1, \ + .shift = 0, \ + } + static struct tk_fast tk_fast_mono ____cacheline_aligned = { .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), - .base[0] = { .clock = &dummy_clock, }, - .base[1] = { .clock = &dummy_clock, }, + .base[0] = FAST_TK_INIT, + .base[1] = FAST_TK_INIT, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), - .base[0] = { .clock = &dummy_clock, }, - .base[1] = { .clock = &dummy_clock, }, + .base[0] = FAST_TK_INIT, + .base[1] = FAST_TK_INIT, }; -/* flag for if timekeeping is suspended */ -int __read_mostly timekeeping_suspended; - static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { -- cgit v1.2.3 From e2d977c9f1abd1d199b412f8f83c1727808b794d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 14 Aug 2020 12:19:35 +0200 Subject: timekeeping: Provide multi-timestamp accessor to NMI safe timekeeper printk wants to store various timestamps (MONOTONIC, REALTIME, BOOTTIME) to make correlation of dmesg from several systems easier. Provide an interface to retrieve all three timestamps in one go. There are some caveats: 1) Boot time and late sleep time injection Boot time is a racy access on 32bit systems if the sleep time injection happens late during resume and not in timekeeping_resume(). That could be avoided by expanding struct tk_read_base with boot offset for 32bit and adding more overhead to the update. As this is a hard to observe once per resume event which can be filtered with reasonable effort using the accurate mono/real timestamps, it's probably not worth the trouble. Aside of that it might be possible on 32 and 64 bit to observe the following when the sleep time injection happens late: CPU 0 CPU 1 timekeeping_resume() ktime_get_fast_timestamps() mono, real = __ktime_get_real_fast() inject_sleep_time() update boot offset boot = mono + bootoffset; That means that boot time already has the sleep time adjustment, but real time does not. On the next readout both are in sync again. Preventing this for 64bit is not really feasible without destroying the careful cache layout of the timekeeper because the sequence count and struct tk_read_base would then need two cache lines instead of one. 2) Suspend/resume timestamps Access to the time keeper clock source is disabled accross the innermost steps of suspend/resume. The accessors still work, but the timestamps are frozen until time keeping is resumed which happens very early. For regular suspend/resume there is no observable difference vs. sched clock, but it might affect some of the nasty low level debug printks. OTOH, access to sched clock is not guaranteed accross suspend/resume on all systems either so it depends on the hardware in use. If that turns out to be a real problem then this could be mitigated by using sched clock in a similar way as during early boot. But it's not as trivial as on early boot because it needs some careful protection against the clock monotonic timestamp jumping backwards on resume. Signed-off-by: Thomas Gleixner Tested-by: Petr Mladek Link: https://lore.kernel.org/r/20200814115512.159981360@linutronix.de --- kernel/time/timekeeping.c | 76 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 57d064d94cc2..ba7657685e22 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -530,29 +530,29 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); - /* * See comment for __ktime_get_fast_ns() vs. timestamp ordering */ -static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) +static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) { struct tk_read_base *tkr; + u64 basem, baser, delta; unsigned int seq; - u64 now; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base_real); + basem = ktime_to_ns(tkr->base); + baser = ktime_to_ns(tkr->base_real); - now += timekeeping_delta_to_ns(tkr, - clocksource_delta( - tk_clock_read(tkr), - tkr->cycle_last, - tkr->mask)); + delta = timekeeping_delta_to_ns(tkr, + clocksource_delta(tk_clock_read(tkr), + tkr->cycle_last, tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); - return now; + if (mono) + *mono = basem + delta; + return baser + delta; } /** @@ -560,10 +560,64 @@ static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) */ u64 ktime_get_real_fast_ns(void) { - return __ktime_get_real_fast_ns(&tk_fast_mono); + return __ktime_get_real_fast(&tk_fast_mono, NULL); } EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); +/** + * ktime_get_fast_timestamps: - NMI safe timestamps + * @snapshot: Pointer to timestamp storage + * + * Stores clock monotonic, boottime and realtime timestamps. + * + * Boot time is a racy access on 32bit systems if the sleep time injection + * happens late during resume and not in timekeeping_resume(). That could + * be avoided by expanding struct tk_read_base with boot offset for 32bit + * and adding more overhead to the update. As this is a hard to observe + * once per resume event which can be filtered with reasonable effort using + * the accurate mono/real timestamps, it's probably not worth the trouble. + * + * Aside of that it might be possible on 32 and 64 bit to observe the + * following when the sleep time injection happens late: + * + * CPU 0 CPU 1 + * timekeeping_resume() + * ktime_get_fast_timestamps() + * mono, real = __ktime_get_real_fast() + * inject_sleep_time() + * update boot offset + * boot = mono + bootoffset; + * + * That means that boot time already has the sleep time adjustment, but + * real time does not. On the next readout both are in sync again. + * + * Preventing this for 64bit is not really feasible without destroying the + * careful cache layout of the timekeeper because the sequence count and + * struct tk_read_base would then need two cache lines instead of one. + * + * Access to the time keeper clock source is disabled accross the innermost + * steps of suspend/resume. The accessors still work, but the timestamps + * are frozen until time keeping is resumed which happens very early. + * + * For regular suspend/resume there is no observable difference vs. sched + * clock, but it might affect some of the nasty low level debug printks. + * + * OTOH, access to sched clock is not guaranteed accross suspend/resume on + * all systems either so it depends on the hardware in use. + * + * If that turns out to be a real problem then this could be mitigated by + * using sched clock in a similar way as during early boot. But it's not as + * trivial as on early boot because it needs some careful protection + * against the clock monotonic timestamp jumping backwards on resume. + */ +void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono); + snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot)); +} + /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. -- cgit v1.2.3 From 0f8ad5f2e93425812c393c91ceb5af3d95e79b10 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 3 Jul 2020 15:40:29 +0200 Subject: kcsan: Add support for atomic builtins Some architectures (currently e.g. s390 partially) implement atomics using the compiler's atomic builtins (__atomic_*, __sync_*). To support enabling KCSAN on such architectures in future, or support experimental use of these builtins, implement support for them. We should also avoid breaking KCSAN kernels due to use (accidental or otherwise) of atomic builtins in drivers, as has happened in the past: https://lkml.kernel.org/r/5231d2c0-41d9-6721-e15f-a7eedf3ce69e@infradead.org The instrumentation is subtly different from regular reads/writes: TSAN instrumentation replaces the use of atomic builtins with a call into the runtime, and the runtime's job is to also execute the desired atomic operation. We rely on the __atomic_* compiler builtins, available with all KCSAN-supported compilers, to implement each TSAN atomic instrumentation function. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 9147ff6a12e5..682d9fd76733 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -879,3 +879,113 @@ void __tsan_init(void) { } EXPORT_SYMBOL(__tsan_init); + +/* + * Instrumentation for atomic builtins (__atomic_*, __sync_*). + * + * Normal kernel code _should not_ be using them directly, but some + * architectures may implement some or all atomics using the compilers' + * builtins. + * + * Note: If an architecture decides to fully implement atomics using the + * builtins, because they are implicitly instrumented by KCSAN (and KASAN, + * etc.), implementing the ARCH_ATOMIC interface (to get instrumentation via + * atomic-instrumented) is no longer necessary. + * + * TSAN instrumentation replaces atomic accesses with calls to any of the below + * functions, whose job is to also execute the operation itself. + */ + +#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \ + u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \ + u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \ + { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \ + return __atomic_load_n(ptr, memorder); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_load); \ + void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \ + void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \ + { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + __atomic_store_n(ptr, v, memorder); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_store) + +#define DEFINE_TSAN_ATOMIC_RMW(op, bits, suffix) \ + u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \ + u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \ + { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + return __atomic_##op##suffix(ptr, v, memorder); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_##op) + +/* + * Note: CAS operations are always classified as write, even in case they + * fail. We cannot perform check_access() after a write, as it might lead to + * false positives, in cases such as: + * + * T0: __atomic_compare_exchange_n(&p->flag, &old, 1, ...) + * + * T1: if (__atomic_load_n(&p->flag, ...)) { + * modify *p; + * p->flag = 0; + * } + * + * The only downside is that, if there are 3 threads, with one CAS that + * succeeds, another CAS that fails, and an unmarked racing operation, we may + * point at the wrong CAS as the source of the race. However, if we assume that + * all CAS can succeed in some other execution, the data race is still valid. + */ +#define DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strength, weak) \ + int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ + u##bits val, int mo, int fail_mo); \ + int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ + u##bits val, int mo, int fail_mo) \ + { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_##strength) + +#define DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits) \ + u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ + int mo, int fail_mo); \ + u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ + int mo, int fail_mo) \ + { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + __atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \ + return exp; \ + } \ + EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_val) + +#define DEFINE_TSAN_ATOMIC_OPS(bits) \ + DEFINE_TSAN_ATOMIC_LOAD_STORE(bits); \ + DEFINE_TSAN_ATOMIC_RMW(exchange, bits, _n); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_add, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_sub, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_and, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_or, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_xor, bits, ); \ + DEFINE_TSAN_ATOMIC_RMW(fetch_nand, bits, ); \ + DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strong, 0); \ + DEFINE_TSAN_ATOMIC_CMPXCHG(bits, weak, 1); \ + DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits) + +DEFINE_TSAN_ATOMIC_OPS(8); +DEFINE_TSAN_ATOMIC_OPS(16); +DEFINE_TSAN_ATOMIC_OPS(32); +DEFINE_TSAN_ATOMIC_OPS(64); + +void __tsan_atomic_thread_fence(int memorder); +void __tsan_atomic_thread_fence(int memorder) +{ + __atomic_thread_fence(memorder); +} +EXPORT_SYMBOL(__tsan_atomic_thread_fence); + +void __tsan_atomic_signal_fence(int memorder); +void __tsan_atomic_signal_fence(int memorder) { } +EXPORT_SYMBOL(__tsan_atomic_signal_fence); -- cgit v1.2.3 From f9ea63193135473ed6b6ff06f016eb6248100041 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 3 Jul 2020 15:40:31 +0200 Subject: kcsan: Add atomic builtin test case Adds test case to kcsan-test module, to test atomic builtin instrumentation works. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan-test.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c index fed6fcb5768c..721180cbbab1 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan-test.c @@ -390,6 +390,15 @@ static noinline void test_kernel_seqlock_writer(void) write_sequnlock_irqrestore(&test_seqlock, flags); } +static noinline void test_kernel_atomic_builtins(void) +{ + /* + * Generate concurrent accesses, expecting no reports, ensuring KCSAN + * treats builtin atomics as actually atomic. + */ + __atomic_load_n(&test_var, __ATOMIC_RELAXED); +} + /* ===== Test cases ===== */ /* Simple test with normal data race. */ @@ -852,6 +861,59 @@ static void test_seqlock_noreport(struct kunit *test) KUNIT_EXPECT_FALSE(test, match_never); } +/* + * Test atomic builtins work and required instrumentation functions exist. We + * also test that KCSAN understands they're atomic by racing with them via + * test_kernel_atomic_builtins(), and expect no reports. + * + * The atomic builtins _SHOULD NOT_ be used in normal kernel code! + */ +static void test_atomic_builtins(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_atomic_builtins, test_kernel_atomic_builtins); + do { + long tmp; + + kcsan_enable_current(); + + __atomic_store_n(&test_var, 42L, __ATOMIC_RELAXED); + KUNIT_EXPECT_EQ(test, 42L, __atomic_load_n(&test_var, __ATOMIC_RELAXED)); + + KUNIT_EXPECT_EQ(test, 42L, __atomic_exchange_n(&test_var, 20, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 20L, test_var); + + tmp = 20L; + KUNIT_EXPECT_TRUE(test, __atomic_compare_exchange_n(&test_var, &tmp, 30L, + 0, __ATOMIC_RELAXED, + __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, tmp, 20L); + KUNIT_EXPECT_EQ(test, test_var, 30L); + KUNIT_EXPECT_FALSE(test, __atomic_compare_exchange_n(&test_var, &tmp, 40L, + 1, __ATOMIC_RELAXED, + __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, tmp, 30L); + KUNIT_EXPECT_EQ(test, test_var, 30L); + + KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 31L, __atomic_fetch_sub(&test_var, 1, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_and(&test_var, 0xf, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 14L, __atomic_fetch_xor(&test_var, 0xf, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 1L, __atomic_fetch_or(&test_var, 0xf0, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, 241L, __atomic_fetch_nand(&test_var, 0xf, __ATOMIC_RELAXED)); + KUNIT_EXPECT_EQ(test, -2L, test_var); + + __atomic_thread_fence(__ATOMIC_SEQ_CST); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + + kcsan_disable_current(); + + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + /* * Each test case is run with different numbers of threads. Until KUnit supports * passing arguments for each test case, we encode #threads in the test case @@ -891,6 +953,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), KCSAN_KUNIT_CASE(test_jiffies_noreport), KCSAN_KUNIT_CASE(test_seqlock_noreport), + KCSAN_KUNIT_CASE(test_atomic_builtins), {}, }; -- cgit v1.2.3 From 14e2ac8de0f91f12122a49f09897b0cd05256460 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Jul 2020 09:00:01 +0200 Subject: kcsan: Support compounded read-write instrumentation Add support for compounded read-write instrumentation if supported by the compiler. Adds the necessary instrumentation functions, and a new type which is used to generate a more descriptive report. Furthermore, such compounded memory access instrumentation is excluded from the "assume aligned writes up to word size are atomic" rule, because we cannot assume that the compiler emits code that is atomic for compound ops. LLVM/Clang added support for the feature in: https://github.com/llvm/llvm-project/commit/785d41a261d136b64ab6c15c5d35f2adc5ad53e3 The new instrumentation is emitted for sets of memory accesses in the same basic block to the same address with at least one read appearing before a write. These typically result from compound operations such as ++, --, +=, -=, |=, &=, etc. but also equivalent forms such as "var = var + 1". Where the compiler determines that it is equivalent to emit a call to a single __tsan_read_write instead of separate __tsan_read and __tsan_write, we can then benefit from improved performance and better reporting for such access patterns. The new reports now show that the ops are both reads and writes, for example: read-write to 0xffffffff90548a38 of 8 bytes by task 143 on cpu 3: test_kernel_rmw_array+0x45/0xa0 access_thread+0x71/0xb0 kthread+0x21e/0x240 ret_from_fork+0x22/0x30 read-write to 0xffffffff90548a38 of 8 bytes by task 144 on cpu 2: test_kernel_rmw_array+0x45/0xa0 access_thread+0x71/0xb0 kthread+0x21e/0x240 ret_from_fork+0x22/0x30 Acked-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 23 ++++++++++++++++++----- kernel/kcsan/report.c | 4 ++++ 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 682d9fd76733..4c8b40b14314 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -223,7 +223,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) && (type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) && - IS_ALIGNED((unsigned long)ptr, size)) + !(type & KCSAN_ACCESS_COMPOUND) && IS_ALIGNED((unsigned long)ptr, size)) return true; /* Assume aligned writes up to word size are atomic. */ if (ctx->atomic_next > 0) { @@ -793,7 +793,17 @@ EXPORT_SYMBOL(__kcsan_check_access); EXPORT_SYMBOL(__tsan_write##size); \ void __tsan_unaligned_write##size(void *ptr) \ __alias(__tsan_write##size); \ - EXPORT_SYMBOL(__tsan_unaligned_write##size) + EXPORT_SYMBOL(__tsan_unaligned_write##size); \ + void __tsan_read_write##size(void *ptr); \ + void __tsan_read_write##size(void *ptr) \ + { \ + check_access(ptr, size, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE); \ + } \ + EXPORT_SYMBOL(__tsan_read_write##size); \ + void __tsan_unaligned_read_write##size(void *ptr) \ + __alias(__tsan_read_write##size); \ + EXPORT_SYMBOL(__tsan_unaligned_read_write##size) DEFINE_TSAN_READ_WRITE(1); DEFINE_TSAN_READ_WRITE(2); @@ -916,7 +926,8 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ return __atomic_##op##suffix(ptr, v, memorder); \ } \ EXPORT_SYMBOL(__tsan_atomic##bits##_##op) @@ -944,7 +955,8 @@ EXPORT_SYMBOL(__tsan_init); int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ u##bits val, int mo, int fail_mo) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \ } \ EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_##strength) @@ -955,7 +967,8 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ int mo, int fail_mo) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ __atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \ return exp; \ } \ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 9d07e175de0f..3e83a69239fa 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -228,6 +228,10 @@ static const char *get_access_type(int type) return "write"; case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: return "write (marked)"; + case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE: + return "read-write"; + case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: + return "read-write (marked)"; case KCSAN_ACCESS_SCOPED: return "read (scoped)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC: -- cgit v1.2.3 From 106a307fd0a762e2d47e1cf99e6da43763887a18 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Jul 2020 09:00:03 +0200 Subject: kcsan: Skew delay to be longer for certain access types For compound instrumentation and assert accesses, skew the watchpoint delay to be longer if randomized. This is useful to improve race detection for such accesses. For compound accesses we should increase the delay as we've aggregated both read and write instrumentation. By giving up 1 call into the runtime, we're less likely to set up a watchpoint and thus less likely to detect a race. We can balance this by increasing the watchpoint delay. For assert accesses, we know these are of increased interest, and we wish to increase our chances of detecting races for such checks. Note that, kcsan_udelay_{task,interrupt} define the upper bound delays. When randomized, delays are uniformly distributed between [0, delay]. Skewing the delay does not break this promise as long as the defined upper bounds are still adhered to. The current skew results in delays uniformly distributed between [delay/2, delay]. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 4c8b40b14314..95a364e22aab 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -283,11 +283,15 @@ static __always_inline bool kcsan_is_enabled(void) return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; } -static inline unsigned int get_delay(void) +static inline unsigned int get_delay(int type) { unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt; + /* For certain access types, skew the random delay to be longer. */ + unsigned int skew_delay_order = + (type & (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_ASSERT)) ? 1 : 0; + return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? - prandom_u32_max(delay) : + prandom_u32_max(delay >> skew_delay_order) : 0); } @@ -470,7 +474,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Delay this thread, to increase probability of observing a racy * conflicting access. */ - udelay(get_delay()); + udelay(get_delay(type)); /* * Re-read value, and check if it is as expected; if not, we infer a -- cgit v1.2.3 From 9d1335cc1e97cc3da0d14f640dd716e614083e8b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Jul 2020 09:00:04 +0200 Subject: kcsan: Add missing CONFIG_KCSAN_IGNORE_ATOMICS checks Add missing CONFIG_KCSAN_IGNORE_ATOMICS checks for the builtin atomics instrumentation. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 95a364e22aab..99e5044bc942 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -914,14 +914,19 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \ + } \ return __atomic_load_n(ptr, memorder); \ } \ EXPORT_SYMBOL(__tsan_atomic##bits##_load); \ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + } \ __atomic_store_n(ptr, v, memorder); \ } \ EXPORT_SYMBOL(__tsan_atomic##bits##_store) @@ -930,8 +935,11 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, \ - KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ + KCSAN_ACCESS_ATOMIC); \ + } \ return __atomic_##op##suffix(ptr, v, memorder); \ } \ EXPORT_SYMBOL(__tsan_atomic##bits##_##op) @@ -959,8 +967,11 @@ EXPORT_SYMBOL(__tsan_init); int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ u##bits val, int mo, int fail_mo) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, \ - KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ + KCSAN_ACCESS_ATOMIC); \ + } \ return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \ } \ EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_##strength) @@ -971,8 +982,11 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ int mo, int fail_mo) \ { \ - check_access(ptr, bits / BITS_PER_BYTE, \ - KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \ + if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ + check_access(ptr, bits / BITS_PER_BYTE, \ + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ + KCSAN_ACCESS_ATOMIC); \ + } \ __atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \ return exp; \ } \ -- cgit v1.2.3 From bec4a2474890a6884eb890c778ea02bccaaae6eb Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 24 Jul 2020 09:00:05 +0200 Subject: kcsan: Test support for compound instrumentation Changes kcsan-test module to support checking reports that include compound instrumentation. Since we should not fail the test if this support is unavailable, we have to add a config variable that the test can use to decide what to check for. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan-test.c | 65 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c index 721180cbbab1..ebe7fd245104 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan-test.c @@ -27,6 +27,12 @@ #include #include +#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE +#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE) +#else +#define __KCSAN_ACCESS_RW(alt) (alt) +#endif + /* Points to current test-case memory access "kernels". */ static void (*access_kernels[2])(void); @@ -186,20 +192,21 @@ static bool report_matches(const struct expect_report *r) /* Access 1 & 2 */ for (i = 0; i < 2; ++i) { + const int ty = r->access[i].type; const char *const access_type = - (r->access[i].type & KCSAN_ACCESS_ASSERT) ? - ((r->access[i].type & KCSAN_ACCESS_WRITE) ? - "assert no accesses" : - "assert no writes") : - ((r->access[i].type & KCSAN_ACCESS_WRITE) ? - "write" : - "read"); + (ty & KCSAN_ACCESS_ASSERT) ? + ((ty & KCSAN_ACCESS_WRITE) ? + "assert no accesses" : + "assert no writes") : + ((ty & KCSAN_ACCESS_WRITE) ? + ((ty & KCSAN_ACCESS_COMPOUND) ? + "read-write" : + "write") : + "read"); const char *const access_type_aux = - (r->access[i].type & KCSAN_ACCESS_ATOMIC) ? - " (marked)" : - ((r->access[i].type & KCSAN_ACCESS_SCOPED) ? - " (scoped)" : - ""); + (ty & KCSAN_ACCESS_ATOMIC) ? + " (marked)" : + ((ty & KCSAN_ACCESS_SCOPED) ? " (scoped)" : ""); if (i == 1) { /* Access 2 */ @@ -277,6 +284,12 @@ static noinline void test_kernel_write_atomic(void) WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1); } +static noinline void test_kernel_atomic_rmw(void) +{ + /* Use builtin, so we can set up the "bad" atomic/non-atomic scenario. */ + __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED); +} + __no_kcsan static noinline void test_kernel_write_uninstrumented(void) { test_var++; } @@ -439,8 +452,8 @@ static void test_concurrent_races(struct kunit *test) const struct expect_report expect = { .access = { /* NULL will match any address. */ - { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE }, - { test_kernel_rmw_array, NULL, 0, 0 }, + { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) }, }, }; static const struct expect_report never = { @@ -629,6 +642,29 @@ static void test_read_plain_atomic_write(struct kunit *test) KUNIT_EXPECT_TRUE(test, match_expect); } +/* Test that atomic RMWs generate correct report. */ +__no_kcsan +static void test_read_plain_atomic_rmw(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_atomic_rmw, &test_var, sizeof(test_var), + KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC }, + }, + }; + bool match_expect = false; + + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) + return; + + begin_test_checks(test_kernel_read, test_kernel_atomic_rmw); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + /* Zero-sized accesses should never cause data race reports. */ __no_kcsan static void test_zero_size_access(struct kunit *test) @@ -942,6 +978,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_write_write_struct_part), KCSAN_KUNIT_CASE(test_read_atomic_write_atomic), KCSAN_KUNIT_CASE(test_read_plain_atomic_write), + KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw), KCSAN_KUNIT_CASE(test_zero_size_access), KCSAN_KUNIT_CASE(test_data_race), KCSAN_KUNIT_CASE(test_assert_exclusive_writer), -- cgit v1.2.3 From 69b2c81bc894606670204f0ae08f406dbcce836d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 31 Jul 2020 10:17:19 +0200 Subject: kcsan: Simplify debugfs counter to name mapping Simplify counter ID to name mapping by using an array with designated inits. This way, we can turn a run-time BUG() into a compile-time static assertion failure if a counter name is missing. No functional change intended. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/debugfs.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 023e49c58d55..3a9566addeff 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -19,6 +19,18 @@ * Statistics counters. */ static atomic_long_t counters[KCSAN_COUNTER_COUNT]; +static const char *const counter_names[] = { + [KCSAN_COUNTER_USED_WATCHPOINTS] = "used_watchpoints", + [KCSAN_COUNTER_SETUP_WATCHPOINTS] = "setup_watchpoints", + [KCSAN_COUNTER_DATA_RACES] = "data_races", + [KCSAN_COUNTER_ASSERT_FAILURES] = "assert_failures", + [KCSAN_COUNTER_NO_CAPACITY] = "no_capacity", + [KCSAN_COUNTER_REPORT_RACES] = "report_races", + [KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN] = "races_unknown_origin", + [KCSAN_COUNTER_UNENCODABLE_ACCESSES] = "unencodable_accesses", + [KCSAN_COUNTER_ENCODING_FALSE_POSITIVES] = "encoding_false_positives", +}; +static_assert(ARRAY_SIZE(counter_names) == KCSAN_COUNTER_COUNT); /* * Addresses for filtering functions from reporting. This list can be used as a @@ -39,24 +51,6 @@ static struct { }; static DEFINE_SPINLOCK(report_filterlist_lock); -static const char *counter_to_name(enum kcsan_counter_id id) -{ - switch (id) { - case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints"; - case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints"; - case KCSAN_COUNTER_DATA_RACES: return "data_races"; - case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures"; - case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity"; - case KCSAN_COUNTER_REPORT_RACES: return "report_races"; - case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin"; - case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses"; - case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives"; - case KCSAN_COUNTER_COUNT: - BUG(); - } - return NULL; -} - void kcsan_counter_inc(enum kcsan_counter_id id) { atomic_long_inc(&counters[id]); @@ -271,8 +265,7 @@ static int show_info(struct seq_file *file, void *v) /* show stats */ seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled)); for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) - seq_printf(file, "%s: %ld\n", counter_to_name(i), - atomic_long_read(&counters[i])); + seq_printf(file, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); /* show filter functions, and filter type */ spin_lock_irqsave(&report_filterlist_lock, flags); -- cgit v1.2.3 From a4e74fa5f0d3e2a11f2d0deb522681d219a81426 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 31 Jul 2020 10:17:20 +0200 Subject: kcsan: Simplify constant string handling Simplify checking prefixes and length calculation of constant strings. For the former, the kernel provides str_has_prefix(), and the latter we should just use strlen("..") because GCC and Clang have optimizations that optimize these into constants. No functional change intended. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/debugfs.c | 8 ++++---- kernel/kcsan/report.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 3a9566addeff..116bdd8f050c 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -300,16 +300,16 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o WRITE_ONCE(kcsan_enabled, true); } else if (!strcmp(arg, "off")) { WRITE_ONCE(kcsan_enabled, false); - } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) { + } else if (str_has_prefix(arg, "microbench=")) { unsigned long iters; - if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters)) + if (kstrtoul(&arg[strlen("microbench=")], 0, &iters)) return -EINVAL; microbenchmark(iters); - } else if (!strncmp(arg, "test=", sizeof("test=") - 1)) { + } else if (str_has_prefix(arg, "test=")) { unsigned long iters; - if (kstrtoul(&arg[sizeof("test=") - 1], 0, &iters)) + if (kstrtoul(&arg[strlen("test=")], 0, &iters)) return -EINVAL; test_thread(iters); } else if (!strcmp(arg, "whitelist")) { diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 3e83a69239fa..bf1d59449805 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -279,8 +279,8 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries cur = strnstr(buf, "kcsan_", len); if (cur) { - cur += sizeof("kcsan_") - 1; - if (strncmp(cur, "test", sizeof("test") - 1)) + cur += strlen("kcsan_"); + if (!str_has_prefix(cur, "test")) continue; /* KCSAN runtime function. */ /* KCSAN related test. */ } -- cgit v1.2.3 From 4700ccdf18fa3002d66769b69cf715cc58beea37 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 31 Jul 2020 10:17:21 +0200 Subject: kcsan: Remove debugfs test command Remove the debugfs test command, as it is no longer needed now that we have the KUnit+Torture based kcsan-test module. This is to avoid confusion around how KCSAN should be tested, as only the kcsan-test module is maintained. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/debugfs.c | 66 -------------------------------------------------- 1 file changed, 66 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 116bdd8f050c..de1da1b01aa4 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -98,66 +98,6 @@ static noinline void microbenchmark(unsigned long iters) current->kcsan_ctx = ctx_save; } -/* - * Simple test to create conflicting accesses. Write 'test=' to KCSAN's - * debugfs file from multiple tasks to generate real conflicts and show reports. - */ -static long test_dummy; -static long test_flags; -static long test_scoped; -static noinline void test_thread(unsigned long iters) -{ - const long CHANGE_BITS = 0xff00ff00ff00ff00L; - const struct kcsan_ctx ctx_save = current->kcsan_ctx; - cycles_t cycles; - - /* We may have been called from an atomic region; reset context. */ - memset(¤t->kcsan_ctx, 0, sizeof(current->kcsan_ctx)); - - pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); - pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n", - &test_dummy, &test_flags, &test_scoped); - - cycles = get_cycles(); - while (iters--) { - /* These all should generate reports. */ - __kcsan_check_read(&test_dummy, sizeof(test_dummy)); - ASSERT_EXCLUSIVE_WRITER(test_dummy); - ASSERT_EXCLUSIVE_ACCESS(test_dummy); - - ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */ - __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ - - ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */ - __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */ - - /* not actually instrumented */ - WRITE_ONCE(test_dummy, iters); /* to observe value-change */ - __kcsan_check_write(&test_dummy, sizeof(test_dummy)); - - test_flags ^= CHANGE_BITS; /* generate value-change */ - __kcsan_check_write(&test_flags, sizeof(test_flags)); - - BUG_ON(current->kcsan_ctx.scoped_accesses.prev); - { - /* Should generate reports anywhere in this block. */ - ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped); - ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped); - BUG_ON(!current->kcsan_ctx.scoped_accesses.prev); - /* Unrelated accesses. */ - __kcsan_check_access(&cycles, sizeof(cycles), 0); - __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC); - } - BUG_ON(current->kcsan_ctx.scoped_accesses.prev); - } - cycles = get_cycles() - cycles; - - pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); - - /* restore context */ - current->kcsan_ctx = ctx_save; -} - static int cmp_filterlist_addrs(const void *rhs, const void *lhs) { const unsigned long a = *(const unsigned long *)rhs; @@ -306,12 +246,6 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o if (kstrtoul(&arg[strlen("microbench=")], 0, &iters)) return -EINVAL; microbenchmark(iters); - } else if (str_has_prefix(arg, "test=")) { - unsigned long iters; - - if (kstrtoul(&arg[strlen("test=")], 0, &iters)) - return -EINVAL; - test_thread(iters); } else if (!strcmp(arg, "whitelist")) { set_report_filterlist_whitelist(true); } else if (!strcmp(arg, "blacklist")) { -- cgit v1.2.3 From 2778793072c31e3eb33842f3bd7da82dfc7efc6b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 31 Jul 2020 10:17:22 +0200 Subject: kcsan: Show message if enabled early Show a message in the kernel log if KCSAN was enabled early. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 99e5044bc942..b176400a2735 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kcsan: " fmt + #include #include #include @@ -463,7 +465,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) { kcsan_disable_current(); - pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", + pr_err("watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", is_write ? "write" : "read", size, ptr, watchpoint_slot((unsigned long)ptr), encode_watchpoint((unsigned long)ptr, size, is_write)); @@ -623,8 +625,10 @@ void __init kcsan_init(void) * We are in the init task, and no other tasks should be running; * WRITE_ONCE without memory barrier is sufficient. */ - if (kcsan_early_enable) + if (kcsan_early_enable) { + pr_info("enabled early\n"); WRITE_ONCE(kcsan_enabled, true); + } } /* === Exported interface =================================================== */ -- cgit v1.2.3 From 178a1877d782c034f466edd80e30a107af5469df Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 31 Jul 2020 10:17:23 +0200 Subject: kcsan: Use pr_fmt for consistency Use the same pr_fmt throughout for consistency. [ The only exception is report.c, where the format must be kept precisely as-is. ] Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/debugfs.c | 8 +++++--- kernel/kcsan/selftest.c | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index de1da1b01aa4..6c4914fa2fad 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kcsan: " fmt + #include #include #include @@ -80,7 +82,7 @@ static noinline void microbenchmark(unsigned long iters) */ WRITE_ONCE(kcsan_enabled, false); - pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters); + pr_info("%s begin | iters: %lu\n", __func__, iters); cycles = get_cycles(); while (iters--) { @@ -91,7 +93,7 @@ static noinline void microbenchmark(unsigned long iters) } cycles = get_cycles() - cycles; - pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles); + pr_info("%s end | cycles: %llu\n", __func__, cycles); WRITE_ONCE(kcsan_enabled, was_enabled); /* restore context */ @@ -154,7 +156,7 @@ static ssize_t insert_report_filterlist(const char *func) ssize_t ret = 0; if (!addr) { - pr_err("KCSAN: could not find function: '%s'\n", func); + pr_err("could not find function: '%s'\n", func); return -ENOENT; } diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index d26a052d3383..d98bc208d06d 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "kcsan: " fmt + #include #include #include @@ -116,16 +118,16 @@ static int __init kcsan_selftest(void) if (do_test()) \ ++passed; \ else \ - pr_err("KCSAN selftest: " #do_test " failed"); \ + pr_err("selftest: " #do_test " failed"); \ } while (0) RUN_TEST(test_requires); RUN_TEST(test_encode_decode); RUN_TEST(test_matching_access); - pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total); + pr_info("selftest: %d/%d tests passed\n", passed, total); if (passed != total) - panic("KCSAN selftests failed"); + panic("selftests failed"); return 0; } postcore_initcall(kcsan_selftest); -- cgit v1.2.3 From 2e986b81f698e73c95e6456183f27b861f47bb87 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 10 Aug 2020 10:06:25 +0200 Subject: kcsan: Optimize debugfs stats counters Remove kcsan_counter_inc/dec() functions, as they perform no other logic, and are no longer needed. This avoids several calls in kcsan_setup_watchpoint() and kcsan_found_watchpoint(), as well as lets the compiler warn us about potential out-of-bounds accesses as the array's size is known at all usage sites at compile-time. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 22 +++++++++++----------- kernel/kcsan/debugfs.c | 21 +++++---------------- kernel/kcsan/kcsan.h | 12 ++++++------ kernel/kcsan/report.c | 2 +- 4 files changed, 23 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index b176400a2735..8a1ff605ff2d 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -367,13 +367,13 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, * already removed the watchpoint, or another thread consumed * the watchpoint before this thread. */ - kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_REPORT_RACES]); } if ((type & KCSAN_ACCESS_ASSERT) != 0) - kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); else - kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_DATA_RACES]); user_access_restore(flags); } @@ -414,7 +414,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; if (!check_encodable((unsigned long)ptr, size)) { - kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_UNENCODABLE_ACCESSES]); goto out; } @@ -434,12 +434,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * with which should_watch() returns true should be tweaked so * that this case happens very rarely. */ - kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_NO_CAPACITY]); goto out_unlock; } - kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS); - kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_SETUP_WATCHPOINTS]); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]); /* * Read the current value, to later check and infer a race if the data @@ -541,16 +541,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * increment this counter. */ if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE) - kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL, watchpoint - watchpoints); } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) { /* Inferring a race, since the value should not have changed. */ - kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN]); if (is_assert) - kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE, @@ -563,7 +563,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * reused after this point. */ remove_watchpoint(watchpoint); - kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); + atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]); out_unlock: if (!kcsan_interrupt_watcher) local_irq_restore(irq_flags); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 6c4914fa2fad..3c8093a371b1 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -17,10 +17,7 @@ #include "kcsan.h" -/* - * Statistics counters. - */ -static atomic_long_t counters[KCSAN_COUNTER_COUNT]; +atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT]; static const char *const counter_names[] = { [KCSAN_COUNTER_USED_WATCHPOINTS] = "used_watchpoints", [KCSAN_COUNTER_SETUP_WATCHPOINTS] = "setup_watchpoints", @@ -53,16 +50,6 @@ static struct { }; static DEFINE_SPINLOCK(report_filterlist_lock); -void kcsan_counter_inc(enum kcsan_counter_id id) -{ - atomic_long_inc(&counters[id]); -} - -void kcsan_counter_dec(enum kcsan_counter_id id) -{ - atomic_long_dec(&counters[id]); -} - /* * The microbenchmark allows benchmarking KCSAN core runtime only. To run * multiple threads, pipe 'microbench=' from multiple tasks into the @@ -206,8 +193,10 @@ static int show_info(struct seq_file *file, void *v) /* show stats */ seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled)); - for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) - seq_printf(file, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); + for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) { + seq_printf(file, "%s: %ld\n", counter_names[i], + atomic_long_read(&kcsan_counters[i])); + } /* show filter functions, and filter type */ spin_lock_irqsave(&report_filterlist_lock, flags); diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 29480010dc30..8d4bf3431b3c 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -8,6 +8,7 @@ #ifndef _KERNEL_KCSAN_KCSAN_H #define _KERNEL_KCSAN_KCSAN_H +#include #include #include @@ -34,6 +35,10 @@ void kcsan_restore_irqtrace(struct task_struct *task); */ void kcsan_debugfs_init(void); +/* + * Statistics counters displayed via debugfs; should only be modified in + * slow-paths. + */ enum kcsan_counter_id { /* * Number of watchpoints currently in use. @@ -86,12 +91,7 @@ enum kcsan_counter_id { KCSAN_COUNTER_COUNT, /* number of counters */ }; - -/* - * Increment/decrement counter with given id; avoid calling these in fast-path. - */ -extern void kcsan_counter_inc(enum kcsan_counter_id id); -extern void kcsan_counter_dec(enum kcsan_counter_id id); +extern atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT]; /* * Returns true if data races in the function symbol that maps to func_addr diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index bf1d59449805..d3bf87e6007c 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -559,7 +559,7 @@ static bool prepare_report_consumer(unsigned long *flags, * If the actual accesses to not match, this was a false * positive due to watchpoint encoding. */ - kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES); + atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ENCODING_FALSE_POSITIVES]); goto discard; } -- cgit v1.2.3 From ebc3505d507cf0aafdc31e4b2359c9b22b3927c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 13:25:26 -0700 Subject: rcu: Remove KCSAN stubs KCSAN is now in mainline, so this commit removes the stubs for the data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8ce77d9ac716..eb36779697ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -70,19 +70,6 @@ #endif #define MODULE_PARAM_PREFIX "rcutree." -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Data structures. */ /* -- cgit v1.2.3 From beb27bd649a08655b6e15b71265fccad9c00bd2c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 13:26:20 -0700 Subject: rcu: Remove KCSAN stubs from update.c KCSAN is now in mainline, so this commit removes the stubs for the data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 2de49b5d8dd2..5f7713a27dbb 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -53,19 +53,6 @@ #endif #define MODULE_PARAM_PREFIX "rcupdate." -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); -- cgit v1.2.3 From d9b60741318f6f8bcb2adc4beaef724c923fcb93 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 13:24:04 -0700 Subject: srcu: Remove KCSAN stubs KCSAN is now in mainline, so this commit removes the stubs for the data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c100acf332ed..c13348ee80a5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -29,19 +29,6 @@ #include "rcu.h" #include "rcu_segcblist.h" -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Holdoff in nanoseconds for auto-expediting. */ #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; -- cgit v1.2.3 From 7487ea07dfa9bd782a13469cab18973ea0ab8c57 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Jun 2020 09:51:12 -0700 Subject: rcu: Initialize at declaration time in rcu_exp_handler() This commit moves the initialization of the CONFIG_PREEMPT=n version of the rcu_exp_handler() function's rdp and rnp local variables into their respective declarations to save a couple lines of code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1888c0eb1216..8760b6ead770 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -732,11 +732,9 @@ static void rcu_exp_need_qs(void) /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { - struct rcu_data *rdp; - struct rcu_node *rnp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; - rdp = this_cpu_ptr(&rcu_data); - rnp = rdp->mynode; if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; -- cgit v1.2.3 From a7886e899fd8334a03d37e66ad10295d175725ea Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 18 Jun 2020 21:36:40 -0400 Subject: rcu/trace: Use gp_seq_req in acceleration's rcu_grace_period tracepoint During acceleration of CB, the rsp's gp_seq is rcu_seq_snap'd. This is the value used for acceleration - it is the value of gp_seq at which it is safe the execute all callbacks in the callback list. The rdp's gp_seq is not very useful for this scenario. Make rcu_grace_period report the gp_seq_req instead as it allows one to reason about how the acceleration works. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index eb36779697ca..896912034982 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1483,9 +1483,10 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB")); else - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB")); + return ret; } -- cgit v1.2.3 From e082c7b38185af0f59e55efff840939c35391f85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Jun 2020 09:25:34 -0700 Subject: nocb: Clarify RCU nocb CPU error message A message of the form "rcu: !!! lDTs ." can be tracked down, but doing so is not trivial. This commit therefore eases this process by adding text so that this error message now reads as follows: "rcu: nocb GP activity on CB-only CPU!!! lDTs ." Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 982fc5be5269..bbc0c07ce56e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2417,7 +2417,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) !waslocked && !wastimer && !wassleep) return; /* Nothing untowards. */ - pr_info(" !!! %c%c%c%c %c\n", + pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n", "lL"[waslocked], "dD"[!!rdp->nocb_defer_wakeup], "tT"[wastimer], -- cgit v1.2.3 From 9c39245382de4d52a122641952900709d4a9950b Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 22 Jun 2020 00:07:27 +0530 Subject: rcu/tree: Force quiescent state on callback overload On callback overload, it is necessary to quickly detect idle CPUs, and rcu_gp_fqs_check_wake() checks for this condition. Unfortunately, the code following the call to this function does not repeat this check, which means that in reality no actual quiescent-state forcing, instead only a couple of quick and pointless wakeups at the beginning of the grace period. This commit therefore adds a check for the RCU_GP_FLAG_OVLD flag in the post-wakeup "if" statement in rcu_gp_fqs_loop(). Fixes: 1fca4d12f4637 ("rcu: Expedite first two FQS scans under callback-overload conditions") Reviewed-by: Joel Fernandes (Google) Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 896912034982..4770d7709dc2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1884,7 +1884,7 @@ static void rcu_gp_fqs_loop(void) break; /* If time for quiescent-state forcing, do it. */ if (!time_after(rcu_state.jiffies_force_qs, jiffies) || - (gf & RCU_GP_FLAG_FQS)) { + (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); -- cgit v1.2.3 From 9b1ce0acb5e65e9ea1e6b322562d072f9f7d1f6e Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 22 Jun 2020 23:37:03 +0530 Subject: rcu/tree: Remove CONFIG_PREMPT_RCU check in force_qs_rnp() Originally, the call to rcu_preempt_blocked_readers_cgp() from force_qs_rnp() had to be conditioned on CONFIG_PREEMPT_RCU=y, as in commit a77da14ce9af ("rcu: Yet another fix for preemption and CPU hotplug"). However, there is now a CONFIG_PREEMPT_RCU=n definition of rcu_preempt_blocked_readers_cgp() that unconditionally returns zero, so invoking it is now safe. In addition, the CONFIG_PREEMPT_RCU=n definition of rcu_initiate_boost() simply releases the rcu_node structure's ->lock, which is what happens when the "if" condition evaluates to false. This commit therefore drops the IS_ENABLED(CONFIG_PREEMPT_RCU) check, so that rcu_initiate_boost() is called only in CONFIG_PREEMPT_RCU=y kernels when there are readers blocking the current grace period. This does not change the behavior, but reduces code-reader confusion by eliminating non-CONFIG_PREEMPT_RCU=y calls to rcu_initiate_boost(). Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4770d7709dc2..acc926f07dc1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2533,8 +2533,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || - rcu_preempt_blocked_readers_cgp(rnp)) { + if (rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they * are all zero. But we might need to -- cgit v1.2.3 From 2130c6b4f610ea65e9df71dfa79ee08f2fc17743 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Jun 2020 16:46:43 -0700 Subject: nocb: Remove show_rcu_nocb_state() false positive printout The rcu_data structure's ->nocb_timer field is used to defer wakeups of the corresponding no-CBs CPU's grace-period kthread ("rcuog*"), and that structure's ->nocb_defer_wakeup field is used to track such deferral. This means that the show_rcu_nocb_state() printing an error when those fields are set for a CPU not corresponding to a no-CBs grace-period kthread is erroneous. This commit therefore switches the check from ->nocb_timer to ->nocb_bypass_timer and removes the check of ->nocb_defer_wakeup. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bbc0c07ce56e..4d63ee3de7a9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2411,10 +2411,9 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) return; waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); - wastimer = timer_pending(&rdp->nocb_timer); + wastimer = timer_pending(&rdp->nocb_bypass_timer); wassleep = swait_active(&rdp->nocb_gp_wq); - if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep && - !waslocked && !wastimer && !wassleep) + if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep) return; /* Nothing untowards. */ pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n", -- cgit v1.2.3 From b5374b2df0ac1c78895b8eb8d9582a7bdc67257d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2020 17:09:27 -0700 Subject: rcu: Add READ_ONCE() to rcu_do_batch() access to rcu_divisor Given that sysfs can change the value of rcu_divisor at any time, this commit adds a READ_ONCE to the sole access to that variable. While in the area, this commit also adds bounds checking, clamping the value to a shift that makes sense for a signed long. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acc926f07dc1..1dca14cf66f9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2362,6 +2362,7 @@ int rcutree_dead_cpu(unsigned int cpu) */ static void rcu_do_batch(struct rcu_data *rdp) { + int div; unsigned long flags; const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rcu_segcblist_is_offloaded(&rdp->cblist); @@ -2390,7 +2391,9 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); - bl = max(rdp->blimit, pending >> rcu_divisor); + div = READ_ONCE(rcu_divisor); + div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; + bl = max(rdp->blimit, pending >> div); if (unlikely(bl > 100)) tlimit = local_clock() + rcu_resched_ns; trace_rcu_batch_start(rcu_state.name, -- cgit v1.2.3 From a2b354b9950bb859d8d959f951dda26725b041fb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2020 17:49:40 -0700 Subject: rcu: Add READ_ONCE() to rcu_do_batch() access to rcu_resched_ns Given that sysfs can change the value of rcu_resched_ns at any time, this commit adds a READ_ONCE() to the sole access to that variable. While in the area, this commit also adds bounds checking, clamping the value to at least a millisecond, but no longer than a second. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1dca14cf66f9..da05afc53493 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2394,8 +2394,12 @@ static void rcu_do_batch(struct rcu_data *rdp) div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); - if (unlikely(bl > 100)) - tlimit = local_clock() + rcu_resched_ns; + if (unlikely(bl > 100)) { + long rrn = READ_ONCE(rcu_resched_ns); + + rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; + tlimit = local_clock() + rrn; + } trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); -- cgit v1.2.3 From fe63b723cc7ca3a91ea91274e0f2cba29452b3fa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2020 18:04:45 -0700 Subject: rcu: Add READ_ONCE() to rcu_do_batch() access to rcu_kick_kthreads Given that sysfs can change the value of rcu_kick_kthreads at any time, this commit adds a READ_ONCE() to the sole access to that variable. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b5d3b4794db4..a1780a621b5e 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -158,7 +158,7 @@ static void rcu_stall_kick_kthreads(void) { unsigned long j; - if (!rcu_kick_kthreads) + if (!READ_ONCE(rcu_kick_kthreads)) return; j = READ_ONCE(rcu_state.jiffies_kick_kthreads); if (time_after(jiffies, j) && rcu_state.gp_kthread && @@ -580,7 +580,7 @@ static void check_cpu_stall(struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if ((rcu_stall_is_suppressed() && !rcu_kick_kthreads) || + if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) || !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); -- cgit v1.2.3 From 1ef5a442a113d140580b3b8bbd6f50c9f7746397 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2020 20:57:59 -0700 Subject: rcu: Add READ_ONCE() to rcu_do_batch() access to rcu_cpu_stall_ftrace_dump Given that sysfs can change the value of rcu_cpu_stall_ftrace_dump at any time, this commit adds a READ_ONCE() to the accesses to that variable. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index a1780a621b5e..0fde39b8daab 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -623,7 +623,7 @@ static void check_cpu_stall(struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); - if (rcu_cpu_stall_ftrace_dump) + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); } else if (rcu_gp_in_progress() && @@ -632,7 +632,7 @@ static void check_cpu_stall(struct rcu_data *rdp) /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2, gps); - if (rcu_cpu_stall_ftrace_dump) + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); } } -- cgit v1.2.3 From c0f97f20e5d97a1358ade650fcf6a322c0c9bc72 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Jul 2020 20:22:05 -0700 Subject: rcu: Move rcu_cpu_started per-CPU variable to rcu_data When the rcu_cpu_started per-CPU variable was added by commit f64c6013a202 ("rcu/x86: Provide early rcu_cpu_starting() callback"), there were multiple sets of per-CPU rcu_data structures. Therefore, the rcu_cpu_started flag was added as a separate per-CPU variable. But now there is only one set of per-CPU rcu_data structures, so this commit moves rcu_cpu_started to a new ->cpu_started field in that structure. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 ++++------- kernel/rcu/tree.h | 1 + 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index da05afc53493..52108dd92169 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3967,8 +3967,6 @@ int rcutree_offline_cpu(unsigned int cpu) return 0; } -static DEFINE_PER_CPU(int, rcu_cpu_started); - /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3988,12 +3986,11 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_node *rnp; bool newcpu; - if (per_cpu(rcu_cpu_started, cpu)) + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->cpu_started) return; + rdp->cpu_started = true; - per_cpu(rcu_cpu_started, cpu) = 1; - - rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -4053,7 +4050,7 @@ void rcu_report_dead(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock(&rcu_state.ofl_lock); - per_cpu(rcu_cpu_started, cpu) = 0; + rdp->cpu_started = false; } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c96ae351688b..309bc7f41d35 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -156,6 +156,7 @@ struct rcu_data { bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ + bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ -- cgit v1.2.3 From 4569c5ee95d5695bfd794ae968c2d59b3e69129a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Aug 2020 10:35:16 -0700 Subject: rcu/nocb: Add a warning for non-GP kthread running GP code This commit increases RCU's ability to defend itself by emitting a warning if one of the nocb CB kthreads invokes the GP kthread's wait function. This warning augments a similar check that is carried out at the end of rcutorture testing and when RCU CPU stall warnings are emitted. The problem with those checks is that the miscreants have long since departed and disposed of any and all evidence. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4d63ee3de7a9..cb1e8c8befb9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1926,6 +1926,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * nearest grace period (if any) to wait for next. The CB kthreads * and the global grace-period kthread are awakened if needed. */ + WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); -- cgit v1.2.3 From f37599e6f06da47e49c3408afe66c5b6e83a90bd Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 7 Aug 2020 13:07:19 -0400 Subject: rcu: Clarify comments about FQS loop reporting quiescent states Since at least v4.19, the FQS loop no longer reports quiescent states for offline CPUs except in emergency situations. This commit therefore fixes the comment in rcu_gp_init() to match the current code. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 52108dd92169..2c7afe491c45 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1706,10 +1706,13 @@ static bool rcu_gp_init(void) raw_spin_unlock_irq_rcu_node(rnp); /* - * Apply per-leaf buffered online and offline operations to the - * rcu_node tree. Note that this new grace period need not wait - * for subsequent online CPUs, and that quiescent-state forcing - * will handle subsequent offline CPUs. + * Apply per-leaf buffered online and offline operations to + * the rcu_node tree. Note that this new grace period need not + * wait for subsequent online CPUs, and that RCU hooks in the CPU + * offlining path, when combined with checks in this function, + * will handle CPUs that are currently going offline or that will + * go offline later. Please also refer to "Hotplug CPU" section + * of RCU's Requirements documentation. */ rcu_state.gp_state = RCU_GP_ONOFF; rcu_for_each_leaf_node(rnp) { -- cgit v1.2.3 From 666ca2907e6b75960ce2f0fe50afc5d8a46f296d Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 7 Aug 2020 13:07:20 -0400 Subject: rcu: Make FQS more aggressive in complaining about offline CPUs The RCU grace-period kthread's force-quiescent state (FQS) loop should never see an offline CPU that has not yet reported a quiescent state. After all, the offline CPU should have reported a quiescent state during the CPU-offline process, or, failing that, by rcu_gp_init() if it ran concurrently with either the CPU going offline or the last task on a leaf rcu_node structure exiting its RCU read-side critical section while all CPUs corresponding to that structure are offline. The FQS loop should therefore complain if it does see an offline CPU that has not yet reported a quiescent state. And it does, but only once the grace period has been in force for a full second. This commit therefore makes this warning more aggressive, so that it will trigger as soon as the condition makes its appearance. Light testing with TREE03 and hotplug shows no warnings. This commit also converts the warning to WARN_ON_ONCE() in order to stave off possible log spam. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2c7afe491c45..396abe0e0d01 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1214,13 +1214,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* If waiting too long on an offline CPU, complain. */ - if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && - time_after(jiffies, rcu_state.gp_start + HZ)) { + /* + * Complain if a CPU that is considered to be offline from RCU's + * perspective has not yet reported a quiescent state. After all, + * the offline CPU should have reported a quiescent state during + * the CPU-offline process, or, failing that, by rcu_gp_init() + * if it ran concurrently with either the CPU going offline or the + * last task on a leaf rcu_node structure exiting its RCU read-side + * critical section while all CPUs corresponding to that structure + * are offline. This added warning detects bugs in any of these + * code paths. + * + * The rcu_node structure's ->lock is held here, which excludes + * the relevant portions the CPU-hotplug code, the grace-period + * initialization code, and the rcu_read_unlock() code paths. + * + * For more detail, please refer to the "Hotplug CPU" section + * of RCU's Requirements documentation. + */ + if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) { bool onl; struct rcu_node *rnp1; - WARN_ON(1); /* Offline CPUs are supposed to report QS! */ pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, (long)rnp->gp_seq, (long)rnp->completedqs); -- cgit v1.2.3 From 7f2a53c231fe5d9522c3b695ab454203904031ac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Aug 2020 10:37:22 -0700 Subject: rcu: Remove unused __rcu_is_watching() function The x86/entry work removed all uses of __rcu_is_watching(), therefore this commit removes it entirely. Cc: Andy Lutomirski Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Signed-off-by: Paul E. McKenney --- kernel/entry/common.c | 2 +- kernel/rcu/tree.c | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 9852e0d62d95..ad794a10fa80 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -278,7 +278,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * terminate a grace period, if and only if the timer interrupt is * not nested into another interrupt. * - * Checking for __rcu_is_watching() here would prevent the nesting + * Checking for rcu_is_watching() here would prevent the nesting * interrupt to invoke rcu_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully * assume that it is the first interupt and eventually claim diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 396abe0e0d01..232362293678 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1077,11 +1077,6 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } -noinstr bool __rcu_is_watching(void) -{ - return !rcu_dynticks_curr_cpu_in_eqs(); -} - /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * -- cgit v1.2.3 From e9d338a0b1799c988b678e8ccb66a442272e6aa3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 24 Jun 2020 15:59:59 -0700 Subject: scftorture: Add smp_call_function() torture test This commit adds an smp_call_function() torture test that repeatedly invokes this function and complains if things go badly awry. Signed-off-by: Paul E. McKenney --- kernel/Makefile | 2 + kernel/scftorture.c | 350 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 352 insertions(+) create mode 100644 kernel/scftorture.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 9a20016d4900..c45f551deaaa 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -133,6 +133,8 @@ KASAN_SANITIZE_stackleak.o := n KCSAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n +obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o + $(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz diff --git a/kernel/scftorture.c b/kernel/scftorture.c new file mode 100644 index 000000000000..44f1e49ba6e9 --- /dev/null +++ b/kernel/scftorture.c @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Torture test for smp_call_function() and friends. +// +// Copyright (C) Facebook, 2020. +// +// Author: Paul E. McKenney + +#define pr_fmt(fmt) fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SCFTORT_STRING "scftorture" +#define SCFTORT_FLAG SCFTORT_STRING ": " + +#define SCFTORTOUT(s, x...) \ + pr_alert(SCFTORT_FLAG s, ## x) + +#define VERBOSE_SCFTORTOUT(s, x...) \ + do { if (verbose) pr_alert(SCFTORT_FLAG s, ## x); } while (0) + +#define VERBOSE_SCFTORTOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney "); + +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_SCF_TORTURE_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +torture_param(int, longwait, 0, "Include ridiculously long waits? (seconds)"); +torture_param(int, nthreads, -1, "# threads, defaults to -1 for all CPUs."); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shutdown_secs, 0, "Shutdown time (ms), <= zero to disable."); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s."); +torture_param(int, stutter_cpus, 5, "Number of jiffies to change CPUs under test, 0=disable"); +torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU hotplug."); +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); +torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations."); +torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations."); +torture_param(int, weight_mult, -1, "Testing weight for multi-CPU no-wait operations."); +torture_param(int, weight_mult_wait, -1, "Testing weight for multi-CPU operations."); +torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations."); +torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations."); + +char *torture_type = ""; + +#ifdef MODULE +# define SCFTORT_SHUTDOWN 0 +#else +# define SCFTORT_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, SCFTORT_SHUTDOWN, "Shutdown at end of torture test."); + +struct scf_statistics { + struct task_struct *task; + int cpu; + long long n_single; + long long n_single_wait; + long long n_multi; + long long n_multi_wait; + long long n_all; + long long n_all_wait; +}; + +static struct scf_statistics *scf_stats_p; +static struct task_struct *scf_torture_stats_task; +static DEFINE_PER_CPU(long long, scf_invoked_count); + +// Use to wait for all threads to start. +static atomic_t n_started; +static atomic_t n_errs; +static bool scfdone; + +DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); + +// Print torture statistics. Caller must ensure serialization. +static void scf_torture_stats_print(void) +{ + int cpu; + long long invoked_count = 0; + bool isdone = READ_ONCE(scfdone); + + for_each_possible_cpu(cpu) + invoked_count += data_race(per_cpu(scf_invoked_count, cpu)); + pr_alert("%s scf_invoked_count %s: %lld ", + SCFTORT_FLAG, isdone ? "VER" : "ver", invoked_count); + torture_onoff_stats(); + pr_cont("\n"); +} + +// Periodically prints torture statistics, if periodic statistics printing +// was specified via the stat_interval module parameter. +static int +scf_torture_stats(void *arg) +{ + VERBOSE_TOROUT_STRING("scf_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + scf_torture_stats_print(); + torture_shutdown_absorb("scf_torture_stats"); + } while (!torture_must_stop()); + torture_kthread_stopping("scf_torture_stats"); + return 0; +} + +// Update statistics and occasionally burn up mass quantities of CPU time, +// if told to do so via scftorture.longwait. Otherwise, occasionally burn +// a little bit. +static void scf_handler(void *unused) +{ + int i; + int j; + unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand)); + + this_cpu_inc(scf_invoked_count); + if (longwait <= 0) { + if (!(r & 0xffc0)) + udelay(r & 0x3f); + return; + } + if (r & 0xfff) + return; + r = (r >> 12); + if (longwait <= 0) { + udelay((r & 0xff) + 1); + return; + } + r = r % longwait + 1; + for (i = 0; i < r; i++) { + for (j = 0; j < 1000; j++) { + udelay(1000); + cpu_relax(); + } + } +} + +// Randomly do an smp_call_function*() invocation. +static void scftorture_invoke_one(struct scf_statistics *scfp,struct torture_random_state *trsp) +{ + if (use_cpus_read_lock) + cpus_read_lock(); + else + preempt_disable(); + scfp->n_all++; + smp_call_function(scf_handler, NULL, 0); + if (use_cpus_read_lock) + cpus_read_unlock(); + else + preempt_enable(); + if (!(torture_random(trsp) & 0xfff)) + schedule_timeout_uninterruptible(1); +} + +// SCF test kthread. Repeatedly does calls to members of the +// smp_call_function() family of functions. +static int scftorture_invoker(void *arg) +{ + DEFINE_TORTURE_RANDOM(rand); + struct scf_statistics *scfp = (struct scf_statistics *)arg; + + VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu); + set_cpus_allowed_ptr(current, cpumask_of(scfp->cpu % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); + + VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id()); + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != scfp->cpu); + + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) { + if (torture_must_stop()) { + VERBOSE_SCFTORTOUT("scftorture_invoker %d ended before starting", scfp->cpu); + goto end; + } + schedule_timeout_uninterruptible(1); + } + + VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu); + + do { + scftorture_invoke_one(scfp, &rand); + } while (!torture_must_stop()); + + VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu); +end: + torture_kthread_stopping("scftorture_invoker"); + return 0; +} + +static void +scftorture_print_module_parms(const char *tag) +{ + pr_alert(SCFTORT_FLAG + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_mult=%d, weight_mult_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_mult, weight_mult_wait, weight_all, weight_all_wait); +} + +static void scf_cleanup_handler(void *unused) +{ +} + +static void scf_torture_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + WRITE_ONCE(scfdone, true); + if (nthreads) + for (i = 0; i < nthreads; i++) + torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task); + else + goto end; + kfree(scf_stats_p); + scf_stats_p = NULL; + smp_call_function(scf_cleanup_handler, NULL, 0); + torture_stop_kthread(scf_torture_stats, scf_torture_stats_task); + scf_torture_stats_print(); // -After- the stats thread is stopped! + + if (atomic_read(&n_errs)) + scftorture_print_module_parms("End of test: FAILURE"); + else if (torture_onoff_failures()) + scftorture_print_module_parms("End of test: LOCK_HOTPLUG"); + else + scftorture_print_module_parms("End of test: SUCCESS"); + +end: + torture_cleanup_end(); +} + +static int __init scf_torture_init(void) +{ + long i; + int firsterr = 0; + + if (!torture_init_begin(SCFTORT_STRING, verbose)) + return -EBUSY; + + scftorture_print_module_parms("Start of test"); + + if (weight_single == -1 && weight_single_wait == -1 && + weight_mult == -1 && weight_mult_wait == -1 && + weight_all == -1 && weight_all_wait == -1) { + weight_single = 1; + weight_single_wait = 1; + weight_mult = 1; + weight_mult_wait = 1; + weight_all = 1; + weight_all_wait = 1; + } else { + if (weight_single == -1) + weight_single = 0; + if (weight_single_wait == -1) + weight_single_wait = 0; + if (weight_mult == -1) + weight_mult = 0; + if (weight_mult_wait == -1) + weight_mult_wait = 0; + if (weight_all == -1) + weight_all = 0; + if (weight_all_wait == -1) + weight_all_wait = 0; + } + if (weight_single == 0 && weight_single_wait == 0 && + weight_mult == 0 && weight_mult_wait == 0 && + weight_all == 0 && weight_all_wait == 0) { + firsterr = -EINVAL; + goto unwind; + } + + if (onoff_interval > 0) { + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL); + if (firsterr) + goto unwind; + } + if (shutdown_secs > 0) { + firsterr = torture_shutdown_init(shutdown_secs, scf_torture_cleanup); + if (firsterr) + goto unwind; + } + + // Worker tasks invoking smp_call_function(). + if (nthreads < 0) + nthreads = num_online_cpus(); + scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL); + if (!scf_stats_p) { + VERBOSE_SCFTORTOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads\n", nthreads); + + atomic_set(&n_started, nthreads); + for (i = 0; i < nthreads; i++) { + scf_stats_p[i].cpu = i; + firsterr = torture_create_kthread(scftorture_invoker, (void *)&scf_stats_p[i], + scf_stats_p[i].task); + if (firsterr) + goto unwind; + } + if (stat_interval > 0) { + firsterr = torture_create_kthread(scf_torture_stats, NULL, scf_torture_stats_task); + if (firsterr) + goto unwind; + } + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + scf_torture_cleanup(); + return firsterr; +} + +module_init(scf_torture_init); +module_exit(scf_torture_cleanup); -- cgit v1.2.3 From 5022b8ac608f8b80b042a8041fe2738c4b9ea8cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2020 17:05:58 -0700 Subject: scftorture: Implement weighted primitive selection This commit uses the scftorture.weight* kernel parameters to randomly chooses between smp_call_function_single(), smp_call_function_many(), and smp_call_function(). For each variant, it also randomly chooses whether to invoke it synchronously (wait=1) or asynchronously (wait=0). The percentage weighting for each option are dumped to the console log (search for "scf_sel_dump"). This accumulates statistics, which a later commit will dump out at the end of the run. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 182 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 155 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 44f1e49ba6e9..5f1984522ae4 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -64,8 +64,8 @@ torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations."); torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations."); -torture_param(int, weight_mult, -1, "Testing weight for multi-CPU no-wait operations."); -torture_param(int, weight_mult_wait, -1, "Testing weight for multi-CPU operations."); +torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations."); +torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations."); torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations."); torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations."); @@ -83,9 +83,11 @@ struct scf_statistics { struct task_struct *task; int cpu; long long n_single; + long long n_single_ofl; long long n_single_wait; - long long n_multi; - long long n_multi_wait; + long long n_single_wait_ofl; + long long n_many; + long long n_many_wait; long long n_all; long long n_all_wait; }; @@ -94,6 +96,27 @@ static struct scf_statistics *scf_stats_p; static struct task_struct *scf_torture_stats_task; static DEFINE_PER_CPU(long long, scf_invoked_count); +// Data for random primitive selection +#define SCF_PRIM_SINGLE 0 +#define SCF_PRIM_MANY 1 +#define SCF_PRIM_ALL 2 +#define SCF_NPRIMS (2 * 3) // Need wait and no-wait versions of each. + +static char *scf_prim_name[] = { + "smp_call_function_single", + "smp_call_function_many", + "smp_call_function", +}; + +struct scf_selector { + unsigned long scfs_weight; + int scfs_prim; + bool scfs_wait; +}; +static struct scf_selector scf_sel_array[SCF_NPRIMS]; +static int scf_sel_array_len; +static unsigned long scf_sel_totweight; + // Use to wait for all threads to start. static atomic_t n_started; static atomic_t n_errs; @@ -131,6 +154,57 @@ scf_torture_stats(void *arg) return 0; } +// Add a primitive to the scf_sel_array[]. +static void scf_sel_add(unsigned long weight, int prim, bool wait) +{ + struct scf_selector *scfsp = &scf_sel_array[scf_sel_array_len]; + + // If no weight, if array would overflow, if computing three-place + // percentages would overflow, or if the scf_prim_name[] array would + // overflow, don't bother. In the last three two cases, complain. + if (!weight || + WARN_ON_ONCE(scf_sel_array_len >= ARRAY_SIZE(scf_sel_array)) || + WARN_ON_ONCE(0 - 100000 * weight <= 100000 * scf_sel_totweight) || + WARN_ON_ONCE(prim >= ARRAY_SIZE(scf_prim_name))) + return; + scf_sel_totweight += weight; + scfsp->scfs_weight = scf_sel_totweight; + scfsp->scfs_prim = prim; + scfsp->scfs_wait = wait; + scf_sel_array_len++; +} + +// Dump out weighting percentages for scf_prim_name[] array. +static void scf_sel_dump(void) +{ + int i; + unsigned long oldw = 0; + struct scf_selector *scfsp; + unsigned long w; + + for (i = 0; i < scf_sel_array_len; i++) { + scfsp = &scf_sel_array[i]; + w = (scfsp->scfs_weight - oldw) * 100000 / scf_sel_totweight; + pr_info("%s: %3lu.%03lu %s(%s)\n", __func__, w / 1000, w % 1000, + scf_prim_name[scfsp->scfs_prim], + scfsp->scfs_wait ? "wait" : "nowait"); + oldw = scfsp->scfs_weight; + } +} + +// Randomly pick a primitive and wait/nowait, based on weightings. +static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp) +{ + int i; + unsigned long w = torture_random(trsp) % (scf_sel_totweight + 1); + + for (i = 0; i < scf_sel_array_len; i++) + if (scf_sel_array[i].scfs_weight >= w) + return &scf_sel_array[i]; + WARN_ON_ONCE(1); + return &scf_sel_array[0]; +} + // Update statistics and occasionally burn up mass quantities of CPU time, // if told to do so via scftorture.longwait. Otherwise, occasionally burn // a little bit. @@ -162,15 +236,55 @@ static void scf_handler(void *unused) } } +// As above, but check for correct CPU. +static void scf_handler_1(void *me) +{ + if (WARN_ON_ONCE(smp_processor_id() != (uintptr_t)me)) + atomic_inc(&n_errs); + scf_handler(NULL); +} + // Randomly do an smp_call_function*() invocation. -static void scftorture_invoke_one(struct scf_statistics *scfp,struct torture_random_state *trsp) +static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp) { + uintptr_t cpu; + int ret; + struct scf_selector *scfsp = scf_sel_rand(trsp); + if (use_cpus_read_lock) cpus_read_lock(); else preempt_disable(); - scfp->n_all++; - smp_call_function(scf_handler, NULL, 0); + switch (scfsp->scfs_prim) { + case SCF_PRIM_SINGLE: + cpu = torture_random(trsp) % nr_cpu_ids; + if (scfsp->scfs_wait) + scfp->n_single_wait++; + else + scfp->n_single++; + ret = smp_call_function_single(cpu, scf_handler_1, (void *)cpu, scfsp->scfs_wait); + if (ret) { + if (scfsp->scfs_wait) + scfp->n_single_wait_ofl++; + else + scfp->n_single_ofl++; + } + break; + case SCF_PRIM_MANY: + if (scfsp->scfs_wait) + scfp->n_many_wait++; + else + scfp->n_many++; + smp_call_function_many(cpu_online_mask, scf_handler, NULL, scfsp->scfs_wait); + break; + case SCF_PRIM_ALL: + if (scfsp->scfs_wait) + scfp->n_all_wait++; + else + scfp->n_all++; + smp_call_function(scf_handler, NULL, scfsp->scfs_wait); + break; + } if (use_cpus_read_lock) cpus_read_unlock(); else @@ -222,8 +336,8 @@ static void scftorture_print_module_parms(const char *tag) { pr_alert(SCFTORT_FLAG - "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_mult=%d, weight_mult_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, - verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_mult, weight_mult_wait, weight_all, weight_all_wait); + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); } static void scf_cleanup_handler(void *unused) @@ -264,6 +378,12 @@ static int __init scf_torture_init(void) { long i; int firsterr = 0; + unsigned long weight_single1 = weight_single; + unsigned long weight_single_wait1 = weight_single_wait; + unsigned long weight_many1 = weight_many; + unsigned long weight_many_wait1 = weight_many_wait; + unsigned long weight_all1 = weight_all; + unsigned long weight_all_wait1 = weight_all_wait; if (!torture_init_begin(SCFTORT_STRING, verbose)) return -EBUSY; @@ -271,34 +391,42 @@ static int __init scf_torture_init(void) scftorture_print_module_parms("Start of test"); if (weight_single == -1 && weight_single_wait == -1 && - weight_mult == -1 && weight_mult_wait == -1 && + weight_many == -1 && weight_many_wait == -1 && weight_all == -1 && weight_all_wait == -1) { - weight_single = 1; - weight_single_wait = 1; - weight_mult = 1; - weight_mult_wait = 1; - weight_all = 1; - weight_all_wait = 1; + weight_single1 = 2 * nr_cpu_ids; + weight_single_wait1 = 2 * nr_cpu_ids; + weight_many1 = 2; + weight_many_wait1 = 2; + weight_all1 = 1; + weight_all_wait1 = 1; } else { if (weight_single == -1) - weight_single = 0; + weight_single1 = 0; if (weight_single_wait == -1) - weight_single_wait = 0; - if (weight_mult == -1) - weight_mult = 0; - if (weight_mult_wait == -1) - weight_mult_wait = 0; + weight_single_wait1 = 0; + if (weight_many == -1) + weight_many1 = 0; + if (weight_many_wait == -1) + weight_many_wait1 = 0; if (weight_all == -1) - weight_all = 0; + weight_all1 = 0; if (weight_all_wait == -1) - weight_all_wait = 0; + weight_all_wait1 = 0; } - if (weight_single == 0 && weight_single_wait == 0 && - weight_mult == 0 && weight_mult_wait == 0 && - weight_all == 0 && weight_all_wait == 0) { + if (weight_single1 == 0 && weight_single_wait1 == 0 && + weight_many1 == 0 && weight_many_wait1 == 0 && + weight_all1 == 0 && weight_all_wait1 == 0) { + VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); firsterr = -EINVAL; goto unwind; } + scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false); + scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true); + scf_sel_add(weight_many1, SCF_PRIM_MANY, false); + scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true); + scf_sel_add(weight_all1, SCF_PRIM_ALL, false); + scf_sel_add(weight_all_wait1, SCF_PRIM_ALL, true); + scf_sel_dump(); if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL); -- cgit v1.2.3 From bca37119c57bdc2c68c84b313a5118005e8693cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 26 Jun 2020 13:39:41 -0700 Subject: tick-sched: Clarify "NOHZ: local_softirq_pending" warning Currently, can_stop_idle_tick() prints "NOHZ: local_softirq_pending HH" (where "HH" is the hexadecimal softirq vector number) when one or more non-RCU softirq handlers are still enabled when checking to stop the scheduler-tick interrupt. This message is not as enlightening as one might hope, so this commit changes it to "NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #HH". Reported-by: Andy Lutomirski Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f0199a4ba1ad..81632cd5e3b7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -927,7 +927,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (ratelimit < 10 && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - pr_warn("NOHZ: local_softirq_pending %02x\n", + pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", (unsigned int) local_softirq_pending()); ratelimit++; } -- cgit v1.2.3 From dba3142b37f343734bf61dbce2914acb76e69fb6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2020 16:13:37 -0700 Subject: scftorture: Summarize per-thread statistics This commit summarizes the per-thread statistics, providing counts of the number of single, many, and all calls, both no-wait and wait, and, for the single case, the number where the target CPU was offline. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 5f1984522ae4..09a62424bb8c 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -128,13 +128,27 @@ DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); static void scf_torture_stats_print(void) { int cpu; + int i; long long invoked_count = 0; bool isdone = READ_ONCE(scfdone); + struct scf_statistics scfs = {}; for_each_possible_cpu(cpu) invoked_count += data_race(per_cpu(scf_invoked_count, cpu)); - pr_alert("%s scf_invoked_count %s: %lld ", - SCFTORT_FLAG, isdone ? "VER" : "ver", invoked_count); + for (i = 0; i < nthreads; i++) { + scfs.n_single += scf_stats_p[i].n_single; + scfs.n_single_ofl += scf_stats_p[i].n_single_ofl; + scfs.n_single_wait += scf_stats_p[i].n_single_wait; + scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl; + scfs.n_many += scf_stats_p[i].n_many; + scfs.n_many_wait += scf_stats_p[i].n_many_wait; + scfs.n_all += scf_stats_p[i].n_all; + scfs.n_all_wait += scf_stats_p[i].n_all_wait; + } + pr_alert("%s scf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", + SCFTORT_FLAG, isdone ? "VER" : "ver", invoked_count, + scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl, + scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait); torture_onoff_stats(); pr_cont("\n"); } @@ -357,11 +371,11 @@ static void scf_torture_cleanup(void) torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task); else goto end; - kfree(scf_stats_p); - scf_stats_p = NULL; smp_call_function(scf_cleanup_handler, NULL, 0); torture_stop_kthread(scf_torture_stats, scf_torture_stats_task); scf_torture_stats_print(); // -After- the stats thread is stopped! + kfree(scf_stats_p); // -After- the last stats print has completed! + scf_stats_p = NULL; if (atomic_read(&n_errs)) scftorture_print_module_parms("End of test: FAILURE"); -- cgit v1.2.3 From b93e21a51e1c8ed3816da888d34f88193ad1b917 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2020 20:49:50 -0700 Subject: scftorture: Add smp_call_function_single() memory-ordering checks This commit adds checks for memory misordering across calls to smp_call_function_single() and also across returns in the case where the caller waits. Misordering results in a splat. [ paulmck: s/GFP_KERNEL/GFP_ATOMIC/ per kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 56 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 09a62424bb8c..9b42271d64f1 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -117,9 +117,20 @@ static struct scf_selector scf_sel_array[SCF_NPRIMS]; static int scf_sel_array_len; static unsigned long scf_sel_totweight; +// Communicate between caller and handler. +struct scf_check { + bool scfc_in; + bool scfc_out; + int scfc_cpu; // -1 for not _single(). + bool scfc_wait; +}; + // Use to wait for all threads to start. static atomic_t n_started; static atomic_t n_errs; +static atomic_t n_mb_in_errs; +static atomic_t n_mb_out_errs; +static atomic_t n_alloc_errs; static bool scfdone; DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); @@ -222,24 +233,27 @@ static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp) // Update statistics and occasionally burn up mass quantities of CPU time, // if told to do so via scftorture.longwait. Otherwise, occasionally burn // a little bit. -static void scf_handler(void *unused) +static void scf_handler(void *scfc_in) { int i; int j; unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand)); + struct scf_check *scfcp = scfc_in; + if (likely(scfcp) && WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in)))) + atomic_inc(&n_mb_in_errs); this_cpu_inc(scf_invoked_count); if (longwait <= 0) { if (!(r & 0xffc0)) udelay(r & 0x3f); - return; + goto out; } if (r & 0xfff) - return; + goto out; r = (r >> 12); if (longwait <= 0) { udelay((r & 0xff) + 1); - return; + goto out; } r = r % longwait + 1; for (i = 0; i < r; i++) { @@ -248,14 +262,24 @@ static void scf_handler(void *unused) cpu_relax(); } } +out: + if (unlikely(!scfcp)) + return; + if (scfcp->scfc_wait) + WRITE_ONCE(scfcp->scfc_out, true); + else + kfree(scfcp); } // As above, but check for correct CPU. -static void scf_handler_1(void *me) +static void scf_handler_1(void *scfc_in) { - if (WARN_ON_ONCE(smp_processor_id() != (uintptr_t)me)) + struct scf_check *scfcp = scfc_in; + + if (likely(scfcp) && WARN_ONCE(smp_processor_id() != scfcp->scfc_cpu, "%s: Wanted CPU %d got CPU %d\n", __func__, scfcp->scfc_cpu, smp_processor_id())) { atomic_inc(&n_errs); - scf_handler(NULL); + } + scf_handler(scfcp); } // Randomly do an smp_call_function*() invocation. @@ -263,6 +287,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra { uintptr_t cpu; int ret; + struct scf_check *scfcp = NULL; struct scf_selector *scfsp = scf_sel_rand(trsp); if (use_cpus_read_lock) @@ -271,17 +296,32 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra preempt_disable(); switch (scfsp->scfs_prim) { case SCF_PRIM_SINGLE: + scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); + if (WARN_ON_ONCE(!scfcp)) + atomic_inc(&n_alloc_errs); cpu = torture_random(trsp) % nr_cpu_ids; if (scfsp->scfs_wait) scfp->n_single_wait++; else scfp->n_single++; - ret = smp_call_function_single(cpu, scf_handler_1, (void *)cpu, scfsp->scfs_wait); + if (scfcp) { + scfcp->scfc_cpu = cpu; + scfcp->scfc_wait = scfsp->scfs_wait; + scfcp->scfc_out = false; + scfcp->scfc_in = true; + } + ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait); if (ret) { if (scfsp->scfs_wait) scfp->n_single_wait_ofl++; else scfp->n_single_ofl++; + kfree(scfcp); + } else if (scfcp && scfsp->scfs_wait) { + if (WARN_ON_ONCE(!scfcp->scfc_out)) + atomic_inc(&n_mb_out_errs); // Leak rather than trash! + else + kfree(scfcp); } break; case SCF_PRIM_MANY: -- cgit v1.2.3 From 980205ee8489d53c4380f7762debac87312b0fb3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jul 2020 12:30:02 -0700 Subject: scftorture: Add smp_call_function_many() memory-ordering checks This commit adds checks for memory misordering across calls to and returns from smp_call_function_many() in the case where the caller waits. Misordering results in a splat. Note that in contrast to smp_call_function_single(), this code does not test memory ordering into the handler in the no-wait case because none of the handlers would be able to free the scf_check structure without introducing heavy synchronization to work out which was last. [ paulmck: s/GFP_KERNEL/GFP_ATOMIC/ per kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 9b42271d64f1..3519ad1b3278 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -240,8 +240,11 @@ static void scf_handler(void *scfc_in) unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand)); struct scf_check *scfcp = scfc_in; - if (likely(scfcp) && WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in)))) - atomic_inc(&n_mb_in_errs); + if (likely(scfcp)) { + WRITE_ONCE(scfcp->scfc_out, false); // For multiple receivers. + if (WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in)))) + atomic_inc(&n_mb_in_errs); + } this_cpu_inc(scf_invoked_count); if (longwait <= 0) { if (!(r & 0xffc0)) @@ -325,11 +328,28 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra } break; case SCF_PRIM_MANY: + if (scfsp->scfs_wait) { + scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); + if (WARN_ON_ONCE(!scfcp)) + atomic_inc(&n_alloc_errs); + } if (scfsp->scfs_wait) scfp->n_many_wait++; else scfp->n_many++; - smp_call_function_many(cpu_online_mask, scf_handler, NULL, scfsp->scfs_wait); + if (scfcp) { + scfcp->scfc_cpu = -1; + scfcp->scfc_wait = true; + scfcp->scfc_out = false; + scfcp->scfc_in = true; + } + smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait); + if (scfcp) { + if (WARN_ON_ONCE(!scfcp->scfc_out)) + atomic_inc(&n_mb_out_errs); // Leak rather than trash! + else + kfree(scfcp); + } break; case SCF_PRIM_ALL: if (scfsp->scfs_wait) -- cgit v1.2.3 From 34e8c4837adb579962e528a4f7dd1f75cb120be4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jul 2020 13:49:06 -0700 Subject: scftorture: Add smp_call_function() memory-ordering checks This commit adds checks for memory misordering across calls to and returns from smp_call_function() in the case where the caller waits. Misordering results in a splat. Note that in contrast to smp_call_function_single(), this code does not test memory ordering into the handler in the no-wait case because none of the handlers would be able to free the scf_check structure without introducing heavy synchronization to work out which was last. [ paulmck: s/GFP_KERNEL/GFP_ATOMIC/ per kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 3519ad1b3278..0d7299d32dd0 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -297,11 +297,13 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra cpus_read_lock(); else preempt_disable(); - switch (scfsp->scfs_prim) { - case SCF_PRIM_SINGLE: + if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) { scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); if (WARN_ON_ONCE(!scfcp)) atomic_inc(&n_alloc_errs); + } + switch (scfsp->scfs_prim) { + case SCF_PRIM_SINGLE: cpu = torture_random(trsp) % nr_cpu_ids; if (scfsp->scfs_wait) scfp->n_single_wait++; @@ -328,11 +330,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra } break; case SCF_PRIM_MANY: - if (scfsp->scfs_wait) { - scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); - if (WARN_ON_ONCE(!scfcp)) - atomic_inc(&n_alloc_errs); - } if (scfsp->scfs_wait) scfp->n_many_wait++; else @@ -356,7 +353,19 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_all_wait++; else scfp->n_all++; - smp_call_function(scf_handler, NULL, scfsp->scfs_wait); + if (scfcp) { + scfcp->scfc_cpu = -1; + scfcp->scfc_wait = true; + scfcp->scfc_out = false; + scfcp->scfc_in = true; + } + smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); + if (scfcp) { + if (WARN_ON_ONCE(!scfcp->scfc_out)) + atomic_inc(&n_mb_out_errs); // Leak rather than trash! + else + kfree(scfcp); + } break; } if (use_cpus_read_lock) -- cgit v1.2.3 From 676e5469643e716df7f39ef77ba8f09c85b0c4f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jul 2020 14:13:02 -0700 Subject: scftorture: Consolidate scftorture_invoke_one() check and kfree() This commit moves checking of the ->scfc_out field and the freeing of the scf_check structure down below the end of switch statement, thus saving a few lines of code. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 0d7299d32dd0..f220cd364e23 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -289,7 +289,7 @@ static void scf_handler_1(void *scfc_in) static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp) { uintptr_t cpu; - int ret; + int ret = 0; struct scf_check *scfcp = NULL; struct scf_selector *scfsp = scf_sel_rand(trsp); @@ -322,11 +322,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra else scfp->n_single_ofl++; kfree(scfcp); - } else if (scfcp && scfsp->scfs_wait) { - if (WARN_ON_ONCE(!scfcp->scfc_out)) - atomic_inc(&n_mb_out_errs); // Leak rather than trash! - else - kfree(scfcp); + scfcp = NULL; } break; case SCF_PRIM_MANY: @@ -341,12 +337,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp->scfc_in = true; } smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait); - if (scfcp) { - if (WARN_ON_ONCE(!scfcp->scfc_out)) - atomic_inc(&n_mb_out_errs); // Leak rather than trash! - else - kfree(scfcp); - } break; case SCF_PRIM_ALL: if (scfsp->scfs_wait) @@ -360,14 +350,14 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp->scfc_in = true; } smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); - if (scfcp) { - if (WARN_ON_ONCE(!scfcp->scfc_out)) - atomic_inc(&n_mb_out_errs); // Leak rather than trash! - else - kfree(scfcp); - } break; } + if (scfcp && scfsp->scfs_wait) { + if (WARN_ON_ONCE(!scfcp->scfc_out)) + atomic_inc(&n_mb_out_errs); // Leak rather than trash! + else + kfree(scfcp); + } if (use_cpus_read_lock) cpus_read_unlock(); else -- cgit v1.2.3 From 4df55bddc1a360e94c86e227fe417ac9422cb615 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jul 2020 13:58:32 -0700 Subject: scftorture: Consolidate scftorture_invoke_one() scf_check initialization This commit hoists much of the initialization of the scf_check structure out of the switch statement, thus saving a few lines of code. The initialization of the ->scfc_in field remains in each leg of the switch statement in order to more heavily stress memory ordering. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index f220cd364e23..8ab72e545a61 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -299,8 +299,13 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra preempt_disable(); if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) { scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); - if (WARN_ON_ONCE(!scfcp)) + if (WARN_ON_ONCE(!scfcp)) { atomic_inc(&n_alloc_errs); + } else { + scfcp->scfc_cpu = -1; + scfcp->scfc_wait = scfsp->scfs_wait; + scfcp->scfc_out = false; + } } switch (scfsp->scfs_prim) { case SCF_PRIM_SINGLE: @@ -311,8 +316,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_single++; if (scfcp) { scfcp->scfc_cpu = cpu; - scfcp->scfc_wait = scfsp->scfs_wait; - scfcp->scfc_out = false; scfcp->scfc_in = true; } ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait); @@ -330,12 +333,8 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_many_wait++; else scfp->n_many++; - if (scfcp) { - scfcp->scfc_cpu = -1; - scfcp->scfc_wait = true; - scfcp->scfc_out = false; + if (scfcp) scfcp->scfc_in = true; - } smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait); break; case SCF_PRIM_ALL: @@ -343,12 +342,8 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_all_wait++; else scfp->n_all++; - if (scfcp) { - scfcp->scfc_cpu = -1; - scfcp->scfc_wait = true; - scfcp->scfc_out = false; + if (scfcp) scfcp->scfc_in = true; - } smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); break; } -- cgit v1.2.3 From dbf83b655a7853bc430af10e9a3e7eb1f4c90f86 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jul 2020 16:06:22 -0700 Subject: scftorture: Flag errors in torture-compatible manner This commit prints error counts on the statistics line and also adds a "!!!" if any of the counters are non-zero. Allocation failures are (somewhat) forgiven, but all other errors result in a "FAILURE" print at the end of the test. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 8ab72e545a61..880c2cef13e7 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -132,6 +132,7 @@ static atomic_t n_mb_in_errs; static atomic_t n_mb_out_errs; static atomic_t n_alloc_errs; static bool scfdone; +static char *bangstr = ""; DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); @@ -156,12 +157,17 @@ static void scf_torture_stats_print(void) scfs.n_all += scf_stats_p[i].n_all; scfs.n_all_wait += scf_stats_p[i].n_all_wait; } - pr_alert("%s scf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", - SCFTORT_FLAG, isdone ? "VER" : "ver", invoked_count, + if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || + atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs)) + bangstr = "!!! "; + pr_alert("%s %sscf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", + SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl, scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait); torture_onoff_stats(); - pr_cont("\n"); + pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs), + atomic_read(&n_mb_in_errs), atomic_read(&n_mb_out_errs), + atomic_read(&n_alloc_errs)); } // Periodically prints torture statistics, if periodic statistics printing @@ -431,7 +437,7 @@ static void scf_torture_cleanup(void) kfree(scf_stats_p); // -After- the last stats print has completed! scf_stats_p = NULL; - if (atomic_read(&n_errs)) + if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs)) scftorture_print_module_parms("End of test: FAILURE"); else if (torture_onoff_failures()) scftorture_print_module_parms("End of test: LOCK_HOTPLUG"); -- cgit v1.2.3 From ee7035d29576dcb59b1191e5f609517cacab1e56 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jul 2020 16:38:16 -0700 Subject: scftorture: Prevent compiler from reducing race probabilities Detecting smp_call_function() memory misordering requires close timing, so it is necessary to have the checks immediately before and after the call to the smp_call_function*() function under test. This commit therefore inserts barrier() calls to prevent the compiler from optimizing memory-misordering detection down into the zone of extreme improbability. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 880c2cef13e7..83496810fc48 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -322,6 +322,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_single++; if (scfcp) { scfcp->scfc_cpu = cpu; + barrier(); // Prevent race-reduction compiler optimizations. scfcp->scfc_in = true; } ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait); @@ -339,8 +340,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_many_wait++; else scfp->n_many++; - if (scfcp) + if (scfcp) { + barrier(); // Prevent race-reduction compiler optimizations. scfcp->scfc_in = true; + } smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait); break; case SCF_PRIM_ALL: @@ -348,8 +351,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_all_wait++; else scfp->n_all++; - if (scfcp) + if (scfcp) { + barrier(); // Prevent race-reduction compiler optimizations. scfcp->scfc_in = true; + } smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); break; } @@ -358,6 +363,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra atomic_inc(&n_mb_out_errs); // Leak rather than trash! else kfree(scfcp); + barrier(); // Prevent race-reduction compiler optimizations. } if (use_cpus_read_lock) cpus_read_unlock(); -- cgit v1.2.3 From 9a52a574676f8d4aa55f69319231ce6c343b00bb Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 2 Jul 2020 09:56:50 -0700 Subject: scftorture: Make symbol 'scf_torture_rand' static The sparse tool complains as follows kernel/scftorture.c:124:1: warning: symbol '__pcpu_scope_scf_torture_rand' was not declared. Should it be static? And this per-CPU variable is not used outside of scftorture.c, so this commit marks it static. Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 83496810fc48..9180de73e4e8 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -134,7 +134,7 @@ static atomic_t n_alloc_errs; static bool scfdone; static char *bangstr = ""; -DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); +static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); // Print torture statistics. Caller must ensure serialization. static void scf_torture_stats_print(void) -- cgit v1.2.3 From de77d4da54d10df97d265e7e99112bfc2fef7d4a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jul 2020 12:15:37 -0700 Subject: scftorture: Check unexpected "switch" statement value This commit adds a "default" case to the switch statement in scftorture_invoke_one() which contains a WARN_ON_ONCE() and an assignment to ->scfc_out to suppress knock-on warnings. These knock-on warnings could otherwise cause the user to think that there was a memory-ordering problem in smp_call_function() instead of a bug in scftorture.c itself. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 9180de73e4e8..d9c01c722e2a 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -357,6 +357,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra } smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); break; + default: + WARN_ON_ONCE(1); + if (scfcp) + scfcp->scfc_out = true; } if (scfcp && scfsp->scfs_wait) { if (WARN_ON_ONCE(!scfcp->scfc_out)) -- cgit v1.2.3 From a7c072ef26644b632241d549869f10f8d2dd3b5c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jul 2020 14:15:33 -0700 Subject: scftorture: Block scftorture_invoker() kthreads for offline CPUs Currently, CPU-hotplug operations might result in all but two of (say) 100 CPUs being offline, which in turn might result in false-positive diagnostics due to overload. This commit therefore causes scftorture_invoker() kthreads for offline CPUs to loop blocking for 200 milliseconds at a time, thus continuously adjusting the number of threads to match the number of online CPUs. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index d9c01c722e2a..04d3a4279413 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -381,11 +381,14 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra // smp_call_function() family of functions. static int scftorture_invoker(void *arg) { + int cpu; DEFINE_TORTURE_RANDOM(rand); struct scf_statistics *scfp = (struct scf_statistics *)arg; + bool was_offline = false; VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu); - set_cpus_allowed_ptr(current, cpumask_of(scfp->cpu % nr_cpu_ids)); + cpu = scfp->cpu % nr_cpu_ids; + set_cpus_allowed_ptr(current, cpumask_of(cpu)); set_user_nice(current, MAX_NICE); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); @@ -408,6 +411,14 @@ static int scftorture_invoker(void *arg) do { scftorture_invoke_one(scfp, &rand); + while (cpu_is_offline(cpu) && !torture_must_stop()) { + schedule_timeout_interruptible(HZ / 5); + was_offline = true; + } + if (was_offline) { + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + was_offline = false; + } } while (!torture_must_stop()); VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu); -- cgit v1.2.3 From 9e66bf03f9c538863e614a72c5799bcd9579630e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jul 2020 15:23:19 -0700 Subject: scftorture: Adapt memory-ordering test to UP operation On uniprocessor systems, smp_call_function() does nothing. This commit therefore avoids complaining about the lack of handler accesses in the single-CPU case where there is no handler. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 04d3a4279413..fc22bcc9a589 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -363,7 +363,8 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp->scfc_out = true; } if (scfcp && scfsp->scfs_wait) { - if (WARN_ON_ONCE(!scfcp->scfc_out)) + if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) && + !scfcp->scfc_out)) atomic_inc(&n_mb_out_errs); // Leak rather than trash! else kfree(scfcp); -- cgit v1.2.3 From 65bd77f554336407f5fd7ced7a6df686767fba21 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Jul 2020 15:53:02 -0700 Subject: scftorture: Add cond_resched() to test loop Although the test loop does randomly delay, which would provide quiescent states and so forth, it is possible for there to be a series of long smp_call_function*() handler runtimes with no delays, which results in softlockup and RCU CPU stall warning messages. This commit therefore inserts a cond_resched() into the main test loop. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index fc22bcc9a589..554a521ee235 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -420,6 +420,7 @@ static int scftorture_invoker(void *arg) set_cpus_allowed_ptr(current, cpumask_of(cpu)); was_offline = false; } + cond_resched(); } while (!torture_must_stop()); VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu); -- cgit v1.2.3 From 4e88ec4a9eb17527e640b063f79e5b875733eb53 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Aug 2020 21:18:12 -0700 Subject: rcuperf: Change rcuperf to rcuscale This commit further avoids conflation of rcuperf with the kernel's perf feature by renaming kernel/rcu/rcuperf.c to kernel/rcu/rcuscale.c, and also by similarly renaming the functions and variables inside this file. This has the side effect of changing the names of the kernel boot parameters, so kernel-parameters.txt and ver_functions.sh are also updated. The rcutorture --torture type was also updated from rcuperf to rcuscale. [ paulmck: Fix bugs located by Stephen Rothwell. ] Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 2 +- kernel/rcu/Makefile | 2 +- kernel/rcu/rcuperf.c | 853 ----------------------------------------------- kernel/rcu/rcuscale.c | 853 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 855 insertions(+), 855 deletions(-) delete mode 100644 kernel/rcu/rcuperf.c create mode 100644 kernel/rcu/rcuscale.c (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 3cf6132a4bb9..5cb175df6ece 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -23,7 +23,7 @@ config TORTURE_TEST tristate default n -config RCU_PERF_TEST +config RCU_SCALE_TEST tristate "performance tests for RCU" depends on DEBUG_KERNEL select TORTURE_TEST diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 95f5117ef8da..0cfb009a99b9 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -11,7 +11,7 @@ obj-y += update.o sync.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o +obj-$(CONFIG_RCU_SCALE_TEST) += rcuscale.o obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c deleted file mode 100644 index 21448d3374e2..000000000000 --- a/kernel/rcu/rcuperf.c +++ /dev/null @@ -1,853 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Read-Copy Update module-based performance-test facility - * - * Copyright (C) IBM Corporation, 2015 - * - * Authors: Paul E. McKenney - */ - -#define pr_fmt(fmt) fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rcu.h" - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney "); - -#define PERF_FLAG "-perf:" -#define PERFOUT_STRING(s) \ - pr_alert("%s" PERF_FLAG " %s\n", perf_type, s) -#define VERBOSE_PERFOUT_STRING(s) \ - do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) -#define VERBOSE_PERFOUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) - -/* - * The intended use cases for the nreaders and nwriters module parameters - * are as follows: - * - * 1. Specify only the nr_cpus kernel boot parameter. This will - * set both nreaders and nwriters to the value specified by - * nr_cpus for a mixed reader/writer test. - * - * 2. Specify the nr_cpus kernel boot parameter, but set - * rcuperf.nreaders to zero. This will set nwriters to the - * value specified by nr_cpus for an update-only test. - * - * 3. Specify the nr_cpus kernel boot parameter, but set - * rcuperf.nwriters to zero. This will set nreaders to the - * value specified by nr_cpus for a read-only test. - * - * Various other use cases may of course be specified. - * - * Note that this test's readers are intended only as a test load for - * the writers. The reader performance statistics will be overly - * pessimistic due to the per-critical-section interrupt disabling, - * test-end checks, and the pair of calls through pointers. - */ - -#ifdef MODULE -# define RCUPERF_SHUTDOWN 0 -#else -# define RCUPERF_SHUTDOWN 1 -#endif - -torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); -torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); -torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); -torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); -torture_param(int, nreaders, -1, "Number of RCU reader threads"); -torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, RCUPERF_SHUTDOWN, - "Shutdown at end of performance tests."); -torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); -torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); -torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); -torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); - -static char *perf_type = "rcu"; -module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)"); - -static int nrealreaders; -static int nrealwriters; -static struct task_struct **writer_tasks; -static struct task_struct **reader_tasks; -static struct task_struct *shutdown_task; - -static u64 **writer_durations; -static int *writer_n_durations; -static atomic_t n_rcu_perf_reader_started; -static atomic_t n_rcu_perf_writer_started; -static atomic_t n_rcu_perf_writer_finished; -static wait_queue_head_t shutdown_wq; -static u64 t_rcu_perf_writer_started; -static u64 t_rcu_perf_writer_finished; -static unsigned long b_rcu_gp_test_started; -static unsigned long b_rcu_gp_test_finished; -static DEFINE_PER_CPU(atomic_t, n_async_inflight); - -#define MAX_MEAS 10000 -#define MIN_MEAS 100 - -/* - * Operations vector for selecting different types of tests. - */ - -struct rcu_perf_ops { - int ptype; - void (*init)(void); - void (*cleanup)(void); - int (*readlock)(void); - void (*readunlock)(int idx); - unsigned long (*get_gp_seq)(void); - unsigned long (*gp_diff)(unsigned long new, unsigned long old); - unsigned long (*exp_completed)(void); - void (*async)(struct rcu_head *head, rcu_callback_t func); - void (*gp_barrier)(void); - void (*sync)(void); - void (*exp_sync)(void); - const char *name; -}; - -static struct rcu_perf_ops *cur_ops; - -/* - * Definitions for rcu perf testing. - */ - -static int rcu_perf_read_lock(void) __acquires(RCU) -{ - rcu_read_lock(); - return 0; -} - -static void rcu_perf_read_unlock(int idx) __releases(RCU) -{ - rcu_read_unlock(); -} - -static unsigned long __maybe_unused rcu_no_completed(void) -{ - return 0; -} - -static void rcu_sync_perf_init(void) -{ -} - -static struct rcu_perf_ops rcu_ops = { - .ptype = RCU_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = rcu_perf_read_lock, - .readunlock = rcu_perf_read_unlock, - .get_gp_seq = rcu_get_gp_seq, - .gp_diff = rcu_seq_diff, - .exp_completed = rcu_exp_batches_completed, - .async = call_rcu, - .gp_barrier = rcu_barrier, - .sync = synchronize_rcu, - .exp_sync = synchronize_rcu_expedited, - .name = "rcu" -}; - -/* - * Definitions for srcu perf testing. - */ - -DEFINE_STATIC_SRCU(srcu_ctl_perf); -static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; - -static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) -{ - return srcu_read_lock(srcu_ctlp); -} - -static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) -{ - srcu_read_unlock(srcu_ctlp, idx); -} - -static unsigned long srcu_perf_completed(void) -{ - return srcu_batches_completed(srcu_ctlp); -} - -static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func) -{ - call_srcu(srcu_ctlp, head, func); -} - -static void srcu_rcu_barrier(void) -{ - srcu_barrier(srcu_ctlp); -} - -static void srcu_perf_synchronize(void) -{ - synchronize_srcu(srcu_ctlp); -} - -static void srcu_perf_synchronize_expedited(void) -{ - synchronize_srcu_expedited(srcu_ctlp); -} - -static struct rcu_perf_ops srcu_ops = { - .ptype = SRCU_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = srcu_perf_read_lock, - .readunlock = srcu_perf_read_unlock, - .get_gp_seq = srcu_perf_completed, - .gp_diff = rcu_seq_diff, - .exp_completed = srcu_perf_completed, - .async = srcu_call_rcu, - .gp_barrier = srcu_rcu_barrier, - .sync = srcu_perf_synchronize, - .exp_sync = srcu_perf_synchronize_expedited, - .name = "srcu" -}; - -static struct srcu_struct srcud; - -static void srcu_sync_perf_init(void) -{ - srcu_ctlp = &srcud; - init_srcu_struct(srcu_ctlp); -} - -static void srcu_sync_perf_cleanup(void) -{ - cleanup_srcu_struct(srcu_ctlp); -} - -static struct rcu_perf_ops srcud_ops = { - .ptype = SRCU_FLAVOR, - .init = srcu_sync_perf_init, - .cleanup = srcu_sync_perf_cleanup, - .readlock = srcu_perf_read_lock, - .readunlock = srcu_perf_read_unlock, - .get_gp_seq = srcu_perf_completed, - .gp_diff = rcu_seq_diff, - .exp_completed = srcu_perf_completed, - .async = srcu_call_rcu, - .gp_barrier = srcu_rcu_barrier, - .sync = srcu_perf_synchronize, - .exp_sync = srcu_perf_synchronize_expedited, - .name = "srcud" -}; - -/* - * Definitions for RCU-tasks perf testing. - */ - -static int tasks_perf_read_lock(void) -{ - return 0; -} - -static void tasks_perf_read_unlock(int idx) -{ -} - -static struct rcu_perf_ops tasks_ops = { - .ptype = RCU_TASKS_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = tasks_perf_read_lock, - .readunlock = tasks_perf_read_unlock, - .get_gp_seq = rcu_no_completed, - .gp_diff = rcu_seq_diff, - .async = call_rcu_tasks, - .gp_barrier = rcu_barrier_tasks, - .sync = synchronize_rcu_tasks, - .exp_sync = synchronize_rcu_tasks, - .name = "tasks" -}; - -static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) -{ - if (!cur_ops->gp_diff) - return new - old; - return cur_ops->gp_diff(new, old); -} - -/* - * If performance tests complete, wait for shutdown to commence. - */ -static void rcu_perf_wait_shutdown(void) -{ - cond_resched_tasks_rcu_qs(); - if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) - return; - while (!torture_must_stop()) - schedule_timeout_uninterruptible(1); -} - -/* - * RCU perf reader kthread. Repeatedly does empty RCU read-side critical - * section, minimizing update-side interference. However, the point of - * this test is not to evaluate reader performance, but instead to serve - * as a test load for update-side performance testing. - */ -static int -rcu_perf_reader(void *arg) -{ - unsigned long flags; - int idx; - long me = (long)arg; - - VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); - set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); - set_user_nice(current, MAX_NICE); - atomic_inc(&n_rcu_perf_reader_started); - - do { - local_irq_save(flags); - idx = cur_ops->readlock(); - cur_ops->readunlock(idx); - local_irq_restore(flags); - rcu_perf_wait_shutdown(); - } while (!torture_must_stop()); - torture_kthread_stopping("rcu_perf_reader"); - return 0; -} - -/* - * Callback function for asynchronous grace periods from rcu_perf_writer(). - */ -static void rcu_perf_async_cb(struct rcu_head *rhp) -{ - atomic_dec(this_cpu_ptr(&n_async_inflight)); - kfree(rhp); -} - -/* - * RCU perf writer kthread. Repeatedly does a grace period. - */ -static int -rcu_perf_writer(void *arg) -{ - int i = 0; - int i_max; - long me = (long)arg; - struct rcu_head *rhp = NULL; - bool started = false, done = false, alldone = false; - u64 t; - u64 *wdp; - u64 *wdpp = writer_durations[me]; - - VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); - WARN_ON(!wdpp); - set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); - sched_set_fifo_low(current); - - if (holdoff) - schedule_timeout_uninterruptible(holdoff * HZ); - - /* - * Wait until rcu_end_inkernel_boot() is called for normal GP tests - * so that RCU is not always expedited for normal GP tests. - * The system_state test is approximate, but works well in practice. - */ - while (!gp_exp && system_state != SYSTEM_RUNNING) - schedule_timeout_uninterruptible(1); - - t = ktime_get_mono_fast_ns(); - if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { - t_rcu_perf_writer_started = t; - if (gp_exp) { - b_rcu_gp_test_started = - cur_ops->exp_completed() / 2; - } else { - b_rcu_gp_test_started = cur_ops->get_gp_seq(); - } - } - - do { - if (writer_holdoff) - udelay(writer_holdoff); - wdp = &wdpp[i]; - *wdp = ktime_get_mono_fast_ns(); - if (gp_async) { -retry: - if (!rhp) - rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); - if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { - atomic_inc(this_cpu_ptr(&n_async_inflight)); - cur_ops->async(rhp, rcu_perf_async_cb); - rhp = NULL; - } else if (!kthread_should_stop()) { - cur_ops->gp_barrier(); - goto retry; - } else { - kfree(rhp); /* Because we are stopping. */ - } - } else if (gp_exp) { - cur_ops->exp_sync(); - } else { - cur_ops->sync(); - } - t = ktime_get_mono_fast_ns(); - *wdp = t - *wdp; - i_max = i; - if (!started && - atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) - started = true; - if (!done && i >= MIN_MEAS) { - done = true; - sched_set_normal(current, 0); - pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", - perf_type, PERF_FLAG, me, MIN_MEAS); - if (atomic_inc_return(&n_rcu_perf_writer_finished) >= - nrealwriters) { - schedule_timeout_interruptible(10); - rcu_ftrace_dump(DUMP_ALL); - PERFOUT_STRING("Test complete"); - t_rcu_perf_writer_finished = t; - if (gp_exp) { - b_rcu_gp_test_finished = - cur_ops->exp_completed() / 2; - } else { - b_rcu_gp_test_finished = - cur_ops->get_gp_seq(); - } - if (shutdown) { - smp_mb(); /* Assign before wake. */ - wake_up(&shutdown_wq); - } - } - } - if (done && !alldone && - atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) - alldone = true; - if (started && !alldone && i < MAX_MEAS - 1) - i++; - rcu_perf_wait_shutdown(); - } while (!torture_must_stop()); - if (gp_async) { - cur_ops->gp_barrier(); - } - writer_n_durations[me] = i_max; - torture_kthread_stopping("rcu_perf_writer"); - return 0; -} - -static void -rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) -{ - pr_alert("%s" PERF_FLAG - "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", - perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); -} - -static void -rcu_perf_cleanup(void) -{ - int i; - int j; - int ngps = 0; - u64 *wdp; - u64 *wdpp; - - /* - * Would like warning at start, but everything is expedited - * during the mid-boot phase, so have to wait till the end. - */ - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - if (rcu_gp_is_normal() && gp_exp) - VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - if (gp_exp && gp_async) - VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!"); - - if (torture_cleanup_begin()) - return; - if (!cur_ops) { - torture_cleanup_end(); - return; - } - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - torture_stop_kthread(rcu_perf_reader, - reader_tasks[i]); - kfree(reader_tasks); - } - - if (writer_tasks) { - for (i = 0; i < nrealwriters; i++) { - torture_stop_kthread(rcu_perf_writer, - writer_tasks[i]); - if (!writer_n_durations) - continue; - j = writer_n_durations[i]; - pr_alert("%s%s writer %d gps: %d\n", - perf_type, PERF_FLAG, i, j); - ngps += j; - } - pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", - perf_type, PERF_FLAG, - t_rcu_perf_writer_started, t_rcu_perf_writer_finished, - t_rcu_perf_writer_finished - - t_rcu_perf_writer_started, - ngps, - rcuperf_seq_diff(b_rcu_gp_test_finished, - b_rcu_gp_test_started)); - for (i = 0; i < nrealwriters; i++) { - if (!writer_durations) - break; - if (!writer_n_durations) - continue; - wdpp = writer_durations[i]; - if (!wdpp) - continue; - for (j = 0; j <= writer_n_durations[i]; j++) { - wdp = &wdpp[j]; - pr_alert("%s%s %4d writer-duration: %5d %llu\n", - perf_type, PERF_FLAG, - i, j, *wdp); - if (j % 100 == 0) - schedule_timeout_uninterruptible(1); - } - kfree(writer_durations[i]); - } - kfree(writer_tasks); - kfree(writer_durations); - kfree(writer_n_durations); - } - - /* Do torture-type-specific cleanup operations. */ - if (cur_ops->cleanup != NULL) - cur_ops->cleanup(); - - torture_cleanup_end(); -} - -/* - * Return the number if non-negative. If -1, the number of CPUs. - * If less than -1, that much less than the number of CPUs, but - * at least one. - */ -static int compute_real(int n) -{ - int nr; - - if (n >= 0) { - nr = n; - } else { - nr = num_online_cpus() + 1 + n; - if (nr <= 0) - nr = 1; - } - return nr; -} - -/* - * RCU perf shutdown kthread. Just waits to be awakened, then shuts - * down system. - */ -static int -rcu_perf_shutdown(void *arg) -{ - wait_event(shutdown_wq, - atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters); - smp_mb(); /* Wake before output. */ - rcu_perf_cleanup(); - kernel_power_off(); - return -EINVAL; -} - -/* - * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number - * of iterations and measure total time and number of GP for all iterations to complete. - */ - -torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); -torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); -torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); - -static struct task_struct **kfree_reader_tasks; -static int kfree_nrealthreads; -static atomic_t n_kfree_perf_thread_started; -static atomic_t n_kfree_perf_thread_ended; - -struct kfree_obj { - char kfree_obj[8]; - struct rcu_head rh; -}; - -static int -kfree_perf_thread(void *arg) -{ - int i, loop = 0; - long me = (long)arg; - struct kfree_obj *alloc_ptr; - u64 start_time, end_time; - long long mem_begin, mem_during = 0; - - VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); - set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); - set_user_nice(current, MAX_NICE); - - start_time = ktime_get_mono_fast_ns(); - - if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) { - if (gp_exp) - b_rcu_gp_test_started = cur_ops->exp_completed() / 2; - else - b_rcu_gp_test_started = cur_ops->get_gp_seq(); - } - - do { - if (!mem_during) { - mem_during = mem_begin = si_mem_available(); - } else if (loop % (kfree_loops / 4) == 0) { - mem_during = (mem_during + si_mem_available()) / 2; - } - - for (i = 0; i < kfree_alloc_num; i++) { - alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); - if (!alloc_ptr) - return -ENOMEM; - - kfree_rcu(alloc_ptr, rh); - } - - cond_resched(); - } while (!torture_must_stop() && ++loop < kfree_loops); - - if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) { - end_time = ktime_get_mono_fast_ns(); - - if (gp_exp) - b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; - else - b_rcu_gp_test_finished = cur_ops->get_gp_seq(); - - pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", - (unsigned long long)(end_time - start_time), kfree_loops, - rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), - (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); - - if (shutdown) { - smp_mb(); /* Assign before wake. */ - wake_up(&shutdown_wq); - } - } - - torture_kthread_stopping("kfree_perf_thread"); - return 0; -} - -static void -kfree_perf_cleanup(void) -{ - int i; - - if (torture_cleanup_begin()) - return; - - if (kfree_reader_tasks) { - for (i = 0; i < kfree_nrealthreads; i++) - torture_stop_kthread(kfree_perf_thread, - kfree_reader_tasks[i]); - kfree(kfree_reader_tasks); - } - - torture_cleanup_end(); -} - -/* - * shutdown kthread. Just waits to be awakened, then shuts down system. - */ -static int -kfree_perf_shutdown(void *arg) -{ - wait_event(shutdown_wq, - atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads); - - smp_mb(); /* Wake before output. */ - - kfree_perf_cleanup(); - kernel_power_off(); - return -EINVAL; -} - -static int __init -kfree_perf_init(void) -{ - long i; - int firsterr = 0; - - kfree_nrealthreads = compute_real(kfree_nthreads); - /* Start up the kthreads. */ - if (shutdown) { - init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(kfree_perf_shutdown, NULL, - shutdown_task); - if (firsterr) - goto unwind; - schedule_timeout_uninterruptible(1); - } - - pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj)); - - kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), - GFP_KERNEL); - if (kfree_reader_tasks == NULL) { - firsterr = -ENOMEM; - goto unwind; - } - - for (i = 0; i < kfree_nrealthreads; i++) { - firsterr = torture_create_kthread(kfree_perf_thread, (void *)i, - kfree_reader_tasks[i]); - if (firsterr) - goto unwind; - } - - while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads) - schedule_timeout_uninterruptible(1); - - torture_init_end(); - return 0; - -unwind: - torture_init_end(); - kfree_perf_cleanup(); - return firsterr; -} - -static int __init -rcu_perf_init(void) -{ - long i; - int firsterr = 0; - static struct rcu_perf_ops *perf_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, - }; - - if (!torture_init_begin(perf_type, verbose)) - return -EBUSY; - - /* Process args and tell the world that the perf'er is on the job. */ - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { - cur_ops = perf_ops[i]; - if (strcmp(perf_type, cur_ops->name) == 0) - break; - } - if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); - pr_alert("rcu-perf types:"); - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_cont(" %s", perf_ops[i]->name); - pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); - firsterr = -EINVAL; - cur_ops = NULL; - goto unwind; - } - if (cur_ops->init) - cur_ops->init(); - - if (kfree_rcu_test) - return kfree_perf_init(); - - nrealwriters = compute_real(nwriters); - nrealreaders = compute_real(nreaders); - atomic_set(&n_rcu_perf_reader_started, 0); - atomic_set(&n_rcu_perf_writer_started, 0); - atomic_set(&n_rcu_perf_writer_finished, 0); - rcu_perf_print_module_parms(cur_ops, "Start of test"); - - /* Start up the kthreads. */ - - if (shutdown) { - init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, - shutdown_task); - if (firsterr) - goto unwind; - schedule_timeout_uninterruptible(1); - } - reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), - GFP_KERNEL); - if (reader_tasks == NULL) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nrealreaders; i++) { - firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, - reader_tasks[i]); - if (firsterr) - goto unwind; - } - while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) - schedule_timeout_uninterruptible(1); - writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), - GFP_KERNEL); - writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), - GFP_KERNEL); - writer_n_durations = - kcalloc(nrealwriters, sizeof(*writer_n_durations), - GFP_KERNEL); - if (!writer_tasks || !writer_durations || !writer_n_durations) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nrealwriters; i++) { - writer_durations[i] = - kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), - GFP_KERNEL); - if (!writer_durations[i]) { - firsterr = -ENOMEM; - goto unwind; - } - firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, - writer_tasks[i]); - if (firsterr) - goto unwind; - } - torture_init_end(); - return 0; - -unwind: - torture_init_end(); - rcu_perf_cleanup(); - return firsterr; -} - -module_init(rcu_perf_init); -module_exit(rcu_perf_cleanup); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c new file mode 100644 index 000000000000..2819b95479af --- /dev/null +++ b/kernel/rcu/rcuscale.c @@ -0,0 +1,853 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Read-Copy Update module-based scalability-test facility + * + * Copyright (C) IBM Corporation, 2015 + * + * Authors: Paul E. McKenney + */ + +#define pr_fmt(fmt) fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rcu.h" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney "); + +#define SCALE_FLAG "-scale:" +#define SCALEOUT_STRING(s) \ + pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s) +#define VERBOSE_SCALEOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0) +#define VERBOSE_SCALEOUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s); } while (0) + +/* + * The intended use cases for the nreaders and nwriters module parameters + * are as follows: + * + * 1. Specify only the nr_cpus kernel boot parameter. This will + * set both nreaders and nwriters to the value specified by + * nr_cpus for a mixed reader/writer test. + * + * 2. Specify the nr_cpus kernel boot parameter, but set + * rcuscale.nreaders to zero. This will set nwriters to the + * value specified by nr_cpus for an update-only test. + * + * 3. Specify the nr_cpus kernel boot parameter, but set + * rcuscale.nwriters to zero. This will set nreaders to the + * value specified by nr_cpus for a read-only test. + * + * Various other use cases may of course be specified. + * + * Note that this test's readers are intended only as a test load for + * the writers. The reader scalability statistics will be overly + * pessimistic due to the per-critical-section interrupt disabling, + * test-end checks, and the pair of calls through pointers. + */ + +#ifdef MODULE +# define RCUSCALE_SHUTDOWN 0 +#else +# define RCUSCALE_SHUTDOWN 1 +#endif + +torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); +torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); +torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nwriters, -1, "Number of RCU updater threads"); +torture_param(bool, shutdown, RCUSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); +torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); +torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?"); +torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); + +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)"); + +static int nrealreaders; +static int nrealwriters; +static struct task_struct **writer_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *shutdown_task; + +static u64 **writer_durations; +static int *writer_n_durations; +static atomic_t n_rcu_scale_reader_started; +static atomic_t n_rcu_scale_writer_started; +static atomic_t n_rcu_scale_writer_finished; +static wait_queue_head_t shutdown_wq; +static u64 t_rcu_scale_writer_started; +static u64 t_rcu_scale_writer_finished; +static unsigned long b_rcu_gp_test_started; +static unsigned long b_rcu_gp_test_finished; +static DEFINE_PER_CPU(atomic_t, n_async_inflight); + +#define MAX_MEAS 10000 +#define MIN_MEAS 100 + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_scale_ops { + int ptype; + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + unsigned long (*get_gp_seq)(void); + unsigned long (*gp_diff)(unsigned long new, unsigned long old); + unsigned long (*exp_completed)(void); + void (*async)(struct rcu_head *head, rcu_callback_t func); + void (*gp_barrier)(void); + void (*sync)(void); + void (*exp_sync)(void); + const char *name; +}; + +static struct rcu_scale_ops *cur_ops; + +/* + * Definitions for rcu scalability testing. + */ + +static int rcu_scale_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_scale_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static unsigned long __maybe_unused rcu_no_completed(void) +{ + return 0; +} + +static void rcu_sync_scale_init(void) +{ +} + +static struct rcu_scale_ops rcu_ops = { + .ptype = RCU_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = rcu_scale_read_lock, + .readunlock = rcu_scale_read_unlock, + .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, + .exp_completed = rcu_exp_batches_completed, + .async = call_rcu, + .gp_barrier = rcu_barrier, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .name = "rcu" +}; + +/* + * Definitions for srcu scalability testing. + */ + +DEFINE_STATIC_SRCU(srcu_ctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_scale; + +static int srcu_scale_read_lock(void) __acquires(srcu_ctlp) +{ + return srcu_read_lock(srcu_ctlp); +} + +static void srcu_scale_read_unlock(int idx) __releases(srcu_ctlp) +{ + srcu_read_unlock(srcu_ctlp, idx); +} + +static unsigned long srcu_scale_completed(void) +{ + return srcu_batches_completed(srcu_ctlp); +} + +static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + call_srcu(srcu_ctlp, head, func); +} + +static void srcu_rcu_barrier(void) +{ + srcu_barrier(srcu_ctlp); +} + +static void srcu_scale_synchronize(void) +{ + synchronize_srcu(srcu_ctlp); +} + +static void srcu_scale_synchronize_expedited(void) +{ + synchronize_srcu_expedited(srcu_ctlp); +} + +static struct rcu_scale_ops srcu_ops = { + .ptype = SRCU_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = srcu_scale_read_lock, + .readunlock = srcu_scale_read_unlock, + .get_gp_seq = srcu_scale_completed, + .gp_diff = rcu_seq_diff, + .exp_completed = srcu_scale_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, + .sync = srcu_scale_synchronize, + .exp_sync = srcu_scale_synchronize_expedited, + .name = "srcu" +}; + +static struct srcu_struct srcud; + +static void srcu_sync_scale_init(void) +{ + srcu_ctlp = &srcud; + init_srcu_struct(srcu_ctlp); +} + +static void srcu_sync_scale_cleanup(void) +{ + cleanup_srcu_struct(srcu_ctlp); +} + +static struct rcu_scale_ops srcud_ops = { + .ptype = SRCU_FLAVOR, + .init = srcu_sync_scale_init, + .cleanup = srcu_sync_scale_cleanup, + .readlock = srcu_scale_read_lock, + .readunlock = srcu_scale_read_unlock, + .get_gp_seq = srcu_scale_completed, + .gp_diff = rcu_seq_diff, + .exp_completed = srcu_scale_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, + .sync = srcu_scale_synchronize, + .exp_sync = srcu_scale_synchronize_expedited, + .name = "srcud" +}; + +/* + * Definitions for RCU-tasks scalability testing. + */ + +static int tasks_scale_read_lock(void) +{ + return 0; +} + +static void tasks_scale_read_unlock(int idx) +{ +} + +static struct rcu_scale_ops tasks_ops = { + .ptype = RCU_TASKS_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = tasks_scale_read_lock, + .readunlock = tasks_scale_read_unlock, + .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, + .async = call_rcu_tasks, + .gp_barrier = rcu_barrier_tasks, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .name = "tasks" +}; + +static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) +{ + if (!cur_ops->gp_diff) + return new - old; + return cur_ops->gp_diff(new, old); +} + +/* + * If scalability tests complete, wait for shutdown to commence. + */ +static void rcu_scale_wait_shutdown(void) +{ + cond_resched_tasks_rcu_qs(); + if (atomic_read(&n_rcu_scale_writer_finished) < nrealwriters) + return; + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); +} + +/* + * RCU scalability reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. However, the + * point of this test is not to evaluate reader scalability, but instead + * to serve as a test load for update-side scalability testing. + */ +static int +rcu_scale_reader(void *arg) +{ + unsigned long flags; + int idx; + long me = (long)arg; + + VERBOSE_SCALEOUT_STRING("rcu_scale_reader task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_rcu_scale_reader_started); + + do { + local_irq_save(flags); + idx = cur_ops->readlock(); + cur_ops->readunlock(idx); + local_irq_restore(flags); + rcu_scale_wait_shutdown(); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_scale_reader"); + return 0; +} + +/* + * Callback function for asynchronous grace periods from rcu_scale_writer(). + */ +static void rcu_scale_async_cb(struct rcu_head *rhp) +{ + atomic_dec(this_cpu_ptr(&n_async_inflight)); + kfree(rhp); +} + +/* + * RCU scale writer kthread. Repeatedly does a grace period. + */ +static int +rcu_scale_writer(void *arg) +{ + int i = 0; + int i_max; + long me = (long)arg; + struct rcu_head *rhp = NULL; + bool started = false, done = false, alldone = false; + u64 t; + u64 *wdp; + u64 *wdpp = writer_durations[me]; + + VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); + WARN_ON(!wdpp); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + sched_set_fifo_low(current); + + if (holdoff) + schedule_timeout_uninterruptible(holdoff * HZ); + + /* + * Wait until rcu_end_inkernel_boot() is called for normal GP tests + * so that RCU is not always expedited for normal GP tests. + * The system_state test is approximate, but works well in practice. + */ + while (!gp_exp && system_state != SYSTEM_RUNNING) + schedule_timeout_uninterruptible(1); + + t = ktime_get_mono_fast_ns(); + if (atomic_inc_return(&n_rcu_scale_writer_started) >= nrealwriters) { + t_rcu_scale_writer_started = t; + if (gp_exp) { + b_rcu_gp_test_started = + cur_ops->exp_completed() / 2; + } else { + b_rcu_gp_test_started = cur_ops->get_gp_seq(); + } + } + + do { + if (writer_holdoff) + udelay(writer_holdoff); + wdp = &wdpp[i]; + *wdp = ktime_get_mono_fast_ns(); + if (gp_async) { +retry: + if (!rhp) + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { + atomic_inc(this_cpu_ptr(&n_async_inflight)); + cur_ops->async(rhp, rcu_scale_async_cb); + rhp = NULL; + } else if (!kthread_should_stop()) { + cur_ops->gp_barrier(); + goto retry; + } else { + kfree(rhp); /* Because we are stopping. */ + } + } else if (gp_exp) { + cur_ops->exp_sync(); + } else { + cur_ops->sync(); + } + t = ktime_get_mono_fast_ns(); + *wdp = t - *wdp; + i_max = i; + if (!started && + atomic_read(&n_rcu_scale_writer_started) >= nrealwriters) + started = true; + if (!done && i >= MIN_MEAS) { + done = true; + sched_set_normal(current, 0); + pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", + scale_type, SCALE_FLAG, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_scale_writer_finished) >= + nrealwriters) { + schedule_timeout_interruptible(10); + rcu_ftrace_dump(DUMP_ALL); + SCALEOUT_STRING("Test complete"); + t_rcu_scale_writer_finished = t; + if (gp_exp) { + b_rcu_gp_test_finished = + cur_ops->exp_completed() / 2; + } else { + b_rcu_gp_test_finished = + cur_ops->get_gp_seq(); + } + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + } + if (done && !alldone && + atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters) + alldone = true; + if (started && !alldone && i < MAX_MEAS - 1) + i++; + rcu_scale_wait_shutdown(); + } while (!torture_must_stop()); + if (gp_async) { + cur_ops->gp_barrier(); + } + writer_n_durations[me] = i_max; + torture_kthread_stopping("rcu_scale_writer"); + return 0; +} + +static void +rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) +{ + pr_alert("%s" SCALE_FLAG + "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", + scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); +} + +static void +rcu_scale_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + VERBOSE_SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + VERBOSE_SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + VERBOSE_SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + + if (torture_cleanup_begin()) + return; + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_scale_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_scale_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + scale_type, SCALE_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + scale_type, SCALE_FLAG, + t_rcu_scale_writer_started, t_rcu_scale_writer_finished, + t_rcu_scale_writer_finished - + t_rcu_scale_writer_started, + ngps, + rcuscale_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j <= writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + scale_type, SCALE_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do torture-type-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * Return the number if non-negative. If -1, the number of CPUs. + * If less than -1, that much less than the number of CPUs, but + * at least one. + */ +static int compute_real(int n) +{ + int nr; + + if (n >= 0) { + nr = n; + } else { + nr = num_online_cpus() + 1 + n; + if (nr <= 0) + nr = 1; + } + return nr; +} + +/* + * RCU scalability shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_scale_shutdown(void *arg) +{ + wait_event(shutdown_wq, + atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_scale_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +/* + * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number + * of iterations and measure total time and number of GP for all iterations to complete. + */ + +torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); +torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); +torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); + +static struct task_struct **kfree_reader_tasks; +static int kfree_nrealthreads; +static atomic_t n_kfree_scale_thread_started; +static atomic_t n_kfree_scale_thread_ended; + +struct kfree_obj { + char kfree_obj[8]; + struct rcu_head rh; +}; + +static int +kfree_scale_thread(void *arg) +{ + int i, loop = 0; + long me = (long)arg; + struct kfree_obj *alloc_ptr; + u64 start_time, end_time; + long long mem_begin, mem_during = 0; + + VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + start_time = ktime_get_mono_fast_ns(); + + if (atomic_inc_return(&n_kfree_scale_thread_started) >= kfree_nrealthreads) { + if (gp_exp) + b_rcu_gp_test_started = cur_ops->exp_completed() / 2; + else + b_rcu_gp_test_started = cur_ops->get_gp_seq(); + } + + do { + if (!mem_during) { + mem_during = mem_begin = si_mem_available(); + } else if (loop % (kfree_loops / 4) == 0) { + mem_during = (mem_during + si_mem_available()) / 2; + } + + for (i = 0; i < kfree_alloc_num; i++) { + alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); + if (!alloc_ptr) + return -ENOMEM; + + kfree_rcu(alloc_ptr, rh); + } + + cond_resched(); + } while (!torture_must_stop() && ++loop < kfree_loops); + + if (atomic_inc_return(&n_kfree_scale_thread_ended) >= kfree_nrealthreads) { + end_time = ktime_get_mono_fast_ns(); + + if (gp_exp) + b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; + else + b_rcu_gp_test_finished = cur_ops->get_gp_seq(); + + pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", + (unsigned long long)(end_time - start_time), kfree_loops, + rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), + (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); + + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + + torture_kthread_stopping("kfree_scale_thread"); + return 0; +} + +static void +kfree_scale_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (kfree_reader_tasks) { + for (i = 0; i < kfree_nrealthreads; i++) + torture_stop_kthread(kfree_scale_thread, + kfree_reader_tasks[i]); + kfree(kfree_reader_tasks); + } + + torture_cleanup_end(); +} + +/* + * shutdown kthread. Just waits to be awakened, then shuts down system. + */ +static int +kfree_scale_shutdown(void *arg) +{ + wait_event(shutdown_wq, + atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); + + smp_mb(); /* Wake before output. */ + + kfree_scale_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +kfree_scale_init(void) +{ + long i; + int firsterr = 0; + + kfree_nrealthreads = compute_real(kfree_nthreads); + /* Start up the kthreads. */ + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(kfree_scale_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj)); + + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), + GFP_KERNEL); + if (kfree_reader_tasks == NULL) { + firsterr = -ENOMEM; + goto unwind; + } + + for (i = 0; i < kfree_nrealthreads; i++) { + firsterr = torture_create_kthread(kfree_scale_thread, (void *)i, + kfree_reader_tasks[i]); + if (firsterr) + goto unwind; + } + + while (atomic_read(&n_kfree_scale_thread_started) < kfree_nrealthreads) + schedule_timeout_uninterruptible(1); + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + kfree_scale_cleanup(); + return firsterr; +} + +static int __init +rcu_scale_init(void) +{ + long i; + int firsterr = 0; + static struct rcu_scale_ops *scale_ops[] = { + &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, + }; + + if (!torture_init_begin(scale_type, verbose)) + return -EBUSY; + + /* Process args and announce that the scalability'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); + pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST)); + firsterr = -EINVAL; + cur_ops = NULL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + if (kfree_rcu_test) + return kfree_scale_init(); + + nrealwriters = compute_real(nwriters); + nrealreaders = compute_real(nreaders); + atomic_set(&n_rcu_scale_reader_started, 0); + atomic_set(&n_rcu_scale_writer_started, 0); + atomic_set(&n_rcu_scale_writer_finished, 0); + rcu_scale_print_module_parms(cur_ops, "Start of test"); + + /* Start up the kthreads. */ + + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(rcu_scale_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + firsterr = torture_create_kthread(rcu_scale_reader, (void *)i, + reader_tasks[i]); + if (firsterr) + goto unwind; + } + while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) + schedule_timeout_uninterruptible(1); + writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), + GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), + GFP_KERNEL); + writer_n_durations = + kcalloc(nrealwriters, sizeof(*writer_n_durations), + GFP_KERNEL); + if (!writer_tasks || !writer_durations || !writer_n_durations) { + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters; i++) { + writer_durations[i] = + kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), + GFP_KERNEL); + if (!writer_durations[i]) { + firsterr = -ENOMEM; + goto unwind; + } + firsterr = torture_create_kthread(rcu_scale_writer, (void *)i, + writer_tasks[i]); + if (firsterr) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + rcu_scale_cleanup(); + return firsterr; +} + +module_init(rcu_scale_init); +module_exit(rcu_scale_cleanup); -- cgit v1.2.3 From 8cbd0e38a9f2de38e8991c5c1c6f9024b2731d17 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Aug 2020 15:51:20 -0700 Subject: rcu: Add Kconfig option for strict RCU grace periods People running automated tests have asked for a way to make RCU minimize grace-period duration in order to increase the probability of KASAN detecting a pointer being improperly leaked from an RCU read-side critical section, for example, like this: rcu_read_lock(); p = rcu_dereference(gp); do_something_with(p); // OK rcu_read_unlock(); do_something_else_with(p); // BUG!!! The rcupdate.rcu_expedited boot parameter is a start in this direction, given that it makes calls to synchronize_rcu() instead invoke the faster (and more wasteful) synchronize_rcu_expedited(). However, this does nothing to shorten RCU grace periods that are instead initiated by call_rcu(), and RCU pointer-leak bugs can involve call_rcu() just as surely as they can synchronize_rcu(). This commit therefore adds a RCU_STRICT_GRACE_PERIOD Kconfig option that will be used to shorten normal (non-expedited) RCU grace periods. This commit also dumps out a message when this option is in effect. Later commits will actually shorten grace periods. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 15 +++++++++++++++ kernel/rcu/tree_plugin.h | 2 ++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 3cf6132a4bb9..cab5a4bebe9c 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -114,4 +114,19 @@ config RCU_EQS_DEBUG Say N here if you need ultimate kernel/user switch latencies Say Y if you are unsure +config RCU_STRICT_GRACE_PERIOD + bool "Provide debug RCU implementation with short grace periods" + depends on DEBUG_KERNEL && RCU_EXPERT + default n + select PREEMPT_COUNT if PREEMPT=n + help + Select this option to build an RCU variant that is strict about + grace periods, making them as short as it can. This limits + scalability, destroys real-time response, degrades battery + lifetime and kills performance. Don't try this on large + machines, as in systems with more than about 10 or 20 CPUs. + But in conjunction with tools like KASAN, it can be helpful + when looking for certain types of RCU usage bugs, for example, + too-short RCU read-side critical sections. + endmenu # "RCU Debugging" diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 982fc5be5269..44cf77db7cae 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -36,6 +36,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) -- cgit v1.2.3 From dc1269186bed3afc5a2018527516be84fe55d3e0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Aug 2020 16:52:17 -0700 Subject: rcu: Reduce leaf fanout for strict RCU grace periods Because strict RCU grace periods will complete more quickly, they will experience greater lock contention on each leaf rcu_node structure's ->lock. This commit therefore reduces the leaf fanout in order to reduce this lock contention. Note that this also has the effect of reducing the number of CPUs supported to 16 in the case of CONFIG_RCU_FANOUT_LEAF=2 or 81 in the case of CONFIG_RCU_FANOUT_LEAF=3. However, greater numbers of CPUs are probably a bad idea when using CONFIG_RCU_STRICT_GRACE_PERIOD=y. Those wishing to live dangerously are free to edit their kernel/rcu/Kconfig files accordingly. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 0ebe15a84985..b71e21f73c40 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -135,10 +135,12 @@ config RCU_FANOUT config RCU_FANOUT_LEAF int "Tree-based hierarchical RCU leaf-level fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT + range 2 64 if 64BIT && !RCU_STRICT_GRACE_PERIOD + range 2 32 if !64BIT && !RCU_STRICT_GRACE_PERIOD + range 2 3 if RCU_STRICT_GRACE_PERIOD depends on TREE_RCU && RCU_EXPERT - default 16 + default 16 if !RCU_STRICT_GRACE_PERIOD + default 2 if RCU_STRICT_GRACE_PERIOD help This option controls the leaf-level fanout of hierarchical implementations of RCU, and allows trading off cache misses -- cgit v1.2.3 From aecd34b9765de3b58c98a1d75b982fc64becd1e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Aug 2020 17:25:23 -0700 Subject: rcu: Restrict default jiffies_till_first_fqs for strict RCU GPs If there are idle CPUs, RCU's grace-period kthread will wait several jiffies before even thinking about polling them. This promotes efficiency, which is normally a good thing, but when the kernel has been built with CONFIG_RCU_STRICT_GRACE_PERIOD=y, we care more about short grace periods. This commit therefore restricts the default jiffies_till_first_fqs value to zero in kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y, which causes RCU's grace-period kthread to poll for idle CPUs immediately after starting a grace period. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8ce77d9ac716..85511590fc38 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -485,7 +485,7 @@ module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); module_param(qovld, long, 0444); -static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; static int rcu_divisor = 7; -- cgit v1.2.3 From 29fc5f93320cb447f83baedfe103ed784cadb073 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2020 06:39:30 -0700 Subject: rcu: Force DEFAULT_RCU_BLIMIT to 1000 for strict RCU GPs The value of DEFAULT_RCU_BLIMIT is normally set to 10, the idea being to avoid needless response-time degradation due to RCU callback invocation. However, when CONFIG_RCU_STRICT_GRACE_PERIOD=y it is better to avoid throttling callback execution in order to better detect pointer leaks from RCU read-side critical sections. This commit therefore sets the value of DEFAULT_RCU_BLIMIT to 1000 in kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 85511590fc38..443685704f5e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -468,17 +468,18 @@ static int rcu_is_cpu_rrupt_from_idle(void) return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } -#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ -#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ +#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) + // Maximum callbacks per rcu_do_batch ... +#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood. static long blimit = DEFAULT_RCU_BLIMIT; -#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ +#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit. static long qhimark = DEFAULT_RCU_QHIMARK; -#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ +#define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit. static long qlowmark = DEFAULT_RCU_QLOMARK; #define DEFAULT_RCU_QOVLD_MULT 2 #define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) -static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ -static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ +static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS. +static long qovld_calc = -1; // No pre-initialization lock acquisitions! module_param(blimit, long, 0444); module_param(qhimark, long, 0444); -- cgit v1.2.3 From f19920e412fdeed1e15691bcee5b40e18b8e96ff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2020 09:40:18 -0700 Subject: rcu: Always set .need_qs from __rcu_read_lock() for strict GPs The ->rcu_read_unlock_special.b.need_qs field in the task_struct structure indicates that the RCU core needs a quiscent state from the corresponding task. The __rcu_read_unlock() function checks this (via an eventual call to rcu_preempt_deferred_qs_irqrestore()), and if set reports a quiscent state immediately upon exit from the outermost RCU read-side critical section. Currently, this flag is only set when the scheduling-clock interrupt decides that the current RCU grace period is too old, as in about one full second too old. But if the kernel has been built with CONFIG_RCU_STRICT_GRACE_PERIOD=y, we clearly do not want to wait that long. This commit therefore sets the .need_qs field immediately at the start of the RCU read-side critical section from within __rcu_read_lock() in order to unconditionally enlist help from __rcu_read_unlock(). But note the additional check for rcu_state.gp_kthread, which prevents attempts to awaken RCU's grace-period kthread during early boot before there is a scheduler. Leaving off this check results in early boot hangs. So early that there is no console output. Thus, this additional check fails until such time as RCU's grace-period kthread has been created, avoiding these empty-console hangs. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 44cf77db7cae..668bbd2be807 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -376,6 +376,8 @@ void __rcu_read_lock(void) rcu_preempt_read_enter(); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread) + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); -- cgit v1.2.3 From 44bad5b3cca2d452d17ef82841b20b42a2cf11a0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2020 15:12:50 -0700 Subject: rcu: Do full report for .need_qs for strict GPs The rcu_preempt_deferred_qs_irqrestore() function is invoked at the end of an RCU read-side critical section (for example, directly from rcu_read_unlock()) and, if .need_qs is set, invokes rcu_qs() to report the new quiescent state. This works, except that rcu_qs() only updates per-CPU state, leaving reporting of the actual quiescent state to a later call to rcu_report_qs_rdp(), for example from within a later RCU_SOFTIRQ instance. Although this approach is exactly what you want if you are more concerned about efficiency than about short grace periods, in CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, short grace periods are the name of the game. This commit therefore makes rcu_preempt_deferred_qs_irqrestore() directly invoke rcu_report_qs_rdp() in CONFIG_RCU_STRICT_GRACE_PERIOD=y, thus shortening grace periods. Historical note: To the best of my knowledge, causing rcu_read_unlock() to directly report a quiescent state first appeared in Jim Houston's and Joe Korty's JRCU. This is the second instance of a Linux-kernel RCU feature being inspired by JRCU, the first being RCU callback offloading (as in the RCU_NOCB_CPU Kconfig option). Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 668bbd2be807..dfdb9020f136 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -459,8 +459,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) return; } t->rcu_read_unlock_special.s = 0; - if (special.b.need_qs) - rcu_qs(); + if (special.b.need_qs) { + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + rcu_report_qs_rdp(rdp->cpu, rdp); + else + rcu_qs(); + } /* * Respond to a request by an expedited grace period for a -- cgit v1.2.3 From 1a2f5d57a33f7b9189b6b3e997eb858301482d79 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2020 16:35:08 -0700 Subject: rcu: Attempt QS when CPU discovers GP for strict GPs A given CPU normally notes a new grace period during one RCU_SOFTIRQ, but avoids reporting the corresponding quiescent state until some later RCU_SOFTIRQ. This leisurly approach improves efficiency by increasing the number of update requests served by each grace period, but is not what is needed for kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y. This commit therefore adds a new rcu_strict_gp_check_qs() function which, in CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, simply enters and immediately exist an RCU read-side critical section. If the CPU is in a quiescent state, the rcu_read_unlock() will attempt to report an immediate quiescent state. This rcu_strict_gp_check_qs() function is invoked from note_gp_changes(), so that a CPU just noticing a new grace period might immediately report a quiescent state for that grace period. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 443685704f5e..36a860c4648b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1574,6 +1574,19 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, raw_spin_unlock_rcu_node(rnp); } +/* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a + * quiescent state. This is intended to be invoked when the CPU notices + * a new grace period. + */ +static void rcu_strict_gp_check_qs(void) +{ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + /* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node @@ -1644,6 +1657,7 @@ static void note_gp_changes(struct rcu_data *rdp) } needwake = __note_gp_changes(rnp, rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + rcu_strict_gp_check_qs(); if (needwake) rcu_gp_kthread_wake(); } -- cgit v1.2.3 From 933ada2c3310aa88807e65c8d498b74a2159a9a2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2020 19:21:48 -0700 Subject: rcu: IPI all CPUs at GP start for strict GPs Currently, each CPU discovers the beginning of a given grace period on its own time, which is again good for efficiency but bad for fast grace periods. This commit therefore uses on_each_cpu() to IPI each CPU after grace-period initialization in order to inform each CPU of the new grace period in a timely manner, but only in kernels build with CONFIG_RCU_STRICT_GRACE_PERIOD=y. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 36a860c4648b..88f4fa639964 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1695,6 +1695,15 @@ static void rcu_gp_torture_wait(void) } } +/* + * Handler for on_each_cpu() to invoke the target CPU's RCU core + * processing. + */ +static void rcu_strict_gp_boundary(void *unused) +{ + invoke_rcu_core(); +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1823,6 +1832,10 @@ static bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_activity, jiffies); } + // If strict, make all CPUs aware of new grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); + return true; } -- cgit v1.2.3 From 4e025f52a1e0e8ff4e303fa0a80e2061ccfa27d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2020 19:42:47 -0700 Subject: rcu: IPI all CPUs at GP end for strict GPs Currently, each CPU discovers the end of a given grace period on its own time, which is again good for efficiency but bad for fast grace periods, given that it is things like kfree() within the RCU callbacks that will cause trouble for pointers leaked from RCU read-side critical sections. This commit therefore uses on_each_cpu() to IPI each CPU after grace-period cleanup in order to inform each CPU of the end of the old grace period in a timely manner, but only in kernels build with CONFIG_RCU_STRICT_GRACE_PERIOD=y. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 88f4fa639964..4bbedfc0f79b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2052,6 +2052,10 @@ static void rcu_gp_cleanup(void) rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); + + // If strict, make all CPUs aware of the end of the old grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); } /* -- cgit v1.2.3 From 3d29aaf1ef992b5b4612fe32b9e6f517f7bba904 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Aug 2020 13:44:10 -0700 Subject: rcu: Provide optional RCU-reader exit delay for strict GPs The goal of this series is to increase the probability of tools like KASAN detecting that an RCU-protected pointer was used outside of its RCU read-side critical section. Thus far, the approach has been to make grace periods and callback processing happen faster. Another approach is to delay the pointer leaker. This commit therefore allows a delay to be applied to exit from RCU read-side critical sections. This slowdown is specified by a new rcutree.rcu_unlock_delay kernel boot parameter that specifies this delay in microseconds, defaulting to zero. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index dfdb9020f136..3f3a4ffd4df2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -430,6 +430,12 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) return !list_empty(&rnp->blkd_tasks); } +// Add delay to rcu_read_unlock() for strict grace periods. +static int rcu_unlock_delay; +#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD +module_param(rcu_unlock_delay, int, 0444); +#endif + /* * Report deferred quiescent states. The deferral time can * be quite short, for example, in the case of the call from @@ -460,10 +466,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) } t->rcu_read_unlock_special.s = 0; if (special.b.need_qs) { - if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { rcu_report_qs_rdp(rdp->cpu, rdp); - else + udelay(rcu_unlock_delay); + } else { rcu_qs(); + } } /* -- cgit v1.2.3 From a657f2617010ae237db5693f875968c28e8f732f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 8 Aug 2020 07:56:31 -0700 Subject: rcu: Execute RCU reader shortly after rcu_core for strict GPs A kernel built with CONFIG_RCU_STRICT_GRACE_PERIOD=y needs a quiescent state to appear very shortly after a CPU has noticed a new grace period. Placing an RCU reader immediately after this point is ineffective because this normally happens in softirq context, which acts as a big RCU reader. This commit therefore introduces a new per-CPU work_struct, which is used at the end of rcu_core() processing to schedule an RCU read-side critical section from within a clean environment. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++++++++++ kernel/rcu/tree.h | 1 + 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4bbedfc0f79b..31995b3f0ed9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2646,6 +2646,14 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); +// Workqueue handler for an RCU reader for kernels enforcing struct RCU +// grace periods. +static void strict_work_handler(struct work_struct *work) +{ + rcu_read_lock(); + rcu_read_unlock(); +} + /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { @@ -2690,6 +2698,10 @@ static __latent_entropy void rcu_core(void) /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); trace_rcu_utilization(TPS("End RCU core")); + + // If strict GPs, schedule an RCU reader in a clean environment. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); } static void rcu_core_si(struct softirq_action *h) @@ -3887,6 +3899,7 @@ rcu_boot_init_percpu_data(int cpu) /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); + INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(rdp->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c96ae351688b..5831ac0b254f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -164,6 +164,7 @@ struct rcu_data { /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ bool defer_qs_iw_pending; /* Scheduler attention pending? */ + struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ struct rcu_segcblist cblist; /* Segmented callback list, with */ -- cgit v1.2.3 From aa40c138cc8f36e2f5c721fd1bdb823a1ef1a237 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Aug 2020 09:58:03 -0700 Subject: rcu: Report QS for outermost PREEMPT=n rcu_read_unlock() for strict GPs The CONFIG_PREEMPT=n instance of rcu_read_unlock is even more aggressively than that of CONFIG_PREEMPT=y in deferring reporting quiescent states to the RCU core. This is just what is wanted in normal use because it reduces overhead, but the resulting delay is not what is wanted for kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y. This commit therefore adds an rcu_read_unlock_strict() function that checks for exceptional conditions, and reports the newly started quiescent state if it is safe to do so, also doing a spin-delay if requested via rcutree.rcu_unlock_delay. This commit also adds a call to rcu_read_unlock_strict() from the CONFIG_PREEMPT=n instance of __rcu_read_unlock(). [ paulmck: Fixed bug located by kernel test robot ] Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++++ kernel/rcu/tree_plugin.h | 24 ++++++++++++++++++------ 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 31995b3f0ed9..a295cadf7c2f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -178,6 +178,12 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +// Add delay to rcu_read_unlock() for strict grace periods. +static int rcu_unlock_delay; +#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD +module_param(rcu_unlock_delay, int, 0444); +#endif + /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3f3a4ffd4df2..25a676dff5de 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -430,12 +430,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) return !list_empty(&rnp->blkd_tasks); } -// Add delay to rcu_read_unlock() for strict grace periods. -static int rcu_unlock_delay; -#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD -module_param(rcu_unlock_delay, int, 0444); -#endif - /* * Report deferred quiescent states. The deferral time can * be quite short, for example, in the case of the call from @@ -784,6 +778,24 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) #else /* #ifdef CONFIG_PREEMPT_RCU */ +/* + * If strict grace periods are enabled, and if the calling + * __rcu_read_unlock() marks the beginning of a quiescent state, immediately + * report that quiescent state and, if requested, spin for a bit. + */ +void rcu_read_unlock_strict(void) +{ + struct rcu_data *rdp; + + if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || + irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + return; + rdp = this_cpu_ptr(&rcu_data); + rcu_report_qs_rdp(rdp->cpu, rdp); + udelay(rcu_unlock_delay); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_strict); + /* * Tell them what RCU they are running. */ -- cgit v1.2.3 From cfeac3977ab4b6222a01f79997739d2367a8cc94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Aug 2020 11:26:14 -0700 Subject: rcu: Remove unused "cpu" parameter from rcu_report_qs_rdp() The "cpu" parameter to rcu_report_qs_rdp() is not used, with rdp->cpu being used instead. Furtheremore, every call to rcu_report_qs_rdp() invokes it on rdp->cpu. This commit therefore removes this unused "cpu" parameter and converts a check of rdp->cpu against smp_processor_id() to a WARN_ON_ONCE(). Reported-by: Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++---- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a295cadf7c2f..c6127651efc6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2240,7 +2240,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) * structure. This must be called from the specified CPU. */ static void -rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) +rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; @@ -2249,6 +2249,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_node *rnp; + WARN_ON_ONCE(rdp->cpu != smp_processor_id()); rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || @@ -2265,8 +2266,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; - if (rdp->cpu == smp_processor_id()) - rdp->core_needs_qs = false; + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { @@ -2315,7 +2315,7 @@ rcu_check_quiescent_state(struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rdp); + rcu_report_qs_rdp(rdp); } /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 25a676dff5de..ca31be019f55 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -461,7 +461,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) t->rcu_read_unlock_special.s = 0; if (special.b.need_qs) { if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { - rcu_report_qs_rdp(rdp->cpu, rdp); + rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } else { rcu_qs(); @@ -791,7 +791,7 @@ void rcu_read_unlock_strict(void) irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) return; rdp = this_cpu_ptr(&rcu_data); - rcu_report_qs_rdp(rdp->cpu, rdp); + rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } EXPORT_SYMBOL_GPL(rcu_read_unlock_strict); -- cgit v1.2.3 From 83224afd11d71e0d6effb86fe1ab5725d5415251 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 13:22:17 -0700 Subject: rcutorture: Remove KCSAN stubs KCSAN is now in mainline, so this commit removes the stubs for the data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f453bf8d2f1e..db3786133644 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -52,19 +52,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ #define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) -- cgit v1.2.3 From 959954df0ca7da2111c3fb67a666681798d15b9d Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 18 Jun 2020 16:29:55 -0400 Subject: rcutorture: Output number of elapsed grace periods This commit adds code to print the grace-period number at the start of the test along with both the grace-period number and the number of elapsed grace periods at the end of the test. Note that variants of RCU)without the notion of a grace-period number (for example, Tiny RCU) just print zeroes. [ paulmck: Adjust commit log. ] Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index db3786133644..c8206ff6007f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -172,6 +172,7 @@ static long n_barrier_successes; /* did rcu_barrier test succeed? */ static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; +static unsigned long start_gp_seq; static int rcu_torture_writer_state; #define RTWS_FIXED_DELAY 0 @@ -2469,8 +2470,9 @@ rcu_torture_cleanup(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); - pr_alert("%s: End-test grace-period state: g%lu f%#x\n", - cur_ops->name, gp_seq, flags); + pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n", + cur_ops->name, (long)gp_seq, flags, + rcutorture_seq_diff(gp_seq, start_gp_seq)); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); if (rcu_torture_can_boost()) @@ -2594,6 +2596,8 @@ rcu_torture_init(void) long i; int cpu; int firsterr = 0; + int flags = 0; + unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, @@ -2636,6 +2640,11 @@ rcu_torture_init(void) nrealreaders = 1; } rcu_torture_print_module_parms(cur_ops, "Start of test"); + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + start_gp_seq = gp_seq; + pr_alert("%s: Start-test grace-period state: g%ld f%#x\n", + cur_ops->name, (long)gp_seq, flags); /* Set up the freelist. */ -- cgit v1.2.3 From d49bed9abc3454bd123cbe974ecbeae119701b92 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 3 Jul 2020 13:05:27 +0800 Subject: locktorture: Make function torture_percpu_rwsem_init() static The sparse tool complains as follows: kernel/locking/locktorture.c:569:6: warning: symbol 'torture_percpu_rwsem_init' was not declared. Should it be static? And this function is not used outside of locktorture.c, so this commit marks it static. Signed-off-by: Wei Yongjun Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 9cfa5e89cff7..62d215b2e39f 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -566,7 +566,7 @@ static struct lock_torture_ops rwsem_lock_ops = { #include static struct percpu_rw_semaphore pcpu_rwsem; -void torture_percpu_rwsem_init(void) +static void torture_percpu_rwsem_init(void) { BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); } -- cgit v1.2.3 From c8fa63714763b7795a3f5fb7ed6d000763e6dccc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 19 Jul 2020 14:40:31 -0700 Subject: rcutorture: Properly set rcu_fwds for OOM handling The conversion of rcu_fwds to dynamic allocation failed to actually allocate the required structure. This commit therefore allocates it, frees it, and updates rcu_fwds accordingly. While in the area, it abstracts the cleanup actions into rcu_torture_fwd_prog_cleanup(). Fixes: 5155be9994e5 ("rcutorture: Dynamically allocate rcu_fwds structure") Reported-by: kernel test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index c8206ff6007f..7942be453a14 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2148,9 +2148,20 @@ static int __init rcu_torture_fwd_prog_init(void) return -ENOMEM; spin_lock_init(&rfp->rcu_fwd_lock); rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + rcu_fwds = rfp; return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } +static void rcu_torture_fwd_prog_cleanup(void) +{ + struct rcu_fwd *rfp; + + torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); + rfp = rcu_fwds; + rcu_fwds = NULL; + kfree(rfp); +} + /* Callback function for RCU barrier testing. */ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) { @@ -2448,7 +2459,7 @@ rcu_torture_cleanup(void) show_rcu_gp_kthreads(); rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); - torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); + rcu_torture_fwd_prog_cleanup(); torture_stop_kthread(rcu_torture_stall, stall_task); torture_stop_kthread(rcu_torture_writer, writer_task); -- cgit v1.2.3 From 57f602022e82ee8fa6476d0e16ddbaf3eb86b245 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 20 Jul 2020 08:34:07 -0700 Subject: rcutorture: Properly synchronize with OOM notifier The current rcutorture forward-progress code assumes that it is the only cause of out-of-memory (OOM) events. For script-based rcutorture testing, this assumption is in fact correct. However, testing based on modprobe/rmmod might well encounter external OOM events, which could happen at any time. This commit therefore properly synchronizes the interaction between rcutorture's forward-progress testing and its OOM notifier by adding a global mutex. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7942be453a14..2b3f04e0af03 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1796,6 +1796,7 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; +static DEFINE_MUTEX(rcu_fwd_mutex); static struct rcu_fwd *rcu_fwds; static bool rcu_fwd_emergency_stop; @@ -2062,8 +2063,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { - struct rcu_fwd *rfp = rcu_fwds; + struct rcu_fwd *rfp; + mutex_lock(&rcu_fwd_mutex); + rfp = rcu_fwds; + if (!rfp) { + mutex_unlock(&rcu_fwd_mutex); + return NOTIFY_OK; + } WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); rcu_torture_fwd_cb_hist(rfp); @@ -2081,6 +2088,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); + mutex_unlock(&rcu_fwd_mutex); return NOTIFY_OK; } @@ -2148,7 +2156,9 @@ static int __init rcu_torture_fwd_prog_init(void) return -ENOMEM; spin_lock_init(&rfp->rcu_fwd_lock); rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + mutex_lock(&rcu_fwd_mutex); rcu_fwds = rfp; + mutex_unlock(&rcu_fwd_mutex); return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } @@ -2158,7 +2168,9 @@ static void rcu_torture_fwd_prog_cleanup(void) torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); rfp = rcu_fwds; + mutex_lock(&rcu_fwd_mutex); rcu_fwds = NULL; + mutex_unlock(&rcu_fwd_mutex); kfree(rfp); } -- cgit v1.2.3 From 58db5785b0d76be4582a32a7900acce88e691d36 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 16 Jul 2020 15:38:56 +0100 Subject: refperf: Avoid null pointer dereference when buf fails to allocate Currently in the unlikely event that buf fails to be allocated it is dereferenced a few times. Use the errexit flag to determine if buf should be written to to avoid the null pointer dereferences. Addresses-Coverity: ("Dereference after null check") Fixes: f518f154ecef ("refperf: Dynamically allocate experiment-summary output buffer") Signed-off-by: Colin Ian King Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index d9291f883b54..952595c678b3 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -546,9 +546,11 @@ static int main_func(void *arg) // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); - buf[0] = 0; - strcat(buf, "\n"); - strcat(buf, "Runs\tTime(ns)\n"); + if (!errexit) { + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Runs\tTime(ns)\n"); + } for (exp = 0; exp < nruns; exp++) { u64 avg; -- cgit v1.2.3 From 299c7d94f635ab93ffb0468aec6b6e2176ec5cbf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Jul 2020 10:45:12 -0700 Subject: rcutorture: Hoist OOM registry up one level Currently, registering and unregistering the OOM notifier is done right before and after the test, respectively. This will not work well for multi-threaded tests, so this commit hoists this registering and unregistering up into the rcu_torture_fwd_prog_init() and rcu_torture_fwd_prog_cleanup() functions. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2b3f04e0af03..983f82fccb18 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2110,13 +2110,11 @@ static int rcu_torture_fwd_prog(void *args) do { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); - register_oom_notifier(&rcutorture_oom_nb); if (!IS_ENABLED(CONFIG_TINY_RCU) || rcu_inkernel_boot_has_ended()) rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); if (rcu_inkernel_boot_has_ended()) rcu_torture_fwd_prog_cr(rfp); - unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ stutter_wait("rcu_torture_fwd_prog"); @@ -2159,6 +2157,7 @@ static int __init rcu_torture_fwd_prog_init(void) mutex_lock(&rcu_fwd_mutex); rcu_fwds = rfp; mutex_unlock(&rcu_fwd_mutex); + register_oom_notifier(&rcutorture_oom_nb); return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } @@ -2171,6 +2170,7 @@ static void rcu_torture_fwd_prog_cleanup(void) mutex_lock(&rcu_fwd_mutex); rcu_fwds = NULL; mutex_unlock(&rcu_fwd_mutex); + unregister_oom_notifier(&rcutorture_oom_nb); kfree(rfp); } -- cgit v1.2.3 From d685514260e21aabd65a9aa8be045766bdaa0549 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Aug 2020 10:33:39 -0700 Subject: rcutorture: Allow pointer leaks to test diagnostic code This commit adds an rcutorture.leakpointer module parameter that intentionally leaks an RCU-protected pointer out of the RCU read-side critical section and checks to see if the corresponding grace period has elapsed, emitting a WARN_ON_ONCE() if so. This module parameter can be used to test facilities like CONFIG_RCU_STRICT_GRACE_PERIOD that end grace periods quickly. While in the area, also document rcutorture.irqreader, which was previously left out. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 983f82fccb18..916ea4f66e4b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -87,6 +87,7 @@ torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); +torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); @@ -1401,6 +1402,9 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) preempt_enable(); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + // This next splat is expected behavior if leakpointer, especially + // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. + WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); /* If error or close call, record the sequence of reader protections. */ if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) { -- cgit v1.2.3 From ec02821c1d35f93b821bc9fdfa83a5f3e9d7275d Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Tue, 18 Aug 2020 06:26:51 +0000 Subject: alarmtimer: Convert comma to semicolon Replace a comma between expression statements by a semicolon. Signed-off-by: Xu Wang Signed-off-by: Thomas Gleixner Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20200818062651.21680-1-vulab@iscas.ac.cn --- kernel/time/alarmtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ca223a89530a..f4ace1bf8382 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -908,7 +908,7 @@ static int __init alarmtimer_init(void) /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real; - alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64, + alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64; alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec; -- cgit v1.2.3 From 450af8d0f6be2e7dd2a528a3fb054bb726bf1747 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:16 +0200 Subject: bpf: Split bpf_local_storage to bpf_sk_storage A purely mechanical change: bpf_sk_storage.c = bpf_sk_storage.c + bpf_local_storage.c bpf_sk_storage.h = bpf_sk_storage.h + bpf_local_storage.h Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200825182919.1118197-5-kpsingh@chromium.org --- kernel/bpf/Makefile | 1 + kernel/bpf/bpf_local_storage.c | 600 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 601 insertions(+) create mode 100644 kernel/bpf/bpf_local_storage.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 19e137aae40e..6961ff400cba 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o obj-$(CONFIG_BPF_SYSCALL) += offload.o obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c new file mode 100644 index 000000000000..ffa7d11fc2bd --- /dev/null +++ b/kernel/bpf/bpf_local_storage.c @@ -0,0 +1,600 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE) + +static struct bpf_local_storage_map_bucket * +select_bucket(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) +{ + return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; +} + +static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) +{ + struct bpf_map *map = &smap->map; + + if (!map->ops->map_local_storage_charge) + return 0; + + return map->ops->map_local_storage_charge(smap, owner, size); +} + +static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, + u32 size) +{ + struct bpf_map *map = &smap->map; + + if (map->ops->map_local_storage_uncharge) + map->ops->map_local_storage_uncharge(smap, owner, size); +} + +static struct bpf_local_storage __rcu ** +owner_storage(struct bpf_local_storage_map *smap, void *owner) +{ + struct bpf_map *map = &smap->map; + + return map->ops->map_owner_storage_ptr(owner); +} + +static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) +{ + return !hlist_unhashed(&selem->snode); +} + +static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) +{ + return !hlist_unhashed(&selem->map_node); +} + +struct bpf_local_storage_elem * +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, + void *value, bool charge_mem) +{ + struct bpf_local_storage_elem *selem; + + if (charge_mem && mem_charge(smap, owner, smap->elem_size)) + return NULL; + + selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (selem) { + if (value) + memcpy(SDATA(selem)->data, value, smap->map.value_size); + return selem; + } + + if (charge_mem) + mem_uncharge(smap, owner, smap->elem_size); + + return NULL; +} + +/* local_storage->lock must be held and selem->local_storage == local_storage. + * The caller must ensure selem->smap is still valid to be + * dereferenced for its smap->elem_size and smap->cache_idx. + */ +bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_mem) +{ + struct bpf_local_storage_map *smap; + bool free_local_storage; + void *owner; + + smap = rcu_dereference(SDATA(selem)->smap); + owner = local_storage->owner; + + /* All uncharging on the owner must be done first. + * The owner may be freed once the last selem is unlinked + * from local_storage. + */ + if (uncharge_mem) + mem_uncharge(smap, owner, smap->elem_size); + + free_local_storage = hlist_is_singular_node(&selem->snode, + &local_storage->list); + if (free_local_storage) { + mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); + local_storage->owner = NULL; + + /* After this RCU_INIT, owner may be freed and cannot be used */ + RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + + /* local_storage is not freed now. local_storage->lock is + * still held and raw_spin_unlock_bh(&local_storage->lock) + * will be done by the caller. + * + * Although the unlock will be done under + * rcu_read_lock(), it is more intutivie to + * read if kfree_rcu(local_storage, rcu) is done + * after the raw_spin_unlock_bh(&local_storage->lock). + * + * Hence, a "bool free_local_storage" is returned + * to the caller which then calls the kfree_rcu() + * after unlock. + */ + } + hlist_del_init_rcu(&selem->snode); + if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == + SDATA(selem)) + RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); + + kfree_rcu(selem, rcu); + + return free_local_storage; +} + +static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage *local_storage; + bool free_local_storage = false; + + if (unlikely(!selem_linked_to_storage(selem))) + /* selem has already been unlinked from sk */ + return; + + local_storage = rcu_dereference(selem->local_storage); + raw_spin_lock_bh(&local_storage->lock); + if (likely(selem_linked_to_storage(selem))) + free_local_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, true); + raw_spin_unlock_bh(&local_storage->lock); + + if (free_local_storage) + kfree_rcu(local_storage, rcu); +} + +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem) +{ + RCU_INIT_POINTER(selem->local_storage, local_storage); + hlist_add_head(&selem->snode, &local_storage->list); +} + +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage_map *smap; + struct bpf_local_storage_map_bucket *b; + + if (unlikely(!selem_linked_to_map(selem))) + /* selem has already be unlinked from smap */ + return; + + smap = rcu_dereference(SDATA(selem)->smap); + b = select_bucket(smap, selem); + raw_spin_lock_bh(&b->lock); + if (likely(selem_linked_to_map(selem))) + hlist_del_init_rcu(&selem->map_node); + raw_spin_unlock_bh(&b->lock); +} + +void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); + + raw_spin_lock_bh(&b->lock); + RCU_INIT_POINTER(SDATA(selem)->smap, smap); + hlist_add_head_rcu(&selem->map_node, &b->list); + raw_spin_unlock_bh(&b->lock); +} + +void bpf_selem_unlink(struct bpf_local_storage_elem *selem) +{ + /* Always unlink from map before unlinking from local_storage + * because selem will be freed after successfully unlinked from + * the local_storage. + */ + bpf_selem_unlink_map(selem); + __bpf_selem_unlink_storage(selem); +} + +struct bpf_local_storage_data * +bpf_local_storage_lookup(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + bool cacheit_lockit) +{ + struct bpf_local_storage_data *sdata; + struct bpf_local_storage_elem *selem; + + /* Fast path (cache hit) */ + sdata = rcu_dereference(local_storage->cache[smap->cache_idx]); + if (sdata && rcu_access_pointer(sdata->smap) == smap) + return sdata; + + /* Slow path (cache miss) */ + hlist_for_each_entry_rcu(selem, &local_storage->list, snode) + if (rcu_access_pointer(SDATA(selem)->smap) == smap) + break; + + if (!selem) + return NULL; + + sdata = SDATA(selem); + if (cacheit_lockit) { + /* spinlock is needed to avoid racing with the + * parallel delete. Otherwise, publishing an already + * deleted sdata to the cache will become a use-after-free + * problem in the next bpf_local_storage_lookup(). + */ + raw_spin_lock_bh(&local_storage->lock); + if (selem_linked_to_storage(selem)) + rcu_assign_pointer(local_storage->cache[smap->cache_idx], + sdata); + raw_spin_unlock_bh(&local_storage->lock); + } + + return sdata; +} + +static int check_flags(const struct bpf_local_storage_data *old_sdata, + u64 map_flags) +{ + if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + +int bpf_local_storage_alloc(void *owner, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *first_selem) +{ + struct bpf_local_storage *prev_storage, *storage; + struct bpf_local_storage **owner_storage_ptr; + int err; + + err = mem_charge(smap, owner, sizeof(*storage)); + if (err) + return err; + + storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN); + if (!storage) { + err = -ENOMEM; + goto uncharge; + } + + INIT_HLIST_HEAD(&storage->list); + raw_spin_lock_init(&storage->lock); + storage->owner = owner; + + bpf_selem_link_storage_nolock(storage, first_selem); + bpf_selem_link_map(smap, first_selem); + + owner_storage_ptr = + (struct bpf_local_storage **)owner_storage(smap, owner); + /* Publish storage to the owner. + * Instead of using any lock of the kernel object (i.e. owner), + * cmpxchg will work with any kernel object regardless what + * the running context is, bh, irq...etc. + * + * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage) + * is protected by the storage->lock. Hence, when freeing + * the owner->storage, the storage->lock must be held before + * setting owner->storage ptr to NULL. + */ + prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); + if (unlikely(prev_storage)) { + bpf_selem_unlink_map(first_selem); + err = -EAGAIN; + goto uncharge; + + /* Note that even first_selem was linked to smap's + * bucket->list, first_selem can be freed immediately + * (instead of kfree_rcu) because + * bpf_local_storage_map_free() does a + * synchronize_rcu() before walking the bucket->list. + * Hence, no one is accessing selem from the + * bucket->list under rcu_read_lock(). + */ + } + + return 0; + +uncharge: + kfree(storage); + mem_uncharge(smap, owner, sizeof(*storage)); + return err; +} + +/* sk cannot be going away because it is linking new elem + * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). + * Otherwise, it will become a leak (and other memory issues + * during map destruction). + */ +struct bpf_local_storage_data * +bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *old_sdata = NULL; + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + int err; + + /* BPF_EXIST and BPF_NOEXIST cannot be both set */ + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || + /* BPF_F_LOCK can only be used in a value with spin_lock */ + unlikely((map_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(&smap->map))) + return ERR_PTR(-EINVAL); + + local_storage = rcu_dereference(*owner_storage(smap, owner)); + if (!local_storage || hlist_empty(&local_storage->list)) { + /* Very first elem for the owner */ + err = check_flags(NULL, map_flags); + if (err) + return ERR_PTR(err); + + selem = bpf_selem_alloc(smap, owner, value, true); + if (!selem) + return ERR_PTR(-ENOMEM); + + err = bpf_local_storage_alloc(owner, smap, selem); + if (err) { + kfree(selem); + mem_uncharge(smap, owner, smap->elem_size); + return ERR_PTR(err); + } + + return SDATA(selem); + } + + if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { + /* Hoping to find an old_sdata to do inline update + * such that it can avoid taking the local_storage->lock + * and changing the lists. + */ + old_sdata = + bpf_local_storage_lookup(local_storage, smap, false); + err = check_flags(old_sdata, map_flags); + if (err) + return ERR_PTR(err); + if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) { + copy_map_value_locked(&smap->map, old_sdata->data, + value, false); + return old_sdata; + } + } + + raw_spin_lock_bh(&local_storage->lock); + + /* Recheck local_storage->list under local_storage->lock */ + if (unlikely(hlist_empty(&local_storage->list))) { + /* A parallel del is happening and local_storage is going + * away. It has just been checked before, so very + * unlikely. Return instead of retry to keep things + * simple. + */ + err = -EAGAIN; + goto unlock_err; + } + + old_sdata = bpf_local_storage_lookup(local_storage, smap, false); + err = check_flags(old_sdata, map_flags); + if (err) + goto unlock_err; + + if (old_sdata && (map_flags & BPF_F_LOCK)) { + copy_map_value_locked(&smap->map, old_sdata->data, value, + false); + selem = SELEM(old_sdata); + goto unlock; + } + + /* local_storage->lock is held. Hence, we are sure + * we can unlink and uncharge the old_sdata successfully + * later. Hence, instead of charging the new selem now + * and then uncharge the old selem later (which may cause + * a potential but unnecessary charge failure), avoid taking + * a charge at all here (the "!old_sdata" check) and the + * old_sdata will not be uncharged later during + * bpf_selem_unlink_storage_nolock(). + */ + selem = bpf_selem_alloc(smap, owner, value, !old_sdata); + if (!selem) { + err = -ENOMEM; + goto unlock_err; + } + + /* First, link the new selem to the map */ + bpf_selem_link_map(smap, selem); + + /* Second, link (and publish) the new selem to local_storage */ + bpf_selem_link_storage_nolock(local_storage, selem); + + /* Third, remove old selem, SELEM(old_sdata) */ + if (old_sdata) { + bpf_selem_unlink_map(SELEM(old_sdata)); + bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), + false); + } + +unlock: + raw_spin_unlock_bh(&local_storage->lock); + return SDATA(selem); + +unlock_err: + raw_spin_unlock_bh(&local_storage->lock); + return ERR_PTR(err); +} + +u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) +{ + u64 min_usage = U64_MAX; + u16 i, res = 0; + + spin_lock(&cache->idx_lock); + + for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) { + if (cache->idx_usage_counts[i] < min_usage) { + min_usage = cache->idx_usage_counts[i]; + res = i; + + /* Found a free cache_idx */ + if (!min_usage) + break; + } + } + cache->idx_usage_counts[res]++; + + spin_unlock(&cache->idx_lock); + + return res; +} + +void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, + u16 idx) +{ + spin_lock(&cache->idx_lock); + cache->idx_usage_counts[idx]--; + spin_unlock(&cache->idx_lock); +} + +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map_bucket *b; + unsigned int i; + + /* Note that this map might be concurrently cloned from + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone + * RCU read section to finish before proceeding. New RCU + * read sections should be prevented via bpf_map_inc_not_zero. + */ + synchronize_rcu(); + + /* bpf prog and the userspace can no longer access this map + * now. No new selem (of this map) can be added + * to the owner->storage or to the map bucket's list. + * + * The elem of this map can be cleaned up here + * or when the storage is freed e.g. + * by bpf_sk_storage_free() during __sk_destruct(). + */ + for (i = 0; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + + rcu_read_lock(); + /* No one is adding to b->list now */ + while ((selem = hlist_entry_safe( + rcu_dereference_raw(hlist_first_rcu(&b->list)), + struct bpf_local_storage_elem, map_node))) { + bpf_selem_unlink(selem); + cond_resched_rcu(); + } + rcu_read_unlock(); + } + + /* While freeing the storage we may still need to access the map. + * + * e.g. when bpf_sk_storage_free() has unlinked selem from the map + * which then made the above while((selem = ...)) loop + * exit immediately. + * + * However, while freeing the storage one still needs to access the + * smap->elem_size to do the uncharging in + * bpf_selem_unlink_storage_nolock(). + * + * Hence, wait another rcu grace period for the storage to be freed. + */ + synchronize_rcu(); + + kvfree(smap->buckets); + kfree(smap); +} + +int bpf_local_storage_map_alloc_check(union bpf_attr *attr) +{ + if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || + !(attr->map_flags & BPF_F_NO_PREALLOC) || + attr->max_entries || + attr->key_size != sizeof(int) || !attr->value_size || + /* Enforce BTF for userspace sk dumping */ + !attr->btf_key_type_id || !attr->btf_value_type_id) + return -EINVAL; + + if (!bpf_capable()) + return -EPERM; + + if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE) + return -E2BIG; + + return 0; +} + +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + unsigned int i; + u32 nbuckets; + u64 cost; + int ret; + + smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); + if (!smap) + return ERR_PTR(-ENOMEM); + bpf_map_init_from_attr(&smap->map, attr); + + nbuckets = roundup_pow_of_two(num_possible_cpus()); + /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ + nbuckets = max_t(u32, 2, nbuckets); + smap->bucket_log = ilog2(nbuckets); + cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); + + ret = bpf_map_charge_init(&smap->map.memory, cost); + if (ret < 0) { + kfree(smap); + return ERR_PTR(ret); + } + + smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, + GFP_USER | __GFP_NOWARN); + if (!smap->buckets) { + bpf_map_charge_finish(&smap->map.memory); + kfree(smap); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nbuckets; i++) { + INIT_HLIST_HEAD(&smap->buckets[i].list); + raw_spin_lock_init(&smap->buckets[i].lock); + } + + smap->elem_size = + sizeof(struct bpf_local_storage_elem) + attr->value_size; + + return smap; +} + +int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + u32 int_data; + + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; + + int_data = *(u32 *)(key_type + 1); + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + return -EINVAL; + + return 0; +} -- cgit v1.2.3 From 8ea636848aca35b9f97c5b5dee30225cf2dd0fe6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:17 +0200 Subject: bpf: Implement bpf_local_storage for inodes Similar to bpf_local_storage for sockets, add local storage for inodes. The life-cycle of storage is managed with the life-cycle of the inode. i.e. the storage is destroyed along with the owning inode. The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the security blob which are now stackable and can co-exist with other LSMs. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200825182919.1118197-6-kpsingh@chromium.org --- kernel/bpf/Makefile | 1 + kernel/bpf/bpf_inode_storage.c | 273 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 10 ++ 4 files changed, 286 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/bpf_inode_storage.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6961ff400cba..bdc8cd1b6767 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -5,6 +5,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c new file mode 100644 index 000000000000..f3a44e929447 --- /dev/null +++ b/kernel/bpf/bpf_inode_storage.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Facebook + * Copyright 2020 Google LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_BPF_STORAGE_CACHE(inode_cache); + +static struct bpf_local_storage __rcu ** +inode_storage_ptr(void *owner) +{ + struct inode *inode = owner; + struct bpf_storage_blob *bsb; + + bsb = bpf_inode(inode); + if (!bsb) + return NULL; + return &bsb->storage; +} + +static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, + struct bpf_map *map, + bool cacheit_lockit) +{ + struct bpf_local_storage *inode_storage; + struct bpf_local_storage_map *smap; + struct bpf_storage_blob *bsb; + + bsb = bpf_inode(inode); + if (!bsb) + return NULL; + + inode_storage = rcu_dereference(bsb->storage); + if (!inode_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit); +} + +void bpf_inode_storage_free(struct inode *inode) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + bool free_inode_storage = false; + struct bpf_storage_blob *bsb; + struct hlist_node *n; + + bsb = bpf_inode(inode); + if (!bsb) + return; + + rcu_read_lock(); + + local_storage = rcu_dereference(bsb->storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + /* Netiher the bpf_prog nor the bpf-map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * local_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&local_storage->lock); + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + free_inode_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false); + } + raw_spin_unlock_bh(&local_storage->lock); + rcu_read_unlock(); + + /* free_inoode_storage should always be true as long as + * local_storage->list was non-empty. + */ + if (free_inode_storage) + kfree_rcu(local_storage, rcu); +} + +static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct file *f; + int fd; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f) + return NULL; + + sdata = inode_storage_lookup(f->f_inode, map, true); + fput(f); + return sdata ? sdata->data : NULL; +} + +static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct file *f; + int fd; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f || !inode_storage_ptr(f->f_inode)) + return -EBADF; + + sdata = bpf_local_storage_update(f->f_inode, + (struct bpf_local_storage_map *)map, + value, map_flags); + fput(f); + return PTR_ERR_OR_ZERO(sdata); +} + +static int inode_storage_delete(struct inode *inode, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = inode_storage_lookup(inode, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata)); + + return 0; +} + +static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct file *f; + int fd, err; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f) + return -EBADF; + + err = inode_storage_delete(f->f_inode, map); + fput(f); + return err; +} + +BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, + void *, value, u64, flags) +{ + struct bpf_local_storage_data *sdata; + + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + /* explicitly check that the inode_storage_ptr is not + * NULL as inode_storage_lookup returns NULL in this case and + * bpf_local_storage_update expects the owner to have a + * valid storage pointer. + */ + if (!inode_storage_ptr(inode)) + return (unsigned long)NULL; + + sdata = inode_storage_lookup(inode, map, true); + if (sdata) + return (unsigned long)sdata->data; + + /* This helper must only called from where the inode is gurranteed + * to have a refcount and cannot be freed. + */ + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + sdata = bpf_local_storage_update( + inode, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); + return IS_ERR(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_inode_storage_delete, + struct bpf_map *, map, struct inode *, inode) +{ + /* This helper must only called from where the inode is gurranteed + * to have a refcount and cannot be freed. + */ + return inode_storage_delete(inode, map); +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache); + return &smap->map; +} + +static void inode_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); +} + +static int inode_storage_map_btf_id; +const struct bpf_map_ops inode_storage_map_ops = { + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = inode_storage_map_alloc, + .map_free = inode_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_fd_inode_storage_lookup_elem, + .map_update_elem = bpf_fd_inode_storage_update_elem, + .map_delete_elem = bpf_fd_inode_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_name = "bpf_local_storage_map", + .map_btf_id = &inode_storage_map_btf_id, + .map_owner_storage_ptr = inode_storage_ptr, +}; + +BTF_ID_LIST(bpf_inode_storage_btf_ids) +BTF_ID_UNUSED +BTF_ID(struct, inode) + +const struct bpf_func_proto bpf_inode_storage_get_proto = { + .func = bpf_inode_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .btf_id = bpf_inode_storage_btf_ids, +}; + +const struct bpf_func_proto bpf_inode_storage_delete_proto = { + .func = bpf_inode_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .btf_id = bpf_inode_storage_btf_ids, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b46e973faee9..5443cea86cef 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -769,7 +769,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && - map->map_type != BPF_MAP_TYPE_SK_STORAGE) + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dd24503ab3d3..38748794518e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4311,6 +4311,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sk_storage_delete) goto error; break; + case BPF_MAP_TYPE_INODE_STORAGE: + if (func_id != BPF_FUNC_inode_storage_get && + func_id != BPF_FUNC_inode_storage_delete) + goto error; + break; default: break; } @@ -4384,6 +4389,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) goto error; break; + case BPF_FUNC_inode_storage_get: + case BPF_FUNC_inode_storage_delete: + if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE) + goto error; + break; default: break; } -- cgit v1.2.3 From 30897832d8b97e93833fb52c0a02951db3692ed2 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:18 +0200 Subject: bpf: Allow local storage to be used from LSM programs Adds support for both bpf_{sk, inode}_storage_{get, delete} to be used in LSM programs. These helpers are not used for tracing programs (currently) as their usage is tied to the life-cycle of the object and should only be used where the owning object won't be freed (when the owning object is passed as an argument to the LSM hook). Thus, they are safer to use in LSM hooks than tracing. Usage of local storage in tracing programs will probably follow a per function based whitelist approach. Since the UAPI helper signature for bpf_sk_storage expect a bpf_sock, it, leads to a compilation warning for LSM programs, it's also updated to accept a void * pointer instead. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200825182919.1118197-7-kpsingh@chromium.org --- kernel/bpf/bpf_lsm.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index fb278144e9fd..9cd1428c7199 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -45,10 +47,27 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return 0; } +static const struct bpf_func_proto * +bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_inode_storage_get: + return &bpf_inode_storage_get_proto; + case BPF_FUNC_inode_storage_delete: + return &bpf_inode_storage_delete_proto; + case BPF_FUNC_sk_storage_get: + return &sk_storage_get_btf_proto; + case BPF_FUNC_sk_storage_delete: + return &sk_storage_delete_btf_proto; + default: + return tracing_prog_func_proto(func_id, prog); + } +} + const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { - .get_func_proto = tracing_prog_func_proto, + .get_func_proto = bpf_lsm_func_proto, .is_valid_access = btf_ctx_access, }; -- cgit v1.2.3 From 2532f849b5134c4c62a20e5aaca33d9fb08af528 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 25 Aug 2020 15:23:46 -0700 Subject: bpf: Disallow BPF_PRELOAD in allmodconfig builds The CC_CAN_LINK checks that the host compiler can link, but bpf_preload relies on libbpf which in turn needs libelf to be present during linking. allmodconfig runs in odd setups with cross compilers and missing host libraries like libelf. Instead of extending kconfig with every possible library that bpf_preload might need disallow building BPF_PRELOAD in such build-only configurations. Signed-off-by: Alexei Starovoitov --- kernel/bpf/preload/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index 7144e2d01ee4..ace49111d3a3 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -6,6 +6,9 @@ config USERMODE_DRIVER menuconfig BPF_PRELOAD bool "Preload BPF file system with kernel specific program and map iterators" depends on BPF + # The dependency on !COMPILE_TEST prevents it from being enabled + # in allmodconfig or allyesconfig configurations + depends on !COMPILE_TEST select USERMODE_DRIVER help This builds kernel module with several embedded BPF programs that are -- cgit v1.2.3 From 6298399bfc101f8e8cf35a916f26aa32bdf04278 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:13 +0200 Subject: bpf: Move btf_resolve_size into __btf_resolve_size Moving btf_resolve_size into __btf_resolve_size and keeping btf_resolve_size public with just first 3 arguments, because the rest of the arguments are not used by outside callers. Following changes are adding more arguments, which are not useful to outside callers. They will be added to the __btf_resolve_size function. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-4-jolsa@kernel.org --- kernel/bpf/bpf_struct_ops.c | 6 ++---- kernel/bpf/btf.c | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 969c5d47f81f..4c3b543bb33b 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -298,8 +298,7 @@ static int check_zero_holes(const struct btf_type *t, void *data) return -EINVAL; mtype = btf_type_by_id(btf_vmlinux, member->type); - mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, - NULL, NULL); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); if (IS_ERR(mtype)) return PTR_ERR(mtype); prev_mend = moff + msize; @@ -396,8 +395,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, u32 msize; mtype = btf_type_by_id(btf_vmlinux, member->type); - mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, - NULL, NULL); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); if (IS_ERR(mtype)) { err = PTR_ERR(mtype); goto reset_unlock; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 91afdd4c82e3..6ed4ecc60381 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1088,10 +1088,10 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) * *elem_type: same as return type ("struct X") * *total_nelems: 1 */ -const struct btf_type * -btf_resolve_size(const struct btf *btf, const struct btf_type *type, - u32 *type_size, const struct btf_type **elem_type, - u32 *total_nelems) +static const struct btf_type * +__btf_resolve_size(const struct btf *btf, const struct btf_type *type, + u32 *type_size, const struct btf_type **elem_type, + u32 *total_nelems) { const struct btf_type *array_type = NULL; const struct btf_array *array; @@ -1150,6 +1150,13 @@ resolved: return array_type ? : type; } +const struct btf_type * +btf_resolve_size(const struct btf *btf, const struct btf_type *type, + u32 *type_size) +{ + return __btf_resolve_size(btf, type, type_size, NULL, NULL); +} + /* The input param "type_id" must point to a needs_resolve type */ static const struct btf_type *btf_type_id_resolve(const struct btf *btf, u32 *type_id) @@ -3976,8 +3983,8 @@ error: mtype = btf_type_by_id(btf_vmlinux, member->type); mname = __btf_name_by_offset(btf_vmlinux, member->name_off); - mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, - &elem_type, &total_nelems); + mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize, + &elem_type, &total_nelems); if (IS_ERR(mtype)) { bpf_log(log, "field %s doesn't have size\n", mname); return -EFAULT; @@ -3991,7 +3998,7 @@ error: if (btf_type_is_array(mtype)) { u32 elem_idx; - /* btf_resolve_size() above helps to + /* __btf_resolve_size() above helps to * linearize a multi-dimensional array. * * The logic here is treating an array -- cgit v1.2.3 From 69ff304792709dcdc5c8918b236cf06fe7db448f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:14 +0200 Subject: bpf: Add elem_id pointer as argument to __btf_resolve_size If the resolved type is array, make btf_resolve_size return also ID of the elem type. It will be needed in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-5-jolsa@kernel.org --- kernel/bpf/btf.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6ed4ecc60381..dbc70fedfb44 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1079,6 +1079,7 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) * *type_size: (x * y * sizeof(u32)). Hence, *type_size always * corresponds to the return type. * *elem_type: u32 + * *elem_id: id of u32 * *total_nelems: (x * y). Hence, individual elem size is * (*type_size / *total_nelems) * @@ -1086,15 +1087,16 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) * return type: type "struct X" * *type_size: sizeof(struct X) * *elem_type: same as return type ("struct X") + * *elem_id: 0 * *total_nelems: 1 */ static const struct btf_type * __btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size, const struct btf_type **elem_type, - u32 *total_nelems) + u32 *elem_id, u32 *total_nelems) { const struct btf_type *array_type = NULL; - const struct btf_array *array; + const struct btf_array *array = NULL; u32 i, size, nelems = 1; for (i = 0; i < MAX_RESOLVE_DEPTH; i++) { @@ -1146,6 +1148,8 @@ resolved: *total_nelems = nelems; if (elem_type) *elem_type = type; + if (elem_id) + *elem_id = array ? array->type : 0; return array_type ? : type; } @@ -3984,7 +3988,7 @@ error: mname = __btf_name_by_offset(btf_vmlinux, member->name_off); mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize, - &elem_type, &total_nelems); + &elem_type, NULL, &total_nelems); if (IS_ERR(mtype)) { bpf_log(log, "field %s doesn't have size\n", mname); return -EFAULT; -- cgit v1.2.3 From 887c31a39c49e261581a3d108607c9dea55b12d9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:15 +0200 Subject: bpf: Add type_id pointer as argument to __btf_resolve_size Adding type_id pointer as argument to __btf_resolve_size to return also BTF ID of the resolved type. It will be used in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-6-jolsa@kernel.org --- kernel/bpf/btf.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbc70fedfb44..ee0e2a5e6c88 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1082,6 +1082,7 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) * *elem_id: id of u32 * *total_nelems: (x * y). Hence, individual elem size is * (*type_size / *total_nelems) + * *type_id: id of type if it's changed within the function, 0 if not * * type: is not an array (e.g. const struct X) * return type: type "struct X" @@ -1089,15 +1090,16 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) * *elem_type: same as return type ("struct X") * *elem_id: 0 * *total_nelems: 1 + * *type_id: id of type if it's changed within the function, 0 if not */ static const struct btf_type * __btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size, const struct btf_type **elem_type, - u32 *elem_id, u32 *total_nelems) + u32 *elem_id, u32 *total_nelems, u32 *type_id) { const struct btf_type *array_type = NULL; const struct btf_array *array = NULL; - u32 i, size, nelems = 1; + u32 i, size, nelems = 1, id = 0; for (i = 0; i < MAX_RESOLVE_DEPTH; i++) { switch (BTF_INFO_KIND(type->info)) { @@ -1118,6 +1120,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: + id = type->type; type = btf_type_by_id(btf, type->type); break; @@ -1150,6 +1153,8 @@ resolved: *elem_type = type; if (elem_id) *elem_id = array ? array->type : 0; + if (type_id && id) + *type_id = id; return array_type ? : type; } @@ -1158,7 +1163,7 @@ const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size) { - return __btf_resolve_size(btf, type, type_size, NULL, NULL); + return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL); } /* The input param "type_id" must point to a needs_resolve type */ @@ -3988,7 +3993,7 @@ error: mname = __btf_name_by_offset(btf_vmlinux, member->name_off); mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize, - &elem_type, NULL, &total_nelems); + &elem_type, NULL, &total_nelems, NULL); if (IS_ERR(mtype)) { bpf_log(log, "field %s doesn't have size\n", mname); return -EFAULT; -- cgit v1.2.3 From dafe58fc1917a96c657a0f56a12f262e5d9fb324 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:16 +0200 Subject: bpf: Remove recursion call in btf_struct_access Andrii suggested we can simply jump to again label instead of making recursion call. Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-7-jolsa@kernel.org --- kernel/bpf/btf.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ee0e2a5e6c88..4488c5b03941 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3931,14 +3931,13 @@ again: /* Only allow structure for now, can be relaxed for * other types later. */ - elem_type = btf_type_skip_modifiers(btf_vmlinux, - array_elem->type, NULL); - if (!btf_type_is_struct(elem_type)) + t = btf_type_skip_modifiers(btf_vmlinux, array_elem->type, + NULL); + if (!btf_type_is_struct(t)) goto error; - off = (off - moff) % elem_type->size; - return btf_struct_access(log, elem_type, off, size, atype, - next_btf_id); + off = (off - moff) % t->size; + goto again; error: bpf_log(log, "access beyond struct %s at off %u size %u\n", -- cgit v1.2.3 From 1c6d28a6ac56de5b0af9239a1b02aea4c3011ea3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:17 +0200 Subject: bpf: Factor btf_struct_access function Adding btf_struct_walk function that walks through the struct type + given offset and returns following values: enum bpf_struct_walk_result { /* < 0 error */ WALK_SCALAR = 0, WALK_PTR, WALK_STRUCT, }; WALK_SCALAR - when SCALAR_VALUE is found WALK_PTR - when pointer value is found, its ID is stored in 'next_btf_id' output param WALK_STRUCT - when nested struct object is found, its ID is stored in 'next_btf_id' output param It will be used in following patches to get all nested struct objects for given type and offset. The btf_struct_access now calls btf_struct_walk function, as long as it gets nested structs as return value. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-8-jolsa@kernel.org --- kernel/bpf/btf.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4488c5b03941..d8d64201c4e0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3886,16 +3886,22 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; } -int btf_struct_access(struct bpf_verifier_log *log, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, - u32 *next_btf_id) +enum bpf_struct_walk_result { + /* < 0 error */ + WALK_SCALAR = 0, + WALK_PTR, + WALK_STRUCT, +}; + +static int btf_struct_walk(struct bpf_verifier_log *log, + const struct btf_type *t, int off, int size, + u32 *next_btf_id) { u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; const char *tname, *mname; - u32 vlen; + u32 vlen, elem_id, mid; again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); @@ -3966,7 +3972,7 @@ error: */ if (off <= moff && BITS_ROUNDUP_BYTES(end_bit) <= off + size) - return SCALAR_VALUE; + return WALK_SCALAR; /* off may be accessing a following member * @@ -3988,11 +3994,13 @@ error: break; /* type of the field */ + mid = member->type; mtype = btf_type_by_id(btf_vmlinux, member->type); mname = __btf_name_by_offset(btf_vmlinux, member->name_off); mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize, - &elem_type, NULL, &total_nelems, NULL); + &elem_type, &elem_id, &total_nelems, + &mid); if (IS_ERR(mtype)) { bpf_log(log, "field %s doesn't have size\n", mname); return -EFAULT; @@ -4054,6 +4062,7 @@ error: elem_idx = (off - moff) / msize; moff += elem_idx * msize; mtype = elem_type; + mid = elem_id; } /* the 'off' we're looking for is either equal to start @@ -4063,6 +4072,12 @@ error: /* our field must be inside that union or struct */ t = mtype; + /* return if the offset matches the member offset */ + if (off == moff) { + *next_btf_id = mid; + return WALK_STRUCT; + } + /* adjust offset we're looking for */ off -= moff; goto again; @@ -4078,11 +4093,10 @@ error: mname, moff, tname, off, size); return -EACCES; } - stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; - return PTR_TO_BTF_ID; + return WALK_PTR; } } @@ -4099,12 +4113,53 @@ error: return -EACCES; } - return SCALAR_VALUE; + return WALK_SCALAR; } bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off); return -EINVAL; } +int btf_struct_access(struct bpf_verifier_log *log, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype __maybe_unused, + u32 *next_btf_id) +{ + int err; + u32 id; + + do { + err = btf_struct_walk(log, t, off, size, &id); + + switch (err) { + case WALK_PTR: + /* If we found the pointer or scalar on t+off, + * we're done. + */ + *next_btf_id = id; + return PTR_TO_BTF_ID; + case WALK_SCALAR: + return SCALAR_VALUE; + case WALK_STRUCT: + /* We found nested struct, so continue the search + * by diving in it. At this point the offset is + * aligned with the new type, so set it to 0. + */ + t = btf_type_by_id(btf_vmlinux, id); + off = 0; + break; + default: + /* It's either error or unknown return value.. + * scream and leave. + */ + if (WARN_ONCE(err > 0, "unknown btf_struct_walk return value")) + return -EINVAL; + return err; + } + } while (t); + + return -EINVAL; +} + int btf_resolve_helper_id(struct bpf_verifier_log *log, const struct bpf_func_proto *fn, int arg) { -- cgit v1.2.3 From faaf4a790d93794b46d67e2fd69b8e5c8cae2d41 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:18 +0200 Subject: bpf: Add btf_struct_ids_match function Adding btf_struct_ids_match function to check if given address provided by BTF object + offset is also address of another nested BTF object. This allows to pass an argument to helper, which is defined via parent BTF object + offset, like for bpf_d_path (added in following changes): SEC("fentry/filp_close") int BPF_PROG(prog_close, struct file *file, void *id) { ... ret = bpf_d_path(&file->f_path, ... The first bpf_d_path argument is hold by verifier as BTF file object plus offset of f_path member. The btf_struct_ids_match function will walk the struct file object and check if there's nested struct path object on the given offset. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-9-jolsa@kernel.org --- kernel/bpf/btf.c | 31 +++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 17 +++++++++++------ 2 files changed, 42 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d8d64201c4e0..df966acaaeb1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4160,6 +4160,37 @@ int btf_struct_access(struct bpf_verifier_log *log, return -EINVAL; } +bool btf_struct_ids_match(struct bpf_verifier_log *log, + int off, u32 id, u32 need_type_id) +{ + const struct btf_type *type; + int err; + + /* Are we already done? */ + if (need_type_id == id && off == 0) + return true; + +again: + type = btf_type_by_id(btf_vmlinux, id); + if (!type) + return false; + err = btf_struct_walk(log, type, off, 1, &id); + if (err != WALK_STRUCT) + return false; + + /* We found nested struct object. If it matches + * the requested ID, we're done. Otherwise let's + * continue the search with offset 0 in the new + * type. + */ + if (need_type_id != id) { + off = 0; + goto again; + } + + return true; +} + int btf_resolve_helper_id(struct bpf_verifier_log *log, const struct bpf_func_proto *fn, int arg) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 38748794518e..f003cee75d22 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3995,16 +3995,21 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, goto err_type; } } else if (arg_type == ARG_PTR_TO_BTF_ID) { + bool ids_match = false; + expected_type = PTR_TO_BTF_ID; if (type != expected_type) goto err_type; if (!fn->check_btf_id) { if (reg->btf_id != meta->btf_id) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), - kernel_type_name(reg->btf_id), regno); - - return -EACCES; + ids_match = btf_struct_ids_match(&env->log, reg->off, reg->btf_id, + meta->btf_id); + if (!ids_match) { + verbose(env, "Helper has type %s got %s in R%d\n", + kernel_type_name(meta->btf_id), + kernel_type_name(reg->btf_id), regno); + return -EACCES; + } } } else if (!fn->check_btf_id(reg->btf_id, arg)) { verbose(env, "Helper does not support %s in R%d\n", @@ -4012,7 +4017,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } - if (!tnum_is_const(reg->var_off) || reg->var_off.value || reg->off) { + if ((reg->off && !ids_match) || !tnum_is_const(reg->var_off) || reg->var_off.value) { verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", regno); return -EACCES; -- cgit v1.2.3 From eae2e83e62633a2659e3bc690facba1c2fc9c45b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:19 +0200 Subject: bpf: Add BTF_SET_START/END macros Adding support to define sorted set of BTF ID values. Following defines sorted set of BTF ID values: BTF_SET_START(btf_allowlist_d_path) BTF_ID(func, vfs_truncate) BTF_ID(func, vfs_fallocate) BTF_ID(func, dentry_open) BTF_ID(func, vfs_getattr) BTF_ID(func, filp_close) BTF_SET_END(btf_allowlist_d_path) It defines following 'struct btf_id_set' variable to access values and count: struct btf_id_set btf_allowlist_d_path; Adding 'allowed' callback to struct bpf_func_proto, to allow verifier the check on allowed callers. Adding btf_id_set_contains function, which will be used by allowed callbacks to verify the caller's BTF ID value is within allowed set. Also removing extra '\' in __BTF_ID_LIST macro. Added BTF_SET_START_GLOBAL macro for global sets. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-10-jolsa@kernel.org --- kernel/bpf/btf.c | 14 ++++++++++++++ kernel/bpf/verifier.c | 5 +++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index df966acaaeb1..f9ac6935ab3c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include /* BTF (BPF Type Format) is the meta data format which describes @@ -4762,3 +4764,15 @@ u32 btf_id(const struct btf *btf) { return btf->id; } + +static int btf_id_cmp_func(const void *a, const void *b) +{ + const int *pa = a, *pb = b; + + return *pa - *pb; +} + +bool btf_id_set_contains(struct btf_id_set *set, u32 id) +{ + return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f003cee75d22..7e5908b83ec7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4859,6 +4859,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (fn->allowed && !fn->allowed(env->prog)) { + verbose(env, "helper call is not allowed in probe\n"); + return -EINVAL; + } + /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(fn->func); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { -- cgit v1.2.3 From 6e22ab9da79343532cd3cde39df25e5a5478c692 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:20 +0200 Subject: bpf: Add d_path helper Adding d_path helper function that returns full path for given 'struct path' object, which needs to be the kernel BTF 'path' object. The path is returned in buffer provided 'buf' of size 'sz' and is zero terminated. bpf_d_path(&file->f_path, buf, size); The helper calls directly d_path function, so there's only limited set of function it can be called from. Adding just very modest set for the start. Updating also bpf.h tools uapi header and adding 'path' to bpf_helpers_doc.py script. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200825192124.710397-11-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a8d4f253ed77..d973d891f2e2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1098,6 +1098,52 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = { .arg1_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) +{ + long len; + char *p; + + if (!sz) + return 0; + + p = d_path(path, buf, sz); + if (IS_ERR(p)) { + len = PTR_ERR(p); + } else { + len = buf + sz - p; + memmove(buf, p, len); + } + + return len; +} + +BTF_SET_START(btf_allowlist_d_path) +BTF_ID(func, vfs_truncate) +BTF_ID(func, vfs_fallocate) +BTF_ID(func, dentry_open) +BTF_ID(func, vfs_getattr) +BTF_ID(func, filp_close) +BTF_SET_END(btf_allowlist_d_path) + +static bool bpf_d_path_allowed(const struct bpf_prog *prog) +{ + return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); +} + +BTF_ID_LIST(bpf_d_path_btf_ids) +BTF_ID(struct, path) + +static const struct bpf_func_proto bpf_d_path_proto = { + .func = bpf_d_path, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_d_path_btf_ids, + .allowed = bpf_d_path_allowed, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1579,6 +1625,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_write_proto : NULL; + case BPF_FUNC_d_path: + return &bpf_d_path_proto; default: return raw_tp_prog_func_proto(func_id, prog); } -- cgit v1.2.3 From ec73240b1627cddfd7cef018c7fa1c32e64a721e Mon Sep 17 00:00:00 2001 From: Josh Don Date: Tue, 4 Aug 2020 12:34:13 -0700 Subject: sched/fair: Ignore cache hotness for SMT migration SMT siblings share caches, so cache hotness should be irrelevant for cross-sibling migration. Signed-off-by: Josh Don Proposed-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200804193413.510651-1-joshdon@google.com --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1a68a0536add..abdb54e2339f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7402,6 +7402,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env) if (unlikely(task_has_idle_policy(p))) return 0; + /* SMT siblings share cache */ + if (env->sd->flags & SD_SHARE_CPUCAPACITY) + return 0; + /* * Buddy candidates are cache hot: */ -- cgit v1.2.3 From da0777d35f47892f359c3f73ea155870bb595700 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 10 Aug 2020 09:30:04 +0100 Subject: sched/fair: Fix wrong negative conversion in find_energy_efficient_cpu() In find_energy_efficient_cpu() 'cpu_cap' could be less that 'util'. It might be because of RT, DL (so higher sched class than CFS), irq or thermal pressure signal, which reduce the capacity value. In such situation the result of 'cpu_cap - util' might be negative but stored in the unsigned long. Then it might be compared with other unsigned long when uclamp_rq_util_with() reduced the 'util' such that is passes the fits_capacity() check. Prevent this situation and make the arithmetic more safe. Fixes: 1d42509e475cd ("sched/fair: Make EAS wakeup placement consider uclamp restrictions") Signed-off-by: Lukasz Luba Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20200810083004.26420-1-lukasz.luba@arm.com --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index abdb54e2339f..90ebaa4ae16e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6594,7 +6594,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); - spare_cap = cpu_cap - util; + spare_cap = cpu_cap; + lsub_positive(&spare_cap, util); /* * Skip CPUs that cannot satisfy the capacity request. -- cgit v1.2.3 From 1724b95b92979a8ea8e55a4817d05b3bb7750958 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Tue, 11 Aug 2020 19:32:09 +0800 Subject: sched/fair: Simplify the work when reweighting entity The code in reweight_entity() can be simplified. For a sched entity on the rq, the entity accounting can be replaced by cfs_rq instantaneous load updates currently called from within the entity accounting. Even though an entity on the rq can't represent a task in reweight_entity() (a task is always dequeued before calling this function) and so the numa task accounting and the rq->cfs_tasks list management of the entity accounting are never called, the redundant cfs_rq->nr_running decrement/increment will be avoided. Signed-off-by: Jiang Biao Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200811113209.34057-1-benbjiang@tencent.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 90ebaa4ae16e..33699db27ed5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3084,7 +3084,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); - account_entity_dequeue(cfs_rq, se); + update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); @@ -3100,7 +3100,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) - account_entity_enqueue(cfs_rq, se); + update_load_add(&cfs_rq->load, se->load.weight); } -- cgit v1.2.3 From c1cecf884ad748f63f9139d5a18ee265ee2f70fb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 19 Aug 2020 22:00:25 +0200 Subject: sched: Cache task_struct::flags in sched_submit_work() sched_submit_work() is considered to be a hot path. The preempt_disable() instruction is a compiler barrier and forces the compiler to load task_struct::flags for the second comparison. By using a local variable, the compiler can load the value once and keep it in a register for the second comparison. Verified on x86-64 with gcc-10. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200819200025.lqvmyefqnbok5i4f@linutronix.de --- kernel/sched/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8471a0f7eb32..c36dc1ae58be 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4551,9 +4551,12 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { + unsigned int task_flags; + if (!tsk->state) return; + task_flags = tsk->flags; /* * If a worker went to sleep, notify and ask workqueue whether * it wants to wake up a task to maintain concurrency. @@ -4562,9 +4565,9 @@ static inline void sched_submit_work(struct task_struct *tsk) * in the possible wakeup of a kworker and because wq_worker_sleeping() * requires it. */ - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { preempt_disable(); - if (tsk->flags & PF_WQ_WORKER) + if (task_flags & PF_WQ_WORKER) wq_worker_sleeping(tsk); else io_wq_worker_sleeping(tsk); -- cgit v1.2.3 From 8fca9494d4b4d6b57b1398cd473feb308df656db Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 25 Aug 2020 14:32:15 +0100 Subject: sched/topology: Move sd_flag_debug out of linux/sched/topology.h Defining an array in a header imported all over the place clearly is a daft idea, that still didn't stop me from doing it. Leave a declaration of sd_flag_debug in topology.h and move its definition to sched/debug.c. Fixes: b6e862f38672 ("sched/topology: Define and assign sched_domain flag metadata") Reported-by: Andy Shevchenko Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200825133216.9163-1-valentin.schneider@arm.com --- kernel/sched/debug.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0655524700d2..0d7896d2a0b2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,6 +245,12 @@ set_table_entry(struct ctl_table *entry, entry->proc_handler = proc_handler; } +#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, +const struct sd_flag_debug sd_flag_debug[] = { +#include +}; +#undef SD_FLAG + static int sd_ctl_doflags(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { -- cgit v1.2.3 From 4fc472f1214ef75e5450f207e23ff13af6eecad4 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 25 Aug 2020 14:32:16 +0100 Subject: sched/topology: Move SD_DEGENERATE_GROUPS_MASK out of linux/sched/topology.h SD_DEGENERATE_GROUPS_MASK is only useful for sched/topology.c, but still gets defined for anyone who imports topology.h, leading to a flurry of unused variable warnings. Move it out of the header and place it next to the SD degeneration functions in sched/topology.c. Fixes: 4ee4ea443a5d ("sched/topology: Introduce SD metaflag for flags needing > 1 groups") Reported-by: Andy Shevchenko Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200825133216.9163-2-valentin.schneider@arm.com --- kernel/sched/topology.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c674aaab312c..aa1676abc544 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -154,6 +154,13 @@ static inline bool sched_debug(void) } #endif /* CONFIG_SCHED_DEBUG */ +/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | +static const unsigned int SD_DEGENERATE_GROUPS_MASK = +#include +0; +#undef SD_FLAG + static int sd_degenerate(struct sched_domain *sd) { if (cpumask_weight(sched_domain_span(sd)) == 1) -- cgit v1.2.3 From e918188611f073063415f40fae568fa4d86d9044 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:20 +0800 Subject: locking: More accurate annotations for read_lock() On the archs using QUEUED_RWLOCKS, read_lock() is not always a recursive read lock, actually it's only recursive if in_interrupt() is true. So change the annotation accordingly to catch more deadlocks. Note we used to treat read_lock() as pure recursive read locks in lib/locking-seftest.c, and this is useful, especially for the lockdep development selftest, so we keep this via a variable to force switching lock annotation for read_lock(). Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-2-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 54b74fabf40c..77cd9e6520c4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4967,6 +4967,20 @@ static bool lockdep_nmi(void) return true; } +/* + * read_lock() is recursive if: + * 1. We force lockdep think this way in selftests or + * 2. The implementation is not queued read/write lock or + * 3. The locker is at an in_interrupt() context. + */ +bool read_lock_is_recursive(void) +{ + return force_read_lock_recursive || + !IS_ENABLED(CONFIG_QUEUED_RWLOCKS) || + in_interrupt(); +} +EXPORT_SYMBOL_GPL(read_lock_is_recursive); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: -- cgit v1.2.3 From b11be024de164213f6338973d76ab9ab139120cd Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:22 +0800 Subject: lockdep: Demagic the return value of BFS __bfs() could return four magic numbers: 1: search succeeds, but none match. 0: search succeeds, find one match. -1: search fails because of the cq is full. -2: search fails because a invalid node is found. This patch cleans things up by using a enum type for the return value of __bfs() and its friends, this improves the code readability of the code, and further, could help if we want to extend the BFS. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-4-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 155 +++++++++++++++++++++++++++-------------------- 1 file changed, 89 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 77cd9e6520c4..462c68cfb378 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1471,28 +1471,58 @@ static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) return lock_class + offset; } +/* + * Return values of a bfs search: + * + * BFS_E* indicates an error + * BFS_R* indicates a result (match or not) + * + * BFS_EINVALIDNODE: Find a invalid node in the graph. + * + * BFS_EQUEUEFULL: The queue is full while doing the bfs. + * + * BFS_RMATCH: Find the matched node in the graph, and put that node into + * *@target_entry. + * + * BFS_RNOMATCH: Haven't found the matched node and keep *@target_entry + * _unchanged_. + */ +enum bfs_result { + BFS_EINVALIDNODE = -2, + BFS_EQUEUEFULL = -1, + BFS_RMATCH = 0, + BFS_RNOMATCH = 1, +}; + +/* + * bfs_result < 0 means error + */ +static inline bool bfs_error(enum bfs_result res) +{ + return res < 0; +} /* * Forward- or backward-dependency search, used for both circular dependency * checking and hardirq-unsafe/softirq-unsafe checking. */ -static int __bfs(struct lock_list *source_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry, - int offset) +static enum bfs_result __bfs(struct lock_list *source_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int offset) { struct lock_list *entry; struct lock_list *lock; struct list_head *head; struct circular_queue *cq = &lock_cq; - int ret = 1; + enum bfs_result ret = BFS_RNOMATCH; lockdep_assert_locked(); if (match(source_entry, data)) { *target_entry = source_entry; - ret = 0; + ret = BFS_RMATCH; goto exit; } @@ -1506,7 +1536,7 @@ static int __bfs(struct lock_list *source_entry, while ((lock = __cq_dequeue(cq))) { if (!lock->class) { - ret = -2; + ret = BFS_EINVALIDNODE; goto exit; } @@ -1518,12 +1548,12 @@ static int __bfs(struct lock_list *source_entry, mark_lock_accessed(entry, lock); if (match(entry, data)) { *target_entry = entry; - ret = 0; + ret = BFS_RMATCH; goto exit; } if (__cq_enqueue(cq, entry)) { - ret = -1; + ret = BFS_EQUEUEFULL; goto exit; } cq_depth = __cq_get_elem_count(cq); @@ -1536,20 +1566,22 @@ exit: return ret; } -static inline int __bfs_forwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) +static inline enum bfs_result +__bfs_forwards(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { return __bfs(src_entry, data, match, target_entry, offsetof(struct lock_class, locks_after)); } -static inline int __bfs_backwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) +static inline enum bfs_result +__bfs_backwards(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { return __bfs(src_entry, data, match, target_entry, offsetof(struct lock_class, locks_before)); @@ -1775,18 +1807,18 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) /* * Check that the dependency graph starting at can lead to - * or not. Print an error and return 0 if it does. + * or not. */ -static noinline int +static noinline enum bfs_result check_path(struct lock_class *target, struct lock_list *src_entry, struct lock_list **target_entry) { - int ret; + enum bfs_result ret; ret = __bfs_forwards(src_entry, (void *)target, class_equal, target_entry); - if (unlikely(ret < 0)) + if (unlikely(bfs_error(ret))) print_bfs_bug(ret); return ret; @@ -1797,13 +1829,13 @@ check_path(struct lock_class *target, struct lock_list *src_entry, * lead to . If it can, there is a circle when adding * -> dependency. * - * Print an error and return 0 if it does. + * Print an error and return BFS_RMATCH if it does. */ -static noinline int +static noinline enum bfs_result check_noncircular(struct held_lock *src, struct held_lock *target, struct lock_trace **const trace) { - int ret; + enum bfs_result ret; struct lock_list *target_entry; struct lock_list src_entry = { .class = hlock_class(src), @@ -1814,7 +1846,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target, ret = check_path(hlock_class(target), &src_entry, &target_entry); - if (unlikely(!ret)) { + if (unlikely(ret == BFS_RMATCH)) { if (!*trace) { /* * If save_trace fails here, the printing might @@ -1836,12 +1868,13 @@ check_noncircular(struct held_lock *src, struct held_lock *target, * or not. If it can, -> dependency is already * in the graph. * - * Print an error and return 2 if it does or 1 if it does not. + * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if + * any error appears in the bfs search. */ -static noinline int +static noinline enum bfs_result check_redundant(struct held_lock *src, struct held_lock *target) { - int ret; + enum bfs_result ret; struct lock_list *target_entry; struct lock_list src_entry = { .class = hlock_class(src), @@ -1852,11 +1885,8 @@ check_redundant(struct held_lock *src, struct held_lock *target) ret = check_path(hlock_class(target), &src_entry, &target_entry); - if (!ret) { + if (ret == BFS_RMATCH) debug_atomic_inc(nr_redundant); - ret = 2; - } else if (ret < 0) - ret = 0; return ret; } @@ -1886,17 +1916,14 @@ static inline int usage_match(struct lock_list *entry, void *mask) * Find a node in the forwards-direction dependency sub-graph starting * at @root->class that matches @bit. * - * Return 0 if such a node exists in the subgraph, and put that node + * Return BFS_MATCH if such a node exists in the subgraph, and put that node * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. */ -static int +static enum bfs_result find_usage_forwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { - int result; + enum bfs_result result; debug_atomic_inc(nr_find_usage_forwards_checks); @@ -1908,18 +1935,12 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask, /* * Find a node in the backwards-direction dependency sub-graph starting * at @root->class that matches @bit. - * - * Return 0 if such a node exists in the subgraph, and put that node - * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. */ -static int +static enum bfs_result find_usage_backwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { - int result; + enum bfs_result result; debug_atomic_inc(nr_find_usage_backwards_checks); @@ -2247,7 +2268,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, struct lock_list *target_entry1; struct lock_list *target_entry; struct lock_list this, that; - int ret; + enum bfs_result ret; /* * Step 1: gather all hard/soft IRQs usages backward in an @@ -2257,7 +2278,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, this.class = hlock_class(prev); ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } @@ -2276,12 +2297,12 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, that.class = hlock_class(next); ret = find_usage_forwards(&that, forward_mask, &target_entry1); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; /* * Step 3: we found a bad match! Now retrieve a lock from the backward @@ -2291,11 +2312,11 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, backward_mask = original_mask(target_entry1->class->usage_mask); ret = find_usage_backwards(&this, backward_mask, &target_entry); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (DEBUG_LOCKS_WARN_ON(ret == 1)) + if (DEBUG_LOCKS_WARN_ON(ret == BFS_RNOMATCH)) return 1; /* @@ -2463,7 +2484,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, struct lock_trace **const trace) { struct lock_list *entry; - int ret; + enum bfs_result ret; if (!hlock_class(prev)->key || !hlock_class(next)->key) { /* @@ -2494,7 +2515,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * in the graph whose neighbours are to be checked. */ ret = check_noncircular(next, prev, trace); - if (unlikely(ret <= 0)) + if (unlikely(bfs_error(ret) || ret == BFS_RMATCH)) return 0; if (!check_irq_usage(curr, prev, next)) @@ -2531,8 +2552,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Is the -> link redundant? */ ret = check_redundant(prev, next); - if (ret != 1) - return ret; + if (bfs_error(ret)) + return 0; + else if (ret == BFS_RMATCH) + return 2; #endif if (!*trace) { @@ -3436,19 +3459,19 @@ static int check_usage_forwards(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit bit, const char *irqclass) { - int ret; + enum bfs_result ret; struct lock_list root; struct lock_list *target_entry; root.parent = NULL; root.class = hlock_class(this); ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; print_irq_inversion_bug(curr, &root, target_entry, this, 1, irqclass); @@ -3463,19 +3486,19 @@ static int check_usage_backwards(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit bit, const char *irqclass) { - int ret; + enum bfs_result ret; struct lock_list root; struct lock_list *target_entry; root.parent = NULL; root.class = hlock_class(this); ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) { + if (bfs_error(ret)) { print_bfs_bug(ret); return 0; } - if (ret == 1) - return ret; + if (ret == BFS_RNOMATCH) + return 1; print_irq_inversion_bug(curr, &root, target_entry, this, 0, irqclass); -- cgit v1.2.3 From d563bc6ead9e79be37067d58509a605b67378184 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:23 +0800 Subject: lockdep: Make __bfs() visit every dependency until a match Currently, __bfs() will do a breadth-first search in the dependency graph and visit each lock class in the graph exactly once, so for example, in the following graph: A ---------> B | ^ | | +----------> C a __bfs() call starts at A, will visit B through dependency A -> B and visit C through dependency A -> C and that's it, IOW, __bfs() will not visit dependency C -> B. This is OK for now, as we only have strong dependencies in the dependency graph, so whenever there is a traverse path from A to B in __bfs(), it means A has strong dependencies to B (IOW, B depends on A strongly). So no need to visit all dependencies in the graph. However, as we are going to add recursive-read lock into the dependency graph, as a result, not all the paths mean strong dependencies, in the same example above, dependency A -> B may be a weak dependency and traverse A -> C -> B may be a strong dependency path. And with the old way of __bfs() (i.e. visiting every lock class exactly once), we will miss the strong dependency path, which will result into failing to find a deadlock. To cure this for the future, we need to find a way for __bfs() to visit each dependency, rather than each class, exactly once in the search until we find a match. The solution is simple: We used to mark lock_class::lockdep_dependency_gen_id to indicate a class has been visited in __bfs(), now we change the semantics a little bit: we now mark lock_class::lockdep_dependency_gen_id to indicate _all the dependencies_ in its lock_{after,before} have been visited in the __bfs() (note we only take one direction in a __bfs() search). In this way, every dependency is guaranteed to be visited until we find a match. Note: the checks in mark_lock_accessed() and lock_accessed() are removed, because after this modification, we may call these two functions on @source_entry of __bfs(), which may not be the entry in "list_entries" Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-5-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 61 +++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 462c68cfb378..150686a71be0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1421,23 +1421,19 @@ static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) return (cq->rear - cq->front) & CQ_MASK; } -static inline void mark_lock_accessed(struct lock_list *lock, - struct lock_list *parent) +static inline void mark_lock_accessed(struct lock_list *lock) { - unsigned long nr; + lock->class->dep_gen_id = lockdep_dependency_gen_id; +} - nr = lock - list_entries; - WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ +static inline void visit_lock_entry(struct lock_list *lock, + struct lock_list *parent) +{ lock->parent = parent; - lock->class->dep_gen_id = lockdep_dependency_gen_id; } static inline unsigned long lock_accessed(struct lock_list *lock) { - unsigned long nr; - - nr = lock - list_entries; - WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -1540,26 +1536,39 @@ static enum bfs_result __bfs(struct lock_list *source_entry, goto exit; } + /* + * If we have visited all the dependencies from this @lock to + * others (iow, if we have visited all lock_list entries in + * @lock->class->locks_{after,before}) we skip, otherwise go + * and visit all the dependencies in the list and mark this + * list accessed. + */ + if (lock_accessed(lock)) + continue; + else + mark_lock_accessed(lock); + head = get_dep_list(lock, offset); + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + list_for_each_entry_rcu(entry, head, entry) { - if (!lock_accessed(entry)) { - unsigned int cq_depth; - mark_lock_accessed(entry, lock); - if (match(entry, data)) { - *target_entry = entry; - ret = BFS_RMATCH; - goto exit; - } - - if (__cq_enqueue(cq, entry)) { - ret = BFS_EQUEUEFULL; - goto exit; - } - cq_depth = __cq_get_elem_count(cq); - if (max_bfs_queue_depth < cq_depth) - max_bfs_queue_depth = cq_depth; + unsigned int cq_depth; + + visit_lock_entry(entry, lock); + if (match(entry, data)) { + *target_entry = entry; + ret = BFS_RMATCH; + goto exit; + } + + if (__cq_enqueue(cq, entry)) { + ret = BFS_EQUEUEFULL; + goto exit; } + cq_depth = __cq_get_elem_count(cq); + if (max_bfs_queue_depth < cq_depth) + max_bfs_queue_depth = cq_depth; } } exit: -- cgit v1.2.3 From bd76eca10de2eb9998d5125b08e8997cbf5508d5 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:24 +0800 Subject: lockdep: Reduce the size of lock_list::distance lock_list::distance is always not greater than MAX_LOCK_DEPTH (which is 48 right now), so a u16 will fit. This patch reduces the size of lock_list::distance to save space, so that we can introduce other fields to help detect recursive read lock deadlocks without increasing the size of lock_list structure. Suggested-by: Peter Zijlstra Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-6-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 150686a71be0..668a983d6405 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1320,7 +1320,7 @@ static struct lock_list *alloc_list_entry(void) */ static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, - unsigned long ip, int distance, + unsigned long ip, u16 distance, const struct lock_trace *trace) { struct lock_list *entry; @@ -2489,7 +2489,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, + struct held_lock *next, u16 distance, struct lock_trace **const trace) { struct lock_list *entry; @@ -2622,7 +2622,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) goto out_bug; for (;;) { - int distance = curr->lockdep_depth - depth + 1; + u16 distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; /* -- cgit v1.2.3 From 3454a36d6a39186de508dd43df590a6363364176 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:25 +0800 Subject: lockdep: Introduce lock_list::dep To add recursive read locks into the dependency graph, we need to store the types of dependencies for the BFS later. There are four types of dependencies: * Exclusive -> Non-recursive dependencies: EN e.g. write_lock(prev) held and try to acquire write_lock(next) or non-recursive read_lock(next), which can be represented as "prev -(EN)-> next" * Shared -> Non-recursive dependencies: SN e.g. read_lock(prev) held and try to acquire write_lock(next) or non-recursive read_lock(next), which can be represented as "prev -(SN)-> next" * Exclusive -> Recursive dependencies: ER e.g. write_lock(prev) held and try to acquire recursive read_lock(next), which can be represented as "prev -(ER)-> next" * Shared -> Recursive dependencies: SR e.g. read_lock(prev) held and try to acquire recursive read_lock(next), which can be represented as "prev -(SR)-> next" So we use 4 bits for the presence of each type in lock_list::dep. Helper functions and macros are also introduced to convert a pair of locks into lock_list::dep bit and maintain the addition of different types of dependencies. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-7-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 92 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 668a983d6405..16ad1b74aa30 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1320,7 +1320,7 @@ static struct lock_list *alloc_list_entry(void) */ static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, - unsigned long ip, u16 distance, + unsigned long ip, u16 distance, u8 dep, const struct lock_trace *trace) { struct lock_list *entry; @@ -1334,6 +1334,7 @@ static int add_lock_to_list(struct lock_class *this, entry->class = this; entry->links_to = links_to; + entry->dep = dep; entry->distance = distance; entry->trace = trace; /* @@ -1498,6 +1499,57 @@ static inline bool bfs_error(enum bfs_result res) return res < 0; } +/* + * DEP_*_BIT in lock_list::dep + * + * For dependency @prev -> @next: + * + * SR: @prev is shared reader (->read != 0) and @next is recursive reader + * (->read == 2) + * ER: @prev is exclusive locker (->read == 0) and @next is recursive reader + * SN: @prev is shared reader and @next is non-recursive locker (->read != 2) + * EN: @prev is exclusive locker and @next is non-recursive locker + * + * Note that we define the value of DEP_*_BITs so that: + * bit0 is prev->read == 0 + * bit1 is next->read != 2 + */ +#define DEP_SR_BIT (0 + (0 << 1)) /* 0 */ +#define DEP_ER_BIT (1 + (0 << 1)) /* 1 */ +#define DEP_SN_BIT (0 + (1 << 1)) /* 2 */ +#define DEP_EN_BIT (1 + (1 << 1)) /* 3 */ + +#define DEP_SR_MASK (1U << (DEP_SR_BIT)) +#define DEP_ER_MASK (1U << (DEP_ER_BIT)) +#define DEP_SN_MASK (1U << (DEP_SN_BIT)) +#define DEP_EN_MASK (1U << (DEP_EN_BIT)) + +static inline unsigned int +__calc_dep_bit(struct held_lock *prev, struct held_lock *next) +{ + return (prev->read == 0) + ((next->read != 2) << 1); +} + +static inline u8 calc_dep(struct held_lock *prev, struct held_lock *next) +{ + return 1U << __calc_dep_bit(prev, next); +} + +/* + * calculate the dep_bit for backwards edges. We care about whether @prev is + * shared and whether @next is recursive. + */ +static inline unsigned int +__calc_dep_bitb(struct held_lock *prev, struct held_lock *next) +{ + return (next->read != 2) + ((prev->read == 0) << 1); +} + +static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next) +{ + return 1U << __calc_dep_bitb(prev, next); +} + /* * Forward- or backward-dependency search, used for both circular dependency * checking and hardirq-unsafe/softirq-unsafe checking. @@ -2552,7 +2604,35 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; - return 1; + entry->dep |= calc_dep(prev, next); + + /* + * Also, update the reverse dependency in @next's + * ->locks_before list. + * + * Here we reuse @entry as the cursor, which is fine + * because we won't go to the next iteration of the + * outer loop: + * + * For normal cases, we return in the inner loop. + * + * If we fail to return, we have inconsistency, i.e. + * ::locks_after contains while + * ::locks_before doesn't contain . In + * that case, we return after the inner and indicate + * something is wrong. + */ + list_for_each_entry(entry, &hlock_class(next)->locks_before, entry) { + if (entry->class == hlock_class(prev)) { + if (distance == 1) + entry->distance = 1; + entry->dep |= calc_depb(prev, next); + return 1; + } + } + + /* is not found in ::locks_before */ + return 0; } } @@ -2579,14 +2659,18 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, *trace); + next->acquire_ip, distance, + calc_dep(prev, next), + *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, - next->acquire_ip, distance, *trace); + next->acquire_ip, distance, + calc_depb(prev, next), + *trace); if (!ret) return 0; -- cgit v1.2.3 From 6971c0f345620aae5e6172207a57b7524603a34e Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:26 +0800 Subject: lockdep: Extend __bfs() to work with multiple types of dependencies Now we have four types of dependencies in the dependency graph, and not all the pathes carry real dependencies (the dependencies that may cause a deadlock), for example: Given lock A and B, if we have: CPU1 CPU2 ============= ============== write_lock(A); read_lock(B); read_lock(B); write_lock(A); (assuming read_lock(B) is a recursive reader) then we have dependencies A -(ER)-> B, and B -(SN)-> A, and a dependency path A -(ER)-> B -(SN)-> A. In lockdep w/o recursive locks, a dependency path from A to A means a deadlock. However, the above case is obviously not a deadlock, because no one holds B exclusively, therefore no one waits for the other to release B, so who get A first in CPU1 and CPU2 will run non-blockingly. As a result, dependency path A -(ER)-> B -(SN)-> A is not a real/strong dependency that could cause a deadlock. From the observation above, we know that for a dependency path to be real/strong, no two adjacent dependencies can be as -(*R)-> -(S*)->. Now our mission is to make __bfs() traverse only the strong dependency paths, which is simple: we record whether we only have -(*R)-> for the previous lock_list of the path in lock_list::only_xr, and when we pick a dependency in the traverse, we 1) filter out -(S*)-> dependency if the previous lock_list only has -(*R)-> dependency (i.e. ->only_xr is true) and 2) set the next lock_list::only_xr to true if we only have -(*R)-> left after we filter out dependencies based on 1), otherwise, set it to false. With this extension for __bfs(), we now need to initialize the root of __bfs() properly (with a correct ->only_xr), to do so, we introduce some helper functions, which also cleans up a little bit for the __bfs() root initialization code. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-8-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 113 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 94 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 16ad1b74aa30..5abc227db0e9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1551,8 +1551,72 @@ static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next) } /* - * Forward- or backward-dependency search, used for both circular dependency - * checking and hardirq-unsafe/softirq-unsafe checking. + * Initialize a lock_list entry @lock belonging to @class as the root for a BFS + * search. + */ +static inline void __bfs_init_root(struct lock_list *lock, + struct lock_class *class) +{ + lock->class = class; + lock->parent = NULL; + lock->only_xr = 0; +} + +/* + * Initialize a lock_list entry @lock based on a lock acquisition @hlock as the + * root for a BFS search. + * + * ->only_xr of the initial lock node is set to @hlock->read == 2, to make sure + * that -> @hlock and @hlock -> is not -(*R)-> + * and -(S*)->. + */ +static inline void bfs_init_root(struct lock_list *lock, + struct held_lock *hlock) +{ + __bfs_init_root(lock, hlock_class(hlock)); + lock->only_xr = (hlock->read == 2); +} + +/* + * Similar to bfs_init_root() but initialize the root for backwards BFS. + * + * ->only_xr of the initial lock node is set to @hlock->read != 0, to make sure + * that -> @hlock and @hlock -> is not + * -(*S)-> and -(R*)-> (reverse order of -(*R)-> and -(S*)->). + */ +static inline void bfs_init_rootb(struct lock_list *lock, + struct held_lock *hlock) +{ + __bfs_init_root(lock, hlock_class(hlock)); + lock->only_xr = (hlock->read != 0); +} + +/* + * Breadth-First Search to find a strong path in the dependency graph. + * + * @source_entry: the source of the path we are searching for. + * @data: data used for the second parameter of @match function + * @match: match function for the search + * @target_entry: pointer to the target of a matched path + * @offset: the offset to struct lock_class to determine whether it is + * locks_after or locks_before + * + * We may have multiple edges (considering different kinds of dependencies, + * e.g. ER and SN) between two nodes in the dependency graph. But + * only the strong dependency path in the graph is relevant to deadlocks. A + * strong dependency path is a dependency path that doesn't have two adjacent + * dependencies as -(*R)-> -(S*)->, please see: + * + * Documentation/locking/lockdep-design.rst + * + * for more explanation of the definition of strong dependency paths + * + * In __bfs(), we only traverse in the strong dependency path: + * + * In lock_list::only_xr, we record whether the previous dependency only + * has -(*R)-> in the search, and if it does (prev only has -(*R)->), we + * filter out any -(S*)-> in the current dependency and after that, the + * ->only_xr is set according to whether we only have -(*R)-> left. */ static enum bfs_result __bfs(struct lock_list *source_entry, void *data, @@ -1582,6 +1646,7 @@ static enum bfs_result __bfs(struct lock_list *source_entry, __cq_enqueue(cq, source_entry); while ((lock = __cq_dequeue(cq))) { + bool prev_only_xr; if (!lock->class) { ret = BFS_EINVALIDNODE; @@ -1602,10 +1667,26 @@ static enum bfs_result __bfs(struct lock_list *source_entry, head = get_dep_list(lock, offset); - DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + prev_only_xr = lock->only_xr; list_for_each_entry_rcu(entry, head, entry) { unsigned int cq_depth; + u8 dep = entry->dep; + + /* + * Mask out all -(S*)-> if we only have *R in previous + * step, because -(*R)-> -(S*)-> don't make up a strong + * dependency. + */ + if (prev_only_xr) + dep &= ~(DEP_SR_MASK | DEP_SN_MASK); + + /* If nothing left, we skip */ + if (!dep) + continue; + + /* If there are only -(*R)-> left, set that for the next step */ + entry->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK)); visit_lock_entry(entry, lock); if (match(entry, data)) { @@ -1827,8 +1908,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) unsigned long ret, flags; struct lock_list this; - this.parent = NULL; - this.class = class; + __bfs_init_root(&this, class); raw_local_irq_save(flags); lockdep_lock(); @@ -1854,8 +1934,7 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) unsigned long ret, flags; struct lock_list this; - this.parent = NULL; - this.class = class; + __bfs_init_root(&this, class); raw_local_irq_save(flags); lockdep_lock(); @@ -1898,10 +1977,9 @@ check_noncircular(struct held_lock *src, struct held_lock *target, { enum bfs_result ret; struct lock_list *target_entry; - struct lock_list src_entry = { - .class = hlock_class(src), - .parent = NULL, - }; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); debug_atomic_inc(nr_cyclic_checks); @@ -1937,10 +2015,9 @@ check_redundant(struct held_lock *src, struct held_lock *target) { enum bfs_result ret; struct lock_list *target_entry; - struct lock_list src_entry = { - .class = hlock_class(src), - .parent = NULL, - }; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); debug_atomic_inc(nr_redundant_checks); @@ -3556,8 +3633,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, struct lock_list root; struct lock_list *target_entry; - root.parent = NULL; - root.class = hlock_class(this); + bfs_init_root(&root, this); ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); if (bfs_error(ret)) { print_bfs_bug(ret); @@ -3583,8 +3659,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, struct lock_list root; struct lock_list *target_entry; - root.parent = NULL; - root.class = hlock_class(this); + bfs_init_rootb(&root, this); ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); if (bfs_error(ret)) { print_bfs_bug(ret); -- cgit v1.2.3 From 61775ed243433ff0556c4f76905929fe01e92922 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:27 +0800 Subject: lockdep: Make __bfs(.match) return bool The "match" parameter of __bfs() is used for checking whether we hit a match in the search, therefore it should return a boolean value rather than an integer for better readability. This patch then changes the return type of the function parameter and the match functions to bool. Suggested-by: Peter Zijlstra Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-9-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5abc227db0e9..78cd74d77be9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1620,7 +1620,7 @@ static inline void bfs_init_rootb(struct lock_list *lock, */ static enum bfs_result __bfs(struct lock_list *source_entry, void *data, - int (*match)(struct lock_list *entry, void *data), + bool (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry, int offset) { @@ -1711,7 +1711,7 @@ exit: static inline enum bfs_result __bfs_forwards(struct lock_list *src_entry, void *data, - int (*match)(struct lock_list *entry, void *data), + bool (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { return __bfs(src_entry, data, match, target_entry, @@ -1722,7 +1722,7 @@ __bfs_forwards(struct lock_list *src_entry, static inline enum bfs_result __bfs_backwards(struct lock_list *src_entry, void *data, - int (*match)(struct lock_list *entry, void *data), + bool (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { return __bfs(src_entry, data, match, target_entry, @@ -1833,7 +1833,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, print_circular_bug_entry(entry, depth); } -static inline int class_equal(struct lock_list *entry, void *data) +static inline bool class_equal(struct lock_list *entry, void *data) { return entry->class == data; } @@ -1888,10 +1888,10 @@ static noinline void print_bfs_bug(int ret) WARN(1, "lockdep bfs error:%d\n", ret); } -static int noop_count(struct lock_list *entry, void *data) +static bool noop_count(struct lock_list *entry, void *data) { (*(unsigned long *)data)++; - return 0; + return false; } static unsigned long __lockdep_count_forward_deps(struct lock_list *this) @@ -2032,11 +2032,11 @@ check_redundant(struct held_lock *src, struct held_lock *target) #ifdef CONFIG_TRACE_IRQFLAGS -static inline int usage_accumulate(struct lock_list *entry, void *mask) +static inline bool usage_accumulate(struct lock_list *entry, void *mask) { *(unsigned long *)mask |= entry->class->usage_mask; - return 0; + return false; } /* @@ -2045,9 +2045,9 @@ static inline int usage_accumulate(struct lock_list *entry, void *mask) * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ -static inline int usage_match(struct lock_list *entry, void *mask) +static inline bool usage_match(struct lock_list *entry, void *mask) { - return entry->class->usage_mask & *(unsigned long *)mask; + return !!(entry->class->usage_mask & *(unsigned long *)mask); } /* -- cgit v1.2.3 From 9de0c9bbcedf752e762c67f105bff342e30f9105 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:28 +0800 Subject: lockdep: Support deadlock detection for recursive read locks in check_noncircular() Currently, lockdep only has limit support for deadlock detection for recursive read locks. This patch support deadlock detection for recursive read locks. The basic idea is: We are about to add dependency B -> A in to the dependency graph, we use check_noncircular() to find whether we have a strong dependency path A -> .. -> B so that we have a strong dependency circle (a closed strong dependency path): A -> .. -> B -> A , which doesn't have two adjacent dependencies as -(*R)-> L -(S*)->. Since A -> .. -> B is already a strong dependency path, so if either B -> A is -(E*)-> or A -> .. -> B is -(*N)->, the circle A -> .. -> B -> A is strong, otherwise not. So we introduce a new match function hlock_conflict() to replace the class_equal() for the deadlock check in check_noncircular(). Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-10-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 78cd74d77be9..9160f1ddc435 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1838,10 +1838,37 @@ static inline bool class_equal(struct lock_list *entry, void *data) return entry->class == data; } +/* + * We are about to add B -> A into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * We will have a deadlock case (conflict) if A -> .. -> B -> A is a strong + * dependency cycle, that means: + * + * Either + * + * a) B -> A is -(E*)-> + * + * or + * + * b) A -> .. -> B is -(*N)-> (i.e. A -> .. -(*N)-> B) + * + * as then we don't have -(*R)-> -(S*)-> in the cycle. + */ +static inline bool hlock_conflict(struct lock_list *entry, void *data) +{ + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 0 || /* B -> A is -(E*)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ +} + static noinline void print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt) + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; struct lock_list *parent; @@ -1950,13 +1977,13 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) * or not. */ static noinline enum bfs_result -check_path(struct lock_class *target, struct lock_list *src_entry, +check_path(struct held_lock *target, struct lock_list *src_entry, + bool (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { enum bfs_result ret; - ret = __bfs_forwards(src_entry, (void *)target, class_equal, - target_entry); + ret = __bfs_forwards(src_entry, target, match, target_entry); if (unlikely(bfs_error(ret))) print_bfs_bug(ret); @@ -1983,7 +2010,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target, debug_atomic_inc(nr_cyclic_checks); - ret = check_path(hlock_class(target), &src_entry, &target_entry); + ret = check_path(target, &src_entry, hlock_conflict, &target_entry); if (unlikely(ret == BFS_RMATCH)) { if (!*trace) { @@ -2021,7 +2048,7 @@ check_redundant(struct held_lock *src, struct held_lock *target) debug_atomic_inc(nr_redundant_checks); - ret = check_path(hlock_class(target), &src_entry, &target_entry); + ret = check_path(target, &src_entry, class_equal, &target_entry); if (ret == BFS_RMATCH) debug_atomic_inc(nr_redundant); -- cgit v1.2.3 From 68e305678583f13a67e2ce22088c2520bd4f97b4 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:29 +0800 Subject: lockdep: Adjust check_redundant() for recursive read change check_redundant() will report redundancy if it finds a path could replace the about-to-add dependency in the BFS search. With recursive read lock changes, we certainly need to change the match function for the check_redundant(), because the path needs to match not only the lock class but also the dependency kinds. For example, if the about-to-add dependency @prev -> @next is A -(SN)-> B, and we find a path A -(S*)-> .. -(*R)->B in the dependency graph with __bfs() (for simplicity, we can also say we find an -(SR)-> path from A to B), we can not replace the dependency with that path in the BFS search. Because the -(SN)-> dependency can make a strong path with a following -(S*)-> dependency, however an -(SR)-> path cannot. Further, we can replace an -(SN)-> dependency with a -(EN)-> path, that means if we find a path which is stronger than or equal to the about-to-add dependency, we can report the redundancy. By "stronger", it means both the start and the end of the path are not weaker than the start and the end of the dependency (E is "stronger" than S and N is "stronger" than R), so that we can replace the dependency with that path. To make sure we find a path whose start point is not weaker than the about-to-add dependency, we use a trick: the ->only_xr of the root (start point) of __bfs() is initialized as @prev-> == 0, therefore if @prev is E, __bfs() will pick only -(E*)-> for the first dependency, otherwise, __bfs() can pick -(E*)-> or -(S*)-> for the first dependency. To make sure we find a path whose end point is not weaker than the about-to-add dependency, we replace the match function for __bfs() check_redundant(), we check for the case that either @next is R (anything is not weaker than it) or the end point of the path is N (which is not weaker than anything). Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-11-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 47 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9160f1ddc435..42e2f1fbbd5f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1833,9 +1833,39 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, print_circular_bug_entry(entry, depth); } -static inline bool class_equal(struct lock_list *entry, void *data) +/* + * We are about to add A -> B into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * If A -> .. -> B can replace A -> B in any __bfs() search (means the former + * is _stronger_ than or equal to the latter), we consider A -> B as redundant. + * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A + * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the + * dependency graph, as any strong path ..-> A -> B ->.. we can get with + * having dependency A -> B, we could already get a equivalent path ..-> A -> + * .. -> B -> .. with A -> .. -> B. Therefore A -> B is reduntant. + * + * We need to make sure both the start and the end of A -> .. -> B is not + * weaker than A -> B. For the start part, please see the comment in + * check_redundant(). For the end part, we need: + * + * Either + * + * a) A -> B is -(*R)-> (everything is not weaker than that) + * + * or + * + * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) + * + */ +static inline bool hlock_equal(struct lock_list *entry, void *data) { - return entry->class == data; + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 2 || /* A -> B is -(*R)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ } /* @@ -2045,10 +2075,21 @@ check_redundant(struct held_lock *src, struct held_lock *target) struct lock_list src_entry; bfs_init_root(&src_entry, src); + /* + * Special setup for check_redundant(). + * + * To report redundant, we need to find a strong dependency path that + * is equal to or stronger than -> . So if is E, + * we need to let __bfs() only search for a path starting at a -(E*)->, + * we achieve this by setting the initial node's ->only_xr to true in + * that case. And if is S, we set initial ->only_xr to false + * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant. + */ + src_entry.only_xr = src->read == 0; debug_atomic_inc(nr_redundant_checks); - ret = check_path(target, &src_entry, class_equal, &target_entry); + ret = check_path(target, &src_entry, hlock_equal, &target_entry); if (ret == BFS_RMATCH) debug_atomic_inc(nr_redundant); -- cgit v1.2.3 From f08e3888574d490b31481eef6d84c61bedba7a47 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:30 +0800 Subject: lockdep: Fix recursive read lock related safe->unsafe detection Currently, in safe->unsafe detection, lockdep misses the fact that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ usage may cause deadlock too, for example: P1 P2 write_lock(l1); read_lock(l2); write_lock(l2); read_lock(l1); Actually, all of the following cases may cause deadlocks: LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_* LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_* LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ To fix this, we need to 1) change the calculation of exclusive_mask() so that READ bits are not dropped and 2) always call usage() in mark_lock_irq() to check usage deadlocks, even when the new usage of the lock is READ. Besides, adjust usage_match() and usage_acculumate() to recursive read lock changes. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-12-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 188 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 141 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 42e2f1fbbd5f..6644974a4f5a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2100,22 +2100,72 @@ check_redundant(struct held_lock *src, struct held_lock *target) #ifdef CONFIG_TRACE_IRQFLAGS +/* + * Forwards and backwards subgraph searching, for the purposes of + * proving that two subgraphs can be connected by a new dependency + * without creating any illegal irq-safe -> irq-unsafe lock dependency. + * + * A irq safe->unsafe deadlock happens with the following conditions: + * + * 1) We have a strong dependency path A -> ... -> B + * + * 2) and we have ENABLED_IRQ usage of B and USED_IN_IRQ usage of A, therefore + * irq can create a new dependency B -> A (consider the case that a holder + * of B gets interrupted by an irq whose handler will try to acquire A). + * + * 3) the dependency circle A -> ... -> B -> A we get from 1) and 2) is a + * strong circle: + * + * For the usage bits of B: + * a) if A -> B is -(*N)->, then B -> A could be any type, so any + * ENABLED_IRQ usage suffices. + * b) if A -> B is -(*R)->, then B -> A must be -(E*)->, so only + * ENABLED_IRQ_*_READ usage suffices. + * + * For the usage bits of A: + * c) if A -> B is -(E*)->, then B -> A could be any type, so any + * USED_IN_IRQ usage suffices. + * d) if A -> B is -(S*)->, then B -> A must be -(*N)->, so only + * USED_IN_IRQ_*_READ usage suffices. + */ + +/* + * There is a strong dependency path in the dependency graph: A -> B, and now + * we need to decide which usage bit of A should be accumulated to detect + * safe->unsafe bugs. + * + * Note that usage_accumulate() is used in backwards search, so ->only_xr + * stands for whether A -> B only has -(S*)-> (in this case ->only_xr is true). + * + * As above, if only_xr is false, which means A -> B has -(E*)-> dependency + * path, any usage of A should be considered. Otherwise, we should only + * consider _READ usage. + */ static inline bool usage_accumulate(struct lock_list *entry, void *mask) { - *(unsigned long *)mask |= entry->class->usage_mask; + if (!entry->only_xr) + *(unsigned long *)mask |= entry->class->usage_mask; + else /* Mask out _READ usage bits */ + *(unsigned long *)mask |= (entry->class->usage_mask & LOCKF_IRQ); return false; } /* - * Forwards and backwards subgraph searching, for the purposes of - * proving that two subgraphs can be connected by a new dependency - * without creating any illegal irq-safe -> irq-unsafe lock dependency. + * There is a strong dependency path in the dependency graph: A -> B, and now + * we need to decide which usage bit of B conflicts with the usage bits of A, + * i.e. which usage bit of B may introduce safe->unsafe deadlocks. + * + * As above, if only_xr is false, which means A -> B has -(*N)-> dependency + * path, any usage of B should be considered. Otherwise, we should only + * consider _READ usage. */ - static inline bool usage_match(struct lock_list *entry, void *mask) { - return !!(entry->class->usage_mask & *(unsigned long *)mask); + if (!entry->only_xr) + return !!(entry->class->usage_mask & *(unsigned long *)mask); + else /* Mask out _READ usage bits */ + return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask); } /* @@ -2406,17 +2456,39 @@ static unsigned long invert_dir_mask(unsigned long mask) } /* - * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all - * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). - * And then mask out all bitnr0. + * Note that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ + * usage may cause deadlock too, for example: + * + * P1 P2 + * + * write_lock(l1); + * read_lock(l2); + * write_lock(l2); + * + * read_lock(l1); + * + * , in above case, l1 will be marked as LOCK_USED_IN_IRQ_HARDIRQ_READ and l2 + * will marked as LOCK_ENABLE_IRQ_HARDIRQ_READ, and this is a possible + * deadlock. + * + * In fact, all of the following cases may cause deadlocks: + * + * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_* + * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_* + * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ + * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ + * + * As a result, to calculate the "exclusive mask", first we invert the + * direction (USED_IN/ENABLED) of the original mask, and 1) for all bits with + * bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). 2) for all + * bits with bitnr0 cleared (LOCK_*_READ), add those with bitnr0 set (LOCK_*). */ static unsigned long exclusive_mask(unsigned long mask) { unsigned long excl = invert_dir_mask(mask); - /* Strip read */ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; - excl &= ~LOCKF_IRQ_READ; + excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; return excl; } @@ -2433,6 +2505,7 @@ static unsigned long original_mask(unsigned long mask) unsigned long excl = invert_dir_mask(mask); /* Include read in existing usages */ + excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; return excl; @@ -2447,14 +2520,24 @@ static int find_exclusive_match(unsigned long mask, enum lock_usage_bit *bitp, enum lock_usage_bit *excl_bitp) { - int bit, excl; + int bit, excl, excl_read; for_each_set_bit(bit, &mask, LOCK_USED) { + /* + * exclusive_bit() strips the read bit, however, + * LOCK_ENABLED_IRQ_*_READ may cause deadlocks too, so we need + * to search excl | LOCK_USAGE_READ_MASK as well. + */ excl = exclusive_bit(bit); + excl_read = excl | LOCK_USAGE_READ_MASK; if (excl_mask & lock_flag(excl)) { *bitp = bit; *excl_bitp = excl; return 0; + } else if (excl_mask & lock_flag(excl_read)) { + *bitp = bit; + *excl_bitp = excl_read; + return 0; } } return -1; @@ -2480,8 +2563,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, * Step 1: gather all hard/soft IRQs usages backward in an * accumulated usage mask. */ - this.parent = NULL; - this.class = hlock_class(prev); + bfs_init_rootb(&this, prev); ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); if (bfs_error(ret)) { @@ -2499,8 +2581,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, */ forward_mask = exclusive_mask(usage_mask); - that.parent = NULL; - that.class = hlock_class(next); + bfs_init_root(&that, next); ret = find_usage_forwards(&that, forward_mask, &target_entry1); if (bfs_error(ret)) { @@ -3695,14 +3776,16 @@ print_irq_inversion_bug(struct task_struct *curr, */ static int check_usage_forwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) + enum lock_usage_bit bit) { enum bfs_result ret; struct lock_list root; struct lock_list *target_entry; + enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK; + unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit); bfs_init_root(&root, this); - ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); + ret = find_usage_forwards(&root, usage_mask, &target_entry); if (bfs_error(ret)) { print_bfs_bug(ret); return 0; @@ -3710,8 +3793,15 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, if (ret == BFS_RNOMATCH) return 1; - print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); + /* Check whether write or read usage is the match */ + if (target_entry->class->usage_mask & lock_flag(bit)) { + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, state_name(bit)); + } else { + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, state_name(read_bit)); + } + return 0; } @@ -3721,14 +3811,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, */ static int check_usage_backwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) + enum lock_usage_bit bit) { enum bfs_result ret; struct lock_list root; struct lock_list *target_entry; + enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK; + unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit); bfs_init_rootb(&root, this); - ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); + ret = find_usage_backwards(&root, usage_mask, &target_entry); if (bfs_error(ret)) { print_bfs_bug(ret); return 0; @@ -3736,8 +3828,15 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, if (ret == BFS_RNOMATCH) return 1; - print_irq_inversion_bug(curr, &root, target_entry, - this, 0, irqclass); + /* Check whether write or read usage is the match */ + if (target_entry->class->usage_mask & lock_flag(bit)) { + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, state_name(bit)); + } else { + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, state_name(read_bit)); + } + return 0; } @@ -3776,8 +3875,6 @@ static int SOFTIRQ_verbose(struct lock_class *class) return 0; } -#define STRICT_READ_CHECKS 1 - static int (*state_verbose_f[])(struct lock_class *class) = { #define LOCKDEP_STATE(__STATE) \ __STATE##_verbose, @@ -3802,16 +3899,6 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int read = new_bit & LOCK_USAGE_READ_MASK; int dir = new_bit & LOCK_USAGE_DIR_MASK; - /* - * mark USED_IN has to look forwards -- to ensure no dependency - * has ENABLED state, which would allow recursion deadlocks. - * - * mark ENABLED has to look backwards -- to ensure no dependee - * has USED_IN state, which, again, would allow recursion deadlocks. - */ - check_usage_f usage = dir ? - check_usage_backwards : check_usage_forwards; - /* * Validate that this particular lock does not have conflicting * usage states. @@ -3820,23 +3907,30 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, return 0; /* - * Validate that the lock dependencies don't have conflicting usage - * states. + * Check for read in write conflicts */ - if ((!read || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) + if (!read && !valid_state(curr, this, new_bit, + excl_bit + LOCK_USAGE_READ_MASK)) return 0; + /* - * Check for read in write conflicts + * Validate that the lock dependencies don't have conflicting usage + * states. */ - if (!read) { - if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK)) + if (dir) { + /* + * mark ENABLED has to look backwards -- to ensure no dependee + * has USED_IN state, which, again, would allow recursion deadlocks. + */ + if (!check_usage_backwards(curr, this, excl_bit)) return 0; - - if (STRICT_READ_CHECKS && - !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK, - state_name(new_bit + LOCK_USAGE_READ_MASK))) + } else { + /* + * mark USED_IN has to look forwards -- to ensure no dependency + * has ENABLED state, which would allow recursion deadlocks. + */ + if (!check_usage_forwards(curr, this, excl_bit)) return 0; } -- cgit v1.2.3 From 621c9dac0eea7607cb9a57cc9ba47fbcd4e644c9 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:31 +0800 Subject: lockdep: Add recursive read locks into dependency graph Since we have all the fundamental to handle recursive read locks, we now add them into the dependency graph. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-13-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 6644974a4f5a..b87766e080f5 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2808,16 +2808,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (!check_irq_usage(curr, prev, next)) return 0; - /* - * For recursive read-locks we do all the dependency checks, - * but we dont store read-triggered dependencies (only - * write-triggered dependencies). This ensures that only the - * write-side dependencies matter, and that if for example a - * write-lock never takes any other locks, then the reads are - * equivalent to a NOP. - */ - if (next->read == 2 || prev->read == 2) - return 1; /* * Is the -> dependency already present? * @@ -2935,13 +2925,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) u16 distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; - /* - * Only non-recursive-read entries get new dependencies - * added: - */ - if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, distance, - &trace); + if (hlock->check) { + int ret = check_prev_add(curr, hlock, next, distance, &trace); if (!ret) return 0; -- cgit v1.2.3 From f611e8cf98ec908b9c2c0da6064a660fc6022487 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 7 Aug 2020 15:42:33 +0800 Subject: lockdep: Take read/write status in consideration when generate chainkey Currently, the chainkey of a lock chain is a hash sum of the class_idx of all the held locks, the read/write status are not taken in to consideration while generating the chainkey. This could result into a problem, if we have: P1() { read_lock(B); lock(A); } P2() { lock(A); read_lock(B); } P3() { lock(A); write_lock(B); } , and P1(), P2(), P3() run one by one. And when running P2(), lockdep detects such a lock chain A -> B is not a deadlock, then it's added in the chain cache, and then when running P3(), even if it's a deadlock, we could miss it because of the hit of chain cache. This could be confirmed by self testcase "chain cached mixed R-L/L-W ". To resolve this, we use concept "hlock_id" to generate the chainkey, the hlock_id is a tuple (hlock->class_idx, hlock->read), which fits in a u16 type. With this, the chainkeys are different is the lock sequences have the same locks but different read/write status. Besides, since we use "hlock_id" to generate chainkeys, the chain_hlocks array now store the "hlock_id"s rather than lock_class indexes. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200807074238.1632519-15-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 53 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b87766e080f5..cccf4bc759c6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -371,6 +371,21 @@ static struct hlist_head classhash_table[CLASSHASH_SIZE]; static struct hlist_head chainhash_table[CHAINHASH_SIZE]; +/* + * the id of held_lock + */ +static inline u16 hlock_id(struct held_lock *hlock) +{ + BUILD_BUG_ON(MAX_LOCKDEP_KEYS_BITS + 2 > 16); + + return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS)); +} + +static inline unsigned int chain_hlock_class_idx(u16 hlock_id) +{ + return hlock_id & (MAX_LOCKDEP_KEYS - 1); +} + /* * The hash key of the lock dependency chains is a hash itself too: * it's a hash of all locks taken up to that lock, including that lock. @@ -3202,7 +3217,10 @@ static inline void free_chain_hlocks(int base, int size) struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) { - return lock_classes + chain_hlocks[chain->base + i]; + u16 chain_hlock = chain_hlocks[chain->base + i]; + unsigned int class_idx = chain_hlock_class_idx(chain_hlock); + + return lock_classes + class_idx - 1; } /* @@ -3228,12 +3246,12 @@ static inline int get_first_held_lock(struct task_struct *curr, /* * Returns the next chain_key iteration */ -static u64 print_chain_key_iteration(int class_idx, u64 chain_key) +static u64 print_chain_key_iteration(u16 hlock_id, u64 chain_key) { - u64 new_chain_key = iterate_chain_key(chain_key, class_idx); + u64 new_chain_key = iterate_chain_key(chain_key, hlock_id); - printk(" class_idx:%d -> chain_key:%016Lx", - class_idx, + printk(" hlock_id:%d -> chain_key:%016Lx", + (unsigned int)hlock_id, (unsigned long long)new_chain_key); return new_chain_key; } @@ -3250,12 +3268,12 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne hlock_next->irq_context); for (; i < depth; i++) { hlock = curr->held_locks + i; - chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); + chain_key = print_chain_key_iteration(hlock_id(hlock), chain_key); print_lock(hlock); } - print_chain_key_iteration(hlock_next->class_idx, chain_key); + print_chain_key_iteration(hlock_id(hlock_next), chain_key); print_lock(hlock_next); } @@ -3263,14 +3281,14 @@ static void print_chain_keys_chain(struct lock_chain *chain) { int i; u64 chain_key = INITIAL_CHAIN_KEY; - int class_id; + u16 hlock_id; printk("depth: %u\n", chain->depth); for (i = 0; i < chain->depth; i++) { - class_id = chain_hlocks[chain->base + i]; - chain_key = print_chain_key_iteration(class_id, chain_key); + hlock_id = chain_hlocks[chain->base + i]; + chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + class_id); + print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1); printk("\n"); } } @@ -3319,7 +3337,7 @@ static int check_no_collision(struct task_struct *curr, } for (j = 0; j < chain->depth - 1; j++, i++) { - id = curr->held_locks[i].class_idx; + id = hlock_id(&curr->held_locks[i]); if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { print_collision(curr, hlock, chain); @@ -3368,7 +3386,6 @@ static inline int add_chain_cache(struct task_struct *curr, struct held_lock *hlock, u64 chain_key) { - struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; int i, j; @@ -3411,11 +3428,11 @@ static inline int add_chain_cache(struct task_struct *curr, chain->base = j; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx; + int lock_id = hlock_id(curr->held_locks + i); chain_hlocks[chain->base + j] = lock_id; } - chain_hlocks[chain->base + j] = class - lock_classes; + chain_hlocks[chain->base + j] = hlock_id(hlock); hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(chain->irq_context); @@ -3602,7 +3619,7 @@ static void check_chain_key(struct task_struct *curr) if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = INITIAL_CHAIN_KEY; - chain_key = iterate_chain_key(chain_key, hlock->class_idx); + chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); prev_hlock = hlock; } if (chain_key != curr->curr_chain_key) { @@ -4749,7 +4766,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = INITIAL_CHAIN_KEY; chain_head = 1; } - chain_key = iterate_chain_key(chain_key, class_idx); + chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); if (nest_lock && !__lock_is_held(nest_lock, -1)) { print_lock_nested_lock_not_held(curr, hlock, ip); @@ -5648,7 +5665,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf, int i; for (i = chain->base; i < chain->base + chain->depth; i++) { - if (chain_hlocks[i] != class - lock_classes) + if (chain_hlock_class_idx(chain_hlocks[i]) != class - lock_classes) continue; /* * Each lock class occurs at most once in a lock chain so once -- cgit v1.2.3 From c07203516439b9cd9f7b3cbed82a77164de5af40 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Wed, 26 Aug 2020 04:00:22 +0000 Subject: audit: Remove redundant null check Because kfree_skb already checked NULL skb parameter, so the additional check is unnecessary, just remove it. Signed-off-by: Xu Wang Signed-off-by: Paul Moore --- kernel/audit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 86f2b76e3d4e..68cee3bc8cfe 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -934,8 +934,7 @@ static void audit_free_reply(struct audit_reply *reply) if (!reply) return; - if (reply->skb) - kfree_skb(reply->skb); + kfree_skb(reply->skb); if (reply->net) put_net(reply->net); kfree(reply); -- cgit v1.2.3 From 7e40781cc8b767dd6530910ae12d75978d7d68e1 Mon Sep 17 00:00:00 2001 From: Udip Pant Date: Tue, 25 Aug 2020 16:20:00 -0700 Subject: bpf: verifier: Use target program's type for access verifications This patch adds changes in verifier to make decisions such as granting of read / write access or enforcement of return code status based on the program type of the target program while using dynamic program extension (of type BPF_PROG_TYPE_EXT). The BPF_PROG_TYPE_EXT type can be used to extend types such as XDP, SKB and others. Since the BPF_PROG_TYPE_EXT program type on itself is just a placeholder for those, we need this extended check for those extended programs to actually work with proper access, while using this option. Specifically, it introduces following changes: - may_access_direct_pkt_data: allow access to packet data based on the target prog - check_return_code: enforce return code based on the target prog (currently, this check is skipped for EXT program) - check_ld_abs: check for 'may_access_skb' based on the target prog - check_map_prog_compatibility: enforce the map compatibility check based on the target prog - may_update_sockmap: allow sockmap update based on the target prog Some other occurrences of prog->type is left as it without replacing with the 'resolved' type: - do_check_common() and check_attach_btf_id(): already have specific logic to handle the EXT prog type - jit_subprogs() and bpf_check(): Not changed for jit compilation or while inferring env->ops Next few patches in this series include selftests for some of these cases. Signed-off-by: Udip Pant Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200825232003.2877030-2-udippant@fb.com --- kernel/bpf/verifier.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7e5908b83ec7..8a097a85d01b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2625,11 +2625,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, #define MAX_PACKET_OFF 0xffff +static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) +{ + return prog->aux->linked_prog ? prog->aux->linked_prog->type + : prog->type; +} + static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_access_type t) { - switch (env->prog->type) { + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + + switch (prog_type) { /* Program types only with direct read access go here! */ case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: @@ -4186,7 +4194,7 @@ err_type: static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) { enum bpf_attach_type eatype = env->prog->expected_attach_type; - enum bpf_prog_type type = env->prog->type; + enum bpf_prog_type type = resolve_prog_type(env->prog); if (func_id != BPF_FUNC_map_update_elem) return false; @@ -7376,7 +7384,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) u8 mode = BPF_MODE(insn->code); int i, err; - if (!may_access_skb(env->prog->type)) { + if (!may_access_skb(resolve_prog_type(env->prog))) { verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } @@ -7464,11 +7472,12 @@ static int check_return_code(struct bpf_verifier_env *env) const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; /* LSM and struct_ops func-ptr's return type could be "void" */ - if ((env->prog->type == BPF_PROG_TYPE_STRUCT_OPS || - env->prog->type == BPF_PROG_TYPE_LSM) && + if ((prog_type == BPF_PROG_TYPE_STRUCT_OPS || + prog_type == BPF_PROG_TYPE_LSM) && !prog->aux->attach_func_proto->type) return 0; @@ -7487,7 +7496,7 @@ static int check_return_code(struct bpf_verifier_env *env) return -EACCES; } - switch (env->prog->type) { + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || @@ -9243,6 +9252,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_prog *prog) { + enum bpf_prog_type prog_type = resolve_prog_type(prog); /* * Validate that trace type programs use preallocated hash maps. * @@ -9260,8 +9270,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, * now, but warnings are emitted so developers are made aware of * the unsafety and can fix their programs before this is enforced. */ - if (is_tracing_prog_type(prog->type) && !is_preallocated_map(map)) { - if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { + if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) { + if (prog_type == BPF_PROG_TYPE_PERF_EVENT) { verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } @@ -9273,8 +9283,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } - if ((is_tracing_prog_type(prog->type) || - prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && + if ((is_tracing_prog_type(prog_type) || + prog_type == BPF_PROG_TYPE_SOCKET_FILTER) && map_value_has_spin_lock(map)) { verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -9986,7 +9996,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); env->prog->aux->num_exentries++; - } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) { + } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) { verbose(env, "Writes through BTF pointers are not allowed\n"); return -EINVAL; } -- cgit v1.2.3 From 2921c90d471889242c24cff529043afb378937fa Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 24 Aug 2020 23:46:08 -0700 Subject: bpf: Fix a verifier failure with xor bpf selftest test_progs/test_sk_assign failed with llvm 11 and llvm 12. Compared to llvm 10, llvm 11 and 12 generates xor instruction which is not handled properly in verifier. The following illustrates the problem: 16: (b4) w5 = 0 17: ... R5_w=inv0 ... ... 132: (a4) w5 ^= 1 133: ... R5_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ... ... 37: (bc) w8 = w5 38: ... R5=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R8_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ... ... 41: (bc) w3 = w8 42: ... R3_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ... 45: (56) if w3 != 0x0 goto pc+1 ... R3_w=inv0 ... 46: (b7) r1 = 34 47: R1_w=inv34 R7=pkt(id=0,off=26,r=38,imm=0) 47: (0f) r7 += r1 48: R1_w=invP34 R3_w=inv0 R7_w=pkt(id=0,off=60,r=38,imm=0) 48: (b4) w9 = 0 49: R1_w=invP34 R3_w=inv0 R7_w=pkt(id=0,off=60,r=38,imm=0) 49: (69) r1 = *(u16 *)(r7 +0) invalid access to packet, off=60 size=2, R7(id=0,off=60,r=38) R7 offset is outside of the packet At above insn 132, w5 = 0, but after w5 ^= 1, we give a really conservative value of w5. At insn 45, in reality the condition should be always false. But due to conservative value for w3, the verifier evaluates it could be true and this later leads to verifier failure complaining potential packet out-of-bound access. This patch implemented proper XOR support in verifier. In the above example, we have: 132: R5=invP0 132: (a4) w5 ^= 1 133: R5_w=invP1 ... 37: (bc) w8 = w5 ... 41: (bc) w3 = w8 42: R3_w=invP1 ... 45: (56) if w3 != 0x0 goto pc+1 47: R3_w=invP1 ... processed 353 insns ... and the verifier can verify the program successfully. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200825064608.2017937-1-yhs@fb.com --- kernel/bpf/verifier.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8a097a85d01b..6f5a9f51cc03 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5829,6 +5829,67 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, __update_reg_bounds(dst_reg); } +static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_subreg_is_const(src_reg->var_off); + bool dst_known = tnum_subreg_is_const(dst_reg->var_off); + struct tnum var32_off = tnum_subreg(dst_reg->var_off); + s32 smin_val = src_reg->s32_min_value; + + /* Assuming scalar64_min_max_xor will be called so it is safe + * to skip updating register for known case. + */ + if (src_known && dst_known) + return; + + /* We get both minimum and maximum from the var32_off. */ + dst_reg->u32_min_value = var32_off.value; + dst_reg->u32_max_value = var32_off.value | var32_off.mask; + + if (dst_reg->s32_min_value >= 0 && smin_val >= 0) { + /* XORing two positive sign numbers gives a positive, + * so safe to cast u32 result into s32. + */ + dst_reg->s32_min_value = dst_reg->u32_min_value; + dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } +} + +static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_is_const(src_reg->var_off); + bool dst_known = tnum_is_const(dst_reg->var_off); + s64 smin_val = src_reg->smin_value; + + if (src_known && dst_known) { + /* dst_reg->var_off.value has been updated earlier */ + __mark_reg_known(dst_reg, dst_reg->var_off.value); + return; + } + + /* We get both minimum and maximum from the var_off. */ + dst_reg->umin_value = dst_reg->var_off.value; + dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; + + if (dst_reg->smin_value >= 0 && smin_val >= 0) { + /* XORing two positive sign numbers gives a positive, + * so safe to cast u64 result into s64. + */ + dst_reg->smin_value = dst_reg->umin_value; + dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } + + __update_reg_bounds(dst_reg); +} + static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { @@ -6137,6 +6198,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar32_min_max_or(dst_reg, &src_reg); scalar_min_max_or(dst_reg, &src_reg); break; + case BPF_XOR: + dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off); + scalar32_min_max_xor(dst_reg, &src_reg); + scalar_min_max_xor(dst_reg, &src_reg); + break; case BPF_LSH: if (umax_val >= insn_bitness) { /* Shifts greater than 31 or 63 are undefined. -- cgit v1.2.3 From f4d05259213ff1e91f767c91dcab455f68308fac Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 27 Aug 2020 18:18:06 -0700 Subject: bpf: Add map_meta_equal map ops Some properties of the inner map is used in the verification time. When an inner map is inserted to an outer map at runtime, bpf_map_meta_equal() is currently used to ensure those properties of the inserting inner map stays the same as the verification time. In particular, the current bpf_map_meta_equal() checks max_entries which turns out to be too restrictive for most of the maps which do not use max_entries during the verification time. It limits the use case that wants to replace a smaller inner map with a larger inner map. There are some maps do use max_entries during verification though. For example, the map_gen_lookup in array_map_ops uses the max_entries to generate the inline lookup code. To accommodate differences between maps, the map_meta_equal is added to bpf_map_ops. Each map-type can decide what to check when its map is used as an inner map during runtime. Also, some map types cannot be used as an inner map and they are currently black listed in bpf_map_meta_alloc() in map_in_map.c. It is not unusual that the new map types may not aware that such blacklist exists. This patch enforces an explicit opt-in and only allows a map to be used as an inner map if it has implemented the map_meta_equal ops. It is based on the discussion in [1]. All maps that support inner map has its map_meta_equal points to bpf_map_meta_equal in this patch. A later patch will relax the max_entries check for most maps. bpf_types.h counts 28 map types. This patch adds 23 ".map_meta_equal" by using coccinelle. -5 for BPF_MAP_TYPE_PROG_ARRAY BPF_MAP_TYPE_(PERCPU)_CGROUP_STORAGE BPF_MAP_TYPE_STRUCT_OPS BPF_MAP_TYPE_ARRAY_OF_MAPS BPF_MAP_TYPE_HASH_OF_MAPS The "if (inner_map->inner_map_meta)" check in bpf_map_meta_alloc() is moved such that the same error is returned. [1]: https://lore.kernel.org/bpf/20200522022342.899756-1-kafai@fb.com/ Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200828011806.1970400-1-kafai@fb.com --- kernel/bpf/arraymap.c | 9 +++++++++ kernel/bpf/bpf_inode_storage.c | 1 + kernel/bpf/cpumap.c | 1 + kernel/bpf/devmap.c | 2 ++ kernel/bpf/hashtab.c | 4 ++++ kernel/bpf/lpm_trie.c | 1 + kernel/bpf/map_in_map.c | 21 ++++++++------------- kernel/bpf/map_in_map.h | 2 -- kernel/bpf/queue_stack_maps.c | 2 ++ kernel/bpf/reuseport_array.c | 1 + kernel/bpf/ringbuf.c | 1 + kernel/bpf/stackmap.c | 1 + kernel/bpf/syscall.c | 1 + 13 files changed, 32 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8ff419b632a6..40d1f7f94307 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -625,6 +625,7 @@ static const struct bpf_iter_seq_info iter_seq_info = { static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, @@ -647,6 +648,7 @@ const struct bpf_map_ops array_map_ops = { static int percpu_array_map_btf_id; const struct bpf_map_ops percpu_array_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, @@ -1003,6 +1005,11 @@ static void prog_array_map_free(struct bpf_map *map) fd_array_map_free(map); } +/* prog_array->aux->{type,jited} is a runtime binding. + * Doing static check alone in the verifier is not enough. + * Thus, prog_array_map cannot be used as an inner_map + * and map_meta_equal is not implemented. + */ static int prog_array_map_btf_id; const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, @@ -1101,6 +1108,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = fd_array_map_free, @@ -1137,6 +1145,7 @@ static void cgroup_fd_array_free(struct bpf_map *map) static int cgroup_array_map_btf_id; const struct bpf_map_ops cgroup_array_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index f3a44e929447..75be02799c0f 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -235,6 +235,7 @@ static void inode_storage_map_free(struct bpf_map *map) static int inode_storage_map_btf_id; const struct bpf_map_ops inode_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, .map_alloc = inode_storage_map_alloc, .map_free = inode_storage_map_free, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index f1c46529929b..8d2a8623d2a7 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -658,6 +658,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = cpu_map_alloc, .map_free = cpu_map_free, .map_delete_elem = cpu_map_delete_elem, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 10abb06065bb..a42052b85c35 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -751,6 +751,7 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_get_next_key, @@ -764,6 +765,7 @@ const struct bpf_map_ops dev_map_ops = { static int dev_map_hash_map_btf_id; const struct bpf_map_ops dev_map_hash_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_hash_get_next_key, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 78dfff6a501b..ad80f45774e7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1810,6 +1810,7 @@ static const struct bpf_iter_seq_info iter_seq_info = { static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, @@ -1827,6 +1828,7 @@ const struct bpf_map_ops htab_map_ops = { static int htab_lru_map_btf_id; const struct bpf_map_ops htab_lru_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, @@ -1947,6 +1949,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, static int htab_percpu_map_btf_id; const struct bpf_map_ops htab_percpu_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, @@ -1963,6 +1966,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { static int htab_lru_percpu_map_btf_id; const struct bpf_map_ops htab_lru_percpu_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 44474bf3ab7a..00e32f2ec3e6 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -732,6 +732,7 @@ static int trie_check_btf(const struct bpf_map *map, static int trie_map_btf_id; const struct bpf_map_ops trie_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = trie_alloc, .map_free = trie_free, .map_get_next_key = trie_get_next_key, diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 17738c93bec8..e97a22dd3232 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -17,23 +17,17 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) if (IS_ERR(inner_map)) return inner_map; - /* prog_array->aux->{type,jited} is a runtime binding. - * Doing static check alone in the verifier is not enough. - */ - if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || - inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE || - inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { - fdput(f); - return ERR_PTR(-ENOTSUPP); - } - /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) { fdput(f); return ERR_PTR(-EINVAL); } + if (!inner_map->ops->map_meta_equal) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + if (map_value_has_spin_lock(inner_map)) { fdput(f); return ERR_PTR(-ENOTSUPP); @@ -89,7 +83,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int ufd) { - struct bpf_map *inner_map; + struct bpf_map *inner_map, *inner_map_meta; struct fd f; f = fdget(ufd); @@ -97,7 +91,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, if (IS_ERR(inner_map)) return inner_map; - if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) + inner_map_meta = map->inner_map_meta; + if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map)) bpf_map_inc(inner_map); else inner_map = ERR_PTR(-EINVAL); diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index a507bf6ef8b9..bcb7534afb3c 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -11,8 +11,6 @@ struct bpf_map; struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); void bpf_map_meta_free(struct bpf_map *map_meta); -bool bpf_map_meta_equal(const struct bpf_map *meta0, - const struct bpf_map *meta1); void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); void bpf_map_fd_put_ptr(void *ptr); diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 44184f82916a..0ee2347ba510 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -257,6 +257,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, static int queue_map_btf_id; const struct bpf_map_ops queue_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, .map_free = queue_stack_map_free, @@ -273,6 +274,7 @@ const struct bpf_map_ops queue_map_ops = { static int stack_map_btf_id; const struct bpf_map_ops stack_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, .map_free = queue_stack_map_free, diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 90b29c5b1da7..5a2ba1182493 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -351,6 +351,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, static int reuseport_array_map_btf_id; const struct bpf_map_ops reuseport_array_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, .map_free = reuseport_array_free, diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 002f8a5c9e51..31cb04a4dd2d 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -287,6 +287,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, static int ringbuf_map_btf_id; const struct bpf_map_ops ringbuf_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index cfed0ac44d38..a2fa006f430e 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -839,6 +839,7 @@ static void stack_map_free(struct bpf_map *map) static int stack_trace_map_btf_id; const struct bpf_map_ops stack_trace_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5443cea86cef..b86b1155b748 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -90,6 +90,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, } const struct bpf_map_ops bpf_map_offload_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, .map_check_btf = map_check_no_btf, -- cgit v1.2.3 From 134fede4eecfcbe7900e789f625fa6f9c3a8cd0e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 27 Aug 2020 18:18:13 -0700 Subject: bpf: Relax max_entries check for most of the inner map types Most of the maps do not use max_entries during verification time. Thus, those map_meta_equal() do not need to enforce max_entries when it is inserted as an inner map during runtime. The max_entries check is removed from the default implementation bpf_map_meta_equal(). The prog_array_map and xsk_map are exception. Its map_gen_lookup uses max_entries to generate inline lookup code. Thus, they will implement its own map_meta_equal() to enforce max_entries. Since there are only two cases now, the max_entries check is not refactored and stays in its own .c file. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200828011813.1970516-1-kafai@fb.com --- kernel/bpf/arraymap.c | 9 ++++++++- kernel/bpf/map_in_map.c | 3 +-- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 40d1f7f94307..d851ebbcf302 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -487,6 +487,13 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + pgoff); } +static bool array_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1) +{ + return meta0->max_entries == meta1->max_entries && + bpf_map_meta_equal(meta0, meta1); +} + struct bpf_iter_seq_array_map_info { struct bpf_map *map; void *percpu_value_buf; @@ -625,7 +632,7 @@ static const struct bpf_iter_seq_info iter_seq_info = { static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { - .map_meta_equal = bpf_map_meta_equal, + .map_meta_equal = array_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index e97a22dd3232..39ab0b68cade 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -75,8 +75,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && - meta0->map_flags == meta1->map_flags && - meta0->max_entries == meta1->max_entries; + meta0->map_flags == meta1->map_flags; } void *bpf_map_fd_get_ptr(struct bpf_map *map, -- cgit v1.2.3 From 1e6c62a8821557720a9b2ea9617359b264f2f67c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 27 Aug 2020 15:01:11 -0700 Subject: bpf: Introduce sleepable BPF programs Introduce sleepable BPF programs that can request such property for themselves via BPF_F_SLEEPABLE flag at program load time. In such case they will be able to use helpers like bpf_copy_from_user() that might sleep. At present only fentry/fexit/fmod_ret and lsm programs can request to be sleepable and only when they are attached to kernel functions that are known to allow sleeping. The non-sleepable programs are relying on implicit rcu_read_lock() and migrate_disable() to protect life time of programs, maps that they use and per-cpu kernel structures used to pass info between bpf programs and the kernel. The sleepable programs cannot be enclosed into rcu_read_lock(). migrate_disable() maps to preempt_disable() in non-RT kernels, so the progs should not be enclosed in migrate_disable() as well. Therefore rcu_read_lock_trace is used to protect the life time of sleepable progs. There are many networking and tracing program types. In many cases the 'struct bpf_prog *' pointer itself is rcu protected within some other kernel data structure and the kernel code is using rcu_dereference() to load that program pointer and call BPF_PROG_RUN() on it. All these cases are not touched. Instead sleepable bpf programs are allowed with bpf trampoline only. The program pointers are hard-coded into generated assembly of bpf trampoline and synchronize_rcu_tasks_trace() is used to protect the life time of the program. The same trampoline can hold both sleepable and non-sleepable progs. When rcu_read_lock_trace is held it means that some sleepable bpf program is running from bpf trampoline. Those programs can use bpf arrays and preallocated hash/lru maps. These map types are waiting on programs to complete via synchronize_rcu_tasks_trace(); Updates to trampoline now has to do synchronize_rcu_tasks_trace() and synchronize_rcu_tasks() to wait for sleepable progs to finish and for trampoline assembly to finish. This is the first step of introducing sleepable progs. Eventually dynamically allocated hash maps can be allowed and networking program types can become sleepable too. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Reviewed-by: Josef Bacik Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200827220114.69225-3-alexei.starovoitov@gmail.com --- kernel/bpf/arraymap.c | 1 + kernel/bpf/hashtab.c | 12 ++++---- kernel/bpf/syscall.c | 13 ++++++-- kernel/bpf/trampoline.c | 28 +++++++++++++++-- kernel/bpf/verifier.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 121 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d851ebbcf302..e046fb7d17cd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "map_in_map.h" diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ad80f45774e7..fe0e06284d33 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -577,8 +578,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - /* Must be called with rcu_read_lock. */ - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -941,7 +941,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1032,7 +1032,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1220,7 +1220,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret = -ENOENT; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1252,7 +1252,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret = -ENOENT; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b86b1155b748..4108ef3b828b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -29,6 +29,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -1731,10 +1732,14 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) btf_put(prog->aux->btf); bpf_prog_free_linfo(prog); - if (deferred) - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); - else + if (deferred) { + if (prog->aux->sleepable) + call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); + else + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + } else { __bpf_prog_put_rcu(&prog->aux->rcu); + } } static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) @@ -2104,6 +2109,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | + BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32)) return -EINVAL; @@ -2159,6 +2165,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) } prog->aux->offload_requested = !!attr->prog_ifindex; + prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; err = security_bpf_prog_alloc(prog->aux); if (err) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 9be85aa4ec5f..c2b76545153c 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -210,9 +212,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) * updates to trampoline would change the code from underneath the * preempted task. Hence wait for tasks to voluntarily schedule or go * to userspace. + * The same trampoline can hold both sleepable and non-sleepable progs. + * synchronize_rcu_tasks_trace() is needed to make sure all sleepable + * programs finish executing. + * Wait for these two grace periods together. */ - - synchronize_rcu_tasks(); + synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace); err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, &tr->func.model, flags, tprogs, @@ -344,7 +349,14 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; bpf_image_ksym_del(&tr->ksym); - /* wait for tasks to get out of trampoline before freeing it */ + /* This code will be executed when all bpf progs (both sleepable and + * non-sleepable) went through + * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred(). + * Hence no need for another synchronize_rcu_tasks_trace() here, + * but synchronize_rcu_tasks() is still needed, since trampoline + * may not have had any sleepable programs and we need to wait + * for tasks to get out of trampoline code before freeing it. + */ synchronize_rcu_tasks(); bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); @@ -394,6 +406,16 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) rcu_read_unlock(); } +void notrace __bpf_prog_enter_sleepable(void) +{ + rcu_read_lock_trace(); +} + +void notrace __bpf_prog_exit_sleepable(void) +{ + rcu_read_unlock_trace(); +} + int __weak arch_prepare_bpf_trampoline(void *image, void *image_end, const struct btf_func_model *m, u32 flags, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f5a9f51cc03..3ebfdb7bd427 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "disasm.h" @@ -9367,6 +9368,23 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } + if (prog->aux->sleepable) + switch (map->map_type) { + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_ARRAY: + if (!is_preallocated_map(map)) { + verbose(env, + "Sleepable programs can only use preallocated hash maps\n"); + return -EINVAL; + } + break; + default: + verbose(env, + "Sleepable programs can only use array and hash maps\n"); + return -EINVAL; + } + return 0; } @@ -10985,6 +11003,36 @@ static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) return -EINVAL; } +/* non exhaustive list of sleepable bpf_lsm_*() functions */ +BTF_SET_START(btf_sleepable_lsm_hooks) +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_lsm_file_mprotect) +BTF_ID(func, bpf_lsm_bprm_committed_creds) +#endif +BTF_SET_END(btf_sleepable_lsm_hooks) + +static int check_sleepable_lsm_hook(u32 btf_id) +{ + return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id); +} + +/* list of non-sleepable functions that are otherwise on + * ALLOW_ERROR_INJECTION list + */ +BTF_SET_START(btf_non_sleepable_error_inject) +/* Three functions below can be called from sleepable and non-sleepable context. + * Assume non-sleepable from bpf safety point of view. + */ +BTF_ID(func, __add_to_page_cache_locked) +BTF_ID(func, should_fail_alloc_page) +BTF_ID(func, should_failslab) +BTF_SET_END(btf_non_sleepable_error_inject) + +static int check_non_sleepable_error_inject(u32 btf_id) +{ + return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id); +} + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; @@ -11002,6 +11050,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) long addr; u64 key; + if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM) { + verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); + return -EINVAL; + } + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) return check_struct_ops_btf_id(env); @@ -11210,13 +11264,36 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } } - if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + if (prog->aux->sleepable) { + ret = -EINVAL; + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + /* fentry/fexit/fmod_ret progs can be sleepable only if they are + * attached to ALLOW_ERROR_INJECTION and are not in denylist. + */ + if (!check_non_sleepable_error_inject(btf_id) && + within_error_injection_list(addr)) + ret = 0; + break; + case BPF_PROG_TYPE_LSM: + /* LSM progs check that they are attached to bpf_lsm_*() funcs. + * Only some of them are sleepable. + */ + if (check_sleepable_lsm_hook(btf_id)) + ret = 0; + break; + default: + break; + } + if (ret) + verbose(env, "%s is not sleepable\n", + prog->aux->attach_func_name); + } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { ret = check_attach_modify_return(prog, addr); if (ret) verbose(env, "%s() is not modifiable\n", prog->aux->attach_func_name); } - if (ret) goto out; tr->func.addr = (void *)addr; -- cgit v1.2.3 From 07be4c4a3e7a0db148e44b16c5190e753d1c8569 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 27 Aug 2020 15:01:12 -0700 Subject: bpf: Add bpf_copy_from_user() helper. Sleepable BPF programs can now use copy_from_user() to access user memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200827220114.69225-4-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 22 ++++++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index be43ab3e619f..5cc7425ee476 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -601,6 +601,28 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, + const void __user *, user_ptr) +{ + int ret = copy_from_user(dst, user_ptr, size); + + if (unlikely(ret)) { + memset(dst, 0, size); + ret = -EFAULT; + } + + return ret; +} + +const struct bpf_func_proto bpf_copy_from_user_proto = { + .func = bpf_copy_from_user, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d973d891f2e2..b2a5380eb187 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1228,6 +1228,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_jiffies64_proto; case BPF_FUNC_get_task_stack: return &bpf_get_task_stack_proto; + case BPF_FUNC_copy_from_user: + return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; default: return NULL; } -- cgit v1.2.3 From cd290ec24633f51029dab0d25505fae7da0e1eda Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 21 Aug 2020 14:31:26 +0200 Subject: kcsan: Use tracing-safe version of prandom In the core runtime, we must minimize any calls to external library functions to avoid any kind of recursion. This can happen even though instrumentation is disabled for called functions, but tracing is enabled. Most recently, prandom_u32() added a tracepoint, which can cause problems for KCSAN even if the rcuidle variant is used. For example: kcsan -> prandom_u32() -> trace_prandom_u32_rcuidle -> srcu_read_lock_notrace -> __srcu_read_lock -> kcsan ... While we could disable KCSAN in kcsan_setup_watchpoint(), this does not solve other unexpected behaviour we may get due recursing into functions that may not be tolerant to such recursion: __srcu_read_lock -> kcsan -> ... -> __srcu_read_lock Therefore, switch to using prandom_u32_state(), which is uninstrumented, and does not have a tracepoint. Link: https://lkml.kernel.org/r/20200821063043.1949509-1-elver@google.com Link: https://lkml.kernel.org/r/20200820172046.GA177701@elver.google.com Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 8a1ff605ff2d..3994a217bde7 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -100,6 +100,9 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; */ static DEFINE_PER_CPU(long, kcsan_skip); +/* For kcsan_prandom_u32_max(). */ +static DEFINE_PER_CPU(struct rnd_state, kcsan_rand_state); + static __always_inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, bool expect_write, @@ -271,11 +274,28 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx * return true; } +/* + * Returns a pseudo-random number in interval [0, ep_ro). See prandom_u32_max() + * for more details. + * + * The open-coded version here is using only safe primitives for all contexts + * where we can have KCSAN instrumentation. In particular, we cannot use + * prandom_u32() directly, as its tracepoint could cause recursion. + */ +static u32 kcsan_prandom_u32_max(u32 ep_ro) +{ + struct rnd_state *state = &get_cpu_var(kcsan_rand_state); + const u32 res = prandom_u32_state(state); + + put_cpu_var(kcsan_rand_state); + return (u32)(((u64) res * ep_ro) >> 32); +} + static inline void reset_kcsan_skip(void) { long skip_count = kcsan_skip_watch - (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ? - prandom_u32_max(kcsan_skip_watch) : + kcsan_prandom_u32_max(kcsan_skip_watch) : 0); this_cpu_write(kcsan_skip, skip_count); } @@ -285,16 +305,18 @@ static __always_inline bool kcsan_is_enabled(void) return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; } -static inline unsigned int get_delay(int type) +/* Introduce delay depending on context and configuration. */ +static void delay_access(int type) { unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt; /* For certain access types, skew the random delay to be longer. */ unsigned int skew_delay_order = (type & (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_ASSERT)) ? 1 : 0; - return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? - prandom_u32_max(delay >> skew_delay_order) : - 0); + delay -= IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ? + kcsan_prandom_u32_max(delay >> skew_delay_order) : + 0; + udelay(delay); } void kcsan_save_irqtrace(struct task_struct *task) @@ -476,7 +498,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Delay this thread, to increase probability of observing a racy * conflicting access. */ - udelay(get_delay(type)); + delay_access(type); /* * Re-read value, and check if it is as expected; if not, we infer a @@ -620,6 +642,7 @@ void __init kcsan_init(void) BUG_ON(!in_task()); kcsan_debugfs_init(); + prandom_seed_full_state(&kcsan_rand_state); /* * We are in the init task, and no other tasks should be running; -- cgit v1.2.3 From 29523c5e6716521f6e2fb59d7785e2bc0b1a993a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 31 Aug 2020 09:31:32 -0700 Subject: bpf: Fix build without BPF_LSM. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit resolve_btfids doesn't like empty set. Add unused ID when BPF_LSM is off. Fixes: 1e6c62a88215 ("bpf: Introduce sleepable BPF programs") Reported-by: Björn Töpel Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Song Liu Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200831163132.66521-1-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3ebfdb7bd427..b4c22b5ce5a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11008,6 +11008,8 @@ BTF_SET_START(btf_sleepable_lsm_hooks) #ifdef CONFIG_BPF_LSM BTF_ID(func, bpf_lsm_file_mprotect) BTF_ID(func, bpf_lsm_bprm_committed_creds) +#else +BTF_ID_UNUSED #endif BTF_SET_END(btf_sleepable_lsm_hooks) -- cgit v1.2.3 From f56407fa6e69499a06bf1e0543fa93be6922acba Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 31 Aug 2020 13:16:51 -0700 Subject: bpf: Remove bpf_lsm_file_mprotect from sleepable list. Technically the bpf programs can sleep while attached to bpf_lsm_file_mprotect, but such programs need to access user memory. So they're in might_fault() category. Which means they cannot be called from file_mprotect lsm hook that takes write lock on mm->mmap_lock. Adjust the test accordingly. Also add might_fault() to __bpf_prog_enter_sleepable() to catch such deadlocks early. Fixes: 1e6c62a88215 ("bpf: Introduce sleepable BPF programs") Fixes: e68a144547fc ("selftests/bpf: Add sleepable tests") Reported-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200831201651.82447-1-alexei.starovoitov@gmail.com --- kernel/bpf/trampoline.c | 1 + kernel/bpf/verifier.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index c2b76545153c..7dd523a7e32d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -409,6 +409,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) void notrace __bpf_prog_enter_sleepable(void) { rcu_read_lock_trace(); + might_fault(); } void notrace __bpf_prog_exit_sleepable(void) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b4c22b5ce5a2..b4e9c56b8b32 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11006,7 +11006,6 @@ static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) /* non exhaustive list of sleepable bpf_lsm_*() functions */ BTF_SET_START(btf_sleepable_lsm_hooks) #ifdef CONFIG_BPF_LSM -BTF_ID(func, bpf_lsm_file_mprotect) BTF_ID(func, bpf_lsm_bprm_committed_creds) #else BTF_ID_UNUSED -- cgit v1.2.3 From b7176c261cdbced87bed9562577333150ed05b01 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 24 Aug 2020 11:03:07 +1200 Subject: dma-contiguous: provide the ability to reserve per-numa CMA Right now, drivers like ARM SMMU are using dma_alloc_coherent() to get coherent DMA buffers to save their command queues and page tables. As there is only one default CMA in the whole system, SMMUs on nodes other than node0 will get remote memory. This leads to significant latency. This patch provides per-numa CMA so that drivers like SMMU can get local memory. Tests show localizing CMA can decrease dma_unmap latency much. For instance, before this patch, SMMU on node2 has to wait for more than 560ns for the completion of CMD_SYNC in an empty command queue; with this patch, it needs 240ns only. A positive side effect of this patch would be improving performance even further for those users who are worried about performance more than DMA security and use iommu.passthrough=1 to skip IOMMU. With local CMA, all drivers can get local coherent DMA buffers. Also, this patch changes the default CONFIG_CMA_AREAS to 19 in NUMA. As 1+CONFIG_CMA_AREAS should be quite enough for most servers on the market even they enable both hugetlb_cma and pernuma_cma. 2 numa nodes: 2(hugetlb) + 2(pernuma) + 1(default global cma) = 5 4 numa nodes: 4(hugetlb) + 4(pernuma) + 1(default global cma) = 9 8 numa nodes: 8(hugetlb) + 8(pernuma) + 1(default global cma) = 17 Signed-off-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 11 ++++++ kernel/dma/contiguous.c | 100 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 101 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 847a9d1fa634..0ddfb5510fe4 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -118,6 +118,17 @@ config DMA_CMA If unsure, say "n". if DMA_CMA + +config DMA_PERNUMA_CMA + bool "Enable separate DMA Contiguous Memory Area for each NUMA Node" + default NUMA && ARM64 + help + Enable this option to get pernuma CMA areas so that devices like + ARM64 SMMU can get local memory by DMA coherent APIs. + + You can set the size of pernuma CMA by specifying "cma_pernuma=size" + on the kernel's command line. + comment "Default contiguous memory area size:" config CMA_SIZE_MBYTES diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index cff7e60968b9..aa53384fd7dc 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -69,6 +69,19 @@ static int __init early_cma(char *p) } early_param("cma", early_cma); +#ifdef CONFIG_DMA_PERNUMA_CMA + +static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES]; +static phys_addr_t pernuma_size_bytes __initdata; + +static int __init early_cma_pernuma(char *p) +{ + pernuma_size_bytes = memparse(p, &p); + return 0; +} +early_param("cma_pernuma", early_cma_pernuma); +#endif + #ifdef CONFIG_CMA_SIZE_PERCENTAGE static phys_addr_t __init __maybe_unused cma_early_percent_memory(void) @@ -96,6 +109,34 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) #endif +#ifdef CONFIG_DMA_PERNUMA_CMA +void __init dma_pernuma_cma_reserve(void) +{ + int nid; + + if (!pernuma_size_bytes) + return; + + for_each_online_node(nid) { + int ret; + char name[20]; + struct cma **cma = &dma_contiguous_pernuma_area[nid]; + + snprintf(name, sizeof(name), "pernuma%d", nid); + ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0, + 0, false, name, cma, nid); + if (ret) { + pr_warn("%s: reservation failed: err %d, node %d", __func__, + ret, nid); + continue; + } + + pr_debug("%s: reserved %llu MiB on node %d\n", __func__, + (unsigned long long)pernuma_size_bytes / SZ_1M, nid); + } +} +#endif + /** * dma_contiguous_reserve() - reserve area(s) for contiguous memory handling * @limit: End address of the reserved memory (optional, 0 for any). @@ -228,23 +269,44 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp) * @size: Requested allocation size. * @gfp: Allocation flags. * - * This function allocates contiguous memory buffer for specified device. It - * tries to use device specific contiguous memory area if available, or the - * default global one. + * tries to use device specific contiguous memory area if available, or it + * tries to use per-numa cma, if the allocation fails, it will fallback to + * try default global one. * - * Note that it byapss one-page size of allocations from the global area as - * the addresses within one page are always contiguous, so there is no need - * to waste CMA pages for that kind; it also helps reduce fragmentations. + * Note that it bypass one-page size of allocations from the per-numa and + * global area as the addresses within one page are always contiguous, so + * there is no need to waste CMA pages for that kind; it also helps reduce + * fragmentations. */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { +#ifdef CONFIG_DMA_PERNUMA_CMA + int nid = dev_to_node(dev); +#endif + /* CMA can be used only in the context which permits sleeping */ if (!gfpflags_allow_blocking(gfp)) return NULL; if (dev->cma_area) return cma_alloc_aligned(dev->cma_area, size, gfp); - if (size <= PAGE_SIZE || !dma_contiguous_default_area) + if (size <= PAGE_SIZE) + return NULL; + +#ifdef CONFIG_DMA_PERNUMA_CMA + if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) { + struct cma *cma = dma_contiguous_pernuma_area[nid]; + struct page *page; + + if (cma) { + page = cma_alloc_aligned(cma, size, gfp); + if (page) + return page; + } + } +#endif + if (!dma_contiguous_default_area) return NULL; + return cma_alloc_aligned(dma_contiguous_default_area, size, gfp); } @@ -261,9 +323,27 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) */ void dma_free_contiguous(struct device *dev, struct page *page, size_t size) { - if (!cma_release(dev_get_cma_area(dev), page, - PAGE_ALIGN(size) >> PAGE_SHIFT)) - __free_pages(page, get_order(size)); + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + /* if dev has its own cma, free page from there */ + if (dev->cma_area) { + if (cma_release(dev->cma_area, page, count)) + return; + } else { + /* + * otherwise, page is from either per-numa cma or default cma + */ +#ifdef CONFIG_DMA_PERNUMA_CMA + if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)], + page, count)) + return; +#endif + if (cma_release(dma_contiguous_default_area, page, count)) + return; + } + + /* not in any cma, free from buddy */ + __free_pages(page, get_order(size)); } /* -- cgit v1.2.3 From 2281f797f5524abb8fff66bf8540b4f4687332a2 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 24 Aug 2020 11:03:09 +1200 Subject: mm: cma: use CMA_MAX_NAME to define the length of cma name array CMA_MAX_NAME should be visible to CMA's users as they might need it to set the name of CMA areas and avoid hardcoding the size locally. So this patch moves CMA_MAX_NAME from local header file to include/linux header file and removes the hardcode in both hugetlb.c and contiguous.c. Signed-off-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index aa53384fd7dc..f4c150810fd2 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -119,7 +119,7 @@ void __init dma_pernuma_cma_reserve(void) for_each_online_node(nid) { int ret; - char name[20]; + char name[CMA_MAX_NAME]; struct cma **cma = &dma_contiguous_pernuma_area[nid]; snprintf(name, sizeof(name), "pernuma%d", nid); -- cgit v1.2.3 From 70d932985757fbe978024db313001218e9f8fe5c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Aug 2020 15:57:36 +0200 Subject: notifier: Fix broken error handling pattern The current notifiers have the following error handling pattern all over the place: int err, nr; err = __foo_notifier_call_chain(&chain, val_up, v, -1, &nr); if (err & NOTIFIER_STOP_MASK) __foo_notifier_call_chain(&chain, val_down, v, nr-1, NULL) And aside from the endless repetition thereof, it is broken. Consider blocking notifiers; both calls take and drop the rwsem, this means that the notifier list can change in between the two calls, making @nr meaningless. Fix this by replacing all the __foo_notifier_call_chain() functions with foo_notifier_call_chain_robust() that embeds the above pattern, but ensures it is inside a single lock region. Note: I switched atomic_notifier_call_chain_robust() to use the spinlock, since RCU cannot provide the guarantee required for the recovery. Note: software_resume() error handling was broken afaict. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20200818135804.325626653@infradead.org --- kernel/cpu_pm.c | 48 ++++++---------- kernel/notifier.c | 144 +++++++++++++++++++++++++++++------------------ kernel/power/hibernate.c | 39 ++++++------- kernel/power/main.c | 8 +-- kernel/power/power.h | 3 +- kernel/power/suspend.c | 14 ++--- kernel/power/user.c | 14 ++--- 7 files changed, 139 insertions(+), 131 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 44a259338e33..f7e1d0eccdbc 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -15,18 +15,28 @@ static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain); -static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls) +static int cpu_pm_notify(enum cpu_pm_event event) { int ret; /* - * __atomic_notifier_call_chain has a RCU read critical section, which + * atomic_notifier_call_chain has a RCU read critical section, which * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let * RCU know this. */ rcu_irq_enter_irqson(); - ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL, - nr_to_call, nr_calls); + ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL); + rcu_irq_exit_irqson(); + + return notifier_to_errno(ret); +} + +static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down) +{ + int ret; + + rcu_irq_enter_irqson(); + ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -80,18 +90,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); */ int cpu_pm_enter(void) { - int nr_calls = 0; - int ret = 0; - - ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); - if (ret) - /* - * Inform listeners (nr_calls - 1) about failure of CPU PM - * PM entry who are notified earlier to prepare for it. - */ - cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL); - - return ret; + return cpu_pm_notify_robust(CPU_PM_ENTER, CPU_PM_ENTER_FAILED); } EXPORT_SYMBOL_GPL(cpu_pm_enter); @@ -109,7 +108,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter); */ int cpu_pm_exit(void) { - return cpu_pm_notify(CPU_PM_EXIT, -1, NULL); + return cpu_pm_notify(CPU_PM_EXIT); } EXPORT_SYMBOL_GPL(cpu_pm_exit); @@ -131,18 +130,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); */ int cpu_cluster_pm_enter(void) { - int nr_calls = 0; - int ret = 0; - - ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); - if (ret) - /* - * Inform listeners (nr_calls - 1) about failure of CPU cluster - * PM entry who are notified earlier to prepare for it. - */ - cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL); - - return ret; + return cpu_pm_notify_robust(CPU_CLUSTER_PM_ENTER, CPU_CLUSTER_PM_ENTER_FAILED); } EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); @@ -163,7 +151,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); */ int cpu_cluster_pm_exit(void) { - return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL); + return cpu_pm_notify(CPU_CLUSTER_PM_EXIT); } EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); diff --git a/kernel/notifier.c b/kernel/notifier.c index 84c987dfbe03..1b019cbca594 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -94,6 +94,34 @@ static int notifier_call_chain(struct notifier_block **nl, } NOKPROBE_SYMBOL(notifier_call_chain); +/** + * notifier_call_chain_robust - Inform the registered notifiers about an event + * and rollback on error. + * @nl: Pointer to head of the blocking notifier chain + * @val_up: Value passed unmodified to the notifier function + * @val_down: Value passed unmodified to the notifier function when recovering + * from an error on @val_up + * @v Pointer passed unmodified to the notifier function + * + * NOTE: It is important the @nl chain doesn't change between the two + * invocations of notifier_call_chain() such that we visit the + * exact same notifier callbacks; this rules out any RCU usage. + * + * Returns: the return value of the @val_up call. + */ +static int notifier_call_chain_robust(struct notifier_block **nl, + unsigned long val_up, unsigned long val_down, + void *v) +{ + int ret, nr = 0; + + ret = notifier_call_chain(nl, val_up, v, -1, &nr); + if (ret & NOTIFY_STOP_MASK) + notifier_call_chain(nl, val_down, v, nr-1, NULL); + + return ret; +} + /* * Atomic notifier chain routines. Registration and unregistration * use a spinlock, and call_chain is synchronized by RCU (no locks). @@ -144,13 +172,30 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); +int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, + unsigned long val_up, unsigned long val_down, void *v) +{ + unsigned long flags; + int ret; + + /* + * Musn't use RCU; because then the notifier list can + * change between the up and down traversal. + */ + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); + spin_unlock_irqrestore(&nh->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust); +NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust); + /** - * __atomic_notifier_call_chain - Call functions in an atomic notifier chain + * atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See the comment for notifier_call_chain. - * @nr_calls: See the comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in an atomic context, so they must not block. @@ -163,24 +208,16 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +int atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v) { int ret; rcu_read_lock(); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); + ret = notifier_call_chain(&nh->head, val, v, -1, NULL); rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); -NOKPROBE_SYMBOL(__atomic_notifier_call_chain); -int atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v) -{ - return __atomic_notifier_call_chain(nh, val, v, -1, NULL); + return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); NOKPROBE_SYMBOL(atomic_notifier_call_chain); @@ -250,13 +287,30 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, } EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); +int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, + unsigned long val_up, unsigned long val_down, void *v) +{ + int ret = NOTIFY_DONE; + + /* + * We check the head outside the lock, but if this access is + * racy then it does not matter what the result of the test + * is, we re-check the list after having taken the lock anyway: + */ + if (rcu_access_pointer(nh->head)) { + down_read(&nh->rwsem); + ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); + up_read(&nh->rwsem); + } + return ret; +} +EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust); + /** - * __blocking_notifier_call_chain - Call functions in a blocking notifier chain + * blocking_notifier_call_chain - Call functions in a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. @@ -268,9 +322,8 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +int blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v) { int ret = NOTIFY_DONE; @@ -281,19 +334,11 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, */ if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, - nr_calls); + ret = notifier_call_chain(&nh->head, val, v, -1, NULL); up_read(&nh->rwsem); } return ret; } -EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); - -int blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v) -{ - return __blocking_notifier_call_chain(nh, val, v, -1, NULL); -} EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); /* @@ -335,13 +380,18 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh, } EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); +int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, + unsigned long val_up, unsigned long val_down, void *v) +{ + return notifier_call_chain_robust(&nh->head, val_up, val_down, v); +} +EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust); + /** - * __raw_notifier_call_chain - Call functions in a raw notifier chain + * raw_notifier_call_chain - Call functions in a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in an undefined context. @@ -354,18 +404,10 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __raw_notifier_call_chain(struct raw_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); -} -EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); - int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v) { - return __raw_notifier_call_chain(nh, val, v, -1, NULL); + return notifier_call_chain(&nh->head, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); @@ -437,12 +479,10 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); /** - * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain + * srcu_notifier_call_chain - Call functions in an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. @@ -454,25 +494,17 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +int srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v) { int ret; int idx; idx = srcu_read_lock(&nh->srcu); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); + ret = notifier_call_chain(&nh->head, val, v, -1, NULL); srcu_read_unlock(&nh->srcu, idx); return ret; } -EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); - -int srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v) -{ - return __srcu_notifier_call_chain(nh, val, v, -1, NULL); -} EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); /** diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e7aa57fb2fdc..1dee70815f3c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -706,8 +706,8 @@ static int load_image_and_restore(void) */ int hibernate(void) { - int error, nr_calls = 0; bool snapshot_test = false; + int error; if (!hibernation_available()) { pm_pr_dbg("Hibernation not available.\n"); @@ -723,11 +723,9 @@ int hibernate(void) pr_info("hibernation entry\n"); pm_prepare_console(); - error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); - if (error) { - nr_calls--; - goto Exit; - } + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); + if (error) + goto Restore; ksys_sync_helper(); @@ -785,7 +783,8 @@ int hibernate(void) /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: - __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); + pm_notifier_call_chain(PM_POST_HIBERNATION); + Restore: pm_restore_console(); hibernate_release(); Unlock: @@ -804,7 +803,7 @@ int hibernate(void) */ int hibernate_quiet_exec(int (*func)(void *data), void *data) { - int error, nr_calls = 0; + int error; lock_system_sleep(); @@ -815,11 +814,9 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) pm_prepare_console(); - error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); - if (error) { - nr_calls--; - goto exit; - } + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); + if (error) + goto restore; error = freeze_processes(); if (error) @@ -880,8 +877,9 @@ thaw: thaw_processes(); exit: - __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); + pm_notifier_call_chain(PM_POST_HIBERNATION); +restore: pm_restore_console(); hibernate_release(); @@ -910,7 +908,7 @@ EXPORT_SYMBOL_GPL(hibernate_quiet_exec); */ static int software_resume(void) { - int error, nr_calls = 0; + int error; /* * If the user said "noresume".. bail out early. @@ -997,11 +995,9 @@ static int software_resume(void) pr_info("resume from hibernation\n"); pm_prepare_console(); - error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); - if (error) { - nr_calls--; - goto Close_Finish; - } + error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); + if (error) + goto Restore; pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); @@ -1017,7 +1013,8 @@ static int software_resume(void) error = load_image_and_restore(); thaw_processes(); Finish: - __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); + pm_notifier_call_chain(PM_POST_RESTORE); + Restore: pm_restore_console(); pr_info("resume failed (%d)\n", error); hibernate_release(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 40f86ec4ab30..0aefd6f57e0a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -80,18 +80,18 @@ int unregister_pm_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_pm_notifier); -int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls) +int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down) { int ret; - ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL, - nr_to_call, nr_calls); + ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL); return notifier_to_errno(ret); } + int pm_notifier_call_chain(unsigned long val) { - return __pm_notifier_call_chain(val, -1, NULL); + return blocking_notifier_call_chain(&pm_chain_head, val, NULL); } /* If set, devices may be suspended and resumed asynchronously. */ diff --git a/kernel/power/power.h b/kernel/power/power.h index 32fc89ac96c3..24f12d534515 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -210,8 +210,7 @@ static inline void suspend_test_finish(const char *label) {} #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ -extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call, - int *nr_calls); +extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); #endif diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8b1bb5ee7e5d..32391acc806b 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -342,18 +342,16 @@ static int suspend_test(int level) */ static int suspend_prepare(suspend_state_t state) { - int error, nr_calls = 0; + int error; if (!sleep_state_supported(state)) return -EPERM; pm_prepare_console(); - error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls); - if (error) { - nr_calls--; - goto Finish; - } + error = pm_notifier_call_chain_robust(PM_SUSPEND_PREPARE, PM_POST_SUSPEND); + if (error) + goto Restore; trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); @@ -363,8 +361,8 @@ static int suspend_prepare(suspend_state_t state) suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); - Finish: - __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL); + pm_notifier_call_chain(PM_POST_SUSPEND); + Restore: pm_restore_console(); return error; } diff --git a/kernel/power/user.c b/kernel/power/user.c index d5eedc2baa2a..047f598f89a5 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -46,7 +46,7 @@ int is_hibernate_resume_dev(const struct inode *bd_inode) static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; - int error, nr_calls = 0; + int error; if (!hibernation_available()) return -EPERM; @@ -73,9 +73,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; data->free_bitmaps = false; - error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); - if (error) - __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL); + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); } else { /* * Resuming. We may need to wait for the image device to @@ -85,15 +83,11 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = -1; data->mode = O_WRONLY; - error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); + error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); if (!error) { error = create_basic_memory_bitmaps(); data->free_bitmaps = !error; - } else - nr_calls--; - - if (error) - __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); + } } if (error) hibernate_release(); -- cgit v1.2.3 From 0340a6b7fb767f7f296b9bacc9a215920519a644 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Aug 2020 15:57:37 +0200 Subject: module: Fix up module_notifier return values While auditing all module notifiers I noticed a whole bunch of fail wrt the return value. Notifiers have a 'special' return semantics. As is; NOTIFY_DONE vs NOTIFY_OK is a bit vague; but notifier_from_errno(0) results in NOTIFY_OK and NOTIFY_DONE has a comment that says "Don't care". From this I've used NOTIFY_DONE when the function completely ignores the callback and notifier_to_error() isn't used. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Reviewed-by: Joel Fernandes (Google) Reviewed-by: Robert Richter Acked-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20200818135804.385360407@infradead.org --- kernel/trace/bpf_trace.c | 8 ++++++-- kernel/trace/trace.c | 2 +- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_printk.c | 4 ++-- kernel/tracepoint.c | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a8d4f253ed77..2ecf7892a31b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2027,10 +2027,11 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op, { struct bpf_trace_module *btm, *tmp; struct module *mod = module; + int ret = 0; if (mod->num_bpf_raw_events == 0 || (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) - return 0; + goto out; mutex_lock(&bpf_module_mutex); @@ -2040,6 +2041,8 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op, if (btm) { btm->module = module; list_add(&btm->list, &bpf_trace_modules); + } else { + ret = -ENOMEM; } break; case MODULE_STATE_GOING: @@ -2055,7 +2058,8 @@ static int bpf_event_notify(struct notifier_block *nb, unsigned long op, mutex_unlock(&bpf_module_mutex); - return 0; +out: + return notifier_from_errno(ret); } static struct notifier_block bpf_module_nb = { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f40d850ebabc..df499922b6a4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9072,7 +9072,7 @@ static int trace_module_notify(struct notifier_block *self, break; } - return 0; + return NOTIFY_OK; } static struct notifier_block trace_module_nb = { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a85effb2373b..beebf2cd364b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2646,7 +2646,7 @@ static int trace_module_notify(struct notifier_block *self, mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); - return 0; + return NOTIFY_OK; } static struct notifier_block trace_module_nb = { diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index d4e31e969206..bb7783b90361 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -96,7 +96,7 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, if (val == MODULE_STATE_COMING) hold_module_trace_bprintk_format(start, end); } - return 0; + return NOTIFY_OK; } /* @@ -174,7 +174,7 @@ __init static int module_trace_bprintk_format_notify(struct notifier_block *self, unsigned long val, void *data) { - return 0; + return NOTIFY_OK; } static inline const char ** find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 73956eaff8a9..8e05ed2cd39e 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -521,7 +521,7 @@ static int tracepoint_module_notify(struct notifier_block *self, case MODULE_STATE_UNFORMED: break; } - return ret; + return notifier_from_errno(ret); } static struct notifier_block tracepoint_module_nb = { -- cgit v1.2.3 From 59cc8e0a906ea23190922e5e0252e5b5a60d70c2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Aug 2020 15:57:38 +0200 Subject: module: Properly propagate MODULE_STATE_COMING failure Now that notifiers got unbroken; use the proper interface to handle notifier errors and propagate them. There were already MODULE_STATE_COMING notifiers that failed; notably: - jump_label_module_notifier() - tracepoint_module_notify() - bpf_event_notify() By propagating this error, we fix those users. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Acked-by: Jessica Yu Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20200818135804.444372853@infradead.org --- kernel/module.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1c5cff34d9f2..3c465cf31d08 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3792,9 +3792,13 @@ static int prepare_coming_module(struct module *mod) if (err) return err; - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); - return 0; + err = blocking_notifier_call_chain_robust(&module_notify_list, + MODULE_STATE_COMING, MODULE_STATE_GOING, mod); + err = notifier_to_errno(err); + if (err) + klp_module_going(mod); + + return err; } static int unknown_module_param_cb(char *param, char *val, const char *modname, -- cgit v1.2.3 From 0db6e3734b130207026df1e78455fa98ca1d6f50 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Aug 2020 15:57:39 +0200 Subject: jump_label,module: Fix module lifetime for __jump_label_mod_text_reserved() Nothing ensures the module exists while we're iterating mod->jump_entries in __jump_label_mod_text_reserved(), take a module reference to ensure the module sticks around. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20200818135804.504501338@infradead.org --- kernel/jump_label.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index cdb3ffab128b..e661c61b3d6b 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -539,19 +539,25 @@ static void static_key_set_mod(struct static_key *key, static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; + int ret; preempt_disable(); mod = __module_text_address((unsigned long)start); WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; preempt_enable(); if (!mod) return 0; - - return __jump_label_text_reserved(mod->jump_entries, + ret = __jump_label_text_reserved(mod->jump_entries, mod->jump_entries + mod->num_jump_entries, start, end); + + module_put(mod); + + return ret; } static void __jump_label_mod_update(struct static_key *key) -- cgit v1.2.3 From 9183c3f9ed710a8edf1a61e8a96d497258d26e08 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 18 Aug 2020 15:57:42 +0200 Subject: static_call: Add inline static call infrastructure Add infrastructure for an arch-specific CONFIG_HAVE_STATIC_CALL_INLINE option, which is a faster version of CONFIG_HAVE_STATIC_CALL. At runtime, the static call sites are patched directly, rather than using the out-of-line trampolines. Compared to out-of-line static calls, the performance benefits are more modest, but still measurable. Steven Rostedt did some tracepoint measurements: https://lkml.kernel.org/r/20181126155405.72b4f718@gandalf.local.home This code is heavily inspired by the jump label code (aka "static jumps"), as some of the concepts are very similar. For more details, see the comments in include/linux/static_call.h. [peterz: simplified interface; merged trampolines] Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Steven Rostedt (VMware) Cc: Linus Torvalds Link: https://lore.kernel.org/r/20200818135804.684334440@infradead.org --- kernel/Makefile | 1 + kernel/module.c | 5 + kernel/static_call.c | 303 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 309 insertions(+) create mode 100644 kernel/static_call.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 9a20016d4900..b74820d8b264 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -111,6 +111,7 @@ obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o +obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/module.c b/kernel/module.c index 3c465cf31d08..c075a18103fb 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3274,6 +3274,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist", sizeof(unsigned long), &mod->num_kprobe_blacklist); +#endif +#ifdef CONFIG_HAVE_STATIC_CALL_INLINE + mod->static_call_sites = section_objs(info, ".static_call_sites", + sizeof(*mod->static_call_sites), + &mod->num_static_call_sites); #endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); diff --git a/kernel/static_call.c b/kernel/static_call.c new file mode 100644 index 000000000000..d24349244675 --- /dev/null +++ b/kernel/static_call.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern struct static_call_site __start_static_call_sites[], + __stop_static_call_sites[]; + +static bool static_call_initialized; + +#define STATIC_CALL_INIT 1UL + +/* mutex to protect key modules/sites */ +static DEFINE_MUTEX(static_call_mutex); + +static void static_call_lock(void) +{ + mutex_lock(&static_call_mutex); +} + +static void static_call_unlock(void) +{ + mutex_unlock(&static_call_mutex); +} + +static inline void *static_call_addr(struct static_call_site *site) +{ + return (void *)((long)site->addr + (long)&site->addr); +} + + +static inline struct static_call_key *static_call_key(const struct static_call_site *site) +{ + return (struct static_call_key *) + (((long)site->key + (long)&site->key) & ~STATIC_CALL_INIT); +} + +/* These assume the key is word-aligned. */ +static inline bool static_call_is_init(struct static_call_site *site) +{ + return ((long)site->key + (long)&site->key) & STATIC_CALL_INIT; +} + +static inline void static_call_set_init(struct static_call_site *site) +{ + site->key = ((long)static_call_key(site) | STATIC_CALL_INIT) - + (long)&site->key; +} + +static int static_call_site_cmp(const void *_a, const void *_b) +{ + const struct static_call_site *a = _a; + const struct static_call_site *b = _b; + const struct static_call_key *key_a = static_call_key(a); + const struct static_call_key *key_b = static_call_key(b); + + if (key_a < key_b) + return -1; + + if (key_a > key_b) + return 1; + + return 0; +} + +static void static_call_site_swap(void *_a, void *_b, int size) +{ + long delta = (unsigned long)_a - (unsigned long)_b; + struct static_call_site *a = _a; + struct static_call_site *b = _b; + struct static_call_site tmp = *a; + + a->addr = b->addr - delta; + a->key = b->key - delta; + + b->addr = tmp.addr + delta; + b->key = tmp.key + delta; +} + +static inline void static_call_sort_entries(struct static_call_site *start, + struct static_call_site *stop) +{ + sort(start, stop - start, sizeof(struct static_call_site), + static_call_site_cmp, static_call_site_swap); +} + +void __static_call_update(struct static_call_key *key, void *tramp, void *func) +{ + struct static_call_site *site, *stop; + struct static_call_mod *site_mod; + + cpus_read_lock(); + static_call_lock(); + + if (key->func == func) + goto done; + + key->func = func; + + arch_static_call_transform(NULL, tramp, func); + + /* + * If uninitialized, we'll not update the callsites, but they still + * point to the trampoline and we just patched that. + */ + if (WARN_ON_ONCE(!static_call_initialized)) + goto done; + + for (site_mod = key->mods; site_mod; site_mod = site_mod->next) { + struct module *mod = site_mod->mod; + + if (!site_mod->sites) { + /* + * This can happen if the static call key is defined in + * a module which doesn't use it. + */ + continue; + } + + stop = __stop_static_call_sites; + +#ifdef CONFIG_MODULES + if (mod) { + stop = mod->static_call_sites + + mod->num_static_call_sites; + } +#endif + + for (site = site_mod->sites; + site < stop && static_call_key(site) == key; site++) { + void *site_addr = static_call_addr(site); + + if (static_call_is_init(site)) { + /* + * Don't write to call sites which were in + * initmem and have since been freed. + */ + if (!mod && system_state >= SYSTEM_RUNNING) + continue; + if (mod && !within_module_init((unsigned long)site_addr, mod)) + continue; + } + + if (!kernel_text_address((unsigned long)site_addr)) { + WARN_ONCE(1, "can't patch static call site at %pS", + site_addr); + continue; + } + + arch_static_call_transform(site_addr, NULL, func); + } + } + +done: + static_call_unlock(); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(__static_call_update); + +static int __static_call_init(struct module *mod, + struct static_call_site *start, + struct static_call_site *stop) +{ + struct static_call_site *site; + struct static_call_key *key, *prev_key = NULL; + struct static_call_mod *site_mod; + + if (start == stop) + return 0; + + static_call_sort_entries(start, stop); + + for (site = start; site < stop; site++) { + void *site_addr = static_call_addr(site); + + if ((mod && within_module_init((unsigned long)site_addr, mod)) || + (!mod && init_section_contains(site_addr, 1))) + static_call_set_init(site); + + key = static_call_key(site); + if (key != prev_key) { + prev_key = key; + + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); + if (!site_mod) + return -ENOMEM; + + site_mod->mod = mod; + site_mod->sites = site; + site_mod->next = key->mods; + key->mods = site_mod; + } + + arch_static_call_transform(site_addr, NULL, key->func); + } + + return 0; +} + +#ifdef CONFIG_MODULES + +static int static_call_add_module(struct module *mod) +{ + return __static_call_init(mod, mod->static_call_sites, + mod->static_call_sites + mod->num_static_call_sites); +} + +static void static_call_del_module(struct module *mod) +{ + struct static_call_site *start = mod->static_call_sites; + struct static_call_site *stop = mod->static_call_sites + + mod->num_static_call_sites; + struct static_call_key *key, *prev_key = NULL; + struct static_call_mod *site_mod, **prev; + struct static_call_site *site; + + for (site = start; site < stop; site++) { + key = static_call_key(site); + if (key == prev_key) + continue; + + prev_key = key; + + for (prev = &key->mods, site_mod = key->mods; + site_mod && site_mod->mod != mod; + prev = &site_mod->next, site_mod = site_mod->next) + ; + + if (!site_mod) + continue; + + *prev = site_mod->next; + kfree(site_mod); + } +} + +static int static_call_module_notify(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + cpus_read_lock(); + static_call_lock(); + + switch (val) { + case MODULE_STATE_COMING: + ret = static_call_add_module(mod); + if (ret) { + WARN(1, "Failed to allocate memory for static calls"); + static_call_del_module(mod); + } + break; + case MODULE_STATE_GOING: + static_call_del_module(mod); + break; + } + + static_call_unlock(); + cpus_read_unlock(); + + return notifier_from_errno(ret); +} + +static struct notifier_block static_call_module_nb = { + .notifier_call = static_call_module_notify, +}; + +#endif /* CONFIG_MODULES */ + +static void __init static_call_init(void) +{ + int ret; + + if (static_call_initialized) + return; + + cpus_read_lock(); + static_call_lock(); + ret = __static_call_init(NULL, __start_static_call_sites, + __stop_static_call_sites); + static_call_unlock(); + cpus_read_unlock(); + + if (ret) { + pr_err("Failed to allocate memory for static_call!\n"); + BUG(); + } + + static_call_initialized = true; + +#ifdef CONFIG_MODULES + register_module_notifier(&static_call_module_nb); +#endif +} +early_initcall(static_call_init); -- cgit v1.2.3 From 6333e8f73b834f54e395a056e6002403f0862c51 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Aug 2020 15:57:43 +0200 Subject: static_call: Avoid kprobes on inline static_call()s Similar to how we disallow kprobes on any other dynamic text (ftrace/jump_label) also disallow kprobes on inline static_call()s. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200818135804.744920586@infradead.org --- kernel/kprobes.c | 2 ++ kernel/static_call.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 287b263c9cb9..67e6a8c18007 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -1634,6 +1635,7 @@ static int check_kprobe_address_safe(struct kprobe *p, if (!kernel_text_address((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || + static_call_text_reserved(p->addr, p->addr) || find_bug((unsigned long)p->addr)) { ret = -EINVAL; goto out; diff --git a/kernel/static_call.c b/kernel/static_call.c index d24349244675..753b2f1b4fb8 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -204,8 +204,58 @@ static int __static_call_init(struct module *mod, return 0; } +static int addr_conflict(struct static_call_site *site, void *start, void *end) +{ + unsigned long addr = (unsigned long)static_call_addr(site); + + if (addr <= (unsigned long)end && + addr + CALL_INSN_SIZE > (unsigned long)start) + return 1; + + return 0; +} + +static int __static_call_text_reserved(struct static_call_site *iter_start, + struct static_call_site *iter_stop, + void *start, void *end) +{ + struct static_call_site *iter = iter_start; + + while (iter < iter_stop) { + if (addr_conflict(iter, start, end)) + return 1; + iter++; + } + + return 0; +} + #ifdef CONFIG_MODULES +static int __static_call_mod_text_reserved(void *start, void *end) +{ + struct module *mod; + int ret; + + preempt_disable(); + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + preempt_enable(); + + if (!mod) + return 0; + + ret = __static_call_text_reserved(mod->static_call_sites, + mod->static_call_sites + mod->num_static_call_sites, + start, end); + + module_put(mod); + + return ret; +} + static int static_call_add_module(struct module *mod) { return __static_call_init(mod, mod->static_call_sites, @@ -273,8 +323,26 @@ static struct notifier_block static_call_module_nb = { .notifier_call = static_call_module_notify, }; +#else + +static inline int __static_call_mod_text_reserved(void *start, void *end) +{ + return 0; +} + #endif /* CONFIG_MODULES */ +int static_call_text_reserved(void *start, void *end) +{ + int ret = __static_call_text_reserved(__start_static_call_sites, + __stop_static_call_sites, start, end); + + if (ret) + return ret; + + return __static_call_mod_text_reserved(start, end); +} + static void __init static_call_init(void) { int ret; -- cgit v1.2.3 From f03c412915f5f69f2d17bcd20ecdd69320bcbf7b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Aug 2020 15:57:46 +0200 Subject: static_call: Add simple self-test for static calls Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200818135804.922581202@infradead.org --- kernel/static_call.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/static_call.c b/kernel/static_call.c index 753b2f1b4fb8..97142cb6bfa6 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -369,3 +369,46 @@ static void __init static_call_init(void) #endif } early_initcall(static_call_init); + +#ifdef CONFIG_STATIC_CALL_SELFTEST + +static int func_a(int x) +{ + return x+1; +} + +static int func_b(int x) +{ + return x+2; +} + +DEFINE_STATIC_CALL(sc_selftest, func_a); + +static struct static_call_data { + int (*func)(int); + int val; + int expect; +} static_call_data [] __initdata = { + { NULL, 2, 3 }, + { func_b, 2, 4 }, + { func_a, 2, 3 } +}; + +static int __init test_static_call_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) { + struct static_call_data *scd = &static_call_data[i]; + + if (scd->func) + static_call_update(sc_selftest, scd->func); + + WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect); + } + + return 0; +} +early_initcall(test_static_call_init); + +#endif /* CONFIG_STATIC_CALL_SELFTEST */ -- cgit v1.2.3 From 5b06fd3bb9cdce4f3e731c48eb5b74c4acc47997 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Aug 2020 15:57:49 +0200 Subject: static_call: Handle tail-calls GCC can turn our static_call(name)(args...) into a tail call, in which case we get a JMP.d32 into the trampoline (which then does a further tail-call). Teach objtool to recognise and mark these in .static_call_sites and adjust the code patching to deal with this. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20200818135805.101186767@infradead.org --- kernel/static_call.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/static_call.c b/kernel/static_call.c index 97142cb6bfa6..d98e0e4272c1 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -15,8 +15,6 @@ extern struct static_call_site __start_static_call_sites[], static bool static_call_initialized; -#define STATIC_CALL_INIT 1UL - /* mutex to protect key modules/sites */ static DEFINE_MUTEX(static_call_mutex); @@ -39,18 +37,23 @@ static inline void *static_call_addr(struct static_call_site *site) static inline struct static_call_key *static_call_key(const struct static_call_site *site) { return (struct static_call_key *) - (((long)site->key + (long)&site->key) & ~STATIC_CALL_INIT); + (((long)site->key + (long)&site->key) & ~STATIC_CALL_SITE_FLAGS); } /* These assume the key is word-aligned. */ static inline bool static_call_is_init(struct static_call_site *site) { - return ((long)site->key + (long)&site->key) & STATIC_CALL_INIT; + return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_INIT; +} + +static inline bool static_call_is_tail(struct static_call_site *site) +{ + return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_TAIL; } static inline void static_call_set_init(struct static_call_site *site) { - site->key = ((long)static_call_key(site) | STATIC_CALL_INIT) - + site->key = ((long)static_call_key(site) | STATIC_CALL_SITE_INIT) - (long)&site->key; } @@ -104,7 +107,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) key->func = func; - arch_static_call_transform(NULL, tramp, func); + arch_static_call_transform(NULL, tramp, func, false); /* * If uninitialized, we'll not update the callsites, but they still @@ -154,7 +157,8 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) continue; } - arch_static_call_transform(site_addr, NULL, func); + arch_static_call_transform(site_addr, NULL, func, + static_call_is_tail(site)); } } @@ -198,7 +202,8 @@ static int __static_call_init(struct module *mod, key->mods = site_mod; } - arch_static_call_transform(site_addr, NULL, key->func); + arch_static_call_transform(site_addr, NULL, key->func, + static_call_is_tail(site)); } return 0; -- cgit v1.2.3 From a945c8345ec0decb2f1a7f19a8c5e60bcb1dd1eb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Aug 2020 15:57:51 +0200 Subject: static_call: Allow early init In order to use static_call() to wire up x86_pmu, we need to initialize earlier, specifically before memory allocation works; copy some of the tricks from jump_label to enable this. Primarily we overload key->next to store a sites pointer when there are no modules, this avoids having to use kmalloc() to initialize the sites and allows us to run much earlier. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20200818135805.220737930@infradead.org --- kernel/static_call.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/static_call.c b/kernel/static_call.c index d98e0e4272c1..f8362b3f8fd5 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -94,10 +94,31 @@ static inline void static_call_sort_entries(struct static_call_site *start, static_call_site_cmp, static_call_site_swap); } +static inline bool static_call_key_has_mods(struct static_call_key *key) +{ + return !(key->type & 1); +} + +static inline struct static_call_mod *static_call_key_next(struct static_call_key *key) +{ + if (!static_call_key_has_mods(key)) + return NULL; + + return key->mods; +} + +static inline struct static_call_site *static_call_key_sites(struct static_call_key *key) +{ + if (static_call_key_has_mods(key)) + return NULL; + + return (struct static_call_site *)(key->type & ~1); +} + void __static_call_update(struct static_call_key *key, void *tramp, void *func) { struct static_call_site *site, *stop; - struct static_call_mod *site_mod; + struct static_call_mod *site_mod, first; cpus_read_lock(); static_call_lock(); @@ -116,13 +137,22 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) if (WARN_ON_ONCE(!static_call_initialized)) goto done; - for (site_mod = key->mods; site_mod; site_mod = site_mod->next) { + first = (struct static_call_mod){ + .next = static_call_key_next(key), + .mod = NULL, + .sites = static_call_key_sites(key), + }; + + for (site_mod = &first; site_mod; site_mod = site_mod->next) { struct module *mod = site_mod->mod; if (!site_mod->sites) { /* * This can happen if the static call key is defined in * a module which doesn't use it. + * + * It also happens in the has_mods case, where the + * 'first' entry has no sites associated with it. */ continue; } @@ -192,16 +222,48 @@ static int __static_call_init(struct module *mod, if (key != prev_key) { prev_key = key; + /* + * For vmlinux (!mod) avoid the allocation by storing + * the sites pointer in the key itself. Also see + * __static_call_update()'s @first. + * + * This allows architectures (eg. x86) to call + * static_call_init() before memory allocation works. + */ + if (!mod) { + key->sites = site; + key->type |= 1; + goto do_transform; + } + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); if (!site_mod) return -ENOMEM; + /* + * When the key has a direct sites pointer, extract + * that into an explicit struct static_call_mod, so we + * can have a list of modules. + */ + if (static_call_key_sites(key)) { + site_mod->mod = NULL; + site_mod->next = NULL; + site_mod->sites = static_call_key_sites(key); + + key->mods = site_mod; + + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); + if (!site_mod) + return -ENOMEM; + } + site_mod->mod = mod; site_mod->sites = site; - site_mod->next = key->mods; + site_mod->next = static_call_key_next(key); key->mods = site_mod; } +do_transform: arch_static_call_transform(site_addr, NULL, key->func, static_call_is_tail(site)); } @@ -348,7 +410,7 @@ int static_call_text_reserved(void *start, void *end) return __static_call_mod_text_reserved(start, end); } -static void __init static_call_init(void) +void __init static_call_init(void) { int ret; -- cgit v1.2.3 From d25e37d89dd2f41d7acae0429039d2f0ae8b4a07 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 18 Aug 2020 15:57:52 +0200 Subject: tracepoint: Optimize using static_call() Currently the tracepoint site will iterate a vector and issue indirect calls to however many handlers are registered (ie. the vector is long). Using static_call() it is possible to optimize this for the common case of only having a single handler registered. In this case the static_call() can directly call this handler. Otherwise, if the vector is longer than 1, call a function that iterates the whole vector like the current code. [peterz: updated to new interface] Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20200818135805.279421092@infradead.org --- kernel/tracepoint.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 8e05ed2cd39e..e92f3fb8887a 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -221,6 +221,20 @@ static void *func_remove(struct tracepoint_func **funcs, return old; } +static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs) +{ + void *func = tp->iterator; + + /* Synthetic events do not have static call sites */ + if (!tp->static_call_key) + return; + + if (!tp_funcs[1].func) + func = tp_funcs[0].func; + + __static_call_update(tp->static_call_key, tp->static_call_tramp, func); +} + /* * Add the probe function to a tracepoint. */ @@ -251,8 +265,9 @@ static int tracepoint_add_func(struct tracepoint *tp, * include/linux/tracepoint.h using rcu_dereference_sched(). */ rcu_assign_pointer(tp->funcs, tp_funcs); - if (!static_key_enabled(&tp->key)) - static_key_slow_inc(&tp->key); + tracepoint_update_call(tp, tp_funcs); + static_key_enable(&tp->key); + release_probes(old); return 0; } @@ -281,9 +296,11 @@ static int tracepoint_remove_func(struct tracepoint *tp, if (tp->unregfunc && static_key_enabled(&tp->key)) tp->unregfunc(); - if (static_key_enabled(&tp->key)) - static_key_slow_dec(&tp->key); + static_key_disable(&tp->key); + } else { + tracepoint_update_call(tp, tp_funcs); } + rcu_assign_pointer(tp->funcs, tp_funcs); release_probes(old); return 0; -- cgit v1.2.3 From ebc4ecd48ca6552b223047839f66e9a9c09aea4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 1 Sep 2020 10:39:28 +0200 Subject: bpf: {cpu,dev}map: Change various functions return type from int to void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions bq_enqueue(), bq_flush_to_queue(), and bq_xmit_all() in {cpu,dev}map.c always return zero. Changing the return type from int to void makes the code easier to follow. Suggested-by: David Ahern Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200901083928.6199-1-bjorn.topel@gmail.com --- kernel/bpf/cpumap.c | 11 +++-------- kernel/bpf/devmap.c | 15 +++++++-------- 2 files changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8d2a8623d2a7..cf548fc88780 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -79,8 +79,6 @@ struct bpf_cpu_map { static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); -static int bq_flush_to_queue(struct xdp_bulk_queue *bq); - static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; @@ -670,7 +668,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_btf_id = &cpu_map_btf_id, }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq) +static void bq_flush_to_queue(struct xdp_bulk_queue *bq) { struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; @@ -679,7 +677,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq) int i; if (unlikely(!bq->count)) - return 0; + return; q = rcpu->queue; spin_lock(&q->producer_lock); @@ -702,13 +700,12 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq) /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); - return 0; } /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) +static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); @@ -729,8 +726,6 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) if (!bq->flush_node.prev) list_add(&bq->flush_node, flush_list); - - return 0; } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a42052b85c35..2b5ca93c17de 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -341,14 +341,14 @@ bool dev_map_can_have_prog(struct bpf_map *map) return false; } -static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; int sent = 0, drops = 0, err = 0; int i; if (unlikely(!bq->count)) - return 0; + return; for (i = 0; i < bq->count; i++) { struct xdp_frame *xdpf = bq->q[i]; @@ -369,7 +369,7 @@ out: trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); - return 0; + return; error: /* If ndo_xdp_xmit fails with an errno, no frames have been * xmit'ed and it's our responsibility to them free all. @@ -421,8 +421,8 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, - struct net_device *dev_rx) +static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, + struct net_device *dev_rx) { struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); @@ -441,8 +441,6 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, if (!bq->flush_node.prev) list_add(&bq->flush_node, flush_list); - - return 0; } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, @@ -462,7 +460,8 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, if (unlikely(!xdpf)) return -EOVERFLOW; - return bq_enqueue(dev, xdpf, dev_rx); + bq_enqueue(dev, xdpf, dev_rx); + return 0; } static struct xdp_buff *dev_map_run_prog(struct net_device *dev, -- cgit v1.2.3 From 14721add58ef267344bee254bc276c9139b7b665 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 2 Sep 2020 14:46:19 +0800 Subject: module: Add more error message for failed kernel module loading When kernel module loading failed, user space only get one of the following error messages: - ENOEXEC This is the most confusing one. From corrupted ELF header to bad WRITE|EXEC flags check introduced by in module_enforce_rwx_sections() all returns this error number. - EPERM This is for blacklisted modules. But mod doesn't do extra explain on this error either. - ENOMEM The only error which needs no explain. This means, if a user got "Exec format error" from modprobe, it provides no meaningful way for the user to debug, and will take extra time communicating to get extra info. So this patch will add extra error messages for -ENOEXEC and -EPERM errors, allowing user to do better debugging and reporting. Reviewed-by: Lucas De Marchi Signed-off-by: Qu Wenruo Signed-off-by: Jessica Yu --- kernel/module.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1c5cff34d9f2..2c00059ac1c9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2096,8 +2096,11 @@ static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, int i; for (i = 0; i < hdr->e_shnum; i++) { - if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) + if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) { + pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n", + mod->name, secstrings + sechdrs[i].sh_name, i); return -ENOEXEC; + } } return 0; @@ -3825,8 +3828,10 @@ static int load_module(struct load_info *info, const char __user *uargs, char *after_dashes; err = elf_header_check(info); - if (err) + if (err) { + pr_err("Module has invalid ELF header\n"); goto free_copy; + } err = setup_load_info(info, flags); if (err) @@ -3834,6 +3839,7 @@ static int load_module(struct load_info *info, const char __user *uargs, if (blacklisted(info->name)) { err = -EPERM; + pr_err("Module %s is blacklisted\n", info->name); goto free_copy; } -- cgit v1.2.3 From 203d7b054fc719561fe258e46e280930890dceaf Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 1 Sep 2020 19:31:12 -0700 Subject: bpf: Avoid iterating duplicated files for task_file iterator Currently, task_file iterator iterates all files from all tasks. This may potentially visit a lot of duplicated files if there are many tasks sharing the same files, e.g., typical pthreads where these pthreads and the main thread are sharing the same files. This patch changed task_file iterator to skip a particular task if that task shares the same files as its group_leader (the task having the same tgid and also task->tgid == task->pid). This will preserve the same result, visiting all files from all tasks, and will reduce runtime cost significantl, e.g., if there are a lot of pthreads and the process has a lot of open files. Suggested-by: Andrii Nakryiko Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Reviewed-by: Josef Bacik Link: https://lore.kernel.org/bpf/20200902023112.1672792-1-yhs@fb.com --- kernel/bpf/task_iter.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 99af4cea1102..5b6af30bfbcd 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -22,7 +22,8 @@ struct bpf_iter_seq_task_info { }; static struct task_struct *task_seq_get_next(struct pid_namespace *ns, - u32 *tid) + u32 *tid, + bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; @@ -36,6 +37,12 @@ retry: if (!task) { ++*tid; goto retry; + } else if (skip_if_dup_files && task->tgid != task->pid && + task->files == task->group_leader->files) { + put_task_struct(task); + task = NULL; + ++*tid; + goto retry; } } rcu_read_unlock(); @@ -48,7 +55,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid); + task = task_seq_get_next(info->common.ns, &info->tid, false); if (!task) return NULL; @@ -65,7 +72,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid); + task = task_seq_get_next(info->common.ns, &info->tid, false); if (!task) return NULL; @@ -148,7 +155,7 @@ again: curr_files = *fstruct; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid); + curr_task = task_seq_get_next(ns, &curr_tid, true); if (!curr_task) return NULL; -- cgit v1.2.3 From 23870f1227680d2aacff6f79c3ab2222bd04e86e Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Wed, 2 Sep 2020 18:03:23 +0200 Subject: locking/lockdep: Fix "USED" <- "IN-NMI" inversions During the LPC RCU BoF Paul asked how come the "USED" <- "IN-NMI" detector doesn't trip over rcu_read_lock()'s lockdep annotation. Looking into this I found a very embarrasing typo in verify_lock_unused(): - if (!(class->usage_mask & LOCK_USED)) + if (!(class->usage_mask & LOCKF_USED)) fixing that will indeed cause rcu_read_lock() to insta-splat :/ The above typo means that instead of testing for: 0x100 (1 << LOCK_USED), we test for 8 (LOCK_USED), which corresponds to (1 << LOCK_ENABLED_HARDIRQ). So instead of testing for _any_ used lock, it will only match any lock used with interrupts enabled. The rcu_read_lock() annotation uses .check=0, which means it will not set any of the interrupt bits and will thus never match. In order to properly fix the situation and allow rcu_read_lock() to correctly work, split LOCK_USED into LOCK_USED and LOCK_USED_READ and by having .read users set USED_READ and test USED, pure read-recursive locks are permitted. Fixes: f6f48e180404 ("lockdep: Teach lockdep about "USED" <- "IN-NMI" inversions") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Masami Hiramatsu Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20200902160323.GK1362448@hirez.programming.kicks-ass.net --- kernel/locking/lockdep.c | 35 +++++++++++++++++++++++++++++------ kernel/locking/lockdep_internals.h | 2 ++ 2 files changed, 31 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 54b74fabf40c..2facbbd146ec 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3969,13 +3969,18 @@ static int separate_irq_context(struct task_struct *curr, static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { - unsigned int new_mask = 1 << new_bit, ret = 1; + unsigned int old_mask, new_mask, ret = 1; if (new_bit >= LOCK_USAGE_STATES) { DEBUG_LOCKS_WARN_ON(1); return 0; } + if (new_bit == LOCK_USED && this->read) + new_bit = LOCK_USED_READ; + + new_mask = 1 << new_bit; + /* * If already set then do not dirty the cacheline, * nor do any checks: @@ -3988,13 +3993,22 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Make sure we didn't race: */ - if (unlikely(hlock_class(this)->usage_mask & new_mask)) { - graph_unlock(); - return 1; - } + if (unlikely(hlock_class(this)->usage_mask & new_mask)) + goto unlock; + old_mask = hlock_class(this)->usage_mask; hlock_class(this)->usage_mask |= new_mask; + /* + * Save one usage_traces[] entry and map both LOCK_USED and + * LOCK_USED_READ onto the same entry. + */ + if (new_bit == LOCK_USED || new_bit == LOCK_USED_READ) { + if (old_mask & (LOCKF_USED | LOCKF_USED_READ)) + goto unlock; + new_bit = LOCK_USED; + } + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) return 0; @@ -4008,6 +4022,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; } +unlock: graph_unlock(); /* @@ -4942,12 +4957,20 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock { #ifdef CONFIG_PROVE_LOCKING struct lock_class *class = look_up_lock_class(lock, subclass); + unsigned long mask = LOCKF_USED; /* if it doesn't have a class (yet), it certainly hasn't been used yet */ if (!class) return; - if (!(class->usage_mask & LOCK_USED)) + /* + * READ locks only conflict with USED, such that if we only ever use + * READ locks, there is no deadlock possible -- RCU. + */ + if (!hlock->read) + mask |= LOCKF_USED_READ; + + if (!(class->usage_mask & mask)) return; hlock->class_idx = class - lock_classes; diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index baca699b94e9..b0be1560ed17 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -19,6 +19,7 @@ enum lock_usage_bit { #include "lockdep_states.h" #undef LOCKDEP_STATE LOCK_USED, + LOCK_USED_READ, LOCK_USAGE_STATES }; @@ -40,6 +41,7 @@ enum { #include "lockdep_states.h" #undef LOCKDEP_STATE __LOCKF(USED) + __LOCKF(USED_READ) }; #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | -- cgit v1.2.3 From 53922270d21de707a1a0ffaf1e07644e77fcb8db Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 18 Jun 2020 16:29:49 -0400 Subject: rcu/segcblist: Prevent useless GP start if no CBs to accelerate The rcu_segcblist_accelerate() function returns true iff it is necessary to request another grace period. A tracing session showed that this function unnecessarily requests grace periods. For example, consider the following sequence of events: 1. Callbacks are queued only on the NEXT segment of CPU A's callback list. 2. CPU A runs RCU_SOFTIRQ, accelerating these callbacks from NEXT to WAIT. 3. Thus rcu_segcblist_accelerate() returns true, requesting grace period N. 4. RCU's grace-period kthread wakes up on CPU B and starts grace period N. 4. CPU A notices the new grace period and invokes RCU_SOFTIRQ. 5. CPU A's RCU_SOFTIRQ again invokes rcu_segcblist_accelerate(), but there are no new callbacks. However, rcu_segcblist_accelerate() nevertheless (uselessly) requests a new grace period N+1. This extra grace period results in additional lock contention and also additional wakeups, all for no good reason. This commit therefore adds a check to rcu_segcblist_accelerate() that prevents the return of true when there are no new callbacks. This change reduces the number of grace periods (GPs) and wakeups in each of eleven five-second rcutorture runs as follows: +----+-------------------+-------------------+ | # | Number of GPs | Number of Wakeups | +====+=========+=========+=========+=========+ | 1 | With | Without | With | Without | +----+---------+---------+---------+---------+ | 2 | 75 | 89 | 113 | 119 | +----+---------+---------+---------+---------+ | 3 | 62 | 91 | 105 | 123 | +----+---------+---------+---------+---------+ | 4 | 60 | 79 | 98 | 110 | +----+---------+---------+---------+---------+ | 5 | 63 | 79 | 99 | 112 | +----+---------+---------+---------+---------+ | 6 | 57 | 89 | 96 | 123 | +----+---------+---------+---------+---------+ | 7 | 64 | 85 | 97 | 118 | +----+---------+---------+---------+---------+ | 8 | 58 | 83 | 98 | 113 | +----+---------+---------+---------+---------+ | 9 | 57 | 77 | 89 | 104 | +----+---------+---------+---------+---------+ | 10 | 66 | 82 | 98 | 119 | +----+---------+---------+---------+---------+ | 11 | 52 | 82 | 83 | 117 | +----+---------+---------+---------+---------+ The reduction in the number of wakeups ranges from 5% to 40%. Cc: urezki@gmail.com [ paulmck: Rework commit log and comment. ] Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 9a0f66133b4b..2d2a6b6b9dfb 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -475,8 +475,16 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * Also advance to the oldest segment of callbacks whose * ->gp_seq[] completion is at or after that passed in via "seq", * skipping any empty segments. + * + * Note that segment "i" (and any lower-numbered segments + * containing older callbacks) will be unaffected, and their + * grace-period numbers remain unchanged. For example, if i == + * WAIT_TAIL, then neither WAIT_TAIL nor DONE_TAIL will be touched. + * Instead, the CBs in NEXT_TAIL will be merged with those in + * NEXT_READY_TAIL and the grace-period number of NEXT_READY_TAIL + * would be updated. NEXT_TAIL would then be empty. */ - if (++i >= RCU_NEXT_TAIL) + if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL) return false; /* -- cgit v1.2.3 From 70060b8770d34f83e9fa4c3526db013dd2773611 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 14 Aug 2020 14:45:57 +0800 Subject: rcu: Shrink each possible cpu krcp CPUs can go offline shortly after kfree_call_rcu() has been invoked, which can leave memory stranded until those CPUs come back online. This commit therefore drains the kcrp of each CPU, not just the ones that happen to be online. Acked-by: Joel Fernandes Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 232362293678..92450642c4d8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3450,7 +3450,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count = 0; /* Snapshot count of all CPUs */ - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count += READ_ONCE(krcp->count); @@ -3465,7 +3465,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int cpu, freed = 0; unsigned long flags; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { int count; struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); @@ -3498,7 +3498,7 @@ void __init kfree_rcu_scheduler_running(void) int cpu; unsigned long flags; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); raw_spin_lock_irqsave(&krcp->lock, flags); -- cgit v1.2.3 From dc0988bbe1bd41e2fa555e4a6f890b819a34b49b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 2 Sep 2020 16:53:40 -0700 Subject: bpf: Do not use bucket_lock for hashmap iterator Currently, for hashmap, the bpf iterator will grab a bucket lock, a spinlock, before traversing the elements in the bucket. This can ensure all bpf visted elements are valid. But this mechanism may cause deadlock if update/deletion happens to the same bucket of the visited map in the program. For example, if we added bpf_map_update_elem() call to the same visited element in selftests bpf_iter_bpf_hash_map.c, we will have the following deadlock: ============================================ WARNING: possible recursive locking detected 5.9.0-rc1+ #841 Not tainted -------------------------------------------- test_progs/1750 is trying to acquire lock: ffff9a5bb73c5e70 (&htab->buckets[i].raw_lock){....}-{2:2}, at: htab_map_update_elem+0x1cf/0x410 but task is already holding lock: ffff9a5bb73c5e20 (&htab->buckets[i].raw_lock){....}-{2:2}, at: bpf_hash_map_seq_find_next+0x94/0x120 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&htab->buckets[i].raw_lock); lock(&htab->buckets[i].raw_lock); *** DEADLOCK *** ... Call Trace: dump_stack+0x78/0xa0 __lock_acquire.cold.74+0x209/0x2e3 lock_acquire+0xba/0x380 ? htab_map_update_elem+0x1cf/0x410 ? __lock_acquire+0x639/0x20c0 _raw_spin_lock_irqsave+0x3b/0x80 ? htab_map_update_elem+0x1cf/0x410 htab_map_update_elem+0x1cf/0x410 ? lock_acquire+0xba/0x380 bpf_prog_ad6dab10433b135d_dump_bpf_hash_map+0x88/0xa9c ? find_held_lock+0x34/0xa0 bpf_iter_run_prog+0x81/0x16e __bpf_hash_map_seq_show+0x145/0x180 bpf_seq_read+0xff/0x3d0 vfs_read+0xad/0x1c0 ksys_read+0x5f/0xe0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ... The bucket_lock first grabbed in seq_ops->next() called by bpf_seq_read(), and then grabbed again in htab_map_update_elem() in the bpf program, causing deadlocks. Actually, we do not need bucket_lock here, we can just use rcu_read_lock() similar to netlink iterator where the rcu_read_{lock,unlock} likes below: seq_ops->start(): rcu_read_lock(); seq_ops->next(): rcu_read_unlock(); /* next element */ rcu_read_lock(); seq_ops->stop(); rcu_read_unlock(); Compared to old bucket_lock mechanism, if concurrent updata/delete happens, we may visit stale elements, miss some elements, or repeat some elements. I think this is a reasonable compromise. For users wanting to avoid stale, missing/repeated accesses, bpf_map batch access syscall interface can be used. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200902235340.2001375-1-yhs@fb.com --- kernel/bpf/hashtab.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 78dfff6a501b..7df28a45c66b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1622,7 +1622,6 @@ struct bpf_iter_seq_hash_map_info { struct bpf_map *map; struct bpf_htab *htab; void *percpu_value_buf; // non-zero means percpu hash - unsigned long flags; u32 bucket_id; u32 skip_elems; }; @@ -1632,7 +1631,6 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, struct htab_elem *prev_elem) { const struct bpf_htab *htab = info->htab; - unsigned long flags = info->flags; u32 skip_elems = info->skip_elems; u32 bucket_id = info->bucket_id; struct hlist_nulls_head *head; @@ -1656,19 +1654,18 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, /* not found, unlock and go to the next bucket */ b = &htab->buckets[bucket_id++]; - htab_unlock_bucket(htab, b, flags); + rcu_read_unlock(); skip_elems = 0; } for (i = bucket_id; i < htab->n_buckets; i++) { b = &htab->buckets[i]; - flags = htab_lock_bucket(htab, b); + rcu_read_lock(); count = 0; head = &b->head; hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { if (count >= skip_elems) { - info->flags = flags; info->bucket_id = i; info->skip_elems = count; return elem; @@ -1676,7 +1673,7 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, count++; } - htab_unlock_bucket(htab, b, flags); + rcu_read_unlock(); skip_elems = 0; } @@ -1754,14 +1751,10 @@ static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) { - struct bpf_iter_seq_hash_map_info *info = seq->private; - if (!v) (void)__bpf_hash_map_seq_show(seq, NULL); else - htab_unlock_bucket(info->htab, - &info->htab->buckets[info->bucket_id], - info->flags); + rcu_read_unlock(); } static int bpf_iter_init_hash_map(void *priv_data, -- cgit v1.2.3 From 1b0df11fde0f14a269a181b3b7f5122415bc5ed7 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Wed, 2 Sep 2020 13:07:56 -0400 Subject: padata: fix possible padata_works_lock deadlock syzbot reports, WARNING: inconsistent lock state 5.9.0-rc2-syzkaller #0 Not tainted -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. syz-executor.0/26715 takes: (padata_works_lock){+.?.}-{2:2}, at: padata_do_parallel kernel/padata.c:220 {IN-SOFTIRQ-W} state was registered at: spin_lock include/linux/spinlock.h:354 [inline] padata_do_parallel kernel/padata.c:220 ... __do_softirq kernel/softirq.c:298 ... sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1091 asm_sysvec_apic_timer_interrupt arch/x86/include/asm/idtentry.h:581 Possible unsafe locking scenario: CPU0 ---- lock(padata_works_lock); lock(padata_works_lock); padata_do_parallel() takes padata_works_lock with softirqs enabled, so a deadlock is possible if, on the same CPU, the lock is acquired in process context and then softirq handling done in an interrupt leads to the same path. Fix by leaving softirqs disabled while do_parallel holds padata_works_lock. Reported-by: syzbot+f4b9f49e38e25eb4ef52@syzkaller.appspotmail.com Fixes: 4611ce2246889 ("padata: allocate work structures for parallel jobs from a pool") Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 16cb894dc272..d4d3ba6e1728 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -215,12 +215,13 @@ int padata_do_parallel(struct padata_shell *ps, padata->pd = pd; padata->cb_cpu = *cb_cpu; - rcu_read_unlock_bh(); - spin_lock(&padata_works_lock); padata->seq_nr = ++pd->seq_nr; pw = padata_work_alloc(); spin_unlock(&padata_works_lock); + + rcu_read_unlock_bh(); + if (pw) { padata_work_init(pw, padata_parallel_worker, padata, 0); queue_work(pinst->parallel_wq, &pw->pw_work); -- cgit v1.2.3 From ba7d25f3dff6adc03b669fea87e356b8dc04575e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 2 Sep 2020 12:21:28 +0200 Subject: exit: support non-blocking pidfds Passing a non-blocking pidfd to waitid() currently has no effect, i.e. is not supported. There are users which would like to use waitid() on pidfds that are O_NONBLOCK and mix it with pidfds that are blocking and both pass them to waitid(). The expected behavior is to have waitid() return -EAGAIN for non-blocking pidfds and to block for blocking pidfds without needing to perform any additional checks for flags set on the pidfd before passing it to waitid(). Non-blocking pidfds will return EAGAIN from waitid() when no child process is ready yet. Returning -EAGAIN for non-blocking pidfds makes it easier for event loops that handle EAGAIN specially. It also makes the API more consistent and uniform. In essence, waitid() is treated like a read on a non-blocking pidfd or a recvmsg() on a non-blocking socket. With the addition of support for non-blocking pidfds we support the same functionality that sockets do. For sockets() recvmsg() supports MSG_DONTWAIT for pidfds waitid() supports WNOHANG. Both flags are per-call options. In contrast non-blocking pidfds and non-blocking sockets are a setting on an open file description affecting all threads in the calling process as well as other processes that hold file descriptors referring to the same open file description. Both behaviors, per call and per open file description, have genuine use-cases. The implementation should be straightforward: - If a non-blocking pidfd is passed and WNOHANG is not raised we simply raise the WNOHANG flag internally. When do_wait() returns indicating that there are eligible child processes but none have exited yet we set EAGAIN. If no child process exists we continue returning ECHILD. - If a non-blocking pidfd is passed and WNOHANG is raised waitid() will continue returning 0, i.e. it will not set EAGAIN. This ensure backwards compatibility with applications passing WNOHANG explicitly with pidfds. A concrete use-case that was brought on-list was Josh's async pidfd library. Ever since the introduction of pidfds and more advanced async io various programming languages such as Rust have grown support for async event libraries. These libraries are created to help build epoll-based event loops around file descriptors. A common pattern is to automatically make all file descriptors they manage to O_NONBLOCK. For such libraries the EAGAIN error code is treated specially. When a function is called that returns EAGAIN the function isn't called again until the event loop indicates the the file descriptor is ready. Supporting EAGAIN when waiting on pidfds makes such libraries just work with little effort. Suggested-by: Josh Triplett Signed-off-by: Christian Brauner Reviewed-by: Josh Triplett Reviewed-by: Oleg Nesterov Cc: Kees Cook Cc: Sargun Dhillon Cc: Jann Horn Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Oleg Nesterov Cc: "Peter Zijlstra (Intel)" Link: https://lore.kernel.org/lkml/20200811181236.GA18763@localhost/ Link: https://github.com/joshtriplett/async-pidfd Link: https://lore.kernel.org/r/20200902102130.147672-3-christian.brauner@ubuntu.com --- kernel/exit.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 733e80f334e7..1f51c27bae59 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1474,7 +1474,7 @@ end: return retval; } -static struct pid *pidfd_get_pid(unsigned int fd) +static struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { struct fd f; struct pid *pid; @@ -1484,8 +1484,10 @@ static struct pid *pidfd_get_pid(unsigned int fd) return ERR_PTR(-EBADF); pid = pidfd_pid(f.file); - if (!IS_ERR(pid)) + if (!IS_ERR(pid)) { get_pid(pid); + *flags = f.file->f_flags; + } fdput(f); return pid; @@ -1498,6 +1500,7 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, struct pid *pid = NULL; enum pid_type type; long ret; + unsigned int f_flags = 0; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) @@ -1531,9 +1534,10 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, if (upid < 0) return -EINVAL; - pid = pidfd_get_pid(upid); + pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); + break; default: return -EINVAL; @@ -1544,7 +1548,12 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, wo.wo_flags = options; wo.wo_info = infop; wo.wo_rusage = ru; + if (f_flags & O_NONBLOCK) + wo.wo_flags |= WNOHANG; + ret = do_wait(&wo); + if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK)) + ret = -EAGAIN; put_pid(pid); return ret; -- cgit v1.2.3 From 6da73d15258a1e5e86d03d4ffba8776d17a8a287 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 2 Sep 2020 12:21:27 +0200 Subject: pidfd: support PIDFD_NONBLOCK in pidfd_open() Introduce PIDFD_NONBLOCK to support non-blocking pidfd file descriptors. Ever since the introduction of pidfds and more advanced async io various programming languages such as Rust have grown support for async event libraries. These libraries are created to help build epoll-based event loops around file descriptors. A common pattern is to automatically make all file descriptors they manage to O_NONBLOCK. For such libraries the EAGAIN error code is treated specially. When a function is called that returns EAGAIN the function isn't called again until the event loop indicates the the file descriptor is ready. Supporting EAGAIN when waiting on pidfds makes such libraries just work with little effort. In the following patch we will extend waitid() internally to support non-blocking pidfds. This introduces a new flag PIDFD_NONBLOCK that is equivalent to O_NONBLOCK. This follows the same patterns we have for other (anon inode) file descriptors such as EFD_NONBLOCK, IN_NONBLOCK, SFD_NONBLOCK, TFD_NONBLOCK and the same for close-on-exec flags. Suggested-by: Josh Triplett Signed-off-by: Christian Brauner Reviewed-by: Josh Triplett Reviewed-by: Oleg Nesterov Cc: Kees Cook Cc: Sargun Dhillon Cc: Oleg Nesterov Link: https://lore.kernel.org/lkml/20200811181236.GA18763@localhost/ Link: https://github.com/joshtriplett/async-pidfd Link: https://lore.kernel.org/r/20200902102130.147672-2-christian.brauner@ubuntu.com --- kernel/pid.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index b2562a7ce525..74ddbff1a6ba 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -43,6 +43,7 @@ #include #include #include +#include struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -522,7 +523,8 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) /** * pidfd_create() - Create a new pid file descriptor. * - * @pid: struct pid that the pidfd will reference + * @pid: struct pid that the pidfd will reference + * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set. * @@ -532,12 +534,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -static int pidfd_create(struct pid *pid) +static int pidfd_create(struct pid *pid, unsigned int flags) { int fd; fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - O_RDWR | O_CLOEXEC); + flags | O_RDWR | O_CLOEXEC); if (fd < 0) put_pid(pid); @@ -565,7 +567,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) int fd; struct pid *p; - if (flags) + if (flags & ~PIDFD_NONBLOCK) return -EINVAL; if (pid <= 0) @@ -576,7 +578,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) return -ESRCH; if (pid_has_task(p, PIDTYPE_TGID)) - fd = pidfd_create(p); + fd = pidfd_create(p, flags); else fd = -EINVAL; -- cgit v1.2.3 From e48c15b796d412ede883bb2ef7779b2a142f7962 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Jun 2020 17:21:32 -0700 Subject: smp: Add source and destination CPUs to __call_single_data This commit adds a destination CPU to __call_single_data, and is inspired by an earlier commit by Peter Zijlstra. This version adds #ifdef to permit use by 32-bit systems and supplying the destination CPU for all smp_call_function*() requests, not just smp_call_function_single(). If need be, 32-bit systems could be accommodated by shrinking the flags field to 16 bits (the atomic_t variant is currently unused) and by providing only eight bits for CPU on such systems. It is not clear that the addition of the fields to __call_single_node are really needed. [ paulmck: Apply Boqun Feng feedback on 32-bit builds. ] Link: https://lore.kernel.org/lkml/20200615164048.GC2531@hirez.programming.kicks-ass.net/ Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/smp.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index d0ae8eb6bf8b..865a876f83ce 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -375,6 +375,9 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd->func = func; csd->info = info; +#ifdef CONFIG_64BIT + csd->dst = cpu; +#endif err = generic_exec_single(cpu, csd); @@ -540,6 +543,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; +#ifdef CONFIG_64BIT + csd->dst = cpu; +#endif if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } -- cgit v1.2.3 From 35feb60474bf4f7fa7840e14fc7fd344996b919d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2020 13:22:54 -0700 Subject: kernel/smp: Provide CSD lock timeout diagnostics This commit causes csd_lock_wait() to emit diagnostics when a CPU fails to respond quickly enough to one of the smp_call_function() family of function calls. These diagnostics are enabled by a new CSD_LOCK_WAIT_DEBUG Kconfig option that depends on DEBUG_KERNEL. This commit was inspired by an earlier patch by Josef Bacik. [ paulmck: Fix for syzbot+0f719294463916a3fc0e@syzkaller.appspotmail.com ] [ paulmck: Fix KASAN use-after-free issue reported by Qian Cai. ] [ paulmck: Fix botched nr_cpu_ids comparison per Dan Carpenter. ] [ paulmck: Apply Peter Zijlstra feedback. ] Link: https://lore.kernel.org/lkml/00000000000042f21905a991ecea@google.com Link: https://lore.kernel.org/lkml/0000000000002ef21705a9933cf3@google.com Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/smp.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 130 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 865a876f83ce..c5d31885bd30 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #include "smpboot.h" #include "sched/smp.h" @@ -96,6 +99,103 @@ void __init call_function_init(void) smpcfd_prepare_cpu(smp_processor_id()); } +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + +static DEFINE_PER_CPU(call_single_data_t *, cur_csd); +static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); +static DEFINE_PER_CPU(void *, cur_csd_info); + +#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) +atomic_t csd_bug_count = ATOMIC_INIT(0); + +/* Record current CSD work for current CPU, NULL to erase. */ +static void csd_lock_record(call_single_data_t *csd) +{ + if (!csd) { + smp_mb(); /* NULL cur_csd after unlock. */ + __this_cpu_write(cur_csd, NULL); + return; + } + __this_cpu_write(cur_csd_func, csd->func); + __this_cpu_write(cur_csd_info, csd->info); + smp_wmb(); /* func and info before csd. */ + __this_cpu_write(cur_csd, csd); + smp_mb(); /* Update cur_csd before function call. */ + /* Or before unlock, as the case may be. */ +} + +static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd) +{ + unsigned int csd_type; + + csd_type = CSD_TYPE(csd); + if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) + return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */ + return -1; +} + +/* + * Complain if too much time spent waiting. Note that only + * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, + * so waiting on other types gets much less information. + */ +static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +{ + int cpu = -1; + int cpux; + bool firsttime; + u64 ts2, ts_delta; + call_single_data_t *cpu_cur_csd; + unsigned int flags = READ_ONCE(csd->flags); + + if (!(flags & CSD_FLAG_LOCK)) { + if (!unlikely(*bug_id)) + return true; + cpu = csd_lock_wait_getcpu(csd); + pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", + *bug_id, raw_smp_processor_id(), cpu); + return true; + } + + ts2 = sched_clock(); + ts_delta = ts2 - *ts1; + if (likely(ts_delta <= CSD_LOCK_TIMEOUT)) + return false; + + firsttime = !*bug_id; + if (firsttime) + *bug_id = atomic_inc_return(&csd_bug_count); + cpu = csd_lock_wait_getcpu(csd); + if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu)) + cpux = 0; + else + cpux = cpu; + cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ + pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", + firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0, + cpu, csd->func, csd->info); + if (cpu_cur_csd && csd != cpu_cur_csd) { + pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", + *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), + READ_ONCE(per_cpu(cur_csd_info, cpux))); + } else { + pr_alert("\tcsd: CSD lock (#%d) %s.\n", + *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); + } + if (cpu >= 0) { + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + if (!cpu_cur_csd) { + pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); + arch_send_call_function_single_ipi(cpu); + } + } + dump_stack(); + *ts1 = ts2; + + return false; +} + /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * @@ -103,10 +203,30 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ +static __always_inline void csd_lock_wait(call_single_data_t *csd) +{ + int bug_id = 0; + u64 ts0, ts1; + + ts1 = ts0 = sched_clock(); + for (;;) { + if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id)) + break; + cpu_relax(); + } + smp_acquire__after_ctrl_dep(); +} + +#else +static void csd_lock_record(call_single_data_t *csd) +{ +} + static __always_inline void csd_lock_wait(call_single_data_t *csd) { smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } +#endif static __always_inline void csd_lock(call_single_data_t *csd) { @@ -166,9 +286,11 @@ static int generic_exec_single(int cpu, call_single_data_t *csd) * We can unlock early even for the synchronous on-stack case, * since we're doing this from the same CPU.. */ + csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); func(info); + csd_lock_record(NULL); local_irq_restore(flags); return 0; } @@ -268,8 +390,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) entry = &csd_next->llist; } + csd_lock_record(csd); func(info); csd_unlock(csd); + csd_lock_record(NULL); } else { prev = &csd->llist; } @@ -296,8 +420,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) smp_call_func_t func = csd->func; void *info = csd->info; + csd_lock_record(csd); csd_unlock(csd); func(info); + csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); } @@ -375,7 +501,8 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd->func = func; csd->info = info; -#ifdef CONFIG_64BIT +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + csd->src = smp_processor_id(); csd->dst = cpu; #endif @@ -543,7 +670,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; -#ifdef CONFIG_64BIT +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + csd->src = smp_processor_id(); csd->dst = cpu; #endif if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) -- cgit v1.2.3 From 2b722160f1a7929f38dfb648c7bbb45f96e65a5b Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 6 Jul 2020 21:49:41 +0800 Subject: smp: Make symbol 'csd_bug_count' static The sparse tool complains as follows: kernel/smp.c:107:10: warning: symbol 'csd_bug_count' was not declared. Should it be static? Because variable is not used outside of smp.c, this commit marks it static. Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index c5d31885bd30..b25383d16e8e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -106,7 +106,7 @@ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); #define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) -atomic_t csd_bug_count = ATOMIC_INIT(0); +static atomic_t csd_bug_count = ATOMIC_INIT(0); /* Record current CSD work for current CPU, NULL to erase. */ static void csd_lock_record(call_single_data_t *csd) -- cgit v1.2.3 From cd1752d34ef33d68d82ef9dcc699b4eaa17c07fc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 26 Aug 2020 18:37:50 +0100 Subject: genirq: Walk the irq_data hierarchy when resending an interrupt On resending an interrupt, we only check the outermost irqchip for a irq_retrigger callback. However, this callback could be implemented at an inner level. Use irq_chip_retrigger_hierarchy() in this case. Reviewed-by: Valentin Schneider Signed-off-by: Marc Zyngier --- kernel/irq/resend.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index c48ce19a257f..8ccd32a0cc80 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -86,6 +86,18 @@ static int irq_sw_resend(struct irq_desc *desc) } #endif +static int try_retrigger(struct irq_desc *desc) +{ + if (desc->irq_data.chip->irq_retrigger) + return desc->irq_data.chip->irq_retrigger(&desc->irq_data); + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + return irq_chip_retrigger_hierarchy(&desc->irq_data); +#else + return 0; +#endif +} + /* * IRQ resend * @@ -113,8 +125,7 @@ int check_irq_resend(struct irq_desc *desc, bool inject) desc->istate &= ~IRQS_PENDING; - if (!desc->irq_data.chip->irq_retrigger || - !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) + if (!try_retrigger(desc)) err = irq_sw_resend(desc); /* If the retrigger was successfull, mark it with the REPLAY bit */ -- cgit v1.2.3 From e75ad2cc41832bae3218e3d6f38b3f03ae0a41df Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 7 Sep 2020 22:06:52 +0800 Subject: blktrace: make function blk_trace_bio_get_cgid() static The sparse tool complains as follows: kernel/trace/blktrace.c:796:5: warning: symbol 'blk_trace_bio_get_cgid' was not declared. Should it be static? This function is not used outside of blktrace.c, so this commit marks it static. Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4b3a42fc3b24..c0887f32f647 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -793,7 +793,7 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) return cgroup_id(bio_blkcg(bio)->css.cgroup); } #else -u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { return 0; } -- cgit v1.2.3 From d397820f36ffe4701343b6ee12687d60db0ed8db Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 21 Jul 2020 15:31:28 +0206 Subject: printk: ringbuffer: support dataless records With commit 896fbe20b4e2333fb55 ("printk: use the lockless ringbuffer"), printk() started silently dropping messages without text because such records are not supported by the new printk ringbuffer. Add support for such records. Currently dataless records are denoted by INVALID_LPOS in order to recognize failed prb_reserve() calls. Change the ringbuffer to instead use two different identifiers (FAILED_LPOS and NO_LPOS) to distinguish between failed prb_reserve() records and successful dataless records, respectively. Fixes: 896fbe20b4e2333fb55 ("printk: use the lockless ringbuffer") Fixes: https://lkml.kernel.org/r/20200718121053.GA691245@elver.google.com Reported-by: Marco Elver Signed-off-by: John Ogness Cc: Petr Mladek Cc: Steven Rostedt Cc: Marco Elver Signed-off-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200721132528.9661-1-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 72 +++++++++++++++++++-------------------- kernel/printk/printk_ringbuffer.h | 15 ++++---- 2 files changed, 43 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 7355ca99e852..0659b50872b5 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -264,6 +264,9 @@ /* Determine how many times the data array has wrapped. */ #define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits) +/* Determine if a logical position refers to a data-less block. */ +#define LPOS_DATALESS(lpos) ((lpos) & 1UL) + /* Get the logical position at index 0 of the current wrap. */ #define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ ((lpos) & ~DATA_SIZE_MASK(data_ring)) @@ -320,21 +323,13 @@ static unsigned int to_blk_size(unsigned int size) * block does not exceed the maximum possible size that could fit within the * ringbuffer. This function provides that basic size check so that the * assumption is safe. - * - * Writers are also not allowed to write 0-sized (data-less) records. Such - * records are used only internally by the ringbuffer. */ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) { struct prb_data_block *db = NULL; - /* - * Writers are not allowed to write data-less records. Such records - * are used only internally by the ringbuffer to denote records where - * their data failed to allocate or have been lost. - */ if (size == 0) - return false; + return true; /* * Ensure the alignment padded size could possibly fit in the data @@ -568,8 +563,8 @@ static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long tail_lpos; unsigned long next_lpos; - /* If @lpos is not valid, there is nothing to do. */ - if (lpos == INVALID_LPOS) + /* If @lpos is from a data-less block, there is nothing to do. */ + if (LPOS_DATALESS(lpos)) return true; /* @@ -962,8 +957,8 @@ static char *data_alloc(struct printk_ringbuffer *rb, if (size == 0) { /* Specify a data-less block. */ - blk_lpos->begin = INVALID_LPOS; - blk_lpos->next = INVALID_LPOS; + blk_lpos->begin = NO_LPOS; + blk_lpos->next = NO_LPOS; return NULL; } @@ -976,8 +971,8 @@ static char *data_alloc(struct printk_ringbuffer *rb, if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) { /* Failed to allocate, specify a data-less block. */ - blk_lpos->begin = INVALID_LPOS; - blk_lpos->next = INVALID_LPOS; + blk_lpos->begin = FAILED_LPOS; + blk_lpos->next = FAILED_LPOS; return NULL; } @@ -1025,6 +1020,10 @@ static char *data_alloc(struct printk_ringbuffer *rb, static unsigned int space_used(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos) { + /* Data-less blocks take no space. */ + if (LPOS_DATALESS(blk_lpos->begin)) + return 0; + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { /* Data block does not wrap. */ return (DATA_INDEX(data_ring, blk_lpos->next) - @@ -1080,11 +1079,8 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; - /* Records are allowed to not have dictionaries. */ - if (r->dict_buf_size) { - if (!data_check_size(&rb->dict_data_ring, r->dict_buf_size)) - goto fail; - } + if (!data_check_size(&rb->dict_data_ring, r->dict_buf_size)) + goto fail; /* * Descriptors in the reserved state act as blockers to all further @@ -1205,15 +1201,18 @@ void prb_commit(struct prb_reserved_entry *e) * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is * triggered if an internal error is detected. */ -static char *get_data(struct prb_data_ring *data_ring, - struct prb_data_blk_lpos *blk_lpos, - unsigned int *data_size) +static const char *get_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, + unsigned int *data_size) { struct prb_data_block *db; /* Data-less data block description. */ - if (blk_lpos->begin == INVALID_LPOS && - blk_lpos->next == INVALID_LPOS) { + if (LPOS_DATALESS(blk_lpos->begin) && LPOS_DATALESS(blk_lpos->next)) { + if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { + *data_size = 0; + return ""; + } return NULL; } @@ -1256,11 +1255,11 @@ static char *get_data(struct prb_data_ring *data_ring, * (even if @text_size is 0). Each '\n' processed is counted as an additional * line. */ -static unsigned int count_lines(char *text, unsigned int text_size) +static unsigned int count_lines(const char *text, unsigned int text_size) { unsigned int next_size = text_size; unsigned int line_count = 1; - char *next = text; + const char *next = text; while (next_size) { next = memchr(next, '\n', next_size); @@ -1287,7 +1286,7 @@ static bool copy_data(struct prb_data_ring *data_ring, unsigned int buf_size, unsigned int *line_count) { unsigned int data_size; - char *data; + const char *data; /* Caller might not want any data. */ if ((!buf || !buf_size) && !line_count) @@ -1317,8 +1316,7 @@ static bool copy_data(struct prb_data_ring *data_ring, data_size = min_t(u16, buf_size, len); - if (!WARN_ON_ONCE(!data_size)) - memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ + memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ return true; } @@ -1355,11 +1353,11 @@ static int desc_read_committed_seq(struct prb_desc_ring *desc_ring, /* * A descriptor in the reusable state may no longer have its data - * available; report it as a data-less record. Or the record may - * actually be a data-less record. + * available; report it as existing but with lost data. Or the record + * may actually be a record with lost data. */ if (d_state == desc_reusable || - (blk_lpos->begin == INVALID_LPOS && blk_lpos->next == INVALID_LPOS)) { + (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) { return -ENOENT; } @@ -1659,10 +1657,10 @@ void prb_init(struct printk_ringbuffer *rb, descs[_DESCS_COUNT(descbits) - 1].info.seq = 0; atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); - descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = INVALID_LPOS; - descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = INVALID_LPOS; - descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.begin = INVALID_LPOS; - descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.next = INVALID_LPOS; + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; + descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.begin = FAILED_LPOS; + descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.next = FAILED_LPOS; } /** diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 3e46a7423c13..e6302da041f9 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -120,12 +120,13 @@ struct prb_reserved_entry { #define DESC_FLAGS_MASK (DESC_COMMITTED_MASK | DESC_REUSE_MASK) #define DESC_ID_MASK (~DESC_FLAGS_MASK) #define DESC_ID(sv) ((sv) & DESC_ID_MASK) -#define INVALID_LPOS 1 +#define FAILED_LPOS 0x1 +#define NO_LPOS 0x3 -#define INVALID_BLK_LPOS \ +#define FAILED_BLK_LPOS \ { \ - .begin = INVALID_LPOS, \ - .next = INVALID_LPOS, \ + .begin = FAILED_LPOS, \ + .next = FAILED_LPOS, \ } /* @@ -147,7 +148,7 @@ struct prb_reserved_entry { * * To satisfy Req1, the tail initially points to a descriptor that is * minimally initialized (having no data block, i.e. data-less with the - * data block's lpos @begin and @next values set to INVALID_LPOS). + * data block's lpos @begin and @next values set to FAILED_LPOS). * * To satisfy Req2, the initial tail descriptor is initialized to the * reusable state. Readers recognize reusable descriptors as existing @@ -242,8 +243,8 @@ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ /* reusable */ \ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ /* no associated data block */ \ - .text_blk_lpos = INVALID_BLK_LPOS, \ - .dict_blk_lpos = INVALID_BLK_LPOS, \ + .text_blk_lpos = FAILED_BLK_LPOS, \ + .dict_blk_lpos = FAILED_BLK_LPOS, \ }, \ }; \ static struct printk_ringbuffer name = { \ -- cgit v1.2.3 From 66ada2ccae4ed4dd07ba91df3b5fdb4c11335bd1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 29 Aug 2020 22:00:01 +0900 Subject: kprobes: Add generic kretprobe trampoline handler Add a generic kretprobe trampoline handler for unifying the all cloned /arch/* kretprobe trampoline handlers. The generic kretprobe trampoline handler is based on the x86 implementation, because it is the latest implementation. It has frame pointer checking, kprobe_busy_begin/end and return address fixup for user handlers. [ mingo: Minor edits. ] Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/159870600138.1229682.3424065380448088833.stgit@devnote2 --- kernel/kprobes.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 287b263c9cb9..a0afaa79024e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1927,6 +1927,104 @@ unsigned long __weak arch_deref_entry_point(void *entry) } #ifdef CONFIG_KRETPROBES + +unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, + void *trampoline_address, + void *frame_pointer) +{ + struct kretprobe_instance *ri = NULL, *last = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *tmp; + unsigned long flags; + kprobe_opcode_t *correct_ret_addr = NULL; + bool skipped = false; + + INIT_HLIST_HEAD(&empty_rp); + kretprobe_hash_lock(current, &head, &flags); + + /* + * It is possible to have multiple instances associated with a given + * task either because multiple functions in the call path have + * return probes installed on them, and/or more than one + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always pushed into the head of the list + * - when multiple return probes are registered for the same + * function, the (chronologically) first instance's ret_addr + * will be the real return address, and all the rest will + * point to kretprobe_trampoline. + */ + hlist_for_each_entry(ri, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + /* + * Return probes must be pushed on this hash list correct + * order (same as return order) so that it can be popped + * correctly. However, if we find it is pushed it incorrect + * order, this means we find a function which should not be + * probed, because the wrong order entry is pushed on the + * path of processing other kretprobe itself. + */ + if (ri->fp != frame_pointer) { + if (!skipped) + pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); + skipped = true; + continue; + } + + correct_ret_addr = ri->ret_addr; + if (skipped) + pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", + ri->rp->kp.addr); + + if (correct_ret_addr != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, (unsigned long)correct_ret_addr, + (unsigned long)trampoline_address); + last = ri; + + hlist_for_each_entry_safe(ri, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + if (ri->fp != frame_pointer) + continue; + + if (ri->rp && ri->rp->handler) { + struct kprobe *prev = kprobe_running(); + + __this_cpu_write(current_kprobe, &ri->rp->kp); + ri->ret_addr = correct_ret_addr; + ri->rp->handler(ri, regs); + __this_cpu_write(current_kprobe, prev); + } + + recycle_rp_inst(ri, &empty_rp); + + if (ri == last) + break; + } + + kretprobe_hash_unlock(current, &flags); + + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + + return (unsigned long)correct_ret_addr; +} +NOKPROBE_SYMBOL(__kretprobe_trampoline_handler) + /* * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. -- cgit v1.2.3 From e03b4a084ea6b0a18b0e874baec439e69090c168 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 29 Aug 2020 22:02:36 +0900 Subject: kprobes: Remove NMI context check The in_nmi() check in pre_handler_kretprobe() is meant to avoid recursion, and blindly assumes that anything NMI is recursive. However, since commit: 9b38cc704e84 ("kretprobe: Prevent triggering kretprobe from within kprobe_flush_task") there is a better way to detect and avoid actual recursion. By setting a dummy kprobe, any actual exceptions will terminate early (by trying to handle the dummy kprobe), and recursion will not happen. Employ this to avoid the kretprobe_table_lock() recursion, replacing the over-eager in_nmi() check. Signed-off-by: Masami Hiramatsu Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/159870615628.1229682.6087311596892125907.stgit@devnote2 --- kernel/kprobes.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a0afaa79024e..211138225fa5 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1359,7 +1359,8 @@ static void cleanup_rp_inst(struct kretprobe *rp) struct hlist_node *next; struct hlist_head *head; - /* No race here */ + /* To avoid recursive kretprobe by NMI, set kprobe busy here */ + kprobe_busy_begin(); for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { kretprobe_table_lock(hash, &flags); head = &kretprobe_inst_table[hash]; @@ -1369,6 +1370,8 @@ static void cleanup_rp_inst(struct kretprobe *rp) } kretprobe_table_unlock(hash, &flags); } + kprobe_busy_end(); + free_rp_inst(rp); } NOKPROBE_SYMBOL(cleanup_rp_inst); @@ -2035,17 +2038,6 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) unsigned long hash, flags = 0; struct kretprobe_instance *ri; - /* - * To avoid deadlocks, prohibit return probing in NMI contexts, - * just skip the probe and increase the (inexact) 'nmissed' - * statistical counter, so that the user is informed that - * something happened: - */ - if (unlikely(in_nmi())) { - rp->nmissed++; - return 0; - } - /* TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); raw_spin_lock_irqsave(&rp->lock, flags); -- cgit v1.2.3 From b338817807538c893540e393856b79cbbdf777ea Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 29 Aug 2020 22:02:47 +0900 Subject: kprobes: Free kretprobe_instance with RCU callback Free kretprobe_instance with RCU callback instead of directly freeing the object in the kretprobe handler context. This will make kretprobe run safer in NMI context. Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/159870616685.1229682.11978742048709542226.stgit@devnote2 --- kernel/kprobes.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 211138225fa5..0676868f1ac2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1223,8 +1223,7 @@ void kprobes_inc_nmissed_count(struct kprobe *p) } NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); -void recycle_rp_inst(struct kretprobe_instance *ri, - struct hlist_head *head) +static void recycle_rp_inst(struct kretprobe_instance *ri) { struct kretprobe *rp = ri->rp; @@ -1236,8 +1235,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri, hlist_add_head(&ri->hlist, &rp->free_instances); raw_spin_unlock(&rp->lock); } else - /* Unregistering */ - hlist_add_head(&ri->hlist, head); + kfree_rcu(ri, rcu); } NOKPROBE_SYMBOL(recycle_rp_inst); @@ -1313,7 +1311,7 @@ void kprobe_busy_end(void) void kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; - struct hlist_head *head, empty_rp; + struct hlist_head *head; struct hlist_node *tmp; unsigned long hash, flags = 0; @@ -1323,19 +1321,14 @@ void kprobe_flush_task(struct task_struct *tk) kprobe_busy_begin(); - INIT_HLIST_HEAD(&empty_rp); hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; kretprobe_table_lock(hash, &flags); hlist_for_each_entry_safe(ri, tmp, head, hlist) { if (ri->task == tk) - recycle_rp_inst(ri, &empty_rp); + recycle_rp_inst(ri); } kretprobe_table_unlock(hash, &flags); - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } kprobe_busy_end(); } @@ -1936,13 +1929,12 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, void *frame_pointer) { struct kretprobe_instance *ri = NULL, *last = NULL; - struct hlist_head *head, empty_rp; + struct hlist_head *head; struct hlist_node *tmp; unsigned long flags; kprobe_opcode_t *correct_ret_addr = NULL; bool skipped = false; - INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* @@ -2011,7 +2003,7 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, __this_cpu_write(current_kprobe, prev); } - recycle_rp_inst(ri, &empty_rp); + recycle_rp_inst(ri); if (ri == last) break; @@ -2019,11 +2011,6 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, kretprobe_hash_unlock(current, &flags); - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - return (unsigned long)correct_ret_addr; } NOKPROBE_SYMBOL(__kretprobe_trampoline_handler) -- cgit v1.2.3 From 319f0ce284fff8e4f95167cb144acc905d0584c7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 29 Aug 2020 22:03:02 +0900 Subject: kprobes: Make local functions static Since we unified the kretprobe trampoline handler from arch/* code, some functions and objects do not need to be exported anymore. Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/159870618256.1229682.8692046612635810882.stgit@devnote2 --- kernel/kprobes.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0676868f1ac2..732a70163584 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1239,7 +1239,7 @@ static void recycle_rp_inst(struct kretprobe_instance *ri) } NOKPROBE_SYMBOL(recycle_rp_inst); -void kretprobe_hash_lock(struct task_struct *tsk, +static void kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) __acquires(hlist_lock) { @@ -1261,7 +1261,7 @@ __acquires(hlist_lock) } NOKPROBE_SYMBOL(kretprobe_table_lock); -void kretprobe_hash_unlock(struct task_struct *tsk, +static void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) __releases(hlist_lock) { @@ -1282,7 +1282,7 @@ __releases(hlist_lock) } NOKPROBE_SYMBOL(kretprobe_table_unlock); -struct kprobe kprobe_busy = { +static struct kprobe kprobe_busy = { .addr = (void *) get_kprobe, }; @@ -1983,8 +1983,7 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, break; } - kretprobe_assert(ri, (unsigned long)correct_ret_addr, - (unsigned long)trampoline_address); + BUG_ON(!correct_ret_addr || (correct_ret_addr == trampoline_address)); last = ri; hlist_for_each_entry_safe(ri, tmp, head, hlist) { -- cgit v1.2.3 From fcdb84cc5b31c2f4e333051f467264f5821bc61a Mon Sep 17 00:00:00 2001 From: Cengiz Can Date: Tue, 30 Jun 2020 11:29:23 +0300 Subject: kdb: remove unnecessary null check of dbg_io_ops `kdb_msg_write` operates on a global `struct kgdb_io *` called `dbg_io_ops`. It's initialized in `debug_core.c` and checked throughout the debug flow. There's a null check in `kdb_msg_write` which triggers static analyzers and gives the (almost entirely wrong) impression that it can be null. Coverity scanner caught this as CID 1465042. I have removed the unnecessary null check and eliminated false-positive forward null dereference warning. Signed-off-by: Cengiz Can Link: https://lore.kernel.org/r/20200630082922.28672-1-cengiz@kernel.wtf Reviewed-by: Sumit Garg Reviewed-by: Douglas Anderson Tested-by: Douglas Anderson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9d847ab851db..e7835ca88e16 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -545,18 +545,18 @@ static int kdb_search_string(char *searched, char *searchfor) static void kdb_msg_write(const char *msg, int msg_len) { struct console *c; + const char *cp; + int len; if (msg_len == 0) return; - if (dbg_io_ops) { - const char *cp = msg; - int len = msg_len; + cp = msg; + len = msg_len; - while (len--) { - dbg_io_ops->write_char(*cp); - cp++; - } + while (len--) { + dbg_io_ops->write_char(*cp); + cp++; } for_each_console(c) { -- cgit v1.2.3 From b18b099e04f450cdc77bec72acefcde7042bd1f3 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 30 Jun 2020 15:14:38 -0700 Subject: kgdb: Make "kgdbcon" work properly with "kgdb_earlycon" On my system the kernel processes the "kgdb_earlycon" parameter before the "kgdbcon" parameter. When we setup "kgdb_earlycon" we'll end up in kgdb_register_callbacks() and "kgdb_use_con" won't have been set yet so we'll never get around to starting "kgdbcon". Let's remedy this by detecting that the IO module was already registered when setting "kgdb_use_con" and registering the console then. As part of this, to avoid pre-declaring things, move the handling of the "kgdbcon" further down in the file. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200630151422.1.I4aa062751ff5e281f5116655c976dff545c09a46@changeid Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index b16dbc1bf056..404d6d47a11d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -94,14 +94,6 @@ int dbg_switch_cpu; /* Use kdb or gdbserver mode */ int dbg_kdb_mode = 1; -static int __init opt_kgdb_con(char *str) -{ - kgdb_use_con = 1; - return 0; -} - -early_param("kgdbcon", opt_kgdb_con); - module_param(kgdb_use_con, int, 0644); module_param(kgdbreboot, int, 0644); @@ -920,6 +912,20 @@ static struct console kgdbcons = { .index = -1, }; +static int __init opt_kgdb_con(char *str) +{ + kgdb_use_con = 1; + + if (kgdb_io_module_registered && !kgdb_con_registered) { + register_console(&kgdbcons); + kgdb_con_registered = 1; + } + + return 0; +} + +early_param("kgdbcon", opt_kgdb_con); + #ifdef CONFIG_MAGIC_SYSRQ static void sysrq_handle_dbg(int key) { -- cgit v1.2.3 From ece4ceaf2eba1c0da9d6b62bc59a43be6b456548 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 7 Sep 2020 13:32:06 -0700 Subject: kdb: Use newer api for tasklist scanning This kills using the do_each_thread/while_each_thread combo to iterate all threads and uses for_each_process_thread() instead, maintaining semantics. while_each_thread() is ultimately racy and deprecated; although in this particular case there is no concurrency so it doesn't matter. Still lets trivially get rid of two more users. Acked-by: Oleg Nesterov Signed-off-by: Davidlohr Bueso Link: https://lore.kernel.org/r/20200907203206.21293-1-dave@stgolabs.net Signed-off-by: Daniel Thompson --- kernel/debug/gdbstub.c | 4 ++-- kernel/debug/kdb/kdb_bt.c | 4 ++-- kernel/debug/kdb/kdb_main.c | 8 ++++---- kernel/debug/kdb/kdb_private.h | 4 ---- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index cc3c43dfec44..b52ebff09ac8 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -725,7 +725,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) } } - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (i >= ks->thr_query && !finished) { int_to_threadref(thref, p->pid); ptr = pack_threadid(ptr, thref); @@ -735,7 +735,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) finished = 1; } i++; - } while_each_thread(g, p); + } *(--ptr) = '\0'; break; diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 18e03aba2cfc..1f9f0e47aeda 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -149,14 +149,14 @@ kdb_bt(int argc, const char **argv) return 0; } /* Now the inactive tasks */ - kdb_do_each_thread(g, p) { + for_each_process_thread(g, p) { if (KDB_FLAG(CMD_INTERRUPT)) return 0; if (task_curr(p)) continue; if (kdb_bt1(p, mask, btaprompt)) return 0; - } kdb_while_each_thread(g, p); + } } else if (strcmp(argv[0], "btp") == 0) { struct task_struct *p; unsigned long pid; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 5c7949061671..930ac1b25ec7 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2299,10 +2299,10 @@ void kdb_ps_suppressed(void) if (kdb_task_state(p, mask_I)) ++idle; } - kdb_do_each_thread(g, p) { + for_each_process_thread(g, p) { if (kdb_task_state(p, mask_M)) ++daemon; - } kdb_while_each_thread(g, p); + } if (idle || daemon) { if (idle) kdb_printf("%d idle process%s (state I)%s\n", @@ -2370,12 +2370,12 @@ static int kdb_ps(int argc, const char **argv) } kdb_printf("\n"); /* Now the real tasks */ - kdb_do_each_thread(g, p) { + for_each_process_thread(g, p) { if (KDB_FLAG(CMD_INTERRUPT)) return 0; if (kdb_task_state(p, mask)) kdb_ps1(p); - } kdb_while_each_thread(g, p); + } return 0; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 2e296e4a234c..a4281fb99299 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -230,10 +230,6 @@ extern struct task_struct *kdb_curr_task(int); #define kdb_task_has_cpu(p) (task_curr(p)) -/* Simplify coexistence with NPTL */ -#define kdb_do_each_thread(g, p) do_each_thread(g, p) -#define kdb_while_each_thread(g, p) while_each_thread(g, p) - #define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL) extern void *debug_kmalloc(size_t size, gfp_t flags); -- cgit v1.2.3 From a566a9012acd7c9a4be7e30dc7acb7a811ec2260 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Tue, 1 Sep 2020 19:40:16 -0600 Subject: seccomp: don't leak memory when filter install races In seccomp_set_mode_filter() with TSYNC | NEW_LISTENER, we first initialize the listener fd, then check to see if we can actually use it later in seccomp_may_assign_mode(), which can fail if anyone else in our thread group has installed a filter and caused some divergence. If we can't, we partially clean up the newly allocated file: we put the fd, put the file, but don't actually clean up the *memory* that was allocated at filter->notif. Let's clean that up too. To accomplish this, let's hoist the actual "detach a notifier from a filter" code to its own helper out of seccomp_notify_release(), so that in case anyone adds stuff to init_listener(), they only have to add the cleanup code in one spot. This does a bit of extra locking and such on the failure path when the filter is not attached, but it's a slow failure path anyway. Fixes: 51891498f2da ("seccomp: allow TSYNC and USER_NOTIF together") Reported-by: syzbot+3ad9614a12f80994c32e@syzkaller.appspotmail.com Signed-off-by: Tycho Andersen Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200902014017.934315-1-tycho@tycho.pizza Signed-off-by: Kees Cook --- kernel/seccomp.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 3ee59ce0a323..bb0dd9ae699a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1109,13 +1109,12 @@ out: } #ifdef CONFIG_SECCOMP_FILTER -static int seccomp_notify_release(struct inode *inode, struct file *file) +static void seccomp_notify_detach(struct seccomp_filter *filter) { - struct seccomp_filter *filter = file->private_data; struct seccomp_knotif *knotif; if (!filter) - return 0; + return; mutex_lock(&filter->notify_lock); @@ -1142,6 +1141,13 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) kfree(filter->notif); filter->notif = NULL; mutex_unlock(&filter->notify_lock); +} + +static int seccomp_notify_release(struct inode *inode, struct file *file) +{ + struct seccomp_filter *filter = file->private_data; + + seccomp_notify_detach(filter); __put_seccomp_filter(filter); return 0; } @@ -1581,6 +1587,7 @@ out_put_fd: listener_f->private_data = NULL; fput(listener_f); put_unused_fd(listener); + seccomp_notify_detach(prepared); } else { fd_install(listener, listener_f); ret = listener; -- cgit v1.2.3 From e839317900e9f13c83d8711d684de88c625b307a Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 2 Sep 2020 08:09:53 -0600 Subject: seccomp: don't leave dangling ->notif if file allocation fails Christian and Kees both pointed out that this is a bit sloppy to open-code both places, and Christian points out that we leave a dangling pointer to ->notif if file allocation fails. Since we check ->notif for null in order to determine if it's ok to install a filter, this means people won't be able to install a filter if the file allocation fails for some reason, even if they subsequently should be able to. To fix this, let's hoist this free+null into its own little helper and use it. Reported-by: Kees Cook Reported-by: Christian Brauner Signed-off-by: Tycho Andersen Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200902140953.1201956-1-tycho@tycho.pizza Signed-off-by: Kees Cook --- kernel/seccomp.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index bb0dd9ae699a..676d4af62103 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1109,6 +1109,12 @@ out: } #ifdef CONFIG_SECCOMP_FILTER +static void seccomp_notify_free(struct seccomp_filter *filter) +{ + kfree(filter->notif); + filter->notif = NULL; +} + static void seccomp_notify_detach(struct seccomp_filter *filter) { struct seccomp_knotif *knotif; @@ -1138,8 +1144,7 @@ static void seccomp_notify_detach(struct seccomp_filter *filter) complete(&knotif->ready); } - kfree(filter->notif); - filter->notif = NULL; + seccomp_notify_free(filter); mutex_unlock(&filter->notify_lock); } @@ -1494,7 +1499,7 @@ static struct file *init_listener(struct seccomp_filter *filter) out_notif: if (IS_ERR(ret)) - kfree(filter->notif); + seccomp_notify_free(filter); out: return ret; } -- cgit v1.2.3 From 4d671d922d51907bc41f1f7f2dc737c928ae78fd Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Fri, 28 Aug 2020 21:56:13 -0400 Subject: seccomp: kill process instead of thread for unknown actions Asynchronous termination of a thread outside of the userspace thread library's knowledge is an unsafe operation that leaves the process in an inconsistent, corrupt, and possibly unrecoverable state. In order to make new actions that may be added in the future safe on kernels not aware of them, change the default action from SECCOMP_RET_KILL_THREAD to SECCOMP_RET_KILL_PROCESS. Signed-off-by: Rich Felker Link: https://lore.kernel.org/r/20200829015609.GA32566@brightrain.aerifal.cx [kees: Fixed up coredump selection logic to match] Signed-off-by: Kees Cook --- kernel/seccomp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 676d4af62103..f754c1087e41 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1020,7 +1020,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, default: seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ - if (action == SECCOMP_RET_KILL_PROCESS || + if (action != SECCOMP_RET_KILL_THREAD || get_nr_threads(current) == 1) { kernel_siginfo_t info; @@ -1030,10 +1030,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, seccomp_init_siginfo(&info, this_syscall, data); do_coredump(&info); } - if (action == SECCOMP_RET_KILL_PROCESS) - do_group_exit(SIGSYS); - else + if (action == SECCOMP_RET_KILL_THREAD) do_exit(SIGSYS); + else + do_group_exit(SIGSYS); } unreachable(); -- cgit v1.2.3 From 2d9ca267a944c2b6ed5b4d750b1cf0407b6262b4 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 24 Aug 2020 15:59:21 +0300 Subject: seccomp: Use current_pt_regs() instead of task_pt_regs(current) As described in commit a3460a59747c ("new helper: current_pt_regs()"): - arch versions are "optimized versions". - some architectures have task_pt_regs() working only for traced tasks blocked on signal delivery. current_pt_regs() needs to work for *all* processes. In preparation for adding a coccinelle rule for using current_*(), instead of raw accesses to current members, modify seccomp_do_user_notification(), __seccomp_filter(), __secure_computing() to use current_pt_regs(). Signed-off-by: Denis Efremov Link: https://lore.kernel.org/r/20200824125921.488311-1-efremov@linux.com [kees: Reworded commit log, add comment to populate_seccomp_data()] Signed-off-by: Kees Cook --- kernel/seccomp.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f754c1087e41..ae6b40cc39f4 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -196,6 +196,10 @@ struct seccomp_filter { */ static void populate_seccomp_data(struct seccomp_data *sd) { + /* + * Instead of using current_pt_reg(), we're already doing the work + * to safely fetch "current", so just use "task" everywhere below. + */ struct task_struct *task = current; struct pt_regs *regs = task_pt_regs(task); unsigned long args[6]; @@ -910,7 +914,7 @@ out: if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) return 0; - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, current_pt_regs(), err, ret); return -1; } @@ -943,13 +947,13 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* Set low-order bits as an errno, capped at MAX_ERRNO. */ if (data > MAX_ERRNO) data = MAX_ERRNO; - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, current_pt_regs(), -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, current_pt_regs()); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; @@ -962,7 +966,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* ENOSYS these calls if there is no tracer attached. */ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { syscall_set_return_value(current, - task_pt_regs(current), + current_pt_regs(), -ENOSYS, 0); goto skip; } @@ -982,7 +986,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, if (fatal_signal_pending(current)) goto skip; /* Check if the tracer forced the syscall to be skipped. */ - this_syscall = syscall_get_nr(current, task_pt_regs(current)); + this_syscall = syscall_get_nr(current, current_pt_regs()); if (this_syscall < 0) goto skip; @@ -1025,7 +1029,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, kernel_siginfo_t info; /* Show the original registers in the dump. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, current_pt_regs()); /* Trigger a manual coredump since do_exit skips it. */ seccomp_init_siginfo(&info, this_syscall, data); do_coredump(&info); @@ -1060,7 +1064,7 @@ int __secure_computing(const struct seccomp_data *sd) return 0; this_syscall = sd ? sd->nr : - syscall_get_nr(current, task_pt_regs(current)); + syscall_get_nr(current, current_pt_regs()); switch (mode) { case SECCOMP_MODE_STRICT: -- cgit v1.2.3 From 7c6967326267bd5c0dded0a99541357d70dd11ac Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 8 Sep 2020 10:57:02 -0700 Subject: bpf: Permit map_ptr arithmetic with opcode add and offset 0 Commit 41c48f3a98231 ("bpf: Support access to bpf map fields") added support to access map fields with CORE support. For example, struct bpf_map { __u32 max_entries; } __attribute__((preserve_access_index)); struct bpf_array { struct bpf_map map; __u32 elem_size; } __attribute__((preserve_access_index)); struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 4); __type(key, __u32); __type(value, __u32); } m_array SEC(".maps"); SEC("cgroup_skb/egress") int cg_skb(void *ctx) { struct bpf_array *array = (struct bpf_array *)&m_array; /* .. array->map.max_entries .. */ } In kernel, bpf_htab has similar structure, struct bpf_htab { struct bpf_map map; ... } In the above cg_skb(), to access array->map.max_entries, with CORE, the clang will generate two builtin's. base = &m_array; /* access array.map */ map_addr = __builtin_preserve_struct_access_info(base, 0, 0); /* access array.map.max_entries */ max_entries_addr = __builtin_preserve_struct_access_info(map_addr, 0, 0); max_entries = *max_entries_addr; In the current llvm, if two builtin's are in the same function or in the same function after inlining, the compiler is smart enough to chain them together and generates like below: base = &m_array; max_entries = *(base + reloc_offset); /* reloc_offset = 0 in this case */ and we are fine. But if we force no inlining for one of functions in test_map_ptr() selftest, e.g., check_default(), the above two __builtin_preserve_* will be in two different functions. In this case, we will have code like: func check_hash(): reloc_offset_map = 0; base = &m_array; map_base = base + reloc_offset_map; check_default(map_base, ...) func check_default(map_base, ...): max_entries = *(map_base + reloc_offset_max_entries); In kernel, map_ptr (CONST_PTR_TO_MAP) does not allow any arithmetic. The above "map_base = base + reloc_offset_map" will trigger a verifier failure. ; VERIFY(check_default(&hash->map, map)); 0: (18) r7 = 0xffffb4fe8018a004 2: (b4) w1 = 110 3: (63) *(u32 *)(r7 +0) = r1 R1_w=invP110 R7_w=map_value(id=0,off=4,ks=4,vs=8,imm=0) R10=fp0 ; VERIFY_TYPE(BPF_MAP_TYPE_HASH, check_hash); 4: (18) r1 = 0xffffb4fe8018a000 6: (b4) w2 = 1 7: (63) *(u32 *)(r1 +0) = r2 R1_w=map_value(id=0,off=0,ks=4,vs=8,imm=0) R2_w=invP1 R7_w=map_value(id=0,off=4,ks=4,vs=8,imm=0) R10=fp0 8: (b7) r2 = 0 9: (18) r8 = 0xffff90bcb500c000 11: (18) r1 = 0xffff90bcb500c000 13: (0f) r1 += r2 R1 pointer arithmetic on map_ptr prohibited To fix the issue, let us permit map_ptr + 0 arithmetic which will result in exactly the same map_ptr. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200908175702.2463625-1-yhs@fb.com --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b4e9c56b8b32..814bc6c1ad16 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5317,6 +5317,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst, reg_type_str[ptr_reg->type]); return -EACCES; case CONST_PTR_TO_MAP: + /* smin_val represents the known value */ + if (known && smin_val == 0 && opcode == BPF_ADD) + break; + /* fall-through */ case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: -- cgit v1.2.3 From 4bd6a7353ee14697fea645e941354976d2c4a452 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 3 Sep 2020 16:22:32 +0200 Subject: sysctl: Convert to iter interfaces Using the read_iter/write_iter interfaces allows for in-kernel users to set sysctls without using set_fs(). Also, the buffer is a string, so give it the real type of 'char *', not void *. [AV: Christoph's fixup folded in] Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 83ff127ef7ae..226299352a76 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1226,7 +1226,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void **buf, size_t *pcount, loff_t *ppos, + char **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { -- cgit v1.2.3 From 848785df48835eefebe0c4eb5da7690690b0a8b7 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 8 Sep 2020 19:49:56 +0100 Subject: sched/topology: Move sd_flag_debug out of #ifdef CONFIG_SYSCTL The last sd_flag_debug shuffle inadvertently moved its definition within an #ifdef CONFIG_SYSCTL region. While CONFIG_SYSCTL is indeed required to produce the sched domain ctl interface (which uses sd_flag_debug to output flag names), it isn't required to run any assertion on the sched_domain hierarchy itself. Move the definition of sd_flag_debug to a CONFIG_SCHED_DEBUG region of topology.c. Now at long last we have: - sd_flag_debug declared in include/linux/sched/topology.h iff CONFIG_SCHED_DEBUG=y - sd_flag_debug defined in kernel/sched/topology.c, conditioned by: - CONFIG_SCHED_DEBUG, with an explicit #ifdef block - CONFIG_SMP, as a requirement to compile topology.c With this change, all symbols pertaining to SD flag metadata (with the exception of __SD_FLAG_CNT) are now defined exclusively within topology.c Fixes: 8fca9494d4b4 ("sched/topology: Move sd_flag_debug out of linux/sched/topology.h") Reported-by: Randy Dunlap Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200908184956.23369-1-valentin.schneider@arm.com --- kernel/sched/debug.c | 6 ------ kernel/sched/topology.c | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0d7896d2a0b2..0655524700d2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,12 +245,6 @@ set_table_entry(struct ctl_table *entry, entry->proc_handler = proc_handler; } -#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, -const struct sd_flag_debug sd_flag_debug[] = { -#include -}; -#undef SD_FLAG - static int sd_ctl_doflags(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index aa1676abc544..249bec7b0a4c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -25,6 +25,12 @@ static inline bool sched_debug(void) return sched_debug_enabled; } +#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, +const struct sd_flag_debug sd_flag_debug[] = { +#include +}; +#undef SD_FLAG + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { -- cgit v1.2.3 From 58faf20a086bd34f91983609e26eac3d5fe76be3 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 27 Aug 2020 13:40:37 +0200 Subject: time/sched_clock: Use raw_read_seqcount_latch() during suspend sched_clock uses seqcount_t latching to switch between two storage places protected by the sequence counter. This allows it to have interruptible, NMI-safe, seqcount_t write side critical sections. Since 7fc26327b756 ("seqlock: Introduce raw_read_seqcount_latch()"), raw_read_seqcount_latch() became the standardized way for seqcount_t latch read paths. Due to the dependent load, it has one read memory barrier less than the currently used raw_read_seqcount() API. Use raw_read_seqcount_latch() for the suspend path. Commit aadd6e5caaac ("time/sched_clock: Use raw_read_seqcount_latch()") missed changing that instance of raw_read_seqcount(). References: 1809bfa44e10 ("timers, sched/clock: Avoid deadlock during read from NMI") Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200715092345.GA231464@debian-buster-darwi.lab.linutronix.de --- kernel/time/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 1c03eec6ca9b..8c6b5febd7a0 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -258,7 +258,7 @@ void __init generic_sched_clock_init(void) */ static u64 notrace suspended_sched_clock_read(void) { - unsigned int seq = raw_read_seqcount(&cd.seq); + unsigned int seq = raw_read_seqcount_latch(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } -- cgit v1.2.3 From a690ed07353ec45f056b0a6f87c23a12a59c030d Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 27 Aug 2020 13:40:40 +0200 Subject: time/sched_clock: Use seqcount_latch_t Latch sequence counters have unique read and write APIs, and thus seqcount_latch_t was recently introduced at seqlock.h. Use that new data type instead of plain seqcount_t. This adds the necessary type-safety and ensures only latching-safe seqcount APIs are to be used. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200827114044.11173-5-a.darwish@linutronix.de --- kernel/time/sched_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 8c6b5febd7a0..0642013dace4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -35,7 +35,7 @@ * into a single 64-byte cache line. */ struct clock_data { - seqcount_t seq; + seqcount_latch_t seq; struct clock_read_data read_data[2]; ktime_t wrap_kt; unsigned long rate; @@ -76,7 +76,7 @@ struct clock_read_data *sched_clock_read_begin(unsigned int *seq) int sched_clock_read_retry(unsigned int seq) { - return read_seqcount_retry(&cd.seq, seq); + return read_seqcount_latch_retry(&cd.seq, seq); } unsigned long long notrace sched_clock(void) -- cgit v1.2.3 From 249d053835320cb3e7c00066cf085a6ba9b1f126 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 27 Aug 2020 13:40:41 +0200 Subject: timekeeping: Use seqcount_latch_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Latch sequence counters are a multiversion concurrency control mechanism where the seqcount_t counter even/odd value is used to switch between two data storage copies. This allows the seqcount_t read path to safely interrupt its write side critical section (e.g. from NMIs). Initially, latch sequence counters were implemented as a single write function, raw_write_seqcount_latch(), above plain seqcount_t. The read path was expected to use plain seqcount_t raw_read_seqcount(). A specialized read function was later added, raw_read_seqcount_latch(), and became the standardized way for latch read paths. Having unique read and write APIs meant that latch sequence counters are basically a data type of their own -- just inappropriately overloading plain seqcount_t. The seqcount_latch_t data type was thus introduced at seqlock.h. Use that new data type instead of seqcount_raw_spinlock_t. This ensures that only latch-safe APIs are to be used with the sequence counter. Note that the use of seqcount_raw_spinlock_t was not very useful in the first place. Only the "raw_" subset of seqcount_t APIs were used at timekeeping.c. This subset was created for contexts where lockdep cannot be used. seqcount_LOCKTYPE_t's raison d'être -- verifying that the seqcount_t writer serialization lock is held -- cannot thus be done. References: 0c3351d451ae ("seqlock: Use raw_ prefix instead of _no_lockdep") References: 55f3560df975 ("seqlock: Extend seqcount API with associated locks") Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200827114044.11173-6-a.darwish@linutronix.de --- kernel/time/timekeeping.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4c47f388a83f..999c981ae766 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -64,7 +64,7 @@ static struct timekeeper shadow_timekeeper; * See @update_fast_timekeeper() below. */ struct tk_fast { - seqcount_raw_spinlock_t seq; + seqcount_latch_t seq; struct tk_read_base base[2]; }; @@ -81,13 +81,13 @@ static struct clocksource dummy_clock = { }; static struct tk_fast tk_fast_mono ____cacheline_aligned = { - .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), + .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { - .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), + .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; @@ -467,7 +467,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); - } while (read_seqcount_retry(&tkf->seq, seq)); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } @@ -533,7 +533,7 @@ static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); - } while (read_seqcount_retry(&tkf->seq, seq)); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } -- cgit v1.2.3 From 556cccad389717d6eb4f5a24b45ff41cad3aaabf Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 21 Aug 2020 12:57:52 -0700 Subject: perf/core: Pull pmu::sched_task() into perf_event_context_sched_in() The pmu::sched_task() is a context switch callback. It passes the cpuctx->task_ctx as a parameter to the lower code. To find the cpuctx->task_ctx, the current code iterates a cpuctx list. The same context was just iterated in perf_event_context_sched_in(), which is invoked right before the pmu::sched_task(). Reuse the cpuctx->task_ctx from perf_event_context_sched_in() can avoid the unnecessary iteration of the cpuctx list. Both pmu::sched_task and perf_event_context_sched_in() have to disable PMU. Pull the pmu::sched_task into perf_event_context_sched_in() can also save the overhead from the PMU disable and reenable. The new and old tasks may have equivalent contexts. The current code optimize this case by swapping the context, which avoids the scheduling. For this case, pmu::sched_task() is still required, e.g., restore the LBR content. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200821195754.20159-1-kan.liang@linux.intel.com --- kernel/events/core.c | 51 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 57efe3b21e29..3f5fec4dcf01 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3491,30 +3491,36 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ +static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in) +{ + struct pmu *pmu; + + pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ + + if (WARN_ON_ONCE(!pmu->sched_task)) + return; + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(pmu); + + pmu->sched_task(cpuctx->task_ctx, sched_in); + + perf_pmu_enable(pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); +} + static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, bool sched_in) { struct perf_cpu_context *cpuctx; - struct pmu *pmu; if (prev == next) return; - list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ - - if (WARN_ON_ONCE(!pmu->sched_task)) - continue; - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(pmu); - - pmu->sched_task(cpuctx->task_ctx, sched_in); + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) + __perf_pmu_sched_task(cpuctx, sched_in); - perf_pmu_enable(pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } } static void perf_event_switch(struct task_struct *task, @@ -3773,10 +3779,14 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, struct task_struct *task) { struct perf_cpu_context *cpuctx; + struct pmu *pmu = ctx->pmu; cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) + if (cpuctx->task_ctx == ctx) { + if (cpuctx->sched_cb_usage) + __perf_pmu_sched_task(cpuctx, true); return; + } perf_ctx_lock(cpuctx, ctx); /* @@ -3786,7 +3796,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!ctx->nr_events) goto unlock; - perf_pmu_disable(ctx->pmu); + perf_pmu_disable(pmu); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3798,7 +3808,11 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); - perf_pmu_enable(ctx->pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(cpuctx->task_ctx, true); + + perf_pmu_enable(pmu); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -3841,9 +3855,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); - - if (__this_cpu_read(perf_sched_cb_usages)) - perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) -- cgit v1.2.3 From 44fae179ce73a26733d9e2d346da4e1a1cb94647 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 21 Aug 2020 12:57:53 -0700 Subject: perf/core: Pull pmu::sched_task() into perf_event_context_sched_out() The pmu::sched_task() is a context switch callback. It passes the cpuctx->task_ctx as a parameter to the lower code. To find the cpuctx->task_ctx, the current code iterates a cpuctx list. The same context will iterated in perf_event_context_sched_out() soon. Share the cpuctx->task_ctx can avoid the unnecessary iteration of the cpuctx list. The pmu::sched_task() is also required for the optimization case for equivalent contexts. The task_ctx_sched_out() will eventually disable and reenable the PMU when schedule out events. Add perf_pmu_disable() and perf_pmu_enable() around task_ctx_sched_out() don't break anything. Drop the cpuctx->ctx.lock for the pmu::sched_task(). The lock is for per-CPU context, which is not necessary for the per-task context schedule. No one uses sched_cb_entry, perf_sched_cb_usages, sched_cb_list, and perf_pmu_sched_task() any more. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200821195754.20159-2-kan.liang@linux.intel.com --- kernel/events/core.c | 47 +++++++++++++++++------------------------------ 1 file changed, 17 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3f5fec4dcf01..45edb85344a1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -382,7 +382,6 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -3384,10 +3383,12 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, struct perf_event_context *parent, *next_parent; struct perf_cpu_context *cpuctx; int do_switch = 1; + struct pmu *pmu; if (likely(!ctx)) return; + pmu = ctx->pmu; cpuctx = __get_cpu_context(ctx); if (!cpuctx->task_ctx) return; @@ -3417,11 +3418,15 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - struct pmu *pmu = ctx->pmu; WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); + perf_pmu_disable(pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(ctx, false); + /* * PMU specific parts of task perf context can require * additional synchronization. As an example of such @@ -3433,6 +3438,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, else swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + perf_pmu_enable(pmu); + /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of @@ -3455,21 +3462,22 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); + perf_pmu_disable(pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(ctx, false); task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); + + perf_pmu_enable(pmu); raw_spin_unlock(&ctx->lock); } } -static DEFINE_PER_CPU(struct list_head, sched_cb_list); - void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - this_cpu_dec(perf_sched_cb_usages); - - if (!--cpuctx->sched_cb_usage) - list_del(&cpuctx->sched_cb_entry); + --cpuctx->sched_cb_usage; } @@ -3477,10 +3485,7 @@ void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - if (!cpuctx->sched_cb_usage++) - list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); - - this_cpu_inc(perf_sched_cb_usages); + cpuctx->sched_cb_usage++; } /* @@ -3509,20 +3514,6 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } -static void perf_pmu_sched_task(struct task_struct *prev, - struct task_struct *next, - bool sched_in) -{ - struct perf_cpu_context *cpuctx; - - if (prev == next) - return; - - list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) - __perf_pmu_sched_task(cpuctx, sched_in); - -} - static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); @@ -3545,9 +3536,6 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; - if (__this_cpu_read(perf_sched_cb_usages)) - perf_pmu_sched_task(task, next, false); - if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); @@ -12867,7 +12855,6 @@ static void __init perf_event_init_all_cpus(void) #ifdef CONFIG_CGROUP_PERF INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); #endif - INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } -- cgit v1.2.3 From 4db7b6aacc573b89227a1b5960af9d2a17f17762 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 2 Sep 2020 10:31:03 -0700 Subject: swiotlb: Use %pa to print phys_addr_t variables There is an extension to a %p to print phys_addr_t type of variables. Use it here. Signed-off-by: Andy Shevchenko Reviewed-by: Fabio Estevam Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index c19379fabd20..6499bda8f0b8 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -172,9 +172,7 @@ void swiotlb_print_info(void) return; } - pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n", - (unsigned long long)io_tlb_start, - (unsigned long long)io_tlb_end, + pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end, bytes >> 20); } -- cgit v1.2.3 From b51e627158cb0f67569aa46b5a08b568d20a81a4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 2 Sep 2020 20:31:05 +0300 Subject: swiotlb: Mark max_segment with static keyword Sparse is not happy about max_segment declaration: CHECK kernel/dma/swiotlb.c kernel/dma/swiotlb.c:96:14: warning: symbol 'max_segment' was not declared. Should it be static? Mark it static as suggested. Signed-off-by: Andy Shevchenko Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 6499bda8f0b8..465a567678d9 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -93,7 +93,7 @@ static unsigned int io_tlb_index; * Max segment that we can provide which (if pages are contingous) will * not be bounced (unless SWIOTLB_FORCE is set). */ -unsigned int max_segment; +static unsigned int max_segment; /* * We need to save away the original address corresponding to a mapped entry -- cgit v1.2.3 From 00089c048eb4a8250325efb32a2724fd0da68cce Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Fri, 4 Sep 2020 16:30:25 +0100 Subject: objtool: Rename frame.h -> objtool.h Header frame.h is getting more code annotations to help objtool analyze object files. Rename the file to objtool.h. [ jpoimboe: add objtool.h to MAINTAINERS ] Signed-off-by: Julien Thierry Signed-off-by: Josh Poimboeuf --- kernel/bpf/core.c | 2 +- kernel/kexec_core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ed0b3578867c..03e284873644 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c19c0dad1ebe..c5e5e5a11535 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From 3d842b51a04808fc1494006b97e0d33402d28507 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Sep 2020 18:34:27 +0200 Subject: dma-mapping: remove the dma_dummy_ops export dma_dummy_ops is only used by the ACPI code, which can't be modular. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/dummy.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c index 05607642c888..6974b1bd7d0b 100644 --- a/kernel/dma/dummy.c +++ b/kernel/dma/dummy.c @@ -36,4 +36,3 @@ const struct dma_map_ops dma_dummy_ops = { .map_sg = dma_dummy_map_sg, .dma_supported = dma_dummy_supported, }; -EXPORT_SYMBOL(dma_dummy_ops); -- cgit v1.2.3 From ec91ccb27408abd842faf6137b3a0df8d6ebbdbb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Sep 2020 18:35:24 +0200 Subject: dma-debug: remove most exports Now that the main dma mapping entry points are out of line most of the symbols in dma-debug.c can only be called from built-in code. Remove the unused exports. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/debug.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 8e9f7b301c6d..6f53c2e03aa4 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1235,7 +1235,6 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, add_dma_entry(entry); } -EXPORT_SYMBOL(debug_dma_map_page); void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { @@ -1290,7 +1289,6 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, return; check_unmap(&ref); } -EXPORT_SYMBOL(debug_dma_unmap_page); void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int mapped_ents, int direction) @@ -1328,7 +1326,6 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, add_dma_entry(entry); } } -EXPORT_SYMBOL(debug_dma_map_sg); static int get_nr_mapped_entries(struct device *dev, struct dma_debug_entry *ref) @@ -1380,7 +1377,6 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, check_unmap(&ref); } } -EXPORT_SYMBOL(debug_dma_unmap_sg); void debug_dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t dma_addr, void *virt) @@ -1466,7 +1462,6 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, add_dma_entry(entry); } -EXPORT_SYMBOL(debug_dma_map_resource); void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) @@ -1484,7 +1479,6 @@ void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, check_unmap(&ref); } -EXPORT_SYMBOL(debug_dma_unmap_resource); void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) @@ -1503,7 +1497,6 @@ void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, check_sync(dev, &ref, true); } -EXPORT_SYMBOL(debug_dma_sync_single_for_cpu); void debug_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, @@ -1523,7 +1516,6 @@ void debug_dma_sync_single_for_device(struct device *dev, check_sync(dev, &ref, false); } -EXPORT_SYMBOL(debug_dma_sync_single_for_device); void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, int direction) @@ -1556,7 +1548,6 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, check_sync(dev, &ref, true); } } -EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu); void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction) @@ -1588,7 +1579,6 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, check_sync(dev, &ref, false); } } -EXPORT_SYMBOL(debug_dma_sync_sg_for_device); static int __init dma_debug_driver_setup(char *str) { -- cgit v1.2.3 From ef1a85b6ca093abb00fbc38ea55fd9ae08ab05ef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 29 Aug 2020 10:40:28 +0200 Subject: dma-mapping: fix DMA_OPS dependencies Driver that select DMA_OPS need to depend on HAS_DMA support to work. The vop driver was missing that dependency, so add it, and also add a another depends in DMA_OPS itself. That won't fix the issue due to how the Kconfig dependencies work, but at least produce a warning about unmet dependencies. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 0ddfb5510fe4..e7b801649f65 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -9,6 +9,7 @@ config HAS_DMA default y config DMA_OPS + depends on HAS_DMA bool # -- cgit v1.2.3 From abdaf11ac18925ce8cc229e62e35b342d548ece2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Aug 2020 16:41:50 +0200 Subject: dma-mapping: add (back) arch_dma_mark_clean for ia64 Add back a hook to optimize dcache flushing after reading executable code using DMA. This gets ia64 out of the business of pretending to be dma incoherent just for this optimization. Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 6 ++++++ kernel/dma/direct.c | 3 +++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index e7b801649f65..281785feb874 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -44,6 +44,12 @@ config ARCH_HAS_DMA_SET_MASK config ARCH_HAS_DMA_WRITE_COMBINE bool +# +# Select if the architectures provides the arch_dma_mark_clean hook +# +config ARCH_HAS_DMA_MARK_CLEAN + bool + config DMA_DECLARE_COHERENT bool diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index db6ef07aec3b..949c1cbf08b2 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -345,6 +345,9 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, if (unlikely(is_swiotlb_buffer(paddr))) swiotlb_tbl_sync_single(dev, paddr, sg->length, dir, SYNC_FOR_CPU); + + if (dir == DMA_FROM_DEVICE) + arch_dma_mark_clean(paddr, sg->length); } if (!dev_is_dma_coherent(dev)) -- cgit v1.2.3 From 2f5388a29be82a62529d146499e70987e856f6f7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Aug 2020 17:06:40 +0200 Subject: dma-direct: remove dma_direct_{alloc,free}_pages Just merge these helpers into the main dma_direct_{alloc,free} routines, as the additional checks are always false for the two callers. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 39 +++++++++++++++------------------------ kernel/dma/pool.c | 2 +- 2 files changed, 16 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 949c1cbf08b2..1d564bea5857 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -151,13 +151,18 @@ again: return page; } -void *dma_direct_alloc_pages(struct device *dev, size_t size, +void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { struct page *page; void *ret; int err; + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + dma_alloc_need_uncached(dev, attrs)) + return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); + size = PAGE_ALIGN(size); if (dma_should_alloc_from_pool(dev, gfp, attrs)) { @@ -256,11 +261,18 @@ out_free_pages: return NULL; } -void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_addr, unsigned long attrs) +void dma_direct_free(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { unsigned int page_order = get_order(size); + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + dma_alloc_need_uncached(dev, attrs)) { + arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); + return; + } + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ if (dma_should_free_from_pool(dev, attrs) && dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) @@ -284,27 +296,6 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size); } -void *dma_direct_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) -{ - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs)) - return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); - return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); -} - -void dma_direct_free(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) -{ - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs)) - arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); - else - dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); -} - #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_SWIOTLB) void dma_direct_sync_sg_for_device(struct device *dev, diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 1281c0f0442b..fe11643ff9cc 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -115,7 +115,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, #endif /* * Memory in the atomic DMA pools must be unencrypted, the pools do not - * shrink so no re-encryption occurs in dma_direct_free_pages(). + * shrink so no re-encryption occurs in dma_direct_free(). */ ret = set_memory_decrypted((unsigned long)page_to_virt(page), 1 << order); -- cgit v1.2.3 From 3773dfe6ea4d220dcccb03827fca94ec3d39bc17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Aug 2020 17:14:28 +0200 Subject: dma-direct: lift gfp_t manipulation out of__dma_direct_alloc_pages Move the detailed gfp_t setup from __dma_direct_alloc_pages into the caller to clean things up a little. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 1d564bea5857..12e9f5f75dfe 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -108,7 +108,7 @@ static inline bool dma_should_free_from_pool(struct device *dev, } static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, - gfp_t gfp, unsigned long attrs) + gfp_t gfp) { int node = dev_to_node(dev); struct page *page = NULL; @@ -116,11 +116,6 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, WARN_ON_ONCE(!PAGE_ALIGNED(size)); - if (attrs & DMA_ATTR_NO_WARN) - gfp |= __GFP_NOWARN; - - /* we always manually zero the memory once we are done: */ - gfp &= ~__GFP_ZERO; gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); @@ -164,6 +159,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); size = PAGE_ALIGN(size); + if (attrs & DMA_ATTR_NO_WARN) + gfp |= __GFP_NOWARN; if (dma_should_alloc_from_pool(dev, gfp, attrs)) { u64 phys_mask; @@ -177,7 +174,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, goto done; } - page = __dma_direct_alloc_pages(dev, size, gfp, attrs); + /* we always manually zero the memory once we are done */ + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); if (!page) return NULL; -- cgit v1.2.3 From 96eb89caf753fd0c26d239d8483d92632fb5be15 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Aug 2020 17:20:52 +0200 Subject: dma-direct: use phys_to_dma_direct in dma_direct_alloc Replace the currently open code copy. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 12e9f5f75dfe..57a6e7d7cf8f 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -240,10 +240,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, goto out_encrypt_pages; } done: - if (force_dma_unencrypted(dev)) - *dma_handle = __phys_to_dma(dev, page_to_phys(page)); - else - *dma_handle = phys_to_dma(dev, page_to_phys(page)); + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return ret; out_encrypt_pages: -- cgit v1.2.3 From 7bc5c428a660d4d1bc95ba54bf4cb6bccf8c3029 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Sep 2020 17:56:22 +0200 Subject: dma-direct: remove __dma_to_phys There is no harm in just always clearing the SME encryption bit, while significantly simplifying the interface. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 57a6e7d7cf8f..bfb479c8a370 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -48,11 +48,6 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, { u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); - if (force_dma_unencrypted(dev)) - *phys_limit = __dma_to_phys(dev, dma_limit); - else - *phys_limit = dma_to_phys(dev, dma_limit); - /* * Optimistically try the zone that the physical address mask falls * into first. If that returns memory that isn't actually addressable @@ -61,6 +56,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding * zones. */ + *phys_limit = dma_to_phys(dev, dma_limit); if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) return GFP_DMA; if (*phys_limit <= DMA_BIT_MASK(32)) -- cgit v1.2.3 From 5ceda74093a5c1c3f42a02b894df031f3bbc9af1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Aug 2020 17:34:03 +0200 Subject: dma-direct: rename and cleanup __phys_to_dma The __phys_to_dma vs phys_to_dma distinction isn't exactly obvious. Try to improve the situation by renaming __phys_to_dma to phys_to_dma_unencryped, and not forcing architectures that want to override phys_to_dma to actually provide __phys_to_dma. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 8 ++++---- kernel/dma/swiotlb.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index bfb479c8a370..54db9cfdaecc 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -25,7 +25,7 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { if (force_dma_unencrypted(dev)) - return __phys_to_dma(dev, phys); + return phys_to_dma_unencrypted(dev, phys); return phys_to_dma(dev, phys); } @@ -438,13 +438,13 @@ int dma_direct_supported(struct device *dev, u64 mask) return 1; /* - * This check needs to be against the actual bit mask value, so - * use __phys_to_dma() here so that the SME encryption mask isn't + * This check needs to be against the actual bit mask value, so use + * phys_to_dma_unencrypted() here so that the SME encryption mask isn't * part of the check. */ if (IS_ENABLED(CONFIG_ZONE_DMA)) min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); - return mask >= __phys_to_dma(dev, min_mask); + return mask >= phys_to_dma_unencrypted(dev, min_mask); } size_t dma_direct_max_mapping_size(struct device *dev) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index c19379fabd20..4ea72d145cd2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -670,13 +670,13 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, swiotlb_force); swiotlb_addr = swiotlb_tbl_map_single(dev, - __phys_to_dma(dev, io_tlb_start), + phys_to_dma_unencrypted(dev, io_tlb_start), paddr, size, size, dir, attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ - dma_addr = __phys_to_dma(dev, swiotlb_addr); + dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); -- cgit v1.2.3 From 545d29272f38ba4791cca2a5a86fe6766f462f18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Aug 2020 09:30:44 +0200 Subject: dma-mapping: move dma_common_{mmap,get_sgtable} out of mapping.c Add a new file that contains helpers for misc DMA ops, which is only built when CONFIG_DMA_OPS is set. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/Makefile | 1 + kernel/dma/mapping.c | 47 +------------------------------------------- kernel/dma/ops_helpers.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 46 deletions(-) create mode 100644 kernel/dma/ops_helpers.c (limited to 'kernel') diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 32c7c1942bbd..dc755ab68aab 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_HAS_DMA) += mapping.o direct.o +obj-$(CONFIG_DMA_OPS) += ops_helpers.o obj-$(CONFIG_DMA_OPS) += dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 0d129421e75f..848c95c27d79 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -8,7 +8,7 @@ #include /* for max_pfn */ #include #include -#include +#include #include #include #include @@ -295,22 +295,6 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, } EXPORT_SYMBOL(dma_sync_sg_for_device); -/* - * Create scatter-list for the already allocated DMA buffer. - */ -int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ - struct page *page = virt_to_page(cpu_addr); - int ret; - - ret = sg_alloc_table(sgt, 1, GFP_KERNEL); - if (!ret) - sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); - return ret; -} - /* * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems * that the intention is to allow exporting memory allocated via the @@ -358,35 +342,6 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) } #endif /* CONFIG_MMU */ -/* - * Create userspace mapping for the DMA-coherent memory. - */ -int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ -#ifdef CONFIG_MMU - unsigned long user_count = vma_pages(vma); - unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long off = vma->vm_pgoff; - int ret = -ENXIO; - - vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); - - if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) - return ret; - - if (off >= count || user_count > count - off) - return -ENXIO; - - return remap_pfn_range(vma, vma->vm_start, - page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff, - user_count << PAGE_SHIFT, vma->vm_page_prot); -#else - return -ENXIO; -#endif /* CONFIG_MMU */ -} - /** * dma_can_mmap - check if a given device supports dma_mmap_* * @dev: device to check diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c new file mode 100644 index 000000000000..e443c69be429 --- /dev/null +++ b/kernel/dma/ops_helpers.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helpers for DMA ops implementations. These generally rely on the fact that + * the allocated memory contains normal pages in the direct kernel mapping. + */ +#include + +/* + * Create scatter-list for the already allocated DMA buffer. + */ +int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + struct page *page = virt_to_page(cpu_addr); + int ret; + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (!ret) + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + return ret; +} + +/* + * Create userspace mapping for the DMA-coherent memory. + */ +int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ +#ifdef CONFIG_MMU + unsigned long user_count = vma_pages(vma); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long off = vma->vm_pgoff; + int ret = -ENXIO; + + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); + + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) + return ret; + + if (off >= count || user_count > count - off) + return -ENXIO; + + return remap_pfn_range(vma, vma->vm_start, + page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff, + user_count << PAGE_SHIFT, vma->vm_page_prot); +#else + return -ENXIO; +#endif /* CONFIG_MMU */ +} -- cgit v1.2.3 From a92df4f62fda02e6b141e2b0bb52ccc486264b1c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Aug 2020 19:32:42 +0200 Subject: dma-mapping: move the dma_declare_coherent_memory documentation dma_declare_coherent_memory should not be in a DMA API guide aimed at driver writers (that is consumers of the API). Move it to a comment near the function instead. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/coherent.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 2a0c4985f38e..f85d14bbfcbe 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -107,6 +107,23 @@ static int dma_assign_coherent_memory(struct device *dev, return 0; } +/* + * Declare a region of memory to be handed out by dma_alloc_coherent() when it + * is asked for coherent memory for this device. This shall only be used + * from platform code, usually based on the device tree description. + * + * phys_addr is the CPU physical address to which the memory is currently + * assigned (this will be ioremapped so the CPU can access the region). + * + * device_addr is the DMA address the device needs to be programmed with to + * actually address this memory (this will be handed out as the dma_addr_t in + * dma_alloc_coherent()). + * + * size is the size of the area (must be a multiple of PAGE_SIZE). + * + * As a simplification for the platforms, only *one* such region of memory may + * be declared per device. + */ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size) { -- cgit v1.2.3 From e16c33e290792c9b71b952dc915e5f7dfc9d4409 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Fri, 7 Aug 2020 17:44:40 +0800 Subject: kernel/debug: Fix spelling mistake in debug_core.c Fix typo: "notifiter" --> "notifier" "overriden" --> "overridden" Signed-off-by: Youling Tang Link: https://lore.kernel.org/r/1596793480-22559-1-git-send-email-tangyouling@loongson.cn Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 404d6d47a11d..165e5b0c2083 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -80,7 +80,7 @@ static int exception_level; struct kgdb_io *dbg_io_ops; static DEFINE_SPINLOCK(kgdb_registration_lock); -/* Action for the reboot notifiter, a global allow kdb to change it */ +/* Action for the reboot notifier, a global allow kdb to change it */ static int kgdbreboot; /* kgdb console driver is loaded */ static int kgdb_con_registered; @@ -155,7 +155,7 @@ early_param("nokgdbroundup", opt_nokgdbroundup); /* * Weak aliases for breakpoint management, - * can be overriden by architectures when needed: + * can be overridden by architectures when needed: */ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { -- cgit v1.2.3 From 40249c6962075c040fd071339acae524f18bfac9 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Thu, 10 Sep 2020 14:52:01 +0200 Subject: gcov: add support for GCC 10.1 Using gcov to collect coverage data for kernels compiled with GCC 10.1 causes random malfunctions and kernel crashes. This is the result of a changed GCOV_COUNTERS value in GCC 10.1 that causes a mismatch between the layout of the gcov_info structure created by GCC profiling code and the related structure used by the kernel. Fix this by updating the in-kernel GCOV_COUNTERS value. Also re-enable config GCOV_KERNEL for use with GCC 10. Reported-by: Colin Ian King Reported-by: Leon Romanovsky Signed-off-by: Peter Oberparleiter Tested-by: Leon Romanovsky Tested-and-Acked-by: Colin Ian King Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 1 - kernel/gcov/gcc_4_7.c | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index bb4b680e8455..3110c77230c7 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,6 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - depends on !CC_IS_GCC || GCC_VERSION < 100000 select CONSTRUCTORS if !UML default n help diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 908fdf5098c3..53c67c87f141 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -19,7 +19,9 @@ #include #include "gcov.h" -#if (__GNUC__ >= 7) +#if (__GNUC__ >= 10) +#define GCOV_COUNTERS 8 +#elif (__GNUC__ >= 7) #define GCOV_COUNTERS 9 #elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 -- cgit v1.2.3 From c5e5ec033c4ab25c53f1fd217849e75deb0bf7bf Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 19 May 2020 10:41:00 +0100 Subject: genirq: Add fasteoi IPI flow For irqchips using the fasteoi flow, IPIs are a bit special. They need to be EOI'd early (before calling the handler), as funny things may happen in the handler (they do not necessarily behave like a normal interrupt). Reviewed-by: Valentin Schneider Signed-off-by: Marc Zyngier --- kernel/irq/chip.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 857f5f4c8098..bed517d5aa4d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -944,6 +944,33 @@ void handle_percpu_devid_irq(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } +/** + * handle_percpu_devid_fasteoi_ipi - Per CPU local IPI handler with per cpu + * dev ids + * @desc: the interrupt description structure for this irq + * + * The biggest difference with the IRQ version is that the interrupt is + * EOIed early, as the IPI could result in a context switch, and we need to + * make sure the IPI can fire again. We also assume that the arch code has + * registered an action. If not, we are positively doomed. + */ +void handle_percpu_devid_fasteoi_ipi(struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct irqaction *action = desc->action; + unsigned int irq = irq_desc_get_irq(desc); + irqreturn_t res; + + __kstat_incr_irqs_this_cpu(desc); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); + + trace_irq_handler_entry(irq, action); + res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); + trace_irq_handler_exit(irq, action, res); +} + /** * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu * dev ids -- cgit v1.2.3 From 83cfac95c01817819c2a51f0931d798d851f8a08 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 19 May 2020 14:58:13 +0100 Subject: genirq: Allow interrupts to be excluded from /proc/interrupts A number of architectures implement IPI statistics directly, duplicating the core kstat_irqs accounting. As we move IPIs to being actual IRQs, we would end-up with a confusing display in /proc/interrupts (where the IPIs would appear twice). In order to solve this, allow interrupts to be flagged as "hidden", which excludes them from /proc/interrupts. Reviewed-by: Valentin Schneider Signed-off-by: Marc Zyngier --- kernel/irq/debugfs.c | 1 + kernel/irq/proc.c | 2 +- kernel/irq/settings.h | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index b95ff5d5f4bd..acabc0c0e46b 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -136,6 +136,7 @@ static const struct irq_bit_descr irqdesc_states[] = { BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID), BIT_MASK_DESCR(_IRQ_IS_POLLED), BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY), + BIT_MASK_DESCR(_IRQ_HIDDEN), }; static const struct irq_bit_descr irqdesc_istates[] = { diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 32c071d7bc03..72513ed2a5fc 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -485,7 +485,7 @@ int show_interrupts(struct seq_file *p, void *v) rcu_read_lock(); desc = irq_to_desc(i); - if (!desc) + if (!desc || irq_settings_is_hidden(desc)) goto outsparse; if (desc->kstat_irqs) diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index e43795cd2ccf..403378b9947b 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -17,6 +17,7 @@ enum { _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, _IRQ_IS_POLLED = IRQ_IS_POLLED, _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, + _IRQ_HIDDEN = IRQ_HIDDEN, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -31,6 +32,7 @@ enum { #define IRQ_PER_CPU_DEVID GOT_YOU_MORON #define IRQ_IS_POLLED GOT_YOU_MORON #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON +#define IRQ_HIDDEN GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -167,3 +169,8 @@ static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc) { desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY; } + +static inline bool irq_settings_is_hidden(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_HIDDEN; +} -- cgit v1.2.3 From 73ac74c7d489756d2313219a108809921dbfaea1 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 10 Sep 2020 12:24:53 +0200 Subject: lockdep: fix order in trace_hardirqs_off_caller() Switch order so that locking state is consistent even if the IRQ tracer calls into lockdep again. Acked-by: Peter Zijlstra Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- kernel/trace/trace_preemptirq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index f10073e62603..f4938040c228 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -102,14 +102,14 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { + lockdep_hardirqs_off(CALLER_ADDR0); + if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, caller_addr); if (!in_nmi()) trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); } - - lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off_caller); NOKPROBE_SYMBOL(trace_hardirqs_off_caller); -- cgit v1.2.3 From bcb53209be5cb32d485507452edda19b78f31d84 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 1 Sep 2020 00:12:07 +0900 Subject: kprobes: Fix to check probe enabled before disarm_kprobe_ftrace() Commit: 0cb2f1372baa ("kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler") fixed one bug but the underlying bugs are not completely fixed yet. If we run a kprobe_module.tc of ftracetest, a warning triggers: # ./ftracetest test.d/kprobe/kprobe_module.tc === Ftrace unit tests === [1] Kprobe dynamic event - probing module ... ------------[ cut here ]------------ Failed to disarm kprobe-ftrace at trace_printk_irq_work+0x0/0x7e [trace_printk] (-2) WARNING: CPU: 7 PID: 200 at kernel/kprobes.c:1091 __disarm_kprobe_ftrace.isra.0+0x7e/0xa0 This is because the kill_kprobe() calls disarm_kprobe_ftrace() even if the given probe is not enabled. In that case, ftrace_set_filter_ip() fails because the given probe point is not registered to ftrace. Fix to check the given (going) probe is enabled before invoking disarm_kprobe_ftrace(). Fixes: 0cb2f1372baa ("kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler") Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/159888672694.1411785.5987998076694782591.stgit@devnote2 --- kernel/kprobes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 732a70163584..3b61ae8ff5da 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2235,9 +2235,10 @@ static void kill_kprobe(struct kprobe *p) /* * The module is going away. We should disarm the kprobe which - * is using ftrace. + * is using ftrace, because ftrace framework is still available at + * MODULE_STATE_GOING notification. */ - if (kprobe_ftrace(p)) + if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) disarm_kprobe_ftrace(p); } -- cgit v1.2.3 From b6ec413461034d49f9e586845825adb35ba308f6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Sep 2020 17:58:26 -0700 Subject: core/entry: Report syscall correctly for trace and audit On v5.8 when doing seccomp syscall rewrites (e.g. getpid into getppid as seen in the seccomp selftests), trace (and audit) correctly see the rewritten syscall on entry and exit: seccomp_bpf-1307 [000] .... 22974.874393: sys_enter: NR 110 (... seccomp_bpf-1307 [000] .N.. 22974.874401: sys_exit: NR 110 = 1304 With mainline we see a mismatched enter and exit (the original syscall is incorrectly visible on entry): seccomp_bpf-1030 [000] .... 21.806766: sys_enter: NR 39 (... seccomp_bpf-1030 [000] .... 21.806767: sys_exit: NR 110 = 1027 When ptrace or seccomp change the syscall, this needs to be visible to trace and audit at that time as well. Update the syscall earlier so they see the correct value. Fixes: d88d59b64ca3 ("core/entry: Respect syscall number rewrites") Reported-by: Michael Ellerman Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200912005826.586171-1-keescook@chromium.org --- kernel/entry/common.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 18683598edbc..6fdb6105e6d6 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -60,13 +60,15 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret; } + /* Either of the above might have changed the syscall number */ + syscall = syscall_get_nr(current, regs); + if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, syscall); syscall_enter_audit(regs, syscall); - /* The above might have changed the syscall number */ - return ret ? : syscall_get_nr(current, regs); + return ret ? : syscall; } static __always_inline long -- cgit v1.2.3 From ce003d67ad521d950687bce7e42e5361022c7d3d Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 11:54:02 +0206 Subject: printk: ringbuffer: fix setting state in desc_read() It is expected that desc_read() will always set at least the @state_var field. However, if the descriptor is in an inconsistent state, no fields are set. Also, the second load of @state_var is not stored in @desc_out and so might not match the state value that is returned. Always set the last loaded @state_var into @desc_out, regardless of the descriptor consistency. Fixes: b6cf8b3f3312 ("printk: add lockless ringbuffer") Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914094803.27365-1-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 0659b50872b5..88f7dd4cb0c1 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -368,9 +368,9 @@ static enum desc_state get_desc_state(unsigned long id, } /* - * Get a copy of a specified descriptor and its queried state. A descriptor - * that is not in the committed or reusable state must be considered garbage - * by the reader. + * Get a copy of a specified descriptor and return its queried state. If the + * descriptor is in an inconsistent state (miss or reserved), the caller can + * only expect the descriptor's @state_var field to be valid. */ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, unsigned long id, struct prb_desc *desc_out) @@ -383,8 +383,14 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, /* Check the descriptor state. */ state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */ d_state = get_desc_state(id, state_val); - if (d_state != desc_committed && d_state != desc_reusable) - return d_state; + if (d_state == desc_miss || d_state == desc_reserved) { + /* + * The descriptor is in an inconsistent state. Set at least + * @state_var so that the caller can see the details of + * the inconsistent state. + */ + goto out; + } /* * Guarantee the state is loaded before copying the descriptor @@ -449,9 +455,15 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, */ smp_rmb(); /* LMM(desc_read:D) */ - /* Re-check the descriptor state. */ + /* + * The data has been copied. Return the current descriptor state, + * which may have changed since the load above. + */ state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ - return get_desc_state(id, state_val); + d_state = get_desc_state(id, state_val); +out: + atomic_long_set(&desc_out->state_var, state_val); + return d_state; } /* -- cgit v1.2.3 From e7c1fe21046a024a5a9ef46e3752521afc1ada7b Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 11:54:03 +0206 Subject: printk: ringbuffer: avoid memcpy() on state_var @state_var is copied as part of the descriptor copying via memcpy(). This is not allowed because @state_var is an atomic type, which in some implementations may contain a spinlock. Avoid using memcpy() with @state_var by explicitly copying the other fields of the descriptor. @state_var is set using atomic set operator before returning. Fixes: b6cf8b3f3312 ("printk: add lockless ringbuffer") Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914094803.27365-2-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 88f7dd4cb0c1..11b860ad5264 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -412,9 +412,14 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, /* * Copy the descriptor data. The data is not valid until the - * state has been re-checked. + * state has been re-checked. A memcpy() for all of @desc + * cannot be used because of the atomic_t @state_var field. */ - memcpy(desc_out, desc, sizeof(*desc_out)); /* LMM(desc_read:C) */ + memcpy(&desc_out->info, &desc->info, sizeof(desc_out->info)); /* LMM(desc_read:C) */ + memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, + sizeof(desc_out->text_blk_lpos)); /* also part of desc_read:C */ + memcpy(&desc_out->dict_blk_lpos, &desc->dict_blk_lpos, + sizeof(desc_out->dict_blk_lpos)); /* also part of desc_read:C */ /* * 1. Guarantee the descriptor content is loaded before re-checking -- cgit v1.2.3 From 2a7f87ed05b9035ef818d0f46fe6aef5640224e3 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 14:39:49 +0206 Subject: printk: ringbuffer: relocate get_data() Move the internal get_data() function as-is above prb_reserve() so that a later change can make use of the static function. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914123354.832-2-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 116 +++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 11b860ad5264..664552cb931c 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -1055,6 +1055,64 @@ static unsigned int space_used(struct prb_data_ring *data_ring, DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin)); } +/* + * Given @blk_lpos, return a pointer to the writer data from the data block + * and calculate the size of the data part. A NULL pointer is returned if + * @blk_lpos specifies values that could never be legal. + * + * This function (used by readers) performs strict validation on the lpos + * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is + * triggered if an internal error is detected. + */ +static const char *get_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, + unsigned int *data_size) +{ + struct prb_data_block *db; + + /* Data-less data block description. */ + if (LPOS_DATALESS(blk_lpos->begin) && LPOS_DATALESS(blk_lpos->next)) { + if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { + *data_size = 0; + return ""; + } + return NULL; + } + + /* Regular data block: @begin less than @next and in same wrap. */ + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && + blk_lpos->begin < blk_lpos->next) { + db = to_block(data_ring, blk_lpos->begin); + *data_size = blk_lpos->next - blk_lpos->begin; + + /* Wrapping data block: @begin is one wrap behind @next. */ + } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == + DATA_WRAPS(data_ring, blk_lpos->next)) { + db = to_block(data_ring, 0); + *data_size = DATA_INDEX(data_ring, blk_lpos->next); + + /* Illegal block description. */ + } else { + WARN_ON_ONCE(1); + return NULL; + } + + /* A valid data block will always be aligned to the ID size. */ + if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || + WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { + return NULL; + } + + /* A valid data block will always have at least an ID. */ + if (WARN_ON_ONCE(*data_size < sizeof(db->id))) + return NULL; + + /* Subtract block ID space from size to reflect data size. */ + *data_size -= sizeof(db->id); + + return &db->data[0]; +} + /** * prb_reserve() - Reserve space in the ringbuffer. * @@ -1209,64 +1267,6 @@ void prb_commit(struct prb_reserved_entry *e) local_irq_restore(e->irqflags); } -/* - * Given @blk_lpos, return a pointer to the writer data from the data block - * and calculate the size of the data part. A NULL pointer is returned if - * @blk_lpos specifies values that could never be legal. - * - * This function (used by readers) performs strict validation on the lpos - * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is - * triggered if an internal error is detected. - */ -static const char *get_data(struct prb_data_ring *data_ring, - struct prb_data_blk_lpos *blk_lpos, - unsigned int *data_size) -{ - struct prb_data_block *db; - - /* Data-less data block description. */ - if (LPOS_DATALESS(blk_lpos->begin) && LPOS_DATALESS(blk_lpos->next)) { - if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { - *data_size = 0; - return ""; - } - return NULL; - } - - /* Regular data block: @begin less than @next and in same wrap. */ - if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && - blk_lpos->begin < blk_lpos->next) { - db = to_block(data_ring, blk_lpos->begin); - *data_size = blk_lpos->next - blk_lpos->begin; - - /* Wrapping data block: @begin is one wrap behind @next. */ - } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == - DATA_WRAPS(data_ring, blk_lpos->next)) { - db = to_block(data_ring, 0); - *data_size = DATA_INDEX(data_ring, blk_lpos->next); - - /* Illegal block description. */ - } else { - WARN_ON_ONCE(1); - return NULL; - } - - /* A valid data block will always be aligned to the ID size. */ - if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || - WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { - return NULL; - } - - /* A valid data block will always have at least an ID. */ - if (WARN_ON_ONCE(*data_size < sizeof(db->id))) - return NULL; - - /* Subtract block ID space from size to reflect data size. */ - *data_size -= sizeof(db->id); - - return &db->data[0]; -} - /* * Count the number of lines in provided text. All text has at least 1 line * (even if @text_size is 0). Each '\n' processed is counted as an additional -- cgit v1.2.3 From e3bc0401c1de2feb15601685b85b4dea45130386 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 14:39:50 +0206 Subject: printk: ringbuffer: add BLK_DATALESS() macro Rather than continually needing to explicitly check @begin and @next to identify a dataless block, introduce and use a BLK_DATALESS() macro. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914123354.832-3-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 664552cb931c..195e6f4d4df6 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -266,6 +266,8 @@ /* Determine if a logical position refers to a data-less block. */ #define LPOS_DATALESS(lpos) ((lpos) & 1UL) +#define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \ + LPOS_DATALESS((blk)->next)) /* Get the logical position at index 0 of the current wrap. */ #define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ @@ -1038,7 +1040,7 @@ static unsigned int space_used(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos) { /* Data-less blocks take no space. */ - if (LPOS_DATALESS(blk_lpos->begin)) + if (BLK_DATALESS(blk_lpos)) return 0; if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { @@ -1071,7 +1073,7 @@ static const char *get_data(struct prb_data_ring *data_ring, struct prb_data_block *db; /* Data-less data block description. */ - if (LPOS_DATALESS(blk_lpos->begin) && LPOS_DATALESS(blk_lpos->next)) { + if (BLK_DATALESS(blk_lpos)) { if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { *data_size = 0; return ""; -- cgit v1.2.3 From cc5c7041c6e1fe8c02fe9e16f28a5e52f7a6957c Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 14:39:51 +0206 Subject: printk: ringbuffer: clear initial reserved fields prb_reserve() will set some meta data values and leave others uninitialized (or rather, containing the values of the previous wrap). Simplify the API by always clearing out all the fields. Only the sequence number is filled in. The caller is now responsible for filling in the rest of the meta data fields. In particular, for correctly filling in text and dict lengths. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914123354.832-4-john.ogness@linutronix.de --- kernel/printk/printk.c | 12 ++++++++---- kernel/printk/printk_ringbuffer.c | 30 ++++++++++++++++++------------ 2 files changed, 26 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fec71229169e..964b5701688f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -520,8 +520,11 @@ static int log_store(u32 caller_id, int facility, int level, memcpy(&r.text_buf[0], text, text_len); if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); - if (r.dict_buf) + r.info->text_len = text_len + trunc_msg_len; + if (r.dict_buf) { memcpy(&r.dict_buf[0], dict, dict_len); + r.info->dict_len = dict_len; + } r.info->facility = facility; r.info->level = level & 7; r.info->flags = flags & 0x1f; @@ -1069,10 +1072,11 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, if (!prb_reserve(&e, rb, &dest_r)) return 0; - memcpy(&dest_r.text_buf[0], &r->text_buf[0], dest_r.text_buf_size); + memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); + dest_r.info->text_len = r->info->text_len; if (dest_r.dict_buf) { - memcpy(&dest_r.dict_buf[0], &r->dict_buf[0], - dest_r.dict_buf_size); + memcpy(&dest_r.dict_buf[0], &r->dict_buf[0], r->info->dict_len); + dest_r.info->dict_len = r->info->dict_len; } dest_r.info->facility = r->info->facility; dest_r.info->level = r->info->level; diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 195e6f4d4df6..5a9c7c8cff7b 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -146,10 +146,13 @@ * * if (prb_reserve(&e, &test_rb, &r)) { * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); + * r.info->text_len = strlen(textstr); * * // dictionary allocation may have failed - * if (r.dict_buf) + * if (r.dict_buf) { * snprintf(r.dict_buf, r.dict_buf_size, "%s", dictstr); + * r.info->dict_len = strlen(dictstr); + * } * * r.info->ts_nsec = local_clock(); * @@ -1142,9 +1145,9 @@ static const char *get_data(struct prb_data_ring *data_ring, * @dict_buf_size is set to 0. Writers must check this before writing to * dictionary space. * - * @info->text_len and @info->dict_len will already be set to @text_buf_size - * and @dict_buf_size, respectively. If dictionary space reservation fails, - * @info->dict_len is set to 0. + * Important: @info->text_len and @info->dict_len need to be set correctly by + * the writer in order for data to be readable and/or extended. + * Their values are initialized to 0. */ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r) @@ -1152,6 +1155,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct prb_desc_ring *desc_ring = &rb->desc_ring; struct prb_desc *d; unsigned long id; + u64 seq; if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; @@ -1176,6 +1180,14 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, d = to_desc(desc_ring, id); + /* + * All @info fields (except @seq) are cleared and must be filled in + * by the writer. Save @seq before clearing because it is used to + * determine the new sequence number. + */ + seq = d->info.seq; + memset(&d->info, 0, sizeof(d->info)); + /* * Set the @e fields here so that prb_commit() can be used if * text data allocation fails. @@ -1194,17 +1206,15 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, * See the "Bootstrap" comment block in printk_ringbuffer.h for * details about how the initializer bootstraps the descriptors. */ - if (d->info.seq == 0 && DESC_INDEX(desc_ring, id) != 0) + if (seq == 0 && DESC_INDEX(desc_ring, id) != 0) d->info.seq = DESC_INDEX(desc_ring, id); else - d->info.seq += DESCS_COUNT(desc_ring); + d->info.seq = seq + DESCS_COUNT(desc_ring); r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, &d->text_blk_lpos, id); /* If text data allocation fails, a data-less record is committed. */ if (r->text_buf_size && !r->text_buf) { - d->info.text_len = 0; - d->info.dict_len = 0; prb_commit(e); /* prb_commit() re-enabled interrupts. */ goto fail; @@ -1221,10 +1231,6 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, r->info = &d->info; - /* Set default values for the sizes. */ - d->info.text_len = r->text_buf_size; - d->info.dict_len = r->dict_buf_size; - /* Record full text space used by record. */ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); -- cgit v1.2.3 From 10dcb06d40411a73e1ae111717e9a987bb760313 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 14:39:52 +0206 Subject: printk: ringbuffer: change representation of states Rather than deriving the state by evaluating bits within the flags area of the state variable, assign the states explicit values and set those values in the flags area. Introduce macros to make it simple to read and write state values for the state variable. Although the functionality is preserved, the binary representation for the states is changed. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914123354.832-5-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 28 +++++++--------------------- kernel/printk/printk_ringbuffer.h | 31 ++++++++++++++++++++----------- 2 files changed, 27 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 5a9c7c8cff7b..c0d31185ccbf 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -348,14 +348,6 @@ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) return true; } -/* The possible responses of a descriptor state-query. */ -enum desc_state { - desc_miss, /* ID mismatch */ - desc_reserved, /* reserved, in use by writer */ - desc_committed, /* committed, writer is done */ - desc_reusable, /* free, not yet used by any writer */ -}; - /* Query the state of a descriptor. */ static enum desc_state get_desc_state(unsigned long id, unsigned long state_val) @@ -363,13 +355,7 @@ static enum desc_state get_desc_state(unsigned long id, if (id != DESC_ID(state_val)) return desc_miss; - if (state_val & DESC_REUSE_MASK) - return desc_reusable; - - if (state_val & DESC_COMMITTED_MASK) - return desc_committed; - - return desc_reserved; + return DESC_STATE(state_val); } /* @@ -484,8 +470,8 @@ out: static void desc_make_reusable(struct prb_desc_ring *desc_ring, unsigned long id) { - unsigned long val_committed = id | DESC_COMMITTED_MASK; - unsigned long val_reusable = val_committed | DESC_REUSE_MASK; + unsigned long val_committed = DESC_SV(id, desc_committed); + unsigned long val_reusable = DESC_SV(id, desc_reusable); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; @@ -921,7 +907,7 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) */ prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */ if (prev_state_val && - prev_state_val != (id_prev_wrap | DESC_COMMITTED_MASK | DESC_REUSE_MASK)) { + get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) { WARN_ON_ONCE(1); return false; } @@ -935,7 +921,7 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) * This pairs with desc_read:D. */ if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val, - id | 0)) { /* LMM(desc_reserve:F) */ + DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */ WARN_ON_ONCE(1); return false; } @@ -1254,7 +1240,7 @@ void prb_commit(struct prb_reserved_entry *e) { struct prb_desc_ring *desc_ring = &e->rb->desc_ring; struct prb_desc *d = to_desc(desc_ring, e->id); - unsigned long prev_state_val = e->id | 0; + unsigned long prev_state_val = DESC_SV(e->id, desc_reserved); /* Now the writer has finished all writing: LMM(prb_commit:A) */ @@ -1267,7 +1253,7 @@ void prb_commit(struct prb_reserved_entry *e) * this. This pairs with desc_read:B. */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, - e->id | DESC_COMMITTED_MASK)) { /* LMM(prb_commit:B) */ + DESC_SV(e->id, desc_committed))) { /* LMM(prb_commit:B) */ WARN_ON_ONCE(1); } diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index e6302da041f9..a9d85a6727b1 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -112,16 +112,25 @@ struct prb_reserved_entry { unsigned int text_space; }; -#define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) -#define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) -#define DESC_SV_BITS (sizeof(unsigned long) * 8) -#define DESC_COMMITTED_MASK (1UL << (DESC_SV_BITS - 1)) -#define DESC_REUSE_MASK (1UL << (DESC_SV_BITS - 2)) -#define DESC_FLAGS_MASK (DESC_COMMITTED_MASK | DESC_REUSE_MASK) -#define DESC_ID_MASK (~DESC_FLAGS_MASK) -#define DESC_ID(sv) ((sv) & DESC_ID_MASK) -#define FAILED_LPOS 0x1 -#define NO_LPOS 0x3 +/* The possible responses of a descriptor state-query. */ +enum desc_state { + desc_miss = -1, /* ID mismatch (pseudo state) */ + desc_reserved = 0x0, /* reserved, in use by writer */ + desc_committed = 0x1, /* committed by writer */ + desc_reusable = 0x3, /* free, not yet used by any writer */ +}; + +#define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) +#define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) +#define DESC_SV_BITS (sizeof(unsigned long) * 8) +#define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2) +#define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT) +#define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT)) +#define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) +#define DESC_ID_MASK (~DESC_FLAGS_MASK) +#define DESC_ID(sv) ((sv) & DESC_ID_MASK) +#define FAILED_LPOS 0x1 +#define NO_LPOS 0x3 #define FAILED_BLK_LPOS \ { \ @@ -213,7 +222,7 @@ struct prb_reserved_entry { */ #define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits))) #define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1)) -#define DESC0_SV(ct_bits) (DESC_COMMITTED_MASK | DESC_REUSE_MASK | DESC0_ID(ct_bits)) +#define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable) /* * Define a ringbuffer with an external text data buffer. The same as -- cgit v1.2.3 From 4cfc7258f876a7feba673ac6d050f525b39cc84c Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 14:39:53 +0206 Subject: printk: ringbuffer: add finalization/extension support Add support for extending the newest data block. For this, introduce a new finalization state (desc_finalized) denoting a committed descriptor that cannot be extended. Until a record is finalized, a writer can reopen that record to append new data. Reopening a record means transitioning from the desc_committed state back to the desc_reserved state. A writer can explicitly finalize a record if there is no intention of extending it. Also, records are automatically finalized when a new record is reserved. This relieves writers of needing to explicitly finalize while also making such records available to readers sooner. (Readers can only traverse finalized records.) Four new memory barrier pairs are introduced. Two of them are insignificant additions (data_realloc:A/desc_read:D and data_realloc:A/data_push_tail:B) because they are alternate path memory barriers that exactly match the purpose, pairing, and context of the two existing memory barrier pairs they provide an alternate path for. The other two new memory barrier pairs are significant additions: desc_reopen_last:A / _prb_commit:B - When reopening a descriptor, ensure the state transitions back to desc_reserved before fully trusting the descriptor data. _prb_commit:B / desc_reserve:D - When committing a descriptor, ensure the state transitions to desc_committed before checking the head ID to see if the descriptor needs to be finalized. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914123354.832-6-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 525 ++++++++++++++++++++++++++++++++++---- kernel/printk/printk_ringbuffer.h | 6 +- 2 files changed, 476 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index c0d31185ccbf..f4e2e9890e0f 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -46,20 +46,26 @@ * into a single descriptor field named @state_var, allowing ID and state to * be synchronously and atomically updated. * - * Descriptors have three states: + * Descriptors have four states: * * reserved * A writer is modifying the record. * * committed - * The record and all its data are complete and available for reading. + * The record and all its data are written. A writer can reopen the + * descriptor (transitioning it back to reserved), but in the committed + * state the data is consistent. + * + * finalized + * The record and all its data are complete and available for reading. A + * writer cannot reopen the descriptor. * * reusable * The record exists, but its text and/or dictionary data may no longer * be available. * * Querying the @state_var of a record requires providing the ID of the - * descriptor to query. This can yield a possible fourth (pseudo) state: + * descriptor to query. This can yield a possible fifth (pseudo) state: * * miss * The descriptor being queried has an unexpected ID. @@ -79,6 +85,28 @@ * committed or reusable queried state. This makes it possible that a valid * sequence number of the tail is always available. * + * Descriptor Finalization + * ~~~~~~~~~~~~~~~~~~~~~~~ + * When a writer calls the commit function prb_commit(), record data is + * fully stored and is consistent within the ringbuffer. However, a writer can + * reopen that record, claiming exclusive access (as with prb_reserve()), and + * modify that record. When finished, the writer must again commit the record. + * + * In order for a record to be made available to readers (and also become + * recyclable for writers), it must be finalized. A finalized record cannot be + * reopened and can never become "unfinalized". Record finalization can occur + * in three different scenarios: + * + * 1) A writer can simultaneously commit and finalize its record by calling + * prb_final_commit() instead of prb_commit(). + * + * 2) When a new record is reserved and the previous record has been + * committed via prb_commit(), that previous record is automatically + * finalized. + * + * 3) When a record is committed via prb_commit() and a newer record + * already exists, the record being committed is automatically finalized. + * * Data Rings * ~~~~~~~~~~ * The two data rings (text and dictionary) function identically. They exist @@ -97,7 +125,7 @@ * are met: * * 1) The descriptor associated with the data block is in the committed - * queried state. + * or finalized queried state. * * 2) The blk_lpos struct within the descriptor associated with the data * block references back to the same data block. @@ -156,9 +184,38 @@ * * r.info->ts_nsec = local_clock(); * + * prb_final_commit(&e); + * } + * + * Note that additional writer functions are available to extend a record + * after it has been committed but not yet finalized. This can be done as + * long as no new records have been reserved and the caller is the same. + * + * Sample writer code (record extending):: + * + * // alternate rest of previous example + * r.info->ts_nsec = local_clock(); + * r.info->text_len = strlen(textstr); + * r.info->caller_id = printk_caller_id(); + * + * // commit the record (but do not finalize yet) * prb_commit(&e); * } * + * ... + * + * // specify additional 5 bytes text space to extend + * prb_rec_init_wr(&r, 5, 0); + * + * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id())) { + * snprintf(&r.text_buf[r.info->text_len], + * r.text_buf_size - r.info->text_len, "hello"); + * + * r.info->text_len += 5; + * + * prb_final_commit(&e); + * } + * * Sample reader code:: * * struct printk_info info; @@ -236,15 +293,21 @@ * desc_reserve:F / desc_read:D * set new descriptor id and reserved (state), then allow writer changes * - * data_alloc:A / desc_read:D + * data_alloc:A (or data_realloc:A) / desc_read:D * set old descriptor reusable (state), then modify new data block area * - * data_alloc:A / data_push_tail:B + * data_alloc:A (or data_realloc:A) / data_push_tail:B * push data tail (lpos), then modify new data block area * - * prb_commit:B / desc_read:B + * _prb_commit:B / desc_read:B * store writer changes, then set new descriptor committed (state) * + * desc_reopen_last:A / _prb_commit:B + * set descriptor reserved (state), then read descriptor data + * + * _prb_commit:B / desc_reserve:D + * set new descriptor committed (state), then check descriptor head (id) + * * data_push_tail:D / data_push_tail:A * set descriptor reusable (state), then push data tail (lpos) * @@ -386,16 +449,16 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, /* * Guarantee the state is loaded before copying the descriptor * content. This avoids copying obsolete descriptor content that might - * not apply to the descriptor state. This pairs with prb_commit:B. + * not apply to the descriptor state. This pairs with _prb_commit:B. * * Memory barrier involvement: * - * If desc_read:A reads from prb_commit:B, then desc_read:C reads - * from prb_commit:A. + * If desc_read:A reads from _prb_commit:B, then desc_read:C reads + * from _prb_commit:A. * * Relies on: * - * WMB from prb_commit:A to prb_commit:B + * WMB from _prb_commit:A to _prb_commit:B * matching * RMB from desc_read:A to desc_read:C */ @@ -431,7 +494,8 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, * * 2. Guarantee the record data is loaded before re-checking the * state. This avoids reading an obsolete descriptor state that may - * not apply to the copied data. This pairs with data_alloc:A. + * not apply to the copied data. This pairs with data_alloc:A and + * data_realloc:A. * * Memory barrier involvement: * @@ -463,19 +527,19 @@ out: } /* - * Take a specified descriptor out of the committed state by attempting - * the transition from committed to reusable. Either this context or some + * Take a specified descriptor out of the finalized state by attempting + * the transition from finalized to reusable. Either this context or some * other context will have been successful. */ static void desc_make_reusable(struct prb_desc_ring *desc_ring, unsigned long id) { - unsigned long val_committed = DESC_SV(id, desc_committed); + unsigned long val_finalized = DESC_SV(id, desc_finalized); unsigned long val_reusable = DESC_SV(id, desc_reusable); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; - atomic_long_cmpxchg_relaxed(state_var, val_committed, + atomic_long_cmpxchg_relaxed(state_var, val_finalized, val_reusable); /* LMM(desc_make_reusable:A) */ } @@ -484,7 +548,7 @@ static void desc_make_reusable(struct prb_desc_ring *desc_ring, * data block from @lpos_begin until @lpos_end into the reusable state. * * If there is any problem making the associated descriptor reusable, either - * the descriptor has not yet been committed or another writer context has + * the descriptor has not yet been finalized or another writer context has * already pushed the tail lpos past the problematic data block. Regardless, * on error the caller can re-load the tail lpos to determine the situation. */ @@ -528,10 +592,10 @@ static bool data_make_reusable(struct printk_ringbuffer *rb, switch (d_state) { case desc_miss: - return false; case desc_reserved: - return false; case desc_committed: + return false; + case desc_finalized: /* * This data block is invalid if the descriptor * does not point back to it. @@ -616,7 +680,7 @@ static bool data_push_tail(struct printk_ringbuffer *rb, * data_make_reusable() may be due to a newly * recycled data area causing the tail lpos to * have been previously pushed. This pairs with - * data_alloc:A. + * data_alloc:A and data_realloc:A. * * Memory barrier involvement: * @@ -729,8 +793,9 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, */ return true; case desc_reserved: - return false; case desc_committed: + return false; + case desc_finalized: desc_make_reusable(desc_ring, tail_id); break; case desc_reusable: @@ -751,7 +816,7 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, /* * Check the next descriptor after @tail_id before pushing the tail - * to it because the tail must always be in a committed or reusable + * to it because the tail must always be in a finalized or reusable * state. The implementation of prb_first_seq() relies on this. * * A successful read implies that the next descriptor is less than or @@ -760,7 +825,7 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, */ d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc); /* LMM(desc_push_tail:A) */ - if (d_state == desc_committed || d_state == desc_reusable) { + if (d_state == desc_finalized || d_state == desc_reusable) { /* * Guarantee any descriptor states that have transitioned to * reusable are stored before pushing the tail ID. This allows @@ -895,6 +960,10 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) * another CPU may have pushed the tail ID. This pairs * with desc_push_tail:C and this also pairs with * prb_first_seq:C. + * + * 5. Guarantee the head ID is stored before trying to + * finalize the previous descriptor. This pairs with + * _prb_commit:B. */ } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id, id)); /* LMM(desc_reserve:D) */ @@ -1024,6 +1093,84 @@ static char *data_alloc(struct printk_ringbuffer *rb, return &blk->data[0]; } +/* + * Try to resize an existing data block associated with the descriptor + * specified by @id. If the resized data block should become wrapped, it + * copies the old data to the new data block. If @size yields a data block + * with the same or less size, the data block is left as is. + * + * Fail if this is not the last allocated data block or if there is not + * enough space or it is not possible make enough space. + * + * Return a pointer to the beginning of the entire data buffer or NULL on + * failure. + */ +static char *data_realloc(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, unsigned int size, + struct prb_data_blk_lpos *blk_lpos, unsigned long id) +{ + struct prb_data_block *blk; + unsigned long head_lpos; + unsigned long next_lpos; + bool wrapped; + + /* Reallocation only works if @blk_lpos is the newest data block. */ + head_lpos = atomic_long_read(&data_ring->head_lpos); + if (head_lpos != blk_lpos->next) + return NULL; + + /* Keep track if @blk_lpos was a wrapping data block. */ + wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); + + size = to_blk_size(size); + + next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); + + /* If the data block does not increase, there is nothing to do. */ + if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { + blk = to_block(data_ring, blk_lpos->begin); + return &blk->data[0]; + } + + if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) + return NULL; + + /* The memory barrier involvement is the same as data_alloc:A. */ + if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos, + next_lpos)) { /* LMM(data_realloc:A) */ + return NULL; + } + + blk = to_block(data_ring, blk_lpos->begin); + + if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { + struct prb_data_block *old_blk = blk; + + /* Wrapping data blocks store their data at the beginning. */ + blk = to_block(data_ring, 0); + + /* + * Store the ID on the wrapped block for consistency. + * The printk_ringbuffer does not actually use it. + */ + blk->id = id; + + if (!wrapped) { + /* + * Since the allocated space is now in the newly + * created wrapping data block, copy the content + * from the old data block. + */ + memcpy(&blk->data[0], &old_blk->data[0], + (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id)); + } + } + + blk_lpos->next = next_lpos; + + return &blk->data[0]; +} + /* Return the number of bytes used by a data block. */ static unsigned int space_used(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos) @@ -1104,6 +1251,206 @@ static const char *get_data(struct prb_data_ring *data_ring, return &db->data[0]; } +/* + * Attempt to transition the newest descriptor from committed back to reserved + * so that the record can be modified by a writer again. This is only possible + * if the descriptor is not yet finalized and the provided @caller_id matches. + */ +static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, + u32 caller_id, unsigned long *id_out) +{ + unsigned long prev_state_val; + enum desc_state d_state; + struct prb_desc desc; + struct prb_desc *d; + unsigned long id; + + id = atomic_long_read(&desc_ring->head_id); + + /* + * To reduce unnecessarily reopening, first check if the descriptor + * state and caller ID are correct. + */ + d_state = desc_read(desc_ring, id, &desc); + if (d_state != desc_committed || desc.info.caller_id != caller_id) + return NULL; + + d = to_desc(desc_ring, id); + + prev_state_val = DESC_SV(id, desc_committed); + + /* + * Guarantee the reserved state is stored before reading any + * record data. A full memory barrier is needed because @state_var + * modification is followed by reading. This pairs with _prb_commit:B. + * + * Memory barrier involvement: + * + * If desc_reopen_last:A reads from _prb_commit:B, then + * prb_reserve_in_last:A reads from _prb_commit:A. + * + * Relies on: + * + * WMB from _prb_commit:A to _prb_commit:B + * matching + * MB If desc_reopen_last:A to prb_reserve_in_last:A + */ + if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, + DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */ + return NULL; + } + + *id_out = id; + return d; +} + +/** + * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer + * used by the newest record. + * + * @e: The entry structure to setup. + * @rb: The ringbuffer to re-reserve and extend data in. + * @r: The record structure to allocate buffers for. + * @caller_id: The caller ID of the caller (reserving writer). + * + * This is the public function available to writers to re-reserve and extend + * data. + * + * The writer specifies the text size to extend (not the new total size) by + * setting the @text_buf_size field of @r. Extending dictionaries is not + * supported, so @dict_buf_size of @r should be set to 0. To ensure proper + * initialization of @r, prb_rec_init_wr() should be used. + * + * This function will fail if @caller_id does not match the caller ID of the + * newest record. In that case the caller must reserve new data using + * prb_reserve(). + * + * Context: Any context. Disables local interrupts on success. + * Return: true if text data could be extended, otherwise false. + * + * On success: + * + * - @r->text_buf points to the beginning of the entire text buffer. + * + * - @r->text_buf_size is set to the new total size of the buffer. + * + * - @r->dict_buf and @r->dict_buf_size are cleared because extending + * the dict buffer is not supported. + * + * - @r->info is not touched so that @r->info->text_len could be used + * to append the text. + * + * - prb_record_text_space() can be used on @e to query the new + * actually used space. + * + * Important: All @r->info fields will already be set with the current values + * for the record. I.e. @r->info->text_len will be less than + * @text_buf_size and @r->info->dict_len may be set, even though + * @dict_buf_size is 0. Writers can use @r->info->text_len to know + * where concatenation begins and writers should update + * @r->info->text_len after concatenating. + */ +bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r, u32 caller_id) +{ + unsigned int data_size; + struct prb_desc *d; + unsigned long id; + + local_irq_save(e->irqflags); + + /* Transition the newest descriptor back to the reserved state. */ + d = desc_reopen_last(&rb->desc_ring, caller_id, &id); + if (!d) { + local_irq_restore(e->irqflags); + goto fail_reopen; + } + + /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */ + + /* + * Set the @e fields here so that prb_commit() can be used if + * anything fails from now on. + */ + e->rb = rb; + e->id = id; + + /* + * desc_reopen_last() checked the caller_id, but there was no + * exclusive access at that point. The descriptor may have + * changed since then. + */ + if (caller_id != d->info.caller_id) + goto fail; + + if (BLK_DATALESS(&d->text_blk_lpos)) { + if (WARN_ON_ONCE(d->info.text_len != 0)) { + pr_warn_once("wrong text_len value (%hu, expecting 0)\n", + d->info.text_len); + d->info.text_len = 0; + } + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + } else { + if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size)) + goto fail; + + /* + * Increase the buffer size to include the original size. If + * the meta data (@text_len) is not sane, use the full data + * block size. + */ + if (WARN_ON_ONCE(d->info.text_len > data_size)) { + pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n", + d->info.text_len, data_size); + d->info.text_len = data_size; + } + r->text_buf_size += d->info.text_len; + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + r->text_buf = data_realloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + } + if (r->text_buf_size && !r->text_buf) + goto fail; + + /* Although dictionary data may be in use, it cannot be extended. */ + r->dict_buf = NULL; + r->dict_buf_size = 0; + + r->info = &d->info; + + e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); + + return true; +fail: + prb_commit(e); + /* prb_commit() re-enabled interrupts. */ +fail_reopen: + /* Make it clear to the caller that the re-reserve failed. */ + memset(r, 0, sizeof(*r)); + return false; +} + +/* + * Attempt to finalize a specified descriptor. If this fails, the descriptor + * is either already final or it will finalize itself when the writer commits. + */ +static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) +{ + unsigned long prev_state_val = DESC_SV(id, desc_committed); + struct prb_desc *d = to_desc(desc_ring, id); + + atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, + DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ +} + /** * prb_reserve() - Reserve space in the ringbuffer. * @@ -1197,6 +1544,15 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, else d->info.seq = seq + DESCS_COUNT(desc_ring); + /* + * New data is about to be reserved. Once that happens, previous + * descriptors are no longer able to be extended. Finalize the + * previous descriptor now so that it can be made available to + * readers. (For seq==0 there is no previous descriptor.) + */ + if (d->info.seq > 0) + desc_make_final(desc_ring, DESC_ID(id - 1)); + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, &d->text_blk_lpos, id); /* If text data allocation fails, a data-less record is committed. */ @@ -1227,33 +1583,40 @@ fail: return false; } -/** - * prb_commit() - Commit (previously reserved) data to the ringbuffer. - * - * @e: The entry containing the reserved data information. - * - * This is the public function available to writers to commit data. - * - * Context: Any context. Enables local interrupts. - */ -void prb_commit(struct prb_reserved_entry *e) +/* Commit the data (possibly finalizing it) and restore interrupts. */ +static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) { struct prb_desc_ring *desc_ring = &e->rb->desc_ring; struct prb_desc *d = to_desc(desc_ring, e->id); unsigned long prev_state_val = DESC_SV(e->id, desc_reserved); - /* Now the writer has finished all writing: LMM(prb_commit:A) */ + /* Now the writer has finished all writing: LMM(_prb_commit:A) */ /* * Set the descriptor as committed. See "ABA Issues" about why * cmpxchg() instead of set() is used. * - * Guarantee all record data is stored before the descriptor state - * is stored as committed. A write memory barrier is sufficient for - * this. This pairs with desc_read:B. + * 1 Guarantee all record data is stored before the descriptor state + * is stored as committed. A write memory barrier is sufficient + * for this. This pairs with desc_read:B and desc_reopen_last:A. + * + * 2. Guarantee the descriptor state is stored as committed before + * re-checking the head ID in order to possibly finalize this + * descriptor. This pairs with desc_reserve:D. + * + * Memory barrier involvement: + * + * If prb_commit:A reads from desc_reserve:D, then + * desc_make_final:A reads from _prb_commit:B. + * + * Relies on: + * + * MB _prb_commit:B to prb_commit:A + * matching + * MB desc_reserve:D to desc_make_final:A */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, - DESC_SV(e->id, desc_committed))) { /* LMM(prb_commit:B) */ + DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */ WARN_ON_ONCE(1); } @@ -1261,6 +1624,59 @@ void prb_commit(struct prb_reserved_entry *e) local_irq_restore(e->irqflags); } +/** + * prb_commit() - Commit (previously reserved) data to the ringbuffer. + * + * @e: The entry containing the reserved data information. + * + * This is the public function available to writers to commit data. + * + * Note that the data is not yet available to readers until it is finalized. + * Finalizing happens automatically when space for the next record is + * reserved. + * + * See prb_final_commit() for a version of this function that finalizes + * immediately. + * + * Context: Any context. Enables local interrupts. + */ +void prb_commit(struct prb_reserved_entry *e) +{ + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + unsigned long head_id; + + _prb_commit(e, desc_committed); + + /* + * If this descriptor is no longer the head (i.e. a new record has + * been allocated), extending the data for this record is no longer + * allowed and therefore it must be finalized. + */ + head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ + if (head_id != e->id) + desc_make_final(desc_ring, e->id); +} + +/** + * prb_final_commit() - Commit and finalize (previously reserved) data to + * the ringbuffer. + * + * @e: The entry containing the reserved data information. + * + * This is the public function available to writers to commit+finalize data. + * + * By finalizing, the data is made immediately available to readers. + * + * This function should only be used if there are no intentions of extending + * this data using prb_reserve_in_last(). + * + * Context: Any context. Enables local interrupts. + */ +void prb_final_commit(struct prb_reserved_entry *e) +{ + _prb_commit(e, desc_finalized); +} + /* * Count the number of lines in provided text. All text has at least 1 line * (even if @text_size is 0). Each '\n' processed is counted as an additional @@ -1312,7 +1728,7 @@ static bool copy_data(struct prb_data_ring *data_ring, * because of the trailing alignment padding. */ if (WARN_ON_ONCE(data_size < (unsigned int)len)) { - pr_warn_once("wrong data size (%u, expecting %hu) for data: %.*s\n", + pr_warn_once("wrong data size (%u, expecting >=%hu) for data: %.*s\n", data_size, len, data_size, data); return false; } @@ -1333,16 +1749,16 @@ static bool copy_data(struct prb_data_ring *data_ring, /* * This is an extended version of desc_read(). It gets a copy of a specified - * descriptor. However, it also verifies that the record is committed and has + * descriptor. However, it also verifies that the record is finalized and has * the sequence number @seq. On success, 0 is returned. * * Error return values: - * -EINVAL: A committed record with sequence number @seq does not exist. - * -ENOENT: A committed record with sequence number @seq exists, but its data + * -EINVAL: A finalized record with sequence number @seq does not exist. + * -ENOENT: A finalized record with sequence number @seq exists, but its data * is not available. This is a valid record, so readers should * continue with the next record. */ -static int desc_read_committed_seq(struct prb_desc_ring *desc_ring, +static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, unsigned long id, u64 seq, struct prb_desc *desc_out) { @@ -1353,11 +1769,12 @@ static int desc_read_committed_seq(struct prb_desc_ring *desc_ring, /* * An unexpected @id (desc_miss) or @seq mismatch means the record - * does not exist. A descriptor in the reserved state means the - * record does not yet exist for the reader. + * does not exist. A descriptor in the reserved or committed state + * means the record does not yet exist for the reader. */ if (d_state == desc_miss || d_state == desc_reserved || + d_state == desc_committed || desc_out->info.seq != seq) { return -EINVAL; } @@ -1379,7 +1796,7 @@ static int desc_read_committed_seq(struct prb_desc_ring *desc_ring, * Copy the ringbuffer data from the record with @seq to the provided * @r buffer. On success, 0 is returned. * - * See desc_read_committed_seq() for error return values. + * See desc_read_finalized_seq() for error return values. */ static int prb_read(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r, unsigned int *line_count) @@ -1395,7 +1812,7 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, id = DESC_ID(atomic_long_read(state_var)); /* Get a local copy of the correct descriptor (if available). */ - err = desc_read_committed_seq(desc_ring, id, seq, &desc); + err = desc_read_finalized_seq(desc_ring, id, seq, &desc); /* * If @r is NULL, the caller is only interested in the availability @@ -1425,8 +1842,8 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, r->info->dict_len = 0; } - /* Ensure the record is still committed and has the same @seq. */ - return desc_read_committed_seq(desc_ring, id, seq, &desc); + /* Ensure the record is still finalized and has the same @seq. */ + return desc_read_finalized_seq(desc_ring, id, seq, &desc); } /* Get the sequence number of the tail descriptor. */ @@ -1444,9 +1861,9 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb) /* * This loop will not be infinite because the tail is - * _always_ in the committed or reusable state. + * _always_ in the finalized or reusable state. */ - if (d_state == desc_committed || d_state == desc_reusable) + if (d_state == desc_finalized || d_state == desc_reusable) break; /* @@ -1473,8 +1890,8 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb) } /* - * Non-blocking read of a record. Updates @seq to the last committed record - * (which may have no data). + * Non-blocking read of a record. Updates @seq to the last finalized record + * (which may have no data available). * * See the description of prb_read_valid() and prb_read_valid_info() * for details. @@ -1500,7 +1917,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, (*seq)++; } else { - /* Non-existent/non-committed record. Must stop. */ + /* Non-existent/non-finalized record. Must stop. */ return false; } } diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index a9d85a6727b1..853ea62dc5f2 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -116,7 +116,8 @@ struct prb_reserved_entry { enum desc_state { desc_miss = -1, /* ID mismatch (pseudo state) */ desc_reserved = 0x0, /* reserved, in use by writer */ - desc_committed = 0x1, /* committed by writer */ + desc_committed = 0x1, /* committed by writer, could get reopened */ + desc_finalized = 0x2, /* committed, no further modification allowed */ desc_reusable = 0x3, /* free, not yet used by any writer */ }; @@ -327,7 +328,10 @@ static inline void prb_rec_init_wr(struct printk_record *r, bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r); +bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r, u32 caller_id); void prb_commit(struct prb_reserved_entry *e); +void prb_final_commit(struct prb_reserved_entry *e); void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int text_buf_size, -- cgit v1.2.3 From f5f022e53b874f978dda23847173cbf2589b07f5 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 14 Sep 2020 14:39:54 +0206 Subject: printk: reimplement log_cont using record extension Use the record extending feature of the ringbuffer to implement continuous messages. This preserves the existing continuous message behavior. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200914123354.832-7-john.ogness@linutronix.de --- kernel/printk/printk.c | 98 +++++++++++--------------------------------------- 1 file changed, 20 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 964b5701688f..9a2e23191576 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -535,7 +535,10 @@ static int log_store(u32 caller_id, int facility, int level, r.info->caller_id = caller_id; /* insert message */ - prb_commit(&e); + if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE)) + prb_commit(&e); + else + prb_final_commit(&e); return (text_len + trunc_msg_len); } @@ -1084,7 +1087,7 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; - prb_commit(&e); + prb_final_commit(&e); return prb_record_text_space(&e); } @@ -1884,87 +1887,26 @@ static inline u32 printk_caller_id(void) 0x80000000 + raw_smp_processor_id(); } -/* - * Continuation lines are buffered, and not committed to the record buffer - * until the line is complete, or a race forces it. The line fragments - * though, are printed immediately to the consoles to ensure everything has - * reached the console in case of a kernel crash. - */ -static struct cont { - char buf[LOG_LINE_MAX]; - size_t len; /* length == 0 means unused buffer */ - u32 caller_id; /* printk_caller_id() of first print */ - u64 ts_nsec; /* time of first print */ - u8 level; /* log level of first message */ - u8 facility; /* log facility of first message */ - enum log_flags flags; /* prefix, newline flags */ -} cont; - -static void cont_flush(void) -{ - if (cont.len == 0) - return; - - log_store(cont.caller_id, cont.facility, cont.level, cont.flags, - cont.ts_nsec, NULL, 0, cont.buf, cont.len); - cont.len = 0; -} - -static bool cont_add(u32 caller_id, int facility, int level, - enum log_flags flags, const char *text, size_t len) -{ - /* If the line gets too long, split it up in separate records. */ - if (cont.len + len > sizeof(cont.buf)) { - cont_flush(); - return false; - } - - if (!cont.len) { - cont.facility = facility; - cont.level = level; - cont.caller_id = caller_id; - cont.ts_nsec = local_clock(); - cont.flags = flags; - } - - memcpy(cont.buf + cont.len, text, len); - cont.len += len; - - // The original flags come from the first line, - // but later continuations can add a newline. - if (flags & LOG_NEWLINE) { - cont.flags |= LOG_NEWLINE; - cont_flush(); - } - - return true; -} - static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) { const u32 caller_id = printk_caller_id(); - /* - * If an earlier line was buffered, and we're a continuation - * write from the same context, try to add it to the buffer. - */ - if (cont.len) { - if (cont.caller_id == caller_id && (lflags & LOG_CONT)) { - if (cont_add(caller_id, facility, level, lflags, text, text_len)) - return text_len; - } - /* Otherwise, make sure it's flushed */ - cont_flush(); - } - - /* Skip empty continuation lines that couldn't be added - they just flush */ - if (!text_len && (lflags & LOG_CONT)) - return 0; - - /* If it doesn't end in a newline, try to buffer the current line */ - if (!(lflags & LOG_NEWLINE)) { - if (cont_add(caller_id, facility, level, lflags, text, text_len)) + if (lflags & LOG_CONT) { + struct prb_reserved_entry e; + struct printk_record r; + + prb_rec_init_wr(&r, text_len, 0); + if (prb_reserve_in_last(&e, prb, &r, caller_id)) { + memcpy(&r.text_buf[r.info->text_len], text, text_len); + r.info->text_len += text_len; + if (lflags & LOG_NEWLINE) { + r.info->flags |= LOG_NEWLINE; + prb_final_commit(&e); + } else { + prb_commit(&e); + } return text_len; + } } /* Store it in the record log */ -- cgit v1.2.3 From ce880cb825fcc22d4e39046a6c3a3a7f6603883d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 15 Sep 2020 17:44:01 -0700 Subject: bpf: Fix a rcu warning for bpffs map pretty-print Running selftest ./btf_btf -p the kernel had the following warning: [ 51.528185] WARNING: CPU: 3 PID: 1756 at kernel/bpf/hashtab.c:717 htab_map_get_next_key+0x2eb/0x300 [ 51.529217] Modules linked in: [ 51.529583] CPU: 3 PID: 1756 Comm: test_btf Not tainted 5.9.0-rc1+ #878 [ 51.530346] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014 [ 51.531410] RIP: 0010:htab_map_get_next_key+0x2eb/0x300 ... [ 51.542826] Call Trace: [ 51.543119] map_seq_next+0x53/0x80 [ 51.543528] seq_read+0x263/0x400 [ 51.543932] vfs_read+0xad/0x1c0 [ 51.544311] ksys_read+0x5f/0xe0 [ 51.544689] do_syscall_64+0x33/0x40 [ 51.545116] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The related source code in kernel/bpf/hashtab.c: 709 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 710 { 711 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 712 struct hlist_nulls_head *head; 713 struct htab_elem *l, *next_l; 714 u32 hash, key_size; 715 int i = 0; 716 717 WARN_ON_ONCE(!rcu_read_lock_held()); In kernel/bpf/inode.c, bpffs map pretty print calls map->ops->map_get_next_key() without holding a rcu_read_lock(), hence causing the above warning. To fix the issue, just surrounding map->ops->map_get_next_key() with rcu read lock. Fixes: a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") Reported-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200916004401.146277-1-yhs@fb.com --- kernel/bpf/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index fb878ba3f22f..18f4969552ac 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -226,10 +226,12 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) else prev_key = key; + rcu_read_lock(); if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; - return NULL; + key = NULL; } + rcu_read_unlock(); return key; } -- cgit v1.2.3 From 984fe94f94756dacb3c8cc52904a23adf9e04da1 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:39 -0700 Subject: bpf: Mutex protect used_maps array and count To support modifying the used_maps array, we use a mutex to protect the use of the counter and the array. The mutex is initialized right after the prog aux is allocated, and destroyed right before prog aux is freed. This way we guarantee it's initialized for both cBPF and eBPF. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-2-sdf@google.com --- kernel/bpf/core.c | 15 +++++++++++---- kernel/bpf/syscall.c | 16 ++++++++++++---- 2 files changed, 23 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ed0b3578867c..2a20c2833996 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -98,6 +98,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->jit_requested = ebpf_jit_enabled(); INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); + mutex_init(&fp->aux->used_maps_mutex); return fp; } @@ -253,6 +254,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { + mutex_destroy(&fp->aux->used_maps_mutex); free_percpu(fp->aux->stats); kfree(fp->aux->poke_tab); kfree(fp->aux); @@ -1747,8 +1749,9 @@ bool bpf_prog_array_compatible(struct bpf_array *array, static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; - int i; + int i, ret = 0; + mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; struct bpf_array *array; @@ -1757,11 +1760,15 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) continue; array = container_of(map, struct bpf_array, map); - if (!bpf_prog_array_compatible(array, fp)) - return -EINVAL; + if (!bpf_prog_array_compatible(array, fp)) { + ret = -EINVAL; + goto out; + } } - return 0; +out: + mutex_unlock(&aux->used_maps_mutex); + return ret; } static void bpf_prog_select_func(struct bpf_prog *fp) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4108ef3b828b..a67b8c6746be 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3162,21 +3162,25 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, const struct bpf_map *map; int i; + mutex_lock(&prog->aux->used_maps_mutex); for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { map = prog->aux->used_maps[i]; if (map == (void *)addr) { *type = BPF_PSEUDO_MAP_FD; - return map; + goto out; } if (!map->ops->map_direct_value_meta) continue; if (!map->ops->map_direct_value_meta(map, addr, off)) { *type = BPF_PSEUDO_MAP_VALUE; - return map; + goto out; } } + map = NULL; - return NULL; +out: + mutex_unlock(&prog->aux->used_maps_mutex); + return map; } static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, @@ -3294,6 +3298,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, memcpy(info.tag, prog->tag, sizeof(prog->tag)); memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); + mutex_lock(&prog->aux->used_maps_mutex); ulen = info.nr_map_ids; info.nr_map_ids = prog->aux->used_map_cnt; ulen = min_t(u32, info.nr_map_ids, ulen); @@ -3303,9 +3308,12 @@ static int bpf_prog_get_info_by_fd(struct file *file, for (i = 0; i < ulen; i++) if (put_user(prog->aux->used_maps[i]->id, - &user_map_ids[i])) + &user_map_ids[i])) { + mutex_unlock(&prog->aux->used_maps_mutex); return -EFAULT; + } } + mutex_unlock(&prog->aux->used_maps_mutex); err = set_info_rec_size(&info); if (err) -- cgit v1.2.3 From ef15314aa5de955c6afd87d512e8b00f5ac08d06 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:40 -0700 Subject: bpf: Add BPF_PROG_BIND_MAP syscall This syscall binds a map to a program. Returns success if the map is already bound to the program. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-3-sdf@google.com --- kernel/bpf/syscall.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a67b8c6746be..2ce32cad5c8e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4161,6 +4161,66 @@ static int bpf_iter_create(union bpf_attr *attr) return err; } +#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags + +static int bpf_prog_bind_map(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + struct bpf_map **used_maps_old, **used_maps_new; + int i, ret = 0; + + if (CHECK_ATTR(BPF_PROG_BIND_MAP)) + return -EINVAL; + + if (attr->prog_bind_map.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_bind_map.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + map = bpf_map_get(attr->prog_bind_map.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto out_prog_put; + } + + mutex_lock(&prog->aux->used_maps_mutex); + + used_maps_old = prog->aux->used_maps; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (used_maps_old[i] == map) + goto out_unlock; + + used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, + sizeof(used_maps_new[0]), + GFP_KERNEL); + if (!used_maps_new) { + ret = -ENOMEM; + goto out_unlock; + } + + memcpy(used_maps_new, used_maps_old, + sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); + used_maps_new[prog->aux->used_map_cnt] = map; + + prog->aux->used_map_cnt++; + prog->aux->used_maps = used_maps_new; + + kfree(used_maps_old); + +out_unlock: + mutex_unlock(&prog->aux->used_maps_mutex); + + if (ret) + bpf_map_put(map); +out_prog_put: + bpf_prog_put(prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4294,6 +4354,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_DETACH: err = link_detach(&attr); break; + case BPF_PROG_BIND_MAP: + err = bpf_prog_bind_map(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From cdabce2e3dff7e4bcef73473987618569d178af3 Mon Sep 17 00:00:00 2001 From: Jiafei Pan Date: Fri, 14 Aug 2020 12:55:22 +0800 Subject: softirq: Add debug check to __raise_softirq_irqoff() __raise_softirq_irqoff() must be called with interrupts disabled to protect the per CPU softirq pending state update against an interrupt and soft interrupt handling on return from interrupt. Add a lockdep assertion to validate the calling convention. [ tglx: Massaged changelog ] Signed-off-by: Jiafei Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200814045522.45719-1-Jiafei.Pan@nxp.com --- kernel/softirq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index bf88d7f62433..09229ad82209 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -481,6 +481,7 @@ void raise_softirq(unsigned int nr) void __raise_softirq_irqoff(unsigned int nr) { + lockdep_assert_irqs_disabled(); trace_softirq_raise(nr); or_softirq_pending(1UL << nr); } -- cgit v1.2.3 From e6b1a44eccfcab5e5e280be376f65478c3b2c7a2 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 15 Sep 2020 22:07:50 +0800 Subject: locking/percpu-rwsem: Use this_cpu_{inc,dec}() for read_count The __this_cpu*() accessors are (in general) IRQ-unsafe which, given that percpu-rwsem is a blocking primitive, should be just fine. However, file_end_write() is used from IRQ context and will cause load-store issues on architectures where the per-cpu accessors are not natively irq-safe. Fix it by using the IRQ-safe this_cpu_*() for operations on read_count. This will generate more expensive code on a number of platforms, which might cause a performance regression for some of the other percpu-rwsem users. If any such is reported, we can consider alternative solutions. Fixes: 70fe2f48152e ("aio: fix freeze protection of aio writes") Signed-off-by: Hou Tao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Acked-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20200915140750.137881-1-houtao1@huawei.com --- kernel/locking/percpu-rwsem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 8bbafe3e5203..70a32a576f3f 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem); static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { - __this_cpu_inc(*sem->read_count); + this_cpu_inc(*sem->read_count); /* * Due to having preemption disabled the decrement happens on @@ -71,7 +71,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) if (likely(!atomic_read_acquire(&sem->block))) return true; - __this_cpu_dec(*sem->read_count); + this_cpu_dec(*sem->read_count); /* Prod writer to re-evaluate readers_active_check() */ rcuwait_wake_up(&sem->writer); -- cgit v1.2.3 From 13b90cadfc294718dd5a89e1fcf103477b01eb50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Aug 2020 13:16:32 +0200 Subject: genirq/chip: Use the first chip in irq_chip_compose_msi_msg() The documentation of irq_chip_compose_msi_msg() claims that with hierarchical irq domains the first chip in the hierarchy which has an irq_compose_msi_msg() callback is chosen. But the code just keeps iterating after it finds a chip with a compose callback. The x86 HPET MSI implementation relies on that behaviour, but that does not make it more correct. The message should always be composed at the domain which manages the underlying resource (e.g. APIC or remap table) because that domain knows about the required layout of the message. On X86 the following hierarchies exist: 1) vector -------- PCI/MSI 2) vector -- IR -- PCI/MSI The vector domain has a different message format than the IR (remapping) domain. So obviously the PCI/MSI domain can't compose the message without having knowledge about the parent domain, which is exactly the opposite of what hierarchical domains want to achieve. X86 actually has two different PCI/MSI chips where #1 has a compose callback and #2 does not. #2 delegates the composition to the remap domain where it belongs, but #1 does it at the PCI/MSI level. For the upcoming device MSI support it's necessary to change this and just let the first domain which can compose the message take care of it. That way the top level chip does not have to worry about it and the device MSI code does not need special knowledge about topologies. It just sets the compose callback to NULL and lets the hierarchy pick the first chip which has one. Due to that the attempt to move the compose callback from the direct delivery PCI/MSI domain to the vector domain made the system fail to boot with interrupt remapping enabled because in the remapping case irq_chip_compose_msi_msg() keeps iterating and choses the compose callback of the vector domain which obviously creates the wrong format for the remap table. Break out of the loop when the first irq chip with a compose callback is found and fixup the HPET code temporarily. That workaround will be removed once the direct delivery compose callback is moved to the place where it belongs in the vector domain. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20200826112331.047917603@linutronix.de --- kernel/irq/chip.c | 9 ++++----- kernel/irq/internals.h | 9 +++++++++ 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 857f5f4c8098..0ae308efa604 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1541,18 +1541,17 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent); */ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { - struct irq_data *pos = NULL; + struct irq_data *pos; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - for (; data; data = data->parent_data) -#endif + for (pos = NULL; !pos && data; data = irqd_get_parent_data(data)) { if (data->chip && data->chip->irq_compose_msi_msg) pos = data; + } + if (!pos) return -ENOSYS; pos->chip->irq_compose_msi_msg(pos, msg); - return 0; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 7db284b10ac9..54363527feea 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -473,6 +473,15 @@ static inline void irq_domain_deactivate_irq(struct irq_data *data) } #endif +static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd) +{ +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + return irqd->parent_data; +#else + return NULL; +#endif +} + #ifdef CONFIG_GENERIC_IRQ_DEBUGFS #include -- cgit v1.2.3 From 9006c133a422f474d7d8e10a8baae179f70c22f5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Aug 2020 13:16:47 +0200 Subject: x86/msi: Use generic MSI domain ops pci_msi_get_hwirq() and pci_msi_set_desc are not longer special. Enable the generic MSI domain ops in the core and PCI MSI code unconditionally and get rid of the x86 specific implementations in the X86 MSI code and in the hyperv PCI driver. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20200826112332.564274859@linutronix.de --- kernel/irq/msi.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index eb95f6106a1e..640668e3f870 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -187,7 +187,6 @@ static const struct irq_domain_ops msi_domain_ops = { .deactivate = msi_domain_deactivate, }; -#ifdef GENERIC_MSI_DOMAIN_OPS static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, msi_alloc_info_t *arg) { @@ -206,11 +205,6 @@ static void msi_domain_ops_set_desc(msi_alloc_info_t *arg, { arg->desc = desc; } -#else -#define msi_domain_ops_get_hwirq NULL -#define msi_domain_ops_prepare NULL -#define msi_domain_ops_set_desc NULL -#endif /* !GENERIC_MSI_DOMAIN_OPS */ static int msi_domain_ops_init(struct irq_domain *domain, struct msi_domain_info *info, -- cgit v1.2.3 From c6c9e2838c5f0b94773511586123bcb125757f2a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Aug 2020 13:16:51 +0200 Subject: irqdomain/msi: Provide DOMAIN_BUS_VMD_MSI PCI devices behind a VMD bus are not subject to interrupt remapping, but the irq domain for VMD MSI cannot be distinguished from a regular PCI/MSI irq domain. Add a new domain bus token and allow it in the bus token check in msi_check_reservation_mode() to keep the functionality the same once VMD uses this token. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Acked-by: Jon Derrick Link: https://lore.kernel.org/r/20200826112332.954409970@linutronix.de --- kernel/irq/msi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 640668e3f870..616b9583cfbb 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -364,8 +364,13 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, { struct msi_desc *desc; - if (domain->bus_token != DOMAIN_BUS_PCI_MSI) + switch(domain->bus_token) { + case DOMAIN_BUS_PCI_MSI: + case DOMAIN_BUS_VMD_MSI: + break; + default: return false; + } if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) return false; -- cgit v1.2.3 From 43e9e705dd57c466c4bfe32ab8c17db537b89297 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Aug 2020 13:16:57 +0200 Subject: irqdomain/msi: Allow to override msi_domain_alloc/free_irqs() To support MSI irq domains which do not fit at all into the regular MSI irqdomain scheme, like the XEN MSI interrupt management for PV/HVM/DOM0, it's necessary to allow to override the alloc/free implementation. This is a preperatory step to switch X86 away from arch_*_msi_irqs() and store the irq domain pointer right in struct device. No functional change for existing MSI irq domain users. Aside of the evil XEN wrapper this is also useful for special MSI domains which need to do extra alloc/free work before/after calling the generic core function. Work like allocating/freeing MSI descriptors, MSI storage space etc. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20200826112333.526797548@linutronix.de --- kernel/irq/msi.c | 70 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 616b9583cfbb..2c0c4d6d0f83 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -229,11 +229,13 @@ static int msi_domain_ops_check(struct irq_domain *domain, } static struct msi_domain_ops msi_domain_ops_default = { - .get_hwirq = msi_domain_ops_get_hwirq, - .msi_init = msi_domain_ops_init, - .msi_check = msi_domain_ops_check, - .msi_prepare = msi_domain_ops_prepare, - .set_desc = msi_domain_ops_set_desc, + .get_hwirq = msi_domain_ops_get_hwirq, + .msi_init = msi_domain_ops_init, + .msi_check = msi_domain_ops_check, + .msi_prepare = msi_domain_ops_prepare, + .set_desc = msi_domain_ops_set_desc, + .domain_alloc_irqs = __msi_domain_alloc_irqs, + .domain_free_irqs = __msi_domain_free_irqs, }; static void msi_domain_update_dom_ops(struct msi_domain_info *info) @@ -245,6 +247,14 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info) return; } + if (ops->domain_alloc_irqs == NULL) + ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs; + if (ops->domain_free_irqs == NULL) + ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs; + + if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS)) + return; + if (ops->get_hwirq == NULL) ops->get_hwirq = msi_domain_ops_default.get_hwirq; if (ops->msi_init == NULL) @@ -278,8 +288,7 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, { struct irq_domain *domain; - if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) - msi_domain_update_dom_ops(info); + msi_domain_update_dom_ops(info); if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) msi_domain_update_chip_ops(info); @@ -386,17 +395,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit; } -/** - * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain - * @domain: The domain to allocate from - * @dev: Pointer to device struct of the device for which the interrupts - * are allocated - * @nvec: The number of interrupts to allocate - * - * Returns 0 on success or an error code. - */ -int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, - int nvec) +int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, + int nvec) { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; @@ -490,12 +490,24 @@ cleanup: } /** - * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev - * @domain: The domain to managing the interrupts + * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain + * @domain: The domain to allocate from * @dev: Pointer to device struct of the device for which the interrupts - * are free + * are allocated + * @nvec: The number of interrupts to allocate + * + * Returns 0 on success or an error code. */ -void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, + int nvec) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + + return ops->domain_alloc_irqs(domain, dev, nvec); +} + +void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) { struct msi_desc *desc; @@ -512,6 +524,20 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) } } +/** + * __msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev + * @domain: The domain to managing the interrupts + * @dev: Pointer to device struct of the device for which the interrupts + * are free + */ +void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + + return ops->domain_free_irqs(domain, dev); +} + /** * msi_get_domain_info - Get the MSI interrupt domain info for @domain * @domain: The interrupt domain to retrieve data from -- cgit v1.2.3 From 6731da9e0ff33d8c5f340705d118a27d3b817d1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Sep 2020 14:14:34 -0700 Subject: rcu-tasks: Mark variables static The n_heavy_reader_attempts, n_heavy_reader_updates, and n_heavy_reader_ofl_updates variables are not used outside of their translation unit, so this commit marks them static. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 05d3e1375e4c..978508ec39c1 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -745,9 +745,9 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); // The number of detections of task quiescent state relying on // heavyweight readers executing explicit memory barriers. -unsigned long n_heavy_reader_attempts; -unsigned long n_heavy_reader_updates; -unsigned long n_heavy_reader_ofl_updates; +static unsigned long n_heavy_reader_attempts; +static unsigned long n_heavy_reader_updates; +static unsigned long n_heavy_reader_ofl_updates; void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, -- cgit v1.2.3 From 2393a613d2e3da35bd73ee55d9dca0fb04810955 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Sep 2020 21:36:34 -0700 Subject: rcu-tasks: Use more aggressive polling for RCU Tasks Trace The RCU Tasks Trace grace periods are too slow, as in 40x slower than those of RCU Tasks. This is due to my having assumed a one-second grace period was OK, and thus not having optimized any further. This commit provides the first step in this optimization process, namely by allowing the task_list scan backoff interval to be specified on a per-flavor basis, and then speeding up the scans for RCU Tasks Trace. However, kernels built with CONFIG_TASKS_TRACE_RCU_READ_MB=y continue to use the old slower backoff, consistent with that Kconfig option's goal of reducing IPIs. Link: https://lore.kernel.org/bpf/CAADnVQK_AiX+S_L_A4CQWT11XyveppBbQSQgH_qWGyzu_E8Yeg@mail.gmail.com/ Reported-by: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 978508ec39c1..ad8c4f3f44d2 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -28,6 +28,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). + * @init_fract: Initial backoff sleep interval. * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. * @n_gps: Number of grace periods completed since boot. @@ -48,6 +49,7 @@ struct rcu_tasks { struct wait_queue_head cbs_wq; raw_spinlock_t cbs_lock; int gp_state; + int init_fract; unsigned long gp_jiffies; unsigned long gp_start; unsigned long n_gps; @@ -329,8 +331,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) */ lastreport = jiffies; - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ - fract = 10; + // Start off with initial wait and slowly back off to 1 HZ wait. + fract = rtp->init_fract; + if (fract > HZ) + fract = HZ; for (;;) { bool firstreport; @@ -553,6 +557,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks); static int __init rcu_spawn_tasks_kthread(void) { + rcu_tasks.init_fract = 10; rcu_tasks.pregp_func = rcu_tasks_pregp_step; rcu_tasks.pertask_func = rcu_tasks_pertask; rcu_tasks.postscan_func = rcu_tasks_postscan; @@ -1163,6 +1168,13 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); static int __init rcu_spawn_tasks_trace_kthread(void) { + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { + rcu_tasks_trace.init_fract = 10; + } else { + rcu_tasks_trace.init_fract = HZ / 5; + if (rcu_tasks_trace.init_fract <= 0) + rcu_tasks_trace.init_fract = 1; + } rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; -- cgit v1.2.3 From 78edc005f477a4987ee0a5d96bfe117295c231fd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Aug 2020 08:09:40 -0700 Subject: rcu-tasks: Prevent complaints of unused show_rcu_tasks_classic_gp_kthread() Commit 8344496e8b49 ("rcu-tasks: Conditionally compile show_rcu_tasks_gp_kthreads()") introduced conditional compilation of several functions, but forgot one occurrence of show_rcu_tasks_classic_gp_kthread() that causes the compiler to warn of an unused static function. This commit uses "static inline" to avoid these complaints and possibly also to avoid emitting an actual definition of this function. Fixes: 8344496e8b49 ("rcu-tasks: Conditionally compile show_rcu_tasks_gp_kthreads()") Cc: # 5.8.x Reported-by: Laurent Pinchart Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 835e2df8590a..05d3e1375e4c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -590,7 +590,7 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) } #else /* #ifdef CONFIG_TASKS_RCU */ -static void show_rcu_tasks_classic_gp_kthread(void) { } +static inline void show_rcu_tasks_classic_gp_kthread(void) { } void exit_tasks_rcu_start(void) { } void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ -- cgit v1.2.3 From 574de8766f6efa0757f3c7ac15c9eb29a4636861 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Sep 2020 21:51:09 -0700 Subject: rcu-tasks: Selectively enable more RCU Tasks Trace IPIs Many workloads are quite sensitive to IPIs, and such workloads should build kernels with CONFIG_TASKS_TRACE_RCU_READ_MB=y to prevent RCU Tasks Trace from using them under normal conditions. However, other workloads are quite happy to permit more IPIs if doing so makes BPF program updates go faster. This commit therefore sets the default value for the rcupdate.rcu_task_ipi_delay kernel parameter to zero for kernels that have been built with CONFIG_TASKS_TRACE_RCU_READ_MB=n, while retaining the old default of (HZ / 10) for kernels that have indicated an aversion to IPIs via CONFIG_TASKS_TRACE_RCU_READ_MB=y. Link: https://lore.kernel.org/bpf/CAADnVQK_AiX+S_L_A4CQWT11XyveppBbQSQgH_qWGyzu_E8Yeg@mail.gmail.com/ Reported-by: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ad8c4f3f44d2..2b4df237b598 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -83,7 +83,7 @@ static struct rcu_tasks rt_name = \ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); /* Avoid IPIing CPUs early in the grace period. */ -#define RCU_TASK_IPI_DELAY (HZ / 2) +#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; module_param(rcu_task_ipi_delay, int, 0644); @@ -916,7 +916,8 @@ static void trc_wait_for_one_reader(struct task_struct *t, // If currently running, send an IPI, either way, add to list. trc_add_holdout(t, bhp); - if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + if (task_curr(t) && + time_after(jiffies + 1, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { // The task is currently running, so try IPIing it. cpu = task_cpu(t); -- cgit v1.2.3 From 4fe192dfbe5ba9780df699d411aa4f25ba24cf61 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Sep 2020 22:05:41 -0700 Subject: rcu-tasks: Shorten per-grace-period sleep for RCU Tasks Trace The various RCU tasks flavors currently wait 100 milliseconds between each grace period in order to prevent CPU-bound loops and to favor efficiency over latency. However, RCU Tasks Trace needs to have a grace-period latency of roughly 25 milliseconds, which is completely infeasible given the 100-millisecond per-grace-period sleep. This commit therefore reduces this sleep duration to 5 milliseconds (or one jiffy, whichever is longer) in kernels built with CONFIG_TASKS_TRACE_RCU_READ_MB=y. Link: https://lore.kernel.org/bpf/CAADnVQK_AiX+S_L_A4CQWT11XyveppBbQSQgH_qWGyzu_E8Yeg@mail.gmail.com/ Reported-by: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 2b4df237b598..a0eaed522277 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -28,6 +28,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). + * @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping. * @init_fract: Initial backoff sleep interval. * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. @@ -49,6 +50,7 @@ struct rcu_tasks { struct wait_queue_head cbs_wq; raw_spinlock_t cbs_lock; int gp_state; + int gp_sleep; int init_fract; unsigned long gp_jiffies; unsigned long gp_start; @@ -233,7 +235,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) cond_resched(); } /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_idle(HZ/10); + schedule_timeout_idle(rtp->gp_sleep); set_tasks_gp_state(rtp, RTGS_WAIT_CBS); } @@ -557,6 +559,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks); static int __init rcu_spawn_tasks_kthread(void) { + rcu_tasks.gp_sleep = HZ / 10; rcu_tasks.init_fract = 10; rcu_tasks.pregp_func = rcu_tasks_pregp_step; rcu_tasks.pertask_func = rcu_tasks_pertask; @@ -690,6 +693,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); static int __init rcu_spawn_tasks_rude_kthread(void) { + rcu_tasks_rude.gp_sleep = HZ / 10; rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } @@ -1170,8 +1174,12 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); static int __init rcu_spawn_tasks_trace_kthread(void) { if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { + rcu_tasks_trace.gp_sleep = HZ / 10; rcu_tasks_trace.init_fract = 10; } else { + rcu_tasks_trace.gp_sleep = HZ / 200; + if (rcu_tasks_trace.gp_sleep <= 0) + rcu_tasks_trace.gp_sleep = 1; rcu_tasks_trace.init_fract = HZ / 5; if (rcu_tasks_trace.init_fract <= 0) rcu_tasks_trace.init_fract = 1; -- cgit v1.2.3 From ba3a86e47232ad9f76160929f33ac9c64e4d0567 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 14 Sep 2020 15:44:37 -0700 Subject: rcu-tasks: Fix grace-period/unlock race in RCU Tasks Trace The more intense grace-period processing resulting from the 50x RCU Tasks Trace grace-period speedups exposed the following race condition: o Task A running on CPU 0 executes rcu_read_lock_trace(), entering a read-side critical section. o When Task A eventually invokes rcu_read_unlock_trace() to exit its read-side critical section, this function notes that the ->trc_reader_special.s flag is zero and and therefore invoke wil set ->trc_reader_nesting to zero using WRITE_ONCE(). But before that happens... o The RCU Tasks Trace grace-period kthread running on some other CPU interrogates Task A, but this fails because this task is currently running. This kthread therefore sends an IPI to CPU 0. o CPU 0 receives the IPI, and thus invokes trc_read_check_handler(). Because Task A has not yet cleared its ->trc_reader_nesting counter, this function sees that Task A is still within its read-side critical section. This function therefore sets the ->trc_reader_nesting.b.need_qs flag, AKA the .need_qs flag. Except that Task A has already checked the .need_qs flag, which is part of the ->trc_reader_special.s flag. The .need_qs flag therefore remains set until Task A's next rcu_read_unlock_trace(). o Task A now invokes synchronize_rcu_tasks_trace(), which cannot start a new grace period until the current grace period completes. And thus cannot return until after that time. But Task A's .need_qs flag is still set, which prevents the current grace period from completing. And because Task A is blocked, it will never execute rcu_read_unlock_trace() until its call to synchronize_rcu_tasks_trace() returns. We are therefore deadlocked. This race is improbable, but 80 hours of rcutorture made it happen twice. The race was possible before the grace-period speedup, but roughly 50x less probable. Several thousand hours of rcutorture would have been necessary to have a reasonable chance of making this happen before this 50x speedup. This commit therefore eliminates this deadlock by setting ->trc_reader_nesting to a large negative number before checking the .need_qs and zeroing (or decrementing with respect to its initial value) ->trc_reader_nesting. For its part, the IPI handler's trc_read_check_handler() function adds a check for negative values, deferring evaluation of the task in this case. Taken together, these changes avoid this deadlock scenario. Fixes: 276c410448db ("rcu-tasks: Split ->trc_reader_need_end") Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Cc: # 5.7.x Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index a0eaed522277..e583a2d47374 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -830,6 +830,12 @@ static void trc_read_check_handler(void *t_in) WRITE_ONCE(t->trc_reader_checked, true); goto reset_ipi; } + // If we are racing with an rcu_read_unlock_trace(), try again later. + if (unlikely(t->trc_reader_nesting < 0)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + goto reset_ipi; + } WRITE_ONCE(t->trc_reader_checked, true); // Get here if the task is in a read-side critical section. Set -- cgit v1.2.3 From 592031cc10858be4adb10f6c0f2608f6f21824aa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Sep 2020 14:03:34 -0700 Subject: rcu-tasks: Fix low-probability task_struct leak When rcu_tasks_trace_postgp() function detects an RCU Tasks Trace CPU stall, it adds all tasks blocking the current grace period to a list, invoking get_task_struct() on each to prevent them from being freed while on the list. It then traverses that list, printing stall-warning messages for each one that is still blocking the current grace period and removing it from the list. The list removal invokes the matching put_task_struct(). This of course means that in the admittedly unlikely event that some task executes its outermost rcu_read_unlock_trace() in the meantime, it won't be removed from the list and put_task_struct() won't be executing, resulting in a task_struct leak. This commit therefore makes the list removal and put_task_struct() unconditional, stopping the leak. Note further that this bug can occur only after an RCU Tasks Trace CPU stall warning, which by default only happens after a grace period has extended for ten minutes (yes, not a typo, minutes). Fixes: 4593e772b502 ("rcu-tasks: Add stall warnings for RCU Tasks Trace") Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Cc: # 5.7.x Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index e583a2d47374..fcd9c25f1140 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1092,11 +1092,11 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) if (READ_ONCE(t->trc_reader_special.b.need_qs)) trc_add_holdout(t, &holdouts); firstreport = true; - list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) - if (READ_ONCE(t->trc_reader_special.b.need_qs)) { + list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) { + if (READ_ONCE(t->trc_reader_special.b.need_qs)) show_stalled_task_trace(t, &firstreport); - trc_del_holdout(t); - } + trc_del_holdout(t); // Release task_struct reference. + } if (firstreport) pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n"); show_stalled_ipi_trace(); -- cgit v1.2.3 From f747c7e15d7bc71a967a94ceda686cf2460b69e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Sep 2020 14:27:38 -0700 Subject: rcu-tasks: Enclose task-list scan in rcu_read_lock() The rcu_tasks_trace_postgp() function uses for_each_process_thread() to scan the task list without the benefit of RCU read-side protection, which can result in use-after-free errors on task_struct structures. This error was missed because the TRACE01 rcutorture scenario enables lockdep, but also builds with CONFIG_PREEMPT_NONE=y. In this situation, preemption is disabled everywhere, so lockdep thinks everywhere can be a legitimate RCU reader. This commit therefore adds the needed rcu_read_lock() and rcu_read_unlock(). Note that this bug can occur only after an RCU Tasks Trace CPU stall warning, which by default only happens after a grace period has extended for ten minutes (yes, not a typo, minutes). Fixes: 4593e772b502 ("rcu-tasks: Add stall warnings for RCU Tasks Trace") Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Cc: # 5.7.x Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fcd9c25f1140..d5d9f2d03e8a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1088,9 +1088,11 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) if (ret) break; // Count reached zero. // Stall warning time, so make a list of the offenders. + rcu_read_lock(); for_each_process_thread(g, t) if (READ_ONCE(t->trc_reader_special.b.need_qs)) trc_add_holdout(t, &holdouts); + rcu_read_unlock(); firstreport = true; list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) { if (READ_ONCE(t->trc_reader_special.b.need_qs)) -- cgit v1.2.3 From f959dcd6ddfd29235030e8026471ac1b022ad2b0 Mon Sep 17 00:00:00 2001 From: Thomas Tai Date: Thu, 17 Sep 2020 18:43:03 +0200 Subject: dma-direct: Fix potential NULL pointer dereference When booting the kernel v5.9-rc4 on a VM, the kernel would panic when printing a warning message in swiotlb_map(). The dev->dma_mask must not be a NULL pointer when calling the dma mapping layer. A NULL pointer check can potentially avoid the panic. Signed-off-by: Thomas Tai Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 848c95c27d79..9a045e51df17 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -144,6 +144,10 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); + + if (WARN_ON_ONCE(!dev->dma_mask)) + return DMA_MAPPING_ERROR; + if (dma_map_direct(dev, ops)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else @@ -179,6 +183,10 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, int ents; BUG_ON(!valid_dma_direction(dir)); + + if (WARN_ON_ONCE(!dev->dma_mask)) + return 0; + if (dma_map_direct(dev, ops)) ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); else @@ -213,6 +221,9 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, BUG_ON(!valid_dma_direction(dir)); + if (WARN_ON_ONCE(!dev->dma_mask)) + return DMA_MAPPING_ERROR; + /* Don't allow RAM to be mapped */ if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) return DMA_MAPPING_ERROR; -- cgit v1.2.3 From e0d072782c734d27f5af062c62266f2598f68542 Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Thu, 17 Sep 2020 18:43:40 +0200 Subject: dma-mapping: introduce DMA range map, supplanting dma_pfn_offset The new field 'dma_range_map' in struct device is used to facilitate the use of single or multiple offsets between mapping regions of cpu addrs and dma addrs. It subsumes the role of "dev->dma_pfn_offset" which was only capable of holding a single uniform offset and had no region bounds checking. The function of_dma_get_range() has been modified so that it takes a single argument -- the device node -- and returns a map, NULL, or an error code. The map is an array that holds the information regarding the DMA regions. Each range entry contains the address offset, the cpu_start address, the dma_start address, and the size of the region. of_dma_configure() is the typical manner to set range offsets but there are a number of ad hoc assignments to "dev->dma_pfn_offset" in the kernel driver code. These cases now invoke the function dma_direct_set_offset(dev, cpu_addr, dma_addr, size). Signed-off-by: Jim Quinlan [hch: various interface cleanups] Signed-off-by: Christoph Hellwig Reviewed-by: Mathieu Poirier Tested-by: Mathieu Poirier Tested-by: Nathan Chancellor --- kernel/dma/coherent.c | 7 +++---- kernel/dma/direct.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index f85d14bbfcbe..c0685196fb6d 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include struct dma_coherent_mem { void *virt_base; @@ -32,9 +32,8 @@ static inline dma_addr_t dma_get_device_base(struct device *dev, struct dma_coherent_mem * mem) { if (mem->use_dev_dma_pfn_offset) - return (mem->pfn_base - dev->dma_pfn_offset) << PAGE_SHIFT; - else - return mem->device_base; + return phys_to_dma(dev, PFN_PHYS(mem->pfn_base)); + return mem->device_base; } static int dma_init_coherent_memory(phys_addr_t phys_addr, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 54db9cfdaecc..750659f7447c 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -13,6 +13,7 @@ #include #include #include +#include /* * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it @@ -66,8 +67,12 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { - return phys_to_dma_direct(dev, phys) + size - 1 <= - min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); + dma_addr_t dma_addr = phys_to_dma_direct(dev, phys); + + if (dma_addr == DMA_MAPPING_ERROR) + return false; + return dma_addr + size - 1 <= + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } /* @@ -461,3 +466,45 @@ bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) return !dev_is_dma_coherent(dev) || is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); } + +/** + * dma_direct_set_offset - Assign scalar offset for a single DMA range. + * @dev: device pointer; needed to "own" the alloced memory. + * @cpu_start: beginning of memory region covered by this offset. + * @dma_start: beginning of DMA/PCI region covered by this offset. + * @size: size of the region. + * + * This is for the simple case of a uniform offset which cannot + * be discovered by "dma-ranges". + * + * It returns -ENOMEM if out of memory, -EINVAL if a map + * already exists, 0 otherwise. + * + * Note: any call to this from a driver is a bug. The mapping needs + * to be described by the device tree or other firmware interfaces. + */ +int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start, + dma_addr_t dma_start, u64 size) +{ + struct bus_dma_region *map; + u64 offset = (u64)cpu_start - (u64)dma_start; + + if (dev->dma_range_map) { + dev_err(dev, "attempt to add DMA range to existing map\n"); + return -EINVAL; + } + + if (!offset) + return 0; + + map = kcalloc(2, sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + map[0].cpu_start = cpu_start; + map[0].dma_start = dma_start; + map[0].offset = offset; + map[0].size = size; + dev->dma_range_map = map; + return 0; +} +EXPORT_SYMBOL_GPL(dma_direct_set_offset); -- cgit v1.2.3 From 80bdad3d7e3ec03f812471d9309f5f682e10f52b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Sep 2020 09:41:59 +0200 Subject: quota: simplify the quotactl compat handling Fold the misaligned u64 workarounds into the main quotactl flow instead of implementing a separate compat syscall handler. Signed-off-by: Christoph Hellwig Acked-by: Jan Kara Signed-off-by: Al Viro --- kernel/sys_ni.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 4d59775ea79c..c925d1e1777e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -369,7 +369,6 @@ COND_SYSCALL_COMPAT(fanotify_mark); /* x86 */ COND_SYSCALL(vm86old); COND_SYSCALL(modify_ldt); -COND_SYSCALL_COMPAT(quotactl32); COND_SYSCALL(vm86); COND_SYSCALL(kexec_file_load); -- cgit v1.2.3 From 5ef64cc8987a9211d3f3667331ba3411a94ddc79 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 13 Sep 2020 14:05:35 -0700 Subject: mm: allow a controlled amount of unfairness in the page lock Commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic") made the page locking entirely fair, in that if a waiter came in while the lock was held, the lock would be transferred to the lockers strictly in order. That was intended to finally get rid of the long-reported watchdog failures that involved the page lock under extreme load, where a process could end up waiting essentially forever, as other page lockers stole the lock from under it. It also improved some benchmarks, but it ended up causing huge performance regressions on others, simply because fair lock behavior doesn't end up giving out the lock as aggressively, causing better worst-case latency, but potentially much worse average latencies and throughput. Instead of reverting that change entirely, this introduces a controlled amount of unfairness, with a sysctl knob to tune it if somebody needs to. But the default value should hopefully be good for any normal load, allowing a few rounds of lock stealing, but enforcing the strict ordering before the lock has been stolen too many times. There is also a hint from Matthieu Baerts that the fair page coloring may end up exposing an ABBA deadlock that is hidden by the usual optimistic lock stealing, and while the unfairness doesn't fix the fundamental issue (and I'm still looking at that), it avoids it in practice. The amount of unfairness can be modified by writing a new value to the 'sysctl_page_lock_unfairness' variable (default value of 5, exposed through /proc/sys/vm/page_lock_unfairness), but that is hopefully something we'd use mainly for debugging rather than being necessary for any deep system tuning. This whole issue has exposed just how critical the page lock can be, and how contended it gets under certain locks. And the main contention doesn't really seem to be anything related to IO (which was the origin of this lock), but for things like just verifying that the page file mapping is stable while faulting in the page into a page table. Link: https://lore.kernel.org/linux-fsdevel/ed8442fd-6f54-dd84-cd4a-941e8b7ee603@MichaelLarabel.com/ Link: https://www.phoronix.com/scan.php?page=article&item=linux-50-59&num=1 Link: https://lore.kernel.org/linux-fsdevel/c560a38d-8313-51fb-b1ec-e904bd8836bc@tessares.net/ Reported-and-tested-by: Michael Larabel Tested-by: Matthieu Baerts Cc: Dave Chinner Cc: Matthew Wilcox Cc: Chris Mason Cc: Jan Kara Cc: Amir Goldstein Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09e70ee2332e..afad085960b8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2912,6 +2912,14 @@ static struct ctl_table vm_table[] = { .proc_handler = percpu_pagelist_fraction_sysctl_handler, .extra1 = SYSCTL_ZERO, }, + { + .procname = "page_lock_unfairness", + .data = &sysctl_page_lock_unfairness, + .maxlen = sizeof(sysctl_page_lock_unfairness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, #ifdef CONFIG_MMU { .procname = "max_map_count", -- cgit v1.2.3 From a748c6975dea325da540610c2ba9b5f332c603e6 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:05 +0200 Subject: bpf: propagate poke descriptors to subprograms Previously, there was no need for poke descriptors being present in subprogram's bpf_prog_aux struct since tailcalls were simply not allowed in them. Each subprog is JITed independently so in order to enable JITing subprograms that use tailcalls, do the following: - in fixup_bpf_calls() store the index of tailcall insn onto the generated poke descriptor, - in case when insn patching occurs, adjust the tailcall insn idx from bpf_patch_insn_data, - then in jit_subprogs() check whether the given poke descriptor belongs to the current subprog by checking if that previously stored absolute index of tail call insn is in the scope of the insns of given subprog, - update the insn->imm with new poke descriptor slot so that while JITing the proper poke descriptor will be grabbed This way each of the main program's poke descriptors are distributed across the subprograms poke descriptor array, so main program's descriptors can be untracked out of the prog array map. Add also subprog's aux struct to the BPF map poke_progs list by calling on it map_poke_track(). In case of any error, call the map_poke_untrack() on subprog's aux structs that have already been registered to prog array map. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 814bc6c1ad16..8a18756953de 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9623,6 +9623,18 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } +static void adjust_poke_descs(struct bpf_prog *prog, u32 len) +{ + struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; + int i, sz = prog->aux->size_poke_tab; + struct bpf_jit_poke_descriptor *desc; + + for (i = 0; i < sz; i++) { + desc = &tab[i]; + desc->insn_idx += len - 1; + } +} + static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@ -9639,6 +9651,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of if (adjust_insn_aux_data(env, new_prog, off, len)) return NULL; adjust_subprog_starts(env, off, len); + adjust_poke_descs(new_prog, len); return new_prog; } @@ -10169,6 +10182,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog, **func, *tmp; int i, j, subprog_start, subprog_end = 0, len, subprog; + struct bpf_map *map_ptr; struct bpf_insn *insn; void *old_bpf_func; int err, num_exentries; @@ -10236,6 +10250,31 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->btf = prog->aux->btf; func[i]->aux->func_info = prog->aux->func_info; + for (j = 0; j < prog->aux->size_poke_tab; j++) { + u32 insn_idx = prog->aux->poke_tab[j].insn_idx; + int ret; + + if (!(insn_idx >= subprog_start && + insn_idx <= subprog_end)) + continue; + + ret = bpf_jit_add_poke_descriptor(func[i], + &prog->aux->poke_tab[j]); + if (ret < 0) { + verbose(env, "adding tail call poke descriptor failed\n"); + goto out_free; + } + + func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1; + + map_ptr = func[i]->aux->poke_tab[ret].tail_call.map; + ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux); + if (ret < 0) { + verbose(env, "tracking tail call prog failed\n"); + goto out_free; + } + } + /* Use bpf_prog_F_tag to indicate functions in stack traces. * Long term would need debug info to populate names */ @@ -10261,6 +10300,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) } cond_resched(); } + + /* Untrack main program's aux structs so that during map_poke_run() + * we will not stumble upon the unfilled poke descriptors; each + * of the main program's poke descs got distributed across subprogs + * and got tracked onto map, so we are sure that none of them will + * be missed after the operation below + */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + + map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); + } + /* at this point all bpf functions were successfully JITed * now populate all bpf_calls with correct addresses and * run last pass of JIT @@ -10329,9 +10381,16 @@ static int jit_subprogs(struct bpf_verifier_env *env) bpf_prog_free_unused_jited_linfo(prog); return 0; out_free: - for (i = 0; i < env->subprog_cnt; i++) - if (func[i]) - bpf_jit_free(func[i]); + for (i = 0; i < env->subprog_cnt; i++) { + if (!func[i]) + continue; + + for (j = 0; j < func[i]->aux->size_poke_tab; j++) { + map_ptr = func[i]->aux->poke_tab[j].tail_call.map; + map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux); + } + bpf_jit_free(func[i]); + } kfree(func); out_undo_insn: /* cleanup main prog to be interpreted */ @@ -10549,6 +10608,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) .reason = BPF_POKE_REASON_TAIL_CALL, .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), .tail_call.key = bpf_map_key_immediate(aux), + .insn_idx = i + delta, }; ret = bpf_jit_add_poke_descriptor(prog, &desc); -- cgit v1.2.3 From cf71b174d3464c7dc22f86f25d629a8d9d5c3519 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:06 +0200 Subject: bpf: rename poke descriptor's 'ip' member to 'tailcall_target' Reflect the actual purpose of poke->ip and rename it to poke->tailcall_target so that it will not the be confused with another poke target that will be introduced in next commit. While at it, do the same thing with poke->ip_stable - rename it to poke->tailcall_target_stable. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 17 +++++++++-------- kernel/bpf/core.c | 3 ++- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e046fb7d17cd..60abf7fe12de 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -918,12 +918,13 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * there could be danger of use after free otherwise. * 2) Initially when we start tracking aux, the program * is not JITed yet and also does not have a kallsyms - * entry. We skip these as poke->ip_stable is not - * active yet. The JIT will do the final fixup before - * setting it stable. The various poke->ip_stable are - * successively activated, so tail call updates can - * arrive from here while JIT is still finishing its - * final fixup for non-activated poke entries. + * entry. We skip these as poke->tailcall_target_stable + * is not active yet. The JIT will do the final fixup + * before setting it stable. The various + * poke->tailcall_target_stable are successively + * activated, so tail call updates can arrive from here + * while JIT is still finishing its final fixup for + * non-activated poke entries. * 3) On program teardown, the program's kallsym entry gets * removed out of RCU callback, but we can only untrack * from sleepable context, therefore bpf_arch_text_poke() @@ -940,7 +941,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * 5) Any other error happening below from bpf_arch_text_poke() * is a unexpected bug. */ - if (!READ_ONCE(poke->ip_stable)) + if (!READ_ONCE(poke->tailcall_target_stable)) continue; if (poke->reason != BPF_POKE_REASON_TAIL_CALL) continue; @@ -948,7 +949,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, + ret = bpf_arch_text_poke(poke->tailcall_target, BPF_MOD_JUMP, old ? (u8 *)old->bpf_func + poke->adj_off : NULL, new ? (u8 *)new->bpf_func + diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2a20c2833996..2e00ac028d38 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -775,7 +775,8 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, if (size > poke_tab_max) return -ENOSPC; - if (poke->ip || poke->ip_stable || poke->adj_off) + if (poke->tailcall_target || poke->tailcall_target_stable || + poke->adj_off) return -EINVAL; switch (poke->reason) { -- cgit v1.2.3 From 7f6e4312e15a5c370e84eaa685879b6bdcc717e4 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:07 +0200 Subject: bpf: Limit caller's stack depth 256 for subprogs with tailcalls Protect against potential stack overflow that might happen when bpf2bpf calls get combined with tailcalls. Limit the caller's stack depth for such case down to 256 so that the worst case scenario would result in 8k stack size (32 which is tailcall limit * 256 = 8k). Suggested-by: Alexei Starovoitov Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8a18756953de..0958fba48d59 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1490,6 +1490,10 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; + if (code == (BPF_JMP | BPF_CALL) && + insn[i].imm == BPF_FUNC_tail_call && + insn[i].src_reg != BPF_PSEUDO_CALL) + subprog[cur_subprog].has_tail_call = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) @@ -2983,6 +2987,31 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) int ret_prog[MAX_CALL_FRAMES]; process_func: + /* protect against potential stack overflow that might happen when + * bpf2bpf calls get combined with tailcalls. Limit the caller's stack + * depth for such case down to 256 so that the worst case scenario + * would result in 8k stack size (32 which is tailcall limit * 256 = + * 8k). + * + * To get the idea what might happen, see an example: + * func1 -> sub rsp, 128 + * subfunc1 -> sub rsp, 256 + * tailcall1 -> add rsp, 256 + * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320) + * subfunc2 -> sub rsp, 64 + * subfunc22 -> sub rsp, 128 + * tailcall2 -> add rsp, 128 + * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416) + * + * tailcall will unwind the current stack frame but it will not get rid + * of caller's stack as shown on the example above. + */ + if (idx && subprog[idx].has_tail_call && depth >= 256) { + verbose(env, + "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n", + depth); + return -EACCES; + } /* round up to 32-bytes, since this is granularity * of interpreter stack size */ -- cgit v1.2.3 From ebf7d1f508a73871acf3b2bfbfa1323a477acdb3 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:08 +0200 Subject: bpf, x64: rework pro/epilogue and tailcall handling in JIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit serves two things: 1) it optimizes BPF prologue/epilogue generation 2) it makes possible to have tailcalls within BPF subprogram Both points are related to each other since without 1), 2) could not be achieved. In [1], Alexei says: "The prologue will look like: nop5 xor eax,eax  // two new bytes if bpf_tail_call() is used in this // function push rbp mov rbp, rsp sub rsp, rounded_stack_depth push rax // zero init tail_call counter variable number of push rbx,r13,r14,r15 Then bpf_tail_call will pop variable number rbx,.. and final 'pop rax' Then 'add rsp, size_of_current_stack_frame' jmp to next function and skip over 'nop5; xor eax,eax; push rpb; mov rbp, rsp' This way new function will set its own stack size and will init tail call counter with whatever value the parent had. If next function doesn't use bpf_tail_call it won't have 'xor eax,eax'. Instead it would need to have 'nop2' in there." Implement that suggestion. Since the layout of stack is changed, tail call counter handling can not rely anymore on popping it to rbx just like it have been handled for constant prologue case and later overwrite of rbx with actual value of rbx pushed to stack. Therefore, let's use one of the register (%rcx) that is considered to be volatile/caller-saved and pop the value of tail call counter in there in the epilogue. Drop the BUILD_BUG_ON in emit_prologue and in emit_bpf_tail_call_indirect where instruction layout is not constant anymore. Introduce new poke target, 'tailcall_bypass' to poke descriptor that is dedicated for skipping the register pops and stack unwind that are generated right before the actual jump to target program. For case when the target program is not present, BPF program will skip the pop instructions and nop5 dedicated for jmpq $target. An example of such state when only R6 of callee saved registers is used by program: ffffffffc0513aa1: e9 0e 00 00 00 jmpq 0xffffffffc0513ab4 ffffffffc0513aa6: 5b pop %rbx ffffffffc0513aa7: 58 pop %rax ffffffffc0513aa8: 48 81 c4 00 00 00 00 add $0x0,%rsp ffffffffc0513aaf: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) ffffffffc0513ab4: 48 89 df mov %rbx,%rdi When target program is inserted, the jump that was there to skip pops/nop5 will become the nop5, so CPU will go over pops and do the actual tailcall. One might ask why there simply can not be pushes after the nop5? In the following example snippet: ffffffffc037030c: 48 89 fb mov %rdi,%rbx (...) ffffffffc0370332: 5b pop %rbx ffffffffc0370333: 58 pop %rax ffffffffc0370334: 48 81 c4 00 00 00 00 add $0x0,%rsp ffffffffc037033b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) ffffffffc0370340: 48 81 ec 00 00 00 00 sub $0x0,%rsp ffffffffc0370347: 50 push %rax ffffffffc0370348: 53 push %rbx ffffffffc0370349: 48 89 df mov %rbx,%rdi ffffffffc037034c: e8 f7 21 00 00 callq 0xffffffffc0372548 There is the bpf2bpf call (at ffffffffc037034c) right after the tailcall and jump target is not present. ctx is in %rbx register and BPF subprogram that we will call into on ffffffffc037034c is relying on it, e.g. it will pick ctx from there. Such code layout is therefore broken as we would overwrite the content of %rbx with the value that was pushed on the prologue. That is the reason for the 'bypass' approach. Special care needs to be taken during the install/update/remove of tailcall target. In case when target program is not present, the CPU must not execute the pop instructions that precede the tailcall. To address that, the following states can be defined: A nop, unwind, nop B nop, unwind, tail C skip, unwind, nop D skip, unwind, tail A is forbidden (lead to incorrectness). The state transitions between tailcall install/update/remove will work as follows: First install tail call f: C->D->B(f) * poke the tailcall, after that get rid of the skip Update tail call f to f': B(f)->B(f') * poke the tailcall (poke->tailcall_target) and do NOT touch the poke->tailcall_bypass Remove tail call: B(f')->C(f') * poke->tailcall_bypass is poked back to jump, then we wait the RCU grace period so that other programs will finish its execution and after that we are safe to remove the poke->tailcall_target Install new tail call (f''): C(f')->D(f'')->B(f''). * same as first step This way CPU can never be exposed to "unwind, tail" state. Last but not least, when tailcalls get mixed with bpf2bpf calls, it would be possible to encounter the endless loop due to clearing the tailcall counter if for example we would use the tailcall3-like from BPF selftests program that would be subprogram-based, meaning the tailcall would be present within the BPF subprogram. This test, broken down to particular steps, would do: entry -> set tailcall counter to 0, bump it by 1, tailcall to func0 func0 -> call subprog_tail (we are NOT skipping the first 11 bytes of prologue and this subprogram has a tailcall, therefore we clear the counter...) subprog -> do the same thing as entry and then loop forever. To address this, the idea is to go through the call chain of bpf2bpf progs and look for a tailcall presence throughout whole chain. If we saw a single tail call then each node in this call chain needs to be marked as a subprog that can reach the tailcall. We would later feed the JIT with this info and: - set eax to 0 only when tailcall is reachable and this is the entry prog - if tailcall is reachable but there's no tailcall in insns of currently JITed prog then push rax anyway, so that it will be possible to propagate further down the call chain - finally if tailcall is reachable, then we need to precede the 'call' insn with mov rax, [rbp - (stack_depth + 8)] Tail call related cases from test_verifier kselftest are also working fine. Sample BPF programs that utilize tail calls (sockex3, tracex5) work properly as well. [1]: https://lore.kernel.org/bpf/20200517043227.2gpq22ifoq37ogst@ast-mbp.dhcp.thefacebook.com/ Suggested-by: Alexei Starovoitov Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 40 ++++++++++++++++++++++++++++++++++------ kernel/bpf/core.c | 2 +- kernel/bpf/verifier.c | 16 ++++++++++++++++ 3 files changed, 51 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 60abf7fe12de..e5fd31268ae0 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -898,6 +898,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { + u8 *old_addr, *new_addr, *old_bypass_addr; struct prog_poke_elem *elem; struct bpf_array_aux *aux; @@ -949,12 +950,39 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - ret = bpf_arch_text_poke(poke->tailcall_target, BPF_MOD_JUMP, - old ? (u8 *)old->bpf_func + - poke->adj_off : NULL, - new ? (u8 *)new->bpf_func + - poke->adj_off : NULL); - BUG_ON(ret < 0 && ret != -EINVAL); + old_bypass_addr = old ? NULL : poke->bypass_addr; + old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; + new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; + + if (new) { + ret = bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, new_addr); + BUG_ON(ret < 0 && ret != -EINVAL); + if (!old) { + ret = bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + poke->bypass_addr, + NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } + } else { + ret = bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + old_bypass_addr, + poke->bypass_addr); + BUG_ON(ret < 0 && ret != -EINVAL); + /* let other CPUs finish the execution of program + * so that it will not possible to expose them + * to invalid nop, stack unwind, nop state + */ + if (!ret) + synchronize_rcu(); + ret = bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } } } } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2e00ac028d38..c4811b139caa 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -776,7 +776,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, if (size > poke_tab_max) return -ENOSPC; if (poke->tailcall_target || poke->tailcall_target_stable || - poke->adj_off) + poke->tailcall_bypass || poke->adj_off || poke->bypass_addr) return -EINVAL; switch (poke->reason) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0958fba48d59..172e12df9eaa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2983,8 +2983,10 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; + bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; + int j; process_func: /* protect against potential stack overflow that might happen when @@ -3040,6 +3042,10 @@ continue_func: i); return -EFAULT; } + + if (subprog[idx].has_tail_call) + tail_call_reachable = true; + frame++; if (frame >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep !\n", @@ -3048,6 +3054,15 @@ continue_func: } goto process_func; } + /* if tail call got detected across bpf2bpf calls then mark each of the + * currently present subprog frames as tail call reachable subprogs; + * this info will be utilized by JIT so that we will be preserving the + * tail call counter throughout bpf2bpf calls combined with tailcalls + */ + if (tail_call_reachable) + for (j = 0; j < frame; j++) + subprog[ret_prog[j]].tail_call_reachable = true; + /* end of for() loop means the last insn of the 'subprog' * was reached. Doesn't matter whether it was JA or EXIT */ @@ -10322,6 +10337,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) num_exentries++; } func[i]->aux->num_exentries = num_exentries; + func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; -- cgit v1.2.3 From e411901c0b775a3ae7f3e2505f8d2d90ac696178 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:09 +0200 Subject: bpf: allow for tailcalls in BPF subprograms for x64 JIT Relax verifier's restriction that was meant to forbid tailcall usage when subprog count was higher than 1. Also, do not max out the stack depth of program that utilizes tailcalls. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 172e12df9eaa..d1c009e8c57f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4268,6 +4268,11 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) return false; } +static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env) +{ + return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64); +} + static int check_map_func_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, int func_id) { @@ -4383,8 +4388,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; - if (env->subprog_cnt > 1) { - verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); + if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) { + verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); return -EINVAL; } break; @@ -10469,6 +10474,13 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON + if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { + /* When JIT fails the progs with bpf2bpf calls and tail_calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); + return -EINVAL; + } for (i = 0; i < prog->len; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) @@ -10632,8 +10644,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) * the program array. */ prog->cb_access = 1; - env->prog->aux->stack_depth = MAX_BPF_STACK; - env->prog->aux->max_pkt_offset = MAX_PACKET_OFF; + if (!allow_tail_call_in_subprogs(env)) + prog->aux->stack_depth = MAX_BPF_STACK; + prog->aux->max_pkt_offset = MAX_PACKET_OFF; /* mark bpf_tail_call as different opcode to avoid * conditional branch in the interpeter for every normal -- cgit v1.2.3 From 09b28d76eac48e922dc293da1aa2b2b85c32aeee Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 17 Sep 2020 19:09:18 -0700 Subject: bpf: Add abnormal return checks. LD_[ABS|IND] instructions may return from the function early. bpf_tail_call pseudo instruction is either fallthrough or return. Allow them in the subprograms only when subprograms are BTF annotated and have scalar return types. Allow ld_abs and tail_call in the main program even if it calls into subprograms. In the past that was not ok to do for ld_abs, since it was JITed with special exit sequence. Since bpf_gen_ld_abs() was introduced the ld_abs looks like normal exit insn from JIT point of view, so it's safe to allow them in the main program. Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 67 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1c009e8c57f..4161b6c406bc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1494,6 +1494,9 @@ static int check_subprogs(struct bpf_verifier_env *env) insn[i].imm == BPF_FUNC_tail_call && insn[i].src_reg != BPF_PSEUDO_CALL) subprog[cur_subprog].has_tail_call = true; + if (BPF_CLASS(code) == BPF_LD && + (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND)) + subprog[cur_subprog].has_ld_abs = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) @@ -7514,18 +7517,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (env->subprog_cnt > 1) { - /* when program has LD_ABS insn JITs and interpreter assume - * that r1 == ctx == skb which is not the case for callees - * that can have arbitrary arguments. It's problematic - * for main prog as well since JITs would need to analyze - * all functions in order to make proper register save/restore - * decisions in the main prog. Hence disallow LD_ABS with calls - */ - verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); - return -EINVAL; - } - if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { @@ -7936,6 +7927,23 @@ err_free: return ret; } +static int check_abnormal_return(struct bpf_verifier_env *env) +{ + int i; + + for (i = 1; i < env->subprog_cnt; i++) { + if (env->subprog_info[i].has_ld_abs) { + verbose(env, "LD_ABS is not allowed in subprogs without BTF\n"); + return -EINVAL; + } + if (env->subprog_info[i].has_tail_call) { + verbose(env, "tail_call is not allowed in subprogs without BTF\n"); + return -EINVAL; + } + } + return 0; +} + /* The minimum supported BTF func info size */ #define MIN_BPF_FUNCINFO_SIZE 8 #define MAX_FUNCINFO_REC_SIZE 252 @@ -7944,20 +7952,24 @@ static int check_btf_func(struct bpf_verifier_env *env, const union bpf_attr *attr, union bpf_attr __user *uattr) { + const struct btf_type *type, *func_proto, *ret_type; u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); struct bpf_func_info *krecord; struct bpf_func_info_aux *info_aux = NULL; - const struct btf_type *type; struct bpf_prog *prog; const struct btf *btf; void __user *urecord; u32 prev_offset = 0; + bool scalar_return; int ret = -ENOMEM; nfuncs = attr->func_info_cnt; - if (!nfuncs) + if (!nfuncs) { + if (check_abnormal_return(env)) + return -EINVAL; return 0; + } if (nfuncs != env->subprog_cnt) { verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); @@ -8005,25 +8017,23 @@ static int check_btf_func(struct bpf_verifier_env *env, } /* check insn_off */ + ret = -EINVAL; if (i == 0) { if (krecord[i].insn_off) { verbose(env, "nonzero insn_off %u for the first func info record", krecord[i].insn_off); - ret = -EINVAL; goto err_free; } } else if (krecord[i].insn_off <= prev_offset) { verbose(env, "same or smaller insn offset (%u) than previous func info record (%u)", krecord[i].insn_off, prev_offset); - ret = -EINVAL; goto err_free; } if (env->subprog_info[i].start != krecord[i].insn_off) { verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); - ret = -EINVAL; goto err_free; } @@ -8032,10 +8042,26 @@ static int check_btf_func(struct bpf_verifier_env *env, if (!type || !btf_type_is_func(type)) { verbose(env, "invalid type id %d in func info", krecord[i].type_id); - ret = -EINVAL; goto err_free; } info_aux[i].linkage = BTF_INFO_VLEN(type->info); + + func_proto = btf_type_by_id(btf, type->type); + if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto))) + /* btf_func_check() already verified it during BTF load */ + goto err_free; + ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); + scalar_return = + btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type); + if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { + verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); + goto err_free; + } + if (i && !scalar_return && env->subprog_info[i].has_tail_call) { + verbose(env, "tail_call is only allowed in functions that return 'int'.\n"); + goto err_free; + } + prev_offset = krecord[i].insn_off; urecord += urec_size; } @@ -8196,8 +8222,11 @@ static int check_btf_info(struct bpf_verifier_env *env, struct btf *btf; int err; - if (!attr->func_info_cnt && !attr->line_info_cnt) + if (!attr->func_info_cnt && !attr->line_info_cnt) { + if (check_abnormal_return(env)) + return -EINVAL; return 0; + } btf = btf_get_by_fd(attr->prog_btf_fd); if (IS_ERR(btf)) -- cgit v1.2.3 From 264c03a245de7c5b1cc3836db45de6b991f877ca Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 14 Sep 2020 16:34:07 +0100 Subject: stacktrace: Remove reliable argument from arch_stack_walk() callback Currently the callback passed to arch_stack_walk() has an argument called reliable passed to it to indicate if the stack entry is reliable, a comment says that this is used by some printk() consumers. However in the current kernel none of the arch_stack_walk() implementations ever set this flag to true and the only callback implementation we have is in the generic stacktrace code which ignores the flag. It therefore appears that this flag is redundant so we can simplify and clarify things by removing it. Signed-off-by: Mark Brown Reviewed-by: Miroslav Benes Link: https://lore.kernel.org/r/20200914153409.25097-2-broonie@kernel.org Signed-off-by: Will Deacon --- kernel/stacktrace.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 946f44a9e86a..9f8117c7cfdd 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -78,8 +78,7 @@ struct stacktrace_cookie { unsigned int len; }; -static bool stack_trace_consume_entry(void *cookie, unsigned long addr, - bool reliable) +static bool stack_trace_consume_entry(void *cookie, unsigned long addr) { struct stacktrace_cookie *c = cookie; @@ -94,12 +93,11 @@ static bool stack_trace_consume_entry(void *cookie, unsigned long addr, return c->len < c->size; } -static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr, - bool reliable) +static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr) { if (in_sched_functions(addr)) return true; - return stack_trace_consume_entry(cookie, addr, reliable); + return stack_trace_consume_entry(cookie, addr); } /** -- cgit v1.2.3 From 3031313eb3d549b7ad6f9fbcc52ba04412e3eb9e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 1 Sep 2020 00:12:07 +0900 Subject: kprobes: Fix to check probe enabled before disarm_kprobe_ftrace() Commit 0cb2f1372baa ("kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler") fixed one bug but not completely fixed yet. If we run a kprobe_module.tc of ftracetest, kernel showed a warning as below. # ./ftracetest test.d/kprobe/kprobe_module.tc === Ftrace unit tests === [1] Kprobe dynamic event - probing module ... [ 22.400215] ------------[ cut here ]------------ [ 22.400962] Failed to disarm kprobe-ftrace at trace_printk_irq_work+0x0/0x7e [trace_printk] (-2) [ 22.402139] WARNING: CPU: 7 PID: 200 at kernel/kprobes.c:1091 __disarm_kprobe_ftrace.isra.0+0x7e/0xa0 [ 22.403358] Modules linked in: trace_printk(-) [ 22.404028] CPU: 7 PID: 200 Comm: rmmod Not tainted 5.9.0-rc2+ #66 [ 22.404870] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 [ 22.406139] RIP: 0010:__disarm_kprobe_ftrace.isra.0+0x7e/0xa0 [ 22.406947] Code: 30 8b 03 eb c9 80 3d e5 09 1f 01 00 75 dc 49 8b 34 24 89 c2 48 c7 c7 a0 c2 05 82 89 45 e4 c6 05 cc 09 1f 01 01 e8 a9 c7 f0 ff <0f> 0b 8b 45 e4 eb b9 89 c6 48 c7 c7 70 c2 05 82 89 45 e4 e8 91 c7 [ 22.409544] RSP: 0018:ffffc90000237df0 EFLAGS: 00010286 [ 22.410385] RAX: 0000000000000000 RBX: ffffffff83066024 RCX: 0000000000000000 [ 22.411434] RDX: 0000000000000001 RSI: ffffffff810de8d3 RDI: ffffffff810de8d3 [ 22.412687] RBP: ffffc90000237e10 R08: 0000000000000001 R09: 0000000000000001 [ 22.413762] R10: 0000000000000000 R11: 0000000000000001 R12: ffff88807c478640 [ 22.414852] R13: ffffffff8235ebc0 R14: ffffffffa00060c0 R15: 0000000000000000 [ 22.415941] FS: 00000000019d48c0(0000) GS:ffff88807d7c0000(0000) knlGS:0000000000000000 [ 22.417264] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 22.418176] CR2: 00000000005bb7e3 CR3: 0000000078f7a000 CR4: 00000000000006a0 [ 22.419309] Call Trace: [ 22.419990] kill_kprobe+0x94/0x160 [ 22.420652] kprobes_module_callback+0x64/0x230 [ 22.421470] notifier_call_chain+0x4f/0x70 [ 22.422184] blocking_notifier_call_chain+0x49/0x70 [ 22.422979] __x64_sys_delete_module+0x1ac/0x240 [ 22.423733] do_syscall_64+0x38/0x50 [ 22.424366] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 22.425176] RIP: 0033:0x4bb81d [ 22.425741] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e0 ff ff ff f7 d8 64 89 01 48 [ 22.428726] RSP: 002b:00007ffc70fef008 EFLAGS: 00000246 ORIG_RAX: 00000000000000b0 [ 22.430169] RAX: ffffffffffffffda RBX: 00000000019d48a0 RCX: 00000000004bb81d [ 22.431375] RDX: 0000000000000000 RSI: 0000000000000880 RDI: 00007ffc70fef028 [ 22.432543] RBP: 0000000000000880 R08: 00000000ffffffff R09: 00007ffc70fef320 [ 22.433692] R10: 0000000000656300 R11: 0000000000000246 R12: 00007ffc70fef028 [ 22.434635] R13: 0000000000000000 R14: 0000000000000002 R15: 0000000000000000 [ 22.435682] irq event stamp: 1169 [ 22.436240] hardirqs last enabled at (1179): [] console_unlock+0x422/0x580 [ 22.437466] hardirqs last disabled at (1188): [] console_unlock+0x7b/0x580 [ 22.438608] softirqs last enabled at (866): [] __do_softirq+0x38e/0x490 [ 22.439637] softirqs last disabled at (859): [] asm_call_on_stack+0x12/0x20 [ 22.440690] ---[ end trace 1e7ce7e1e4567276 ]--- [ 22.472832] trace_kprobe: This probe might be able to register after target module is loaded. Continue. This is because the kill_kprobe() calls disarm_kprobe_ftrace() even if the given probe is not enabled. In that case, ftrace_set_filter_ip() fails because the given probe point is not registered to ftrace. Fix to check the given (going) probe is enabled before invoking disarm_kprobe_ftrace(). Link: https://lkml.kernel.org/r/159888672694.1411785.5987998076694782591.stgit@devnote2 Fixes: 0cb2f1372baa ("kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler") Cc: Ingo Molnar Cc: "Naveen N . Rao" Cc: Anil S Keshavamurthy Cc: David Miller Cc: Muchun Song Cc: Chengming Zhou Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 287b263c9cb9..d43b48ecdb4f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2159,9 +2159,10 @@ static void kill_kprobe(struct kprobe *p) /* * The module is going away. We should disarm the kprobe which - * is using ftrace. + * is using ftrace, because ftrace framework is still available at + * MODULE_STATE_GOING notification. */ - if (kprobe_ftrace(p)) + if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) disarm_kprobe_ftrace(p); } -- cgit v1.2.3 From d5e47505e02666d5b3fe99212e0100791847dc8d Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Mon, 31 Aug 2020 14:26:31 +0200 Subject: ftrace: Free the trampoline when ftrace_startup() fails Commit fc0ea795f53c ("ftrace: Add symbols for ftrace trampolines") missed to remove ops from new ftrace_ops_trampoline_list in ftrace_startup() if ftrace_hash_ipmodify_enable() fails there. It may lead to BUG if such ops come from a module which may be removed. Moreover, the trampoline itself is not freed in this case. Fix it by calling ftrace_trampoline_free() during the rollback. Link: https://lkml.kernel.org/r/20200831122631.28057-1-mbenes@suse.cz Fixes: fc0ea795f53c ("ftrace: Add symbols for ftrace trampolines") Fixes: f8b8be8a310a ("ftrace, kprobes: Support IPMODIFY flag to find IP modify conflict") Signed-off-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 275441254bb5..656d7cb5a78c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2862,6 +2862,8 @@ int ftrace_startup(struct ftrace_ops *ops, int command) __unregister_ftrace_function(ops); ftrace_start_up--; ops->flags &= ~FTRACE_OPS_FL_ENABLED; + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + ftrace_trampoline_free(ops); return ret; } -- cgit v1.2.3 From 478ece9573f04aed834e05e0bbf034320d240f9f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 1 Sep 2020 12:16:17 +0300 Subject: ftrace: Fix missing synchronize_rcu() removing trampoline from kallsyms Add synchronize_rcu() after list_del_rcu() in ftrace_remove_trampoline_from_kallsyms() to protect readers of ftrace_ops_trampoline_list (in ftrace_get_trampoline_kallsym) which is used when kallsyms is read. Link: https://lkml.kernel.org/r/20200901091617.31837-1-adrian.hunter@intel.com Fixes: fc0ea795f53c8d ("ftrace: Add symbols for ftrace trampolines") Signed-off-by: Adrian Hunter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 656d7cb5a78c..e9f893388245 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2782,6 +2782,7 @@ static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) { lockdep_assert_held(&ftrace_lock); list_del_rcu(&ops->list); + synchronize_rcu(); } /* -- cgit v1.2.3 From 795d6379a47bcbb88bd95a69920e4acc52849f88 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 Sep 2020 10:23:31 +0200 Subject: tracing: Make the space reserved for the pid wider For 64bit CONFIG_BASE_SMALL=0 systems PID_MAX_LIMIT is set by default to 4194304. During boot the kernel sets a new value based on number of CPUs but no lower than 32768. It is 1024 per CPU so with 128 CPUs the default becomes 131072 which needs six digits. This value can be increased during run time but must not exceed the initial upper limit. Systemd sometime after v241 sets it to the upper limit during boot. The result is that when the pid exceeds five digits, the trace output is a little hard to read because it is no longer properly padded (same like on big iron with 98+ CPUs). Increase the pid padding to seven digits. Link: https://lkml.kernel.org/r/20200904082331.dcdkrr3bkn3e4qlg@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 38 +++++++++++++++++++------------------- kernel/trace/trace_output.c | 12 ++++++------ 2 files changed, 25 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f40d850ebabc..2a7c26345e83 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3782,14 +3782,14 @@ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off \n" - "# | / _----=> need-resched \n" - "# || / _---=> hardirq/softirq \n" - "# ||| / _--=> preempt-depth \n" - "# |||| / delay \n" - "# cmd pid ||||| time | caller \n" - "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# _------=> CPU# \n" + "# / _-----=> irqs-off \n" + "# | / _----=> need-resched \n" + "# || / _---=> hardirq/softirq \n" + "# ||| / _--=> preempt-depth \n" + "# |||| / delay \n" + "# cmd pid ||||| time | caller \n" + "# \\ / ||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) @@ -3810,26 +3810,26 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, print_event_info(buf, m); - seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); - seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); + seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); + seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - const char *space = " "; - int prec = tgid ? 10 : 2; + const char *space = " "; + int prec = tgid ? 12 : 2; print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); - seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); - seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); - seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); - seq_printf(m, "# %.*s||| / delay\n", prec, space); - seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); - seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); + seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); } void diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 4d1893564912..000e9dc224c6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -497,7 +497,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%8.8s-%-5d %3d", + trace_seq_printf(s, "%8.8s-%-7d %3d", comm, entry->pid, cpu); return trace_print_lat_fmt(s, entry); @@ -588,15 +588,15 @@ int trace_print_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid); if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { unsigned int tgid = trace_find_tgid(entry->pid); if (!tgid) - trace_seq_printf(s, "(-----) "); + trace_seq_printf(s, "(-------) "); else - trace_seq_printf(s, "(%5d) ", tgid); + trace_seq_printf(s, "(%7d) ", tgid); } trace_seq_printf(s, "[%03d] ", iter->cpu); @@ -636,7 +636,7 @@ int trace_print_lat_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); trace_seq_printf( - s, "%16s %5d %3d %d %08x %08lx ", + s, "%16s %7d %3d %d %08x %08lx ", comm, entry->pid, iter->cpu, entry->flags, entry->preempt_count, iter->idx); } else { @@ -917,7 +917,7 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, S = task_index_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, - " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + " %7d:%3d:%c %s [%03d] %7d:%3d:%c %s\n", field->prev_pid, field->prev_prio, S, delim, -- cgit v1.2.3 From 54fa9ba564b717fc2cf689427e195c360315999d Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 7 Sep 2020 11:32:07 +0200 Subject: ftrace: Let ftrace_enable_sysctl take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of ftrace_enable_sysctl to match ctl_table.proc_handler which fixes the following sparse warning: kernel/trace/ftrace.c:7544:43: warning: incorrect type in argument 3 (different address spaces) kernel/trace/ftrace.c:7544:43: expected void * kernel/trace/ftrace.c:7544:43: got void [noderef] __user *buffer Link: https://lkml.kernel.org/r/20200907093207.13540-1-tklauser@distanz.ch Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Cc: Andrew Morton Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Tobias Klauser Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e9f893388245..603255f5f085 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7534,8 +7534,7 @@ static bool is_permanent_ops_registered(void) int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = -ENODEV; -- cgit v1.2.3 From 46bbe5c671e06f070428b9be142cc4ee5cedebac Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 7 Sep 2020 06:58:45 -0700 Subject: tracing: fix double free clang static analyzer reports this problem trace_events_hist.c:3824:3: warning: Attempt to free released memory kfree(hist_data->attrs->var_defs.name[i]); In parse_var_defs() if there is a problem allocating var_defs.expr, the earlier var_defs.name is freed. This free is duplicated by free_var_defs() which frees the rest of the list. Because free_var_defs() has to run anyway, remove the second free fom parse_var_defs(). Link: https://lkml.kernel.org/r/20200907135845.15804-1-trix@redhat.com Cc: stable@vger.kernel.org Fixes: 30350d65ac56 ("tracing: Add variable support to hist triggers") Reviewed-by: Tom Zanussi Signed-off-by: Tom Rix Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0b933546142e..1b2ef6490229 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3865,7 +3865,6 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) s = kstrdup(field_str, GFP_KERNEL); if (!s) { - kfree(hist_data->attrs->var_defs.name[n_vars]); ret = -ENOMEM; goto free; } -- cgit v1.2.3 From 82d083ab60c3693201c6f5c7a5f23a6ed422098d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 17:55:05 +0900 Subject: kprobes: tracing/kprobes: Fix to kill kprobes on initmem after boot Since kprobe_event= cmdline option allows user to put kprobes on the functions in initmem, kprobe has to make such probes gone after boot. Currently the probes on the init functions in modules will be handled by module callback, but the kernel init text isn't handled. Without this, kprobes may access non-exist text area to disable or remove it. Link: https://lkml.kernel.org/r/159972810544.428528.1839307531600646955.stgit@devnote2 Fixes: 970988e19eb0 ("tracing/kprobe: Add kprobe_event= boot parameter") Cc: Jonathan Corbet Cc: Shuah Khan Cc: Randy Dunlap Cc: Ingo Molnar Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d43b48ecdb4f..d750e025d1ff 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2453,6 +2453,28 @@ static struct notifier_block kprobe_module_nb = { extern unsigned long __start_kprobe_blacklist[]; extern unsigned long __stop_kprobe_blacklist[]; +void kprobe_free_init_mem(void) +{ + void *start = (void *)(&__init_begin); + void *end = (void *)(&__init_end); + struct hlist_head *head; + struct kprobe *p; + int i; + + mutex_lock(&kprobe_mutex); + + /* Kill all kprobes on initmem */ + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry(p, head, hlist) { + if (start <= (void *)p->addr && (void *)p->addr < end) + kill_kprobe(p); + } + } + + mutex_unlock(&kprobe_mutex); +} + static int __init init_kprobes(void) { int i, err = 0; -- cgit v1.2.3 From 70b971118e074d5042715587953f27929e99117a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 16 Sep 2020 13:44:53 -0700 Subject: bpf: Use hlist_add_head_rcu when linking to local_storage The local_storage->list will be traversed by rcu reader in parallel. Thus, hlist_add_head_rcu() is needed in bpf_selem_link_storage_nolock(). This patch fixes it. This part of the code has recently been refactored in bpf-next and this patch makes changes to the new file "bpf_local_storage.c". Instead of using the original offending commit in the Fixes tag, the commit that created the file "bpf_local_storage.c" is used. A separate fix has been provided to the bpf tree. Fixes: 450af8d0f6be ("bpf: Split bpf_local_storage to bpf_sk_storage") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200916204453.2003915-1-kafai@fb.com --- kernel/bpf/bpf_local_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index ffa7d11fc2bd..5d3a7af9ba9b 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -159,7 +159,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { RCU_INIT_POINTER(selem->local_storage, local_storage); - hlist_add_head(&selem->snode, &local_storage->list); + hlist_add_head_rcu(&selem->snode, &local_storage->list); } void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) -- cgit v1.2.3 From dc300d77b86a122d3fd099206e1adf699ed80bd7 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 12 Jul 2020 09:10:35 +0800 Subject: tracing: toplevel d_entry already initialized Currently we have following call flow: tracer_init_tracefs() tracing_init_dentry() event_trace_init() tracing_init_dentry() This shows tracing_init_dentry() is called twice in this flow and this is not necessary. Let's remove the second one when it is for sure be properly initialized. Link: https://lkml.kernel.org/r/20200712011036.70948-4-richard.weiyang@linux.alibaba.com Signed-off-by: Wei Yang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a85effb2373b..ee25d849ebba 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3434,7 +3434,6 @@ early_initcall(event_trace_enable_again); __init int event_trace_init(void) { struct trace_array *tr; - struct dentry *d_tracer; struct dentry *entry; int ret; @@ -3442,11 +3441,7 @@ __init int event_trace_init(void) if (!tr) return -ENODEV; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) - return 0; - - entry = tracefs_create_file("available_events", 0444, d_tracer, + entry = tracefs_create_file("available_events", 0444, NULL, tr, &ftrace_avail_fops); if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); @@ -3457,7 +3452,7 @@ __init int event_trace_init(void) if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); - ret = early_event_add_tracer(d_tracer, tr); + ret = early_event_add_tracer(NULL, tr); if (ret) return ret; -- cgit v1.2.3 From 22c36b18263426bdd97ef5e04c0e92224c612ee1 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 12 Jul 2020 09:10:36 +0800 Subject: tracing: make tracing_init_dentry() returns an integer instead of a d_entry pointer Current tracing_init_dentry() return a d_entry pointer, while is not necessary. This function returns NULL on success or error on failure, which means there is no valid d_entry pointer return. Let's return 0 on success and negative value for error. Link: https://lkml.kernel.org/r/20200712011036.70948-5-richard.weiyang@linux.alibaba.com Signed-off-by: Wei Yang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 36 ++++++++++++++++++------------------ kernel/trace/trace.h | 2 +- kernel/trace/trace_dynevent.c | 8 ++++---- kernel/trace/trace_events_synth.c | 9 +++------ kernel/trace/trace_functions_graph.c | 8 ++++---- kernel/trace/trace_hwlat.c | 8 ++++---- kernel/trace/trace_kprobe.c | 10 +++++----- kernel/trace/trace_printk.c | 8 ++++---- kernel/trace/trace_stack.c | 12 ++++++------ kernel/trace/trace_stat.c | 8 ++++---- kernel/trace/trace_uprobe.c | 9 ++++----- 11 files changed, 57 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a7c26345e83..29a9034b38d0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8971,21 +8971,21 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) * directory. It is called via fs_initcall() by any of the boot up code * and expects to return the dentry of the top level tracing directory. */ -struct dentry *tracing_init_dentry(void) +int tracing_init_dentry(void) { struct trace_array *tr = &global_trace; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Tracing disabled due to lockdown\n"); - return ERR_PTR(-EPERM); + return -EPERM; } /* The top level trace array uses NULL as parent */ if (tr->dir) - return NULL; + return 0; if (WARN_ON(!tracefs_initialized())) - return ERR_PTR(-ENODEV); + return -ENODEV; /* * As there may still be users that expect the tracing @@ -8996,7 +8996,7 @@ struct dentry *tracing_init_dentry(void) tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); - return NULL; + return 0; } extern struct trace_eval_map *__start_ftrace_eval_maps[]; @@ -9083,48 +9083,48 @@ static struct notifier_block trace_module_nb = { static __init int tracer_init_tracefs(void) { - struct dentry *d_tracer; + int ret; trace_access_lock_init(); - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; event_trace_init(); - init_tracer_tracefs(&global_trace, d_tracer); - ftrace_init_tracefs_toplevel(&global_trace, d_tracer); + init_tracer_tracefs(&global_trace, NULL); + ftrace_init_tracefs_toplevel(&global_trace, NULL); - trace_create_file("tracing_thresh", 0644, d_tracer, + trace_create_file("tracing_thresh", 0644, NULL, &global_trace, &tracing_thresh_fops); - trace_create_file("README", 0444, d_tracer, + trace_create_file("README", 0444, NULL, NULL, &tracing_readme_fops); - trace_create_file("saved_cmdlines", 0444, d_tracer, + trace_create_file("saved_cmdlines", 0444, NULL, NULL, &tracing_saved_cmdlines_fops); - trace_create_file("saved_cmdlines_size", 0644, d_tracer, + trace_create_file("saved_cmdlines_size", 0644, NULL, NULL, &tracing_saved_cmdlines_size_fops); - trace_create_file("saved_tgids", 0444, d_tracer, + trace_create_file("saved_tgids", 0444, NULL, NULL, &tracing_saved_tgids_fops); trace_eval_init(); - trace_create_eval_file(d_tracer); + trace_create_eval_file(NULL); #ifdef CONFIG_MODULES register_module_notifier(&trace_module_nb); #endif #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, + trace_create_file("dyn_ftrace_total_info", 0444, NULL, NULL, &tracing_dyn_info_fops); #endif - create_trace_instances(d_tracer); + create_trace_instances(NULL); update_tracer_options(&global_trace); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 610d21355526..0d3a405fe446 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -737,7 +737,7 @@ struct dentry *trace_create_file(const char *name, void *data, const struct file_operations *fops); -struct dentry *tracing_init_dentry(void); +int tracing_init_dentry(void); struct ring_buffer_event; diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 9f2e8520b748..9442a9bb080e 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -206,14 +206,14 @@ static const struct file_operations dynamic_events_ops = { /* Make a tracefs interface for controlling dynamic events */ static __init int init_dynamic_event(void) { - struct dentry *d_tracer; struct dentry *entry; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - entry = tracefs_create_file("dynamic_events", 0644, d_tracer, + entry = tracefs_create_file("dynamic_events", 0644, NULL, NULL, &dynamic_events_ops); /* Event list interface */ diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index c6cca0d1d584..f86a2aa0bccd 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1757,7 +1757,6 @@ static const struct file_operations synth_events_fops = { static __init int trace_events_synth_init(void) { struct dentry *entry = NULL; - struct dentry *d_tracer; int err = 0; err = dyn_event_register(&synth_event_ops); @@ -1766,13 +1765,11 @@ static __init int trace_events_synth_init(void) return err; } - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) { - err = PTR_ERR(d_tracer); + err = tracing_init_dentry(); + if (err) goto err; - } - entry = tracefs_create_file("synthetic_events", 0644, d_tracer, + entry = tracefs_create_file("synthetic_events", 0644, NULL, NULL, &synth_events_fops); if (!entry) { err = -ENODEV; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4a9c49c08ec9..60d66278aa0d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1336,13 +1336,13 @@ static const struct file_operations graph_depth_fops = { static __init int init_graph_tracefs(void) { - struct dentry *d_tracer; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("max_graph_depth", 0644, d_tracer, + trace_create_file("max_graph_depth", 0644, NULL, NULL, &graph_depth_fops); return 0; diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 17873e5d0353..c9ad5c6fbaad 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -538,14 +538,14 @@ static const struct file_operations window_fops = { */ static int init_tracefs(void) { - struct dentry *d_tracer; + int ret; struct dentry *top_dir; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return -ENOMEM; - top_dir = tracefs_create_dir("hwlat_detector", d_tracer); + top_dir = tracefs_create_dir("hwlat_detector", NULL); if (!top_dir) return -ENOMEM; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aefb6065b508..feca9b19cd74 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1901,14 +1901,14 @@ subsys_initcall(init_kprobe_trace_early); /* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { - struct dentry *d_tracer; + int ret; struct dentry *entry; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - entry = tracefs_create_file("kprobe_events", 0644, d_tracer, + entry = tracefs_create_file("kprobe_events", 0644, NULL, NULL, &kprobe_events_ops); /* Event list interface */ @@ -1916,7 +1916,7 @@ static __init int init_kprobe_trace(void) pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ - entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, + entry = tracefs_create_file("kprobe_profile", 0444, NULL, NULL, &kprobe_profile_ops); if (!entry) diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index d4e31e969206..71b2e0fdc3e0 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -367,13 +367,13 @@ static const struct file_operations ftrace_formats_fops = { static __init int init_trace_printk_function_export(void) { - struct dentry *d_tracer; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("printk_formats", 0444, d_tracer, + trace_create_file("printk_formats", 0444, NULL, NULL, &ftrace_formats_fops); return 0; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 98bba4764c52..c408423e5d65 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -554,20 +554,20 @@ __setup("stacktrace", enable_stacktrace); static __init int stack_trace_init(void) { - struct dentry *d_tracer; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("stack_max_size", 0644, d_tracer, + trace_create_file("stack_max_size", 0644, NULL, &stack_trace_max_size, &stack_max_size_fops); - trace_create_file("stack_trace", 0444, d_tracer, + trace_create_file("stack_trace", 0444, NULL, NULL, &stack_trace_fops); #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("stack_trace_filter", 0644, d_tracer, + trace_create_file("stack_trace_filter", 0644, NULL, &trace_ops, &stack_trace_filter_fops); #endif diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index d1fa19773cc8..8d141c3825a9 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -276,13 +276,13 @@ static const struct file_operations tracing_stat_fops = { static int tracing_stat_init(void) { - struct dentry *d_tracing; + int ret; - d_tracing = tracing_init_dentry(); - if (IS_ERR(d_tracing)) + ret = tracing_init_dentry(); + if (ret) return -ENODEV; - stat_dir = tracefs_create_dir("trace_stat", d_tracing); + stat_dir = tracefs_create_dir("trace_stat", NULL); if (!stat_dir) { pr_warn("Could not create tracefs 'trace_stat' entry\n"); return -ENOMEM; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f4286c9bdeb4..56729c6b6614 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1625,21 +1625,20 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call) /* Make a trace interface for controling probe points */ static __init int init_uprobe_trace(void) { - struct dentry *d_tracer; int ret; ret = dyn_event_register(&trace_uprobe_ops); if (ret) return ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("uprobe_events", 0644, d_tracer, + trace_create_file("uprobe_events", 0644, NULL, NULL, &uprobe_events_ops); /* Profile interface */ - trace_create_file("uprobe_profile", 0444, d_tracer, + trace_create_file("uprobe_profile", 0444, NULL, NULL, &uprobe_profile_ops); return 0; } -- cgit v1.2.3 From b0399092ccebd9feef68d4ceb8d6219a8c0caa05 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 18 Sep 2020 21:20:21 -0700 Subject: kprobes: fix kill kprobe which has been marked as gone If a kprobe is marked as gone, we should not kill it again. Otherwise, we can disarm the kprobe more than once. In that case, the statistics of kprobe_ftrace_enabled can unbalance which can lead to that kprobe do not work. Fixes: e8386a0cb22f ("kprobes: support probing module __exit function") Co-developed-by: Chengming Zhou Signed-off-by: Muchun Song Signed-off-by: Chengming Zhou Signed-off-by: Andrew Morton Acked-by: Masami Hiramatsu Cc: "Naveen N . Rao" Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: Song Liu Cc: Steven Rostedt Cc: Link: https://lkml.kernel.org/r/20200822030055.32383-1-songmuchun@bytedance.com Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 287b263c9cb9..049da84e1952 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2140,6 +2140,9 @@ static void kill_kprobe(struct kprobe *p) lockdep_assert_held(&kprobe_mutex); + if (WARN_ON_ONCE(kprobe_gone(p))) + return; + p->flags |= KPROBE_FLAG_GONE; if (kprobe_aggrprobe(p)) { /* @@ -2419,7 +2422,10 @@ static int kprobes_module_callback(struct notifier_block *nb, mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry(p, head, hlist) + hlist_for_each_entry(p, head, hlist) { + if (kprobe_gone(p)) + continue; + if (within_module_init((unsigned long)p->addr, mod) || (checkcore && within_module_core((unsigned long)p->addr, mod))) { @@ -2436,6 +2442,7 @@ static int kprobes_module_callback(struct notifier_block *nb, */ kill_kprobe(p); } + } } if (val == MODULE_STATE_GOING) remove_module_kprobe_blacklist(mod); -- cgit v1.2.3 From 7bb82ac30c3dd4ecf1485685cbe84d2ba10dddf4 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 18 Sep 2020 21:20:34 -0700 Subject: ftrace: let ftrace_enable_sysctl take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of ftrace_enable_sysctl to match ctl_table.proc_handler which fixes the following sparse warning: kernel/trace/ftrace.c:7544:43: warning: incorrect type in argument 3 (different address spaces) kernel/trace/ftrace.c:7544:43: expected void * kernel/trace/ftrace.c:7544:43: got void [noderef] __user *buffer Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Tobias Klauser Signed-off-by: Andrew Morton Cc: Christoph Hellwig Cc: Al Viro Link: https://lkml.kernel.org/r/20200907093207.13540-1-tklauser@distanz.ch Signed-off-by: Linus Torvalds --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 275441254bb5..e9fa580f3083 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7531,8 +7531,7 @@ static bool is_permanent_ops_registered(void) int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = -ENODEV; -- cgit v1.2.3 From 4773ef33fc6e59bad2e5d19e334de2fa79c27b74 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 18 Sep 2020 21:20:37 -0700 Subject: stackleak: let stack_erasing_sysctl take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of stack_erasing_sysctl to match ctl_table.proc_handler which fixes the following sparse warning: kernel/stackleak.c:31:50: warning: incorrect type in argument 3 (different address spaces) kernel/stackleak.c:31:50: expected void * kernel/stackleak.c:31:50: got void [noderef] __user *buffer Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Tobias Klauser Signed-off-by: Andrew Morton Cc: Christoph Hellwig Cc: Al Viro Link: https://lkml.kernel.org/r/20200907093253.13656-1-tklauser@distanz.ch Signed-off-by: Linus Torvalds --- kernel/stackleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stackleak.c b/kernel/stackleak.c index a8fc9ae1d03d..ce161a8e8d97 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -20,7 +20,7 @@ static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); int stack_erasing_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); -- cgit v1.2.3 From 3ad1c8ef083bef96ec922688966484be1039e6b5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 21 Sep 2020 12:31:36 +0200 Subject: rcu/tree: Export rcu_idle_{enter,exit} to modules Fix this link error: ERROR: modpost: "rcu_idle_enter" [drivers/acpi/processor.ko] undefined! ERROR: modpost: "rcu_idle_exit" [drivers/acpi/processor.ko] undefined! when CONFIG_ACPI_PROCESSOR is built as module. PeterZ says that in light of ARM needing those soon too, they should simply be exported. Fixes: 1fecfdbb7acc ("ACPI: processor: Take over RCU-idle for C3-BM idle") Reported-by: Sven Joachim Suggested-by: Peter Zijlstra Signed-off-by: Borislav Petkov Reviewed-by: Paul E. McKenney Signed-off-by: Rafael J. Wysocki --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8ce77d9ac716..f78ee759af9c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -673,6 +673,7 @@ void rcu_idle_enter(void) lockdep_assert_irqs_disabled(); rcu_eqs_enter(false); } +EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL /** @@ -886,6 +887,7 @@ void rcu_idle_exit(void) rcu_eqs_exit(false); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_exit); #ifdef CONFIG_NO_HZ_FULL /** -- cgit v1.2.3 From a97740f81874c8063c12c24f34d25f10c4f5e9aa Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Mon, 21 Sep 2020 07:09:26 +0000 Subject: dma-debug: convert comma to semicolon Replace a comma between expression statements by a semicolon. Signed-off-by: Xu Wang Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 6f53c2e03aa4..4211800d9f3e 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1219,7 +1219,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, entry->dev = dev; entry->type = dma_debug_single; entry->pfn = page_to_pfn(page); - entry->offset = offset, + entry->offset = offset; entry->dev_addr = dma_addr; entry->size = size; entry->direction = direction; @@ -1308,7 +1308,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->type = dma_debug_sg; entry->dev = dev; entry->pfn = page_to_pfn(sg_page(s)); - entry->offset = s->offset, + entry->offset = s->offset; entry->size = sg_dma_len(s); entry->dev_addr = sg_dma_address(s); entry->direction = direction; -- cgit v1.2.3 From 31f23a6a181c81543b10a1a9056b0e6c7ef1c747 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 17 Sep 2020 15:44:53 +0800 Subject: bpf: Fix potential call bpf_link_free() in atomic context The in_atomic() macro cannot always detect atomic context, in particular, it cannot know about held spinlocks in non-preemptible kernels. Although, there is no user call bpf_link_put() with holding spinlock now, be on the safe side, so we can avoid this in the future. Signed-off-by: Muchun Song Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200917074453.20621-1-songmuchun@bytedance.com --- kernel/bpf/syscall.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2ce32cad5c8e..ec68d3a23a2b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2345,12 +2345,8 @@ void bpf_link_put(struct bpf_link *link) if (!atomic64_dec_and_test(&link->refcnt)) return; - if (in_atomic()) { - INIT_WORK(&link->work, bpf_link_put_deferred); - schedule_work(&link->work); - } else { - bpf_link_free(link); - } + INIT_WORK(&link->work, bpf_link_put_deferred); + schedule_work(&link->work); } static int bpf_link_release(struct inode *inode, struct file *filp) -- cgit v1.2.3 From e23bb04b0c938588eae41b7f4712b722290ed2b8 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Sat, 19 Sep 2020 22:01:33 -0700 Subject: bpf: Fix sysfs export of empty BTF section If BTF data is missing or removed from the ELF section it is still exported via sysfs as a zero-length file: root@OpenWrt:/# ls -l /sys/kernel/btf/vmlinux -r--r--r-- 1 root root 0 Jul 18 02:59 /sys/kernel/btf/vmlinux Moreover, reads from this file succeed and leak kernel data: root@OpenWrt:/# hexdump -C /sys/kernel/btf/vmlinux|head -10 000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| * 000cc0 00 00 00 00 00 00 00 00 00 00 00 00 80 83 b0 80 |................| 000cd0 00 10 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| 000ce0 00 00 00 00 00 00 00 00 00 00 00 00 57 ac 6e 9d |............W.n.| 000cf0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| * 002650 00 00 00 00 00 00 00 10 00 00 00 01 00 00 00 01 |................| 002660 80 82 9a c4 80 85 97 80 81 a9 51 68 00 00 00 02 |..........Qh....| 002670 80 25 44 dc 80 85 97 80 81 a9 50 24 81 ab c4 60 |.%D.......P$...`| This situation was first observed with kernel 5.4.x, cross-compiled for a MIPS target system. Fix by adding a sanity-check for export of zero-length data sections. Fixes: 341dfcf8d78e ("btf: expose BTF info through sysfs") Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b38db205a66238f70823039a8c531535864eaac5.1600417359.git.Tony.Ambardar@gmail.com --- kernel/bpf/sysfs_btf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 3b495773de5a..11b3380887fa 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -30,15 +30,15 @@ static struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { - if (!__start_BTF) + bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; + + if (!__start_BTF || bin_attr_btf_vmlinux.size == 0) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); if (!btf_kobj) return -ENOMEM; - bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; - return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } -- cgit v1.2.3 From 2af30f115d6957f372ce3096c7198763ff253d97 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:17 +0100 Subject: btf: Make btf_set_contains take a const pointer bsearch doesn't modify the contents of the array, so we can take a const pointer. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200921121227.255763-2-lmb@cloudflare.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f9ac6935ab3c..a2330f6fe2e6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4772,7 +4772,7 @@ static int btf_id_cmp_func(const void *a, const void *b) return *pa - *pb; } -bool btf_id_set_contains(struct btf_id_set *set, u32 id) +bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } -- cgit v1.2.3 From 0d004c020b5574e51f7a525e57d2a11958b334b5 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:18 +0100 Subject: bpf: Check scalar or invalid register in check_helper_mem_access Move the check for a NULL or zero register to check_helper_mem_access. This makes check_stack_boundary easier to understand. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200921121227.255763-3-lmb@cloudflare.com --- kernel/bpf/verifier.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4161b6c406bc..06238395f244 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3641,18 +3641,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, struct bpf_func_state *state = func(env, reg); int err, min_off, max_off, i, j, slot, spi; - if (reg->type != PTR_TO_STACK) { - /* Allow zero-byte read from NULL, regardless of pointer type */ - if (zero_size_allowed && access_size == 0 && - register_is_null(reg)) - return 0; - - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], - reg_type_str[PTR_TO_STACK]); - return -EACCES; - } - if (tnum_is_const(reg->var_off)) { min_off = max_off = reg->var_off.value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size, @@ -3797,9 +3785,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, access_size, zero_size_allowed, "rdwr", &env->prog->aux->max_rdwr_access); - default: /* scalar_value|ptr_to_stack or invalid ptr */ + case PTR_TO_STACK: return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); + default: /* scalar_value or invalid ptr */ + /* Allow zero-byte read from NULL, regardless of pointer type */ + if (zero_size_allowed && access_size == 0 && + register_is_null(reg)) + return 0; + + verbose(env, "R%d type=%s expected=%s\n", regno, + reg_type_str[reg->type], + reg_type_str[PTR_TO_STACK]); + return -EACCES; } } -- cgit v1.2.3 From 9436ef6e862b9ca22e5b12f87b106e07d5af4cae Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:20 +0100 Subject: bpf: Allow specifying a BTF ID per argument in function protos Function prototypes using ARG_PTR_TO_BTF_ID currently use two ways to signal which BTF IDs are acceptable. First, bpf_func_proto.btf_id is an array of IDs, one for each argument. This array is only accessed up to the highest numbered argument that uses ARG_PTR_TO_BTF_ID and may therefore be less than five arguments long. It usually points at a BTF_ID_LIST. Second, check_btf_id is a function pointer that is called by the verifier if present. It gets the actual BTF ID of the register, and the argument number we're currently checking. It turns out that the only user check_arg_btf_id ignores the argument, and is simply used to check whether the BTF ID has a struct sock_common at it's start. Replace both of these mechanisms with an explicit BTF ID for each argument in a function proto. Thanks to btf_struct_ids_match this is very flexible: check_arg_btf_id can be replaced by requiring struct sock_common. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-5-lmb@cloudflare.com --- kernel/bpf/bpf_inode_storage.c | 8 +++----- kernel/bpf/btf.c | 13 ------------- kernel/bpf/stackmap.c | 5 ++--- kernel/bpf/verifier.c | 44 +++++++++++++++++++++--------------------- kernel/trace/bpf_trace.c | 15 +++++--------- 5 files changed, 32 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 75be02799c0f..6edff97ad594 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -249,9 +249,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_owner_storage_ptr = inode_storage_ptr, }; -BTF_ID_LIST(bpf_inode_storage_btf_ids) -BTF_ID_UNUSED -BTF_ID(struct, inode) +BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode) const struct bpf_func_proto bpf_inode_storage_get_proto = { .func = bpf_inode_storage_get, @@ -259,9 +257,9 @@ const struct bpf_func_proto bpf_inode_storage_get_proto = { .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_inode_storage_btf_ids[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, - .btf_id = bpf_inode_storage_btf_ids, }; const struct bpf_func_proto bpf_inode_storage_delete_proto = { @@ -270,5 +268,5 @@ const struct bpf_func_proto bpf_inode_storage_delete_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .btf_id = bpf_inode_storage_btf_ids, + .arg2_btf_id = &bpf_inode_storage_btf_ids[0], }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a2330f6fe2e6..5d3c36e13139 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4193,19 +4193,6 @@ again: return true; } -int btf_resolve_helper_id(struct bpf_verifier_log *log, - const struct bpf_func_proto *fn, int arg) -{ - int id; - - if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID || !btf_vmlinux) - return -EINVAL; - id = fn->btf_id[arg]; - if (!id || id > btf_vmlinux->nr_types) - return -EINVAL; - return id; -} - static int __get_type_size(struct btf *btf, u32 btf_id, const struct btf_type **bad_type) { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a2fa006f430e..06065fa27124 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -665,18 +665,17 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, return __bpf_get_stack(regs, task, NULL, buf, size, flags); } -BTF_ID_LIST(bpf_get_task_stack_btf_ids) -BTF_ID(struct, task_struct) +BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct) const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_get_task_stack_btf_ids[0], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, - .btf_id = bpf_get_task_stack_btf_ids, }; BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 06238395f244..30da34d9b93b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,7 +238,6 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int func_id; - u32 btf_id; }; struct btf *btf_vmlinux; @@ -4049,29 +4048,23 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, goto err_type; } } else if (arg_type == ARG_PTR_TO_BTF_ID) { - bool ids_match = false; + const u32 *btf_id = fn->arg_btf_id[arg]; expected_type = PTR_TO_BTF_ID; if (type != expected_type) goto err_type; - if (!fn->check_btf_id) { - if (reg->btf_id != meta->btf_id) { - ids_match = btf_struct_ids_match(&env->log, reg->off, reg->btf_id, - meta->btf_id); - if (!ids_match) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), - kernel_type_name(reg->btf_id), regno); - return -EACCES; - } - } - } else if (!fn->check_btf_id(reg->btf_id, arg)) { - verbose(env, "Helper does not support %s in R%d\n", - kernel_type_name(reg->btf_id), regno); + if (!btf_id) { + verbose(env, "verifier internal error: missing BTF ID\n"); + return -EFAULT; + } + + if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, *btf_id)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf_id), kernel_type_name(*btf_id)); return -EACCES; } - if ((reg->off && !ids_match) || !tnum_is_const(reg->var_off) || reg->var_off.value) { + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", regno); return -EACCES; @@ -4545,10 +4538,22 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) return count <= 1; } +static bool check_btf_id_ok(const struct bpf_func_proto *fn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) + if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + return false; + + return true; +} + static int check_func_proto(const struct bpf_func_proto *fn, int func_id) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && + check_btf_id_ok(fn) && check_refcount_ok(fn, func_id) ? 0 : -EINVAL; } @@ -4944,11 +4949,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - if (!fn->check_btf_id) { - err = btf_resolve_helper_id(&env->log, fn, i); - if (err > 0) - meta.btf_id = err; - } err = check_func_arg(env, i, &meta, fn); if (err) return err; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b2a5380eb187..ebf9be4d0d6a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -743,19 +743,18 @@ out: return err; } -BTF_ID_LIST(bpf_seq_printf_btf_ids) -BTF_ID(struct, seq_file) +BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) static const struct bpf_func_proto bpf_seq_printf_proto = { .func = bpf_seq_printf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM_OR_NULL, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_seq_printf_btf_ids, }; BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) @@ -763,17 +762,14 @@ BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) return seq_write(m, data, len) ? -EOVERFLOW : 0; } -BTF_ID_LIST(bpf_seq_write_btf_ids) -BTF_ID(struct, seq_file) - static const struct bpf_func_proto bpf_seq_write_proto = { .func = bpf_seq_write, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_seq_write_btf_ids, }; static __always_inline int @@ -1130,17 +1126,16 @@ static bool bpf_d_path_allowed(const struct bpf_prog *prog) return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); } -BTF_ID_LIST(bpf_d_path_btf_ids) -BTF_ID(struct, path) +BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path) static const struct bpf_func_proto bpf_d_path_proto = { .func = bpf_d_path, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_d_path_btf_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_d_path_btf_ids, .allowed = bpf_d_path_allowed, }; -- cgit v1.2.3 From d7b9454a4f6333bf145189b8e769011d15bdd50e Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:21 +0100 Subject: bpf: Make BTF pointer type checking generic Perform BTF type checks if the register we're working on contains a BTF pointer, rather than if the argument is for a BTF pointer. This is easier to understand, and allows removing the code from the arg_type checking section of the function. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-6-lmb@cloudflare.com --- kernel/bpf/verifier.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 30da34d9b93b..a0e919232968 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4048,27 +4048,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, goto err_type; } } else if (arg_type == ARG_PTR_TO_BTF_ID) { - const u32 *btf_id = fn->arg_btf_id[arg]; - expected_type = PTR_TO_BTF_ID; if (type != expected_type) goto err_type; - - if (!btf_id) { - verbose(env, "verifier internal error: missing BTF ID\n"); - return -EFAULT; - } - - if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, *btf_id)) { - verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf_id), kernel_type_name(*btf_id)); - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", - regno); - return -EACCES; - } } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -4123,6 +4105,26 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EFAULT; } + if (type == PTR_TO_BTF_ID) { + const u32 *btf_id = fn->arg_btf_id[arg]; + + if (!btf_id) { + verbose(env, "verifier internal error: missing BTF ID\n"); + return -EFAULT; + } + + if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, *btf_id)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf_id), kernel_type_name(*btf_id)); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", + regno); + return -EACCES; + } + } + if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ meta->map_ptr = reg->map_ptr; -- cgit v1.2.3 From 02f7c9585d1e2d5d76cac497bd5ced8ecf9d6f56 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:22 +0100 Subject: bpf: Make reference tracking generic Instead of dealing with reg->ref_obj_id individually for every arg type that needs it, rely on the fact that ref_obj_id is zero if the register is not reference tracked. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-7-lmb@cloudflare.com --- kernel/bpf/verifier.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a0e919232968..a4549b2656ad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4030,15 +4030,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ if (!type_is_sk_pointer(type)) goto err_type; - if (reg->ref_obj_id) { - if (meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, - meta->ref_obj_id); - return -EFAULT; - } - meta->ref_obj_id = reg->ref_obj_id; - } } else if (arg_type == ARG_PTR_TO_SOCKET || arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { expected_type = PTR_TO_SOCKET; @@ -4087,13 +4078,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, /* final test in check_stack_boundary() */; else if (type != expected_type) goto err_type; - if (meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, - meta->ref_obj_id); - return -EFAULT; - } - meta->ref_obj_id = reg->ref_obj_id; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && @@ -4125,6 +4109,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } } + if (reg->ref_obj_id) { + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; + } + if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ meta->map_ptr = reg->map_ptr; -- cgit v1.2.3 From feec70401672bd9b0268ae59ec5efd15d86ae138 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:23 +0100 Subject: bpf: Make context access check generic Always check context access if the register we're operating on is PTR_TO_CTX, rather than relying on ARG_PTR_TO_CTX. This allows simplifying the arg_type checking section of the function. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-8-lmb@cloudflare.com --- kernel/bpf/verifier.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a4549b2656ad..fc795bac42ed 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4021,9 +4021,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, arg_type == ARG_PTR_TO_CTX_OR_NULL)) { if (type != expected_type) goto err_type; - err = check_ctx_reg(env, reg, regno); - if (err < 0) - return err; } } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { expected_type = PTR_TO_SOCK_COMMON; @@ -4107,6 +4104,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, regno); return -EACCES; } + } else if (type == PTR_TO_CTX) { + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; } if (reg->ref_obj_id) { -- cgit v1.2.3 From a2bbe7cc90755283f1db719eb757616cefd2a9fd Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:24 +0100 Subject: bpf: Set meta->raw_mode for pointers close to use If we encounter a pointer to memory, we set meta->raw_mode depending on the type of memory we point at. What isn't obvious is that this information is only used when the next memory size argument is encountered. Move the assignment closer to where it's used, and add a comment that explains what is going on. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-9-lmb@cloudflare.com --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fc795bac42ed..446fbe7f6b49 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4067,7 +4067,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, type != PTR_TO_RDWR_BUF && type != expected_type) goto err_type; - meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else if (arg_type_is_alloc_mem_ptr(arg_type)) { expected_type = PTR_TO_MEM; if (register_is_null(reg) && @@ -4156,6 +4155,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); + } else if (arg_type_is_mem_ptr(arg_type)) { + /* The access to this pointer is only checked when we hit the + * next is_mem_size argument below. + */ + meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); -- cgit v1.2.3 From c18f0b6aee2aaa6ab2aefd4b9aa1d89142a48824 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:25 +0100 Subject: bpf: Check ARG_PTR_TO_SPINLOCK register type in check_func_arg Move the check for PTR_TO_MAP_VALUE to check_func_arg, where all other checking is done as well. Move the invocation of process_spin_lock away from the register type checking, to allow a future refactoring. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-10-lmb@cloudflare.com --- kernel/bpf/verifier.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 446fbe7f6b49..779091cb5bdd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3828,10 +3828,6 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; - if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "R%d is not a pointer to map_value\n", regno); - return -EINVAL; - } if (!is_const) { verbose(env, "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", @@ -4040,16 +4036,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (type != expected_type) goto err_type; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { - if (meta->func_id == BPF_FUNC_spin_lock) { - if (process_spin_lock(env, regno, true)) - return -EACCES; - } else if (meta->func_id == BPF_FUNC_spin_unlock) { - if (process_spin_lock(env, regno, false)) - return -EACCES; - } else { - verbose(env, "verifier internal error\n"); - return -EFAULT; - } + expected_type = PTR_TO_MAP_VALUE; + if (type != expected_type) + goto err_type; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -4155,6 +4144,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); + } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + if (meta->func_id == BPF_FUNC_spin_lock) { + if (process_spin_lock(env, regno, true)) + return -EACCES; + } else if (meta->func_id == BPF_FUNC_spin_unlock) { + if (process_spin_lock(env, regno, false)) + return -EACCES; + } else { + verbose(env, "verifier internal error\n"); + return -EFAULT; + } } else if (arg_type_is_mem_ptr(arg_type)) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. -- cgit v1.2.3 From fd1b0d604c56e0d9f143b39a92132a2ea9625e6d Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:26 +0100 Subject: bpf: Hoist type checking for nullable arg types check_func_arg has a plethora of weird if statements with empty branches. They work around the fact that *_OR_NULL argument types should accept a SCALAR_VALUE register, as long as it's value is 0. These statements make it difficult to reason about the type checking logic. Instead, skip more detailed type checking logic iff the register is 0, and the function expects a nullable type. This allows simplifying the type checking itself. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-11-lmb@cloudflare.com --- kernel/bpf/verifier.c | 64 ++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 779091cb5bdd..129416ea0256 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -435,6 +435,15 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) return type == ARG_PTR_TO_SOCK_COMMON; } +static bool arg_type_may_be_null(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || + type == ARG_PTR_TO_MEM_OR_NULL || + type == ARG_PTR_TO_CTX_OR_NULL || + type == ARG_PTR_TO_SOCKET_OR_NULL || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + /* Determine whether the function releases some resources allocated by another * function call. The first reference type argument will be assumed to be * released by release_reference(). @@ -3988,17 +3997,20 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; } + if (register_is_null(reg) && arg_type_may_be_null(arg_type)) + /* A NULL register has a SCALAR_VALUE type, so skip + * type checking. + */ + goto skip_type_check; + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE || arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { expected_type = PTR_TO_STACK; - if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) - /* final test in check_stack_boundary() */; - else if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) + if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO || @@ -4013,11 +4025,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } else if (arg_type == ARG_PTR_TO_CTX || arg_type == ARG_PTR_TO_CTX_OR_NULL) { expected_type = PTR_TO_CTX; - if (!(register_is_null(reg) && - arg_type == ARG_PTR_TO_CTX_OR_NULL)) { - if (type != expected_type) - goto err_type; - } + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { expected_type = PTR_TO_SOCK_COMMON; /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ @@ -4026,11 +4035,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } else if (arg_type == ARG_PTR_TO_SOCKET || arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { expected_type = PTR_TO_SOCKET; - if (!(register_is_null(reg) && - arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { - if (type != expected_type) - goto err_type; - } + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_BTF_ID) { expected_type = PTR_TO_BTF_ID; if (type != expected_type) @@ -4041,27 +4047,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, goto err_type; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; - /* One exception here. In case function allows for NULL to be - * passed in as argument, it's a SCALAR_VALUE type. Final test - * happens during stack boundary checking. - */ - if (register_is_null(reg) && - (arg_type == ARG_PTR_TO_MEM_OR_NULL || - arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) - /* final test in check_stack_boundary() */; - else if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != PTR_TO_MEM && - type != PTR_TO_RDONLY_BUF && - type != PTR_TO_RDWR_BUF && - type != expected_type) + if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != PTR_TO_MEM && + type != PTR_TO_RDONLY_BUF && + type != PTR_TO_RDWR_BUF && + type != expected_type) goto err_type; } else if (arg_type_is_alloc_mem_ptr(arg_type)) { expected_type = PTR_TO_MEM; - if (register_is_null(reg) && - arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) - /* final test in check_stack_boundary() */; - else if (type != expected_type) + if (type != expected_type) goto err_type; } else if (arg_type_is_int_ptr(arg_type)) { expected_type = PTR_TO_STACK; @@ -4098,6 +4093,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; } +skip_type_check: if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", -- cgit v1.2.3 From f79e7ea571732a6e16f15c6e2f000c347e2d7431 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:27 +0100 Subject: bpf: Use a table to drive helper arg type checks The mapping between bpf_arg_type and bpf_reg_type is encoded in a big hairy if statement that is hard to follow. The debug output also leaves to be desired: if a reg_type doesn't match we only print one of the options, instead printing all the valid ones. Convert the if statement into a table which is then used to drive type checking. If none of the reg_types match we print all options, e.g.: R2 type=rdonly_buf expected=fp, pkt, pkt_meta, map_value Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-12-lmb@cloudflare.com --- kernel/bpf/verifier.c | 183 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 109 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 129416ea0256..15ab889b0a3f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3903,12 +3903,6 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } -static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_ALLOC_MEM || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; -} - static bool arg_type_is_alloc_size(enum bpf_arg_type type) { return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; @@ -3957,14 +3951,115 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, return 0; } +struct bpf_reg_types { + const enum bpf_reg_type types[10]; +}; + +static const struct bpf_reg_types map_key_value_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + }, +}; + +static const struct bpf_reg_types sock_types = { + .types = { + PTR_TO_SOCK_COMMON, + PTR_TO_SOCKET, + PTR_TO_TCP_SOCK, + PTR_TO_XDP_SOCK, + }, +}; + +static const struct bpf_reg_types mem_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + PTR_TO_MEM, + PTR_TO_RDONLY_BUF, + PTR_TO_RDWR_BUF, + }, +}; + +static const struct bpf_reg_types int_ptr_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + }, +}; + +static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; +static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; +static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; +static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; +static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; + +static const struct bpf_reg_types *compatible_reg_types[] = { + [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, + [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, + [ARG_CONST_SIZE] = &scalar_types, + [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, + [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, + [ARG_CONST_MAP_PTR] = &const_map_ptr_types, + [ARG_PTR_TO_CTX] = &context_types, + [ARG_PTR_TO_CTX_OR_NULL] = &context_types, + [ARG_PTR_TO_SOCK_COMMON] = &sock_types, + [ARG_PTR_TO_SOCKET] = &fullsock_types, + [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, + [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, + [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, + [ARG_PTR_TO_MEM] = &mem_types, + [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, + [ARG_PTR_TO_UNINIT_MEM] = &mem_types, + [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, + [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, + [ARG_PTR_TO_INT] = &int_ptr_types, + [ARG_PTR_TO_LONG] = &int_ptr_types, + [__BPF_ARG_TYPE_MAX] = NULL, +}; + +static int check_reg_type(struct bpf_verifier_env *env, u32 regno, + const struct bpf_reg_types *compatible) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + enum bpf_reg_type expected, type = reg->type; + int i, j; + + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { + expected = compatible->types[i]; + if (expected == NOT_INIT) + break; + + if (type == expected) + return 0; + } + + verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); + for (j = 0; j + 1 < i; j++) + verbose(env, "%s, ", reg_type_str[compatible->types[j]]); + verbose(env, "%s\n", reg_type_str[compatible->types[j]]); + return -EACCES; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn) { u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - enum bpf_reg_type expected_type, type = reg->type; enum bpf_arg_type arg_type = fn->arg_type[arg]; + const struct bpf_reg_types *compatible; + enum bpf_reg_type type = reg->type; int err = 0; if (arg_type == ARG_DONTCARE) @@ -4003,72 +4098,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, */ goto skip_type_check; - if (arg_type == ARG_PTR_TO_MAP_KEY || - arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { - expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) - goto err_type; - } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO || - arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { - expected_type = SCALAR_VALUE; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_CONST_MAP_PTR) { - expected_type = CONST_PTR_TO_MAP; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_PTR_TO_CTX || - arg_type == ARG_PTR_TO_CTX_OR_NULL) { - expected_type = PTR_TO_CTX; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { - expected_type = PTR_TO_SOCK_COMMON; - /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ - if (!type_is_sk_pointer(type)) - goto err_type; - } else if (arg_type == ARG_PTR_TO_SOCKET || - arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { - expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_PTR_TO_BTF_ID) { - expected_type = PTR_TO_BTF_ID; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { - expected_type = PTR_TO_MAP_VALUE; - if (type != expected_type) - goto err_type; - } else if (arg_type_is_mem_ptr(arg_type)) { - expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != PTR_TO_MEM && - type != PTR_TO_RDONLY_BUF && - type != PTR_TO_RDWR_BUF && - type != expected_type) - goto err_type; - } else if (arg_type_is_alloc_mem_ptr(arg_type)) { - expected_type = PTR_TO_MEM; - if (type != expected_type) - goto err_type; - } else if (arg_type_is_int_ptr(arg_type)) { - expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) - goto err_type; - } else { - verbose(env, "unsupported arg_type %d\n", arg_type); + compatible = compatible_reg_types[arg_type]; + if (!compatible) { + verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); return -EFAULT; } + err = check_reg_type(env, regno, compatible); + if (err) + return err; + if (type == PTR_TO_BTF_ID) { const u32 *btf_id = fn->arg_btf_id[arg]; @@ -4221,10 +4260,6 @@ skip_type_check: } return err; -err_type: - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[type], reg_type_str[expected_type]); - return -EACCES; } static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) -- cgit v1.2.3 From a8a717963fe5ecfd274eb93dd1285ee9428ffca7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 18 Sep 2020 13:23:38 +0200 Subject: selftests/bpf: Fix stat probe in d_path test Some kernels builds might inline vfs_getattr call within fstat syscall code path, so fentry/vfs_getattr trampoline is not called. Add security_inode_getattr to allowlist and switch the d_path test stat trampoline to security_inode_getattr. Keeping dentry_open and filp_close, because they are in their own files, so unlikely to be inlined, but in case they are, adding security_file_open. Adding flags that indicate trampolines were called and failing the test if any of them got missed, so it's easier to identify the issue next time. Fixes: e4d1af4b16f8 ("selftests/bpf: Add test for d_path helper") Suggested-by: Alexei Starovoitov Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200918112338.2618444-1-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ebf9be4d0d6a..36508f46a8db 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1114,6 +1114,14 @@ BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) } BTF_SET_START(btf_allowlist_d_path) +#ifdef CONFIG_SECURITY +BTF_ID(func, security_file_permission) +BTF_ID(func, security_inode_getattr) +BTF_ID(func, security_file_open) +#endif +#ifdef CONFIG_SECURITY_PATH +BTF_ID(func, security_path_truncate) +#endif BTF_ID(func, vfs_truncate) BTF_ID(func, vfs_fallocate) BTF_ID(func, dentry_open) -- cgit v1.2.3 From 5c8c206e4308ee33dea7c60b0cfcbed48a6438b4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 6 Aug 2020 20:32:59 -0700 Subject: tracing: Delete repeated words in comments Drop repeated words in kernel/trace/. {and, the, not} Link: https://lkml.kernel.org/r/20200807033259.13778-1-rdunlap@infradead.org Cc: Ingo Molnar Signed-off-by: Randy Dunlap Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace.c | 2 +- kernel/trace/trace_dynevent.c | 2 +- kernel/trace/trace_events_synth.c | 2 +- kernel/trace/tracing_map.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 603255f5f085..84f32dbc7be8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2402,7 +2402,7 @@ struct ftrace_ops direct_ops = { * * If the record has the FTRACE_FL_REGS set, that means that it * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS - * is not not set, then it wants to convert to the normal callback. + * is not set, then it wants to convert to the normal callback. * * Returns the address of the trampoline to set to */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29a9034b38d0..8fac7d6db222 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9287,7 +9287,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) } /* - * We need to stop all tracing on all CPUS to read the + * We need to stop all tracing on all CPUS to read * the next buffer. This is a bit expensive, but is * not done often. We fill all what we can read, * and then release the locks again. diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 9442a9bb080e..5fa49cfd2bb6 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -402,7 +402,7 @@ void dynevent_arg_init(struct dynevent_arg *arg, * whitespace, all followed by a separator, if applicable. After the * first arg string is successfully appended to the command string, * the optional @operator is appended, followed by the second arg and - * and optional @separator. If no separator was specified when + * optional @separator. If no separator was specified when * initializing the arg, a space will be appended. */ void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair, diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index f86a2aa0bccd..7c765e80e974 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1211,7 +1211,7 @@ __synth_event_trace_start(struct trace_event_file *file, * ENABLED bit is set (which attaches the probe thus allowing * this code to be called, etc). Because this is called * directly by the user, we don't have that but we still need - * to honor not logging when disabled. For the the iterated + * to honor not logging when disabled. For the iterated * trace case, we save the enabed state upon start and just * ignore the following data calls. */ diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 74738c9856f1..4b50fc0cb12c 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -260,7 +260,7 @@ int tracing_map_add_var(struct tracing_map *map) * to use cmp_fn. * * A key can be a subset of a compound key; for that purpose, the - * offset param is used to describe where within the the compound key + * offset param is used to describe where within the compound key * the key referenced by this key field resides. * * Return: The index identifying the field in the map and associated -- cgit v1.2.3 From b427e765bdffcc18911ace199a17b09332a47d55 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Thu, 13 Aug 2020 19:28:03 +0800 Subject: tracing: Use __this_cpu_read() in trace_buffered_event_enable() The code is executed with preemption disabled, so it's safe to use __this_cpu_read(). Link: https://lkml.kernel.org/r/20200813112803.12256-1-tian.xianting@h3c.com Signed-off-by: Xianting Tian Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8fac7d6db222..1c4ca25944ba 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2511,7 +2511,7 @@ void trace_buffered_event_enable(void) preempt_disable(); if (cpu == smp_processor_id() && - this_cpu_read(trace_buffered_event) != + __this_cpu_read(trace_buffered_event) != per_cpu(trace_buffered_event, cpu)) WARN_ON_ONCE(1); preempt_enable(); -- cgit v1.2.3 From f3d36426618ee2b2d1fa99aefb5fe4d2dc33807e Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 18 Aug 2020 08:08:57 +0300 Subject: kprobes: Use module_name() macro It is advised to use module_name() macro instead of dereferencing mod->name directly. This makes sense for consistencys sake and also it prevents a hard dependency to CONFIG_MODULES. Link: https://lkml.kernel.org/r/20200818050857.117998-1-jarkko.sakkinen@linux.intel.com Cc: Mark Rutland Cc: Ingo Molnar Cc: linux-mm@kvack.org Cc: Andi Kleen Cc: Ard Biesheuvel Cc: Jessica Yu Cc: Mark Rutland , Cc: Masami Hiramatsu Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Russell King Cc: Will Deacon Acked-by: Masami Hiramatsu Signed-off-by: Jarkko Sakkinen Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index feca9b19cd74..f8e46929ceba 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -106,9 +106,10 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, struct module *mod) { - int len = strlen(mod->name); + int len = strlen(module_name(mod)); const char *name = trace_kprobe_symbol(tk); - return strncmp(mod->name, name, len) == 0 && name[len] == ':'; + + return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; } static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) @@ -688,7 +689,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, if (ret) pr_warn("Failed to re-register probe %s on %s: %d\n", trace_probe_name(&tk->tp), - mod->name, ret); + module_name(mod), ret); } } mutex_unlock(&event_mutex); -- cgit v1.2.3 From eb8d8b4c9848b200586aa98e105b39f159656ba6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 5 Sep 2020 15:50:20 +0300 Subject: tracing: remove a pointless assignment The "tr" is a stack variable so setting it to NULL before a return is a no-op. Delete the assignment. Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1c4ca25944ba..55b829863127 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8799,7 +8799,6 @@ static int __remove_instance(struct trace_array *tr) free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); - tr = NULL; return 0; } -- cgit v1.2.3 From 40d14da383670db21a09e63d52db8dee9b77741e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 6 Sep 2020 18:33:26 -0700 Subject: fgraph: Convert ret_stack tasklist scanning to rcu It seems that alloc_retstack_tasklist() can also take a lockless approach for scanning the tasklist, instead of using the big global tasklist_lock. For this we also kill another deprecated and rcu-unsafe tsk->thread_group user replacing it with for_each_process_thread(), maintaining semantics. Here tasklist_lock does not protect anything other than the list against concurrent fork/exit. And considering that the whole thing is capped by FTRACE_RETSTACK_ALLOC_SIZE (32), it should not be a problem to have a pontentially stale, yet stable, list. The task cannot go away either, so we don't risk racing with ftrace_graph_exit_task() which clears the retstack. The tsk->ret_stack management is not protected by tasklist_lock, being serialized with the corresponding publish/subscribe barriers against concurrent ftrace_push_return_trace(). In addition this plays nicer with cachelines by avoiding two atomic ops in the uncontended case. Link: https://lkml.kernel.org/r/20200907013326.9870-1-dave@stgolabs.net Acked-by: Oleg Nesterov Signed-off-by: Davidlohr Bueso Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 1af321dec0f1..5658f13037b3 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -387,8 +387,8 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } } - read_lock(&tasklist_lock); - do_each_thread(g, t) { + rcu_read_lock(); + for_each_process_thread(g, t) { if (start == end) { ret = -EAGAIN; goto unlock; @@ -403,10 +403,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) smp_wmb(); t->ret_stack = ret_stack_list[start++]; } - } while_each_thread(g, t); + } unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); free: for (i = start; i < end; i++) kfree(ret_stack_list[i]); -- cgit v1.2.3 From 8490db06f914100fc8a5110481cbd37d8968be90 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 17:55:15 +0900 Subject: tracing/boot: Add per-instance tracing_on option support Add per-instance tracing_on option, which will be useful with traceon/traceoff event trigger actions. For example, if we disable tracing_on by default and set traceon and traceoff on a pair of events, we can trace functions between the pair of events. Link: https://lkml.kernel.org/r/159972811538.428528.2561315102284268611.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index fa0fc08c6ef8..d52d441a17e8 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -40,6 +40,16 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) pr_err("Failed to set option: %s\n", buf); } + p = xbc_node_find_value(node, "tracing_on", NULL); + if (p && *p != '\0') { + if (kstrtoul(p, 10, &v)) + pr_err("Failed to set tracing on: %s\n", p); + if (v) + tracer_tracing_on(tr); + else + tracer_tracing_off(tr); + } + p = xbc_node_find_value(node, "trace_clock", NULL); if (p && *p != '\0') { if (tracing_set_clock(tr, p) < 0) -- cgit v1.2.3 From 4725cd89978c26405a20414f3a0fa6cbd2bf9aad Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 17:55:35 +0900 Subject: tracing/kprobes: Support perf-style return probe Support perf-style return probe ("SYMBOL%return") for kprobe events. This will allow boot-time tracing user to define a return probe event. Link: https://lkml.kernel.org/r/159972813535.428528.4437029657208468954.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_kprobe.c | 18 +++++++++++++++++- kernel/trace/trace_probe.h | 1 + 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 55b829863127..ca6da462326d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5122,7 +5122,7 @@ static const char readme_msg[] = "\t -:[/]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" - "place (kretprobe): [:][+]|\n" + "place (kretprobe): [:][+]%return|\n" #endif #ifdef CONFIG_UPROBE_EVENTS " place (uprobe): :[(ref_ctr_offset)]\n" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f8e46929ceba..9d46415296eb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -718,6 +718,9 @@ static int trace_kprobe_create(int argc, const char *argv[]) * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] * - Add kretprobe: * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * Or + * p:[GRP/]EVENT] [MOD:]KSYM[+0]%return [FETCHARGS] + * * Fetch args: * $retval : fetch return value * $stack : fetch stack address @@ -747,7 +750,6 @@ static int trace_kprobe_create(int argc, const char *argv[]) switch (argv[0][0]) { case 'r': is_return = true; - flags |= TPARG_FL_RETURN; break; case 'p': break; @@ -805,12 +807,26 @@ static int trace_kprobe_create(int argc, const char *argv[]) symbol = kstrdup(argv[1], GFP_KERNEL); if (!symbol) return -ENOMEM; + + tmp = strchr(symbol, '%'); + if (tmp) { + if (!strcmp(tmp, "%return")) { + *tmp = '\0'; + is_return = true; + } else { + trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); + goto parse_error; + } + } + /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret || offset < 0 || offset > UINT_MAX) { trace_probe_log_err(0, BAD_PROBE_ADDR); goto parse_error; } + if (is_return) + flags |= TPARG_FL_RETURN; if (kprobe_on_func_entry(NULL, symbol, offset)) flags |= TPARG_FL_FENTRY; if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index a22b62813f8c..04d00987da69 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -404,6 +404,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(MAXACT_TOO_BIG, "Maxactive is too big"), \ C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ + C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ C(BAD_GROUP_NAME, "Group name must follow the same rules as C identifiers"), \ -- cgit v1.2.3 From 3dd3aae32dc91efab916b28cf95986186c6e8d6b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 17:55:46 +0900 Subject: tracing/uprobes: Support perf-style return probe Support perf-style return probe ("SYMBOL%return") for uprobe events as same as kprobe events does. Link: https://lkml.kernel.org/r/159972814601.428528.7641183316212425445.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_uprobe.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ca6da462326d..c35fcd2f2529 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5125,7 +5125,7 @@ static const char readme_msg[] = "place (kretprobe): [:][+]%return|\n" #endif #ifdef CONFIG_UPROBE_EVENTS - " place (uprobe): :[(ref_ctr_offset)]\n" + " place (uprobe): :[%return][(ref_ctr_offset)]\n" #endif "\t args: =fetcharg[:type]\n" "\t fetcharg: %, @
, @[+|-],\n" diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 56729c6b6614..3cf7128e1ad3 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -528,7 +528,7 @@ end: /* * Argument syntax: - * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] + * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int trace_uprobe_create(int argc, const char **argv) { @@ -617,6 +617,19 @@ static int trace_uprobe_create(int argc, const char **argv) } } + /* Check if there is %return suffix */ + tmp = strchr(arg, '%'); + if (tmp) { + if (!strcmp(tmp, "%return")) { + *tmp = '\0'; + is_return = true; + } else { + trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX); + ret = -EINVAL; + goto fail_address_parse; + } + } + /* Parse uprobe offset. */ ret = kstrtoul(arg, 0, &offset); if (ret) { -- cgit v1.2.3 From 36dadef23fcca55bb6531dc12822d3b165319ccc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 21:38:39 +0900 Subject: kprobes: Init kprobes in early_initcall Init kprobes feature in early_initcall as same as jump_label and dynamic_debug does, so that we can use kprobes events in earlier boot stage. Link: https://lkml.kernel.org/r/159974151897.478751.8342374158615496628.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d750e025d1ff..9303881aac84 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2529,7 +2529,7 @@ static int __init init_kprobes(void) init_test_probes(); return err; } -subsys_initcall(init_kprobes); +early_initcall(init_kprobes); #ifdef CONFIG_DEBUG_FS static void report_probe(struct seq_file *pi, struct kprobe *p, -- cgit v1.2.3 From ac343da7bc9048629f9d12d98e8f0573df88836b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 21:38:48 +0900 Subject: tracing: Define event fields early stage Define event fields at early stage so that boot-time tracing can access the event fields (like per-event filter setting). Link: https://lkml.kernel.org/r/159974152862.478751.2023768466808361350.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 92 +++++++++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ee25d849ebba..8e87fbab6930 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2123,12 +2123,48 @@ event_subsystem_dir(struct trace_array *tr, const char *name, return NULL; } +static int +event_define_fields(struct trace_event_call *call) +{ + struct list_head *head; + int ret = 0; + + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + struct trace_event_fields *field = call->class->fields_array; + unsigned int offset = sizeof(struct trace_entry); + + for (; field->type; field++) { + if (field->type == TRACE_FUNCTION_TYPE) { + field->define_fields(call); + break; + } + + offset = ALIGN(offset, field->align); + ret = trace_define_field(call, field->type, field->name, + offset, field->size, + field->is_signed, field->filter_type); + if (WARN_ON_ONCE(ret)) { + pr_err("error code is %d\n", ret); + break; + } + + offset += field->size; + } + } + + return ret; +} + static int event_create_dir(struct dentry *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; - struct list_head *head; struct dentry *d_events; const char *name; int ret; @@ -2162,35 +2198,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) &ftrace_event_id_fops); #endif - /* - * Other events may have the same class. Only update - * the fields if they are not already defined. - */ - head = trace_get_fields(call); - if (list_empty(head)) { - struct trace_event_fields *field = call->class->fields_array; - unsigned int offset = sizeof(struct trace_entry); - - for (; field->type; field++) { - if (field->type == TRACE_FUNCTION_TYPE) { - ret = field->define_fields(call); - break; - } - - offset = ALIGN(offset, field->align); - ret = trace_define_field(call, field->type, field->name, - offset, field->size, - field->is_signed, field->filter_type); - if (ret) - break; - - offset += field->size; - } - if (ret < 0) { - pr_warn("Could not initialize trace point events/%s\n", - name); - return -1; - } + ret = event_define_fields(call); + if (ret < 0) { + pr_warn("Could not initialize trace point events/%s\n", name); + return ret; } /* @@ -2493,7 +2504,7 @@ __trace_early_add_new_event(struct trace_event_call *call, if (!file) return -ENOMEM; - return 0; + return event_define_fields(call); } struct ftrace_module_file_ops; @@ -3431,6 +3442,18 @@ static __init int event_trace_enable_again(void) early_initcall(event_trace_enable_again); +/* Init fields which doesn't related to the tracefs */ +static __init int event_trace_init_fields(void) +{ + if (trace_define_generic_fields()) + pr_warn("tracing: Failed to allocated generic fields"); + + if (trace_define_common_fields()) + pr_warn("tracing: Failed to allocate common fields"); + + return 0; +} + __init int event_trace_init(void) { struct trace_array *tr; @@ -3446,12 +3469,6 @@ __init int event_trace_init(void) if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); - if (trace_define_generic_fields()) - pr_warn("tracing: Failed to allocated generic fields"); - - if (trace_define_common_fields()) - pr_warn("tracing: Failed to allocate common fields"); - ret = early_event_add_tracer(NULL, tr); if (ret) return ret; @@ -3469,6 +3486,7 @@ void __init trace_event_init(void) event_trace_memsetup(); init_ftrace_syscalls(); event_trace_enable(); + event_trace_init_fields(); } #ifdef CONFIG_EVENT_TRACE_STARTUP_TEST -- cgit v1.2.3 From a838deab4e635994476bfc5b254bdf461e168752 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 21:38:58 +0900 Subject: tracing: Enable adding dynamic events early stage Split the event fields initialization from creating new event directory. This allows the boot-time tracing to define dynamic events before initializing events directory on tracefs. Link: https://lkml.kernel.org/r/159974153790.478751.3475515065034825374.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8e87fbab6930..42c0e7df6e70 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -38,6 +38,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); +static bool eventdir_initialized; #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) @@ -2486,7 +2487,10 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) if (!file) return -ENOMEM; - return event_create_dir(tr->event_dir, file); + if (eventdir_initialized) + return event_create_dir(tr->event_dir, file); + else + return event_define_fields(call); } /* @@ -3478,6 +3482,9 @@ __init int event_trace_init(void) if (ret) pr_warn("Failed to register trace events module notifier\n"); #endif + + eventdir_initialized = true; + return 0; } -- cgit v1.2.3 From 4114fbfd02f12d7a58cc4bd6fc36e0925266f9f7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 21:39:07 +0900 Subject: tracing: Enable creating new instance early boot Enable creating new trace_array instance in early boot stage. If the instances directory is not created, postpone it until the tracefs is initialized. Link: https://lkml.kernel.org/r/159974154763.478751.6289753509587233103.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 53 +++++++++++++++++++++++++++++++++--------- kernel/trace/trace.h | 7 ++++++ kernel/trace/trace_functions.c | 22 +++++++++++------- 3 files changed, 63 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c35fcd2f2529..6211a13b3327 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8636,6 +8636,24 @@ struct trace_array *trace_array_find_get(const char *instance) return tr; } +static int trace_array_create_dir(struct trace_array *tr) +{ + int ret; + + tr->dir = tracefs_create_dir(tr->name, trace_instance_dir); + if (!tr->dir) + return -EINVAL; + + ret = event_trace_add_tracer(tr->dir, tr); + if (ret) + tracefs_remove(tr->dir); + + init_tracer_tracefs(tr, tr->dir); + __update_tracer_options(tr); + + return ret; +} + static struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; @@ -8671,30 +8689,27 @@ static struct trace_array *trace_array_create(const char *name) if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; - tr->dir = tracefs_create_dir(name, trace_instance_dir); - if (!tr->dir) - goto out_free_tr; - - ret = event_trace_add_tracer(tr->dir, tr); - if (ret) { - tracefs_remove(tr->dir); + if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; - } ftrace_init_trace_array(tr); - init_tracer_tracefs(tr, tr->dir); init_trace_flags_index(tr); - __update_tracer_options(tr); + + if (trace_instance_dir) { + ret = trace_array_create_dir(tr); + if (ret) + goto out_free_tr; + } list_add(&tr->list, &ftrace_trace_arrays); tr->ref++; - return tr; out_free_tr: + ftrace_free_ftrace_ops(tr); free_trace_buffers(tr); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); @@ -8852,11 +8867,27 @@ static int instance_rmdir(const char *name) static __init void create_trace_instances(struct dentry *d_tracer) { + struct trace_array *tr; + trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, instance_mkdir, instance_rmdir); if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) return; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->name) + continue; + if (MEM_FAIL(trace_array_create_dir(tr) < 0, + "Failed to create instance directory\n")) + break; + } + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); } static void diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0d3a405fe446..525434145eea 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1125,6 +1125,8 @@ extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent); void ftrace_destroy_function_files(struct trace_array *tr); +int ftrace_allocate_ftrace_ops(struct trace_array *tr); +void ftrace_free_ftrace_ops(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); @@ -1146,6 +1148,11 @@ ftrace_create_function_files(struct trace_array *tr, { return 0; } +static inline int ftrace_allocate_ftrace_ops(struct trace_array *tr) +{ + return 0; +} +static inline void ftrace_free_ftrace_ops(struct trace_array *tr) { } static inline void ftrace_destroy_function_files(struct trace_array *tr) { } static inline __init void ftrace_init_global_array_ops(struct trace_array *tr) { } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index dd4dff71d89a..2c2126e1871d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -34,10 +34,14 @@ enum { TRACE_FUNC_OPT_STACK = 0x1, }; -static int allocate_ftrace_ops(struct trace_array *tr) +int ftrace_allocate_ftrace_ops(struct trace_array *tr) { struct ftrace_ops *ops; + /* The top level array uses the "global_ops" */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return 0; + ops = kzalloc(sizeof(*ops), GFP_KERNEL); if (!ops) return -ENOMEM; @@ -48,15 +52,19 @@ static int allocate_ftrace_ops(struct trace_array *tr) tr->ops = ops; ops->private = tr; + return 0; } +void ftrace_free_ftrace_ops(struct trace_array *tr) +{ + kfree(tr->ops); + tr->ops = NULL; +} int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent) { - int ret; - /* * The top level array uses the "global_ops", and the files are * created on boot up. @@ -64,9 +72,8 @@ int ftrace_create_function_files(struct trace_array *tr, if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return 0; - ret = allocate_ftrace_ops(tr); - if (ret) - return ret; + if (!tr->ops) + return -EINVAL; ftrace_create_filter_files(tr->ops, parent); @@ -76,8 +83,7 @@ int ftrace_create_function_files(struct trace_array *tr, void ftrace_destroy_function_files(struct trace_array *tr) { ftrace_destroy_filter_files(tr->ops); - kfree(tr->ops); - tr->ops = NULL; + ftrace_free_ftrace_ops(tr); } static int function_trace_init(struct trace_array *tr) -- cgit v1.2.3 From ba0fbfbb21cd90d51e4f6668ee8397e810818028 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 21:39:17 +0900 Subject: tracing/boot, kprobe, synth: Initialize boot-time tracing earlier Initialize boot-time tracing in core_initcall_sync instead of fs_initcall, and initialize required tracers (kprobes and synth) in core_initcall. This will allow the boot-time tracing to trace __init code from the beginning of postcore_initcall stage. Link: https://lkml.kernel.org/r/159974155727.478751.7486926132902849578.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 7 +++++-- kernel/trace/trace_events_synth.c | 19 ++++++++++++++----- kernel/trace/trace_kprobe.c | 6 +++--- 3 files changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index d52d441a17e8..754e3cf2df3a 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -340,5 +340,8 @@ static int __init trace_boot_init(void) return 0; } - -fs_initcall(trace_boot_init); +/* + * Start tracing at the end of core-initcall, so that it starts tracing + * from the beginning of postcore_initcall. + */ +core_initcall_sync(trace_boot_init); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 7c765e80e974..a9cd7793f7ea 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1754,17 +1754,26 @@ static const struct file_operations synth_events_fops = { .release = seq_release, }; -static __init int trace_events_synth_init(void) +/* + * Register dynevent at core_initcall. This allows kernel to setup kprobe + * events in postcore_initcall without tracefs. + */ +static __init int trace_events_synth_init_early(void) { - struct dentry *entry = NULL; int err = 0; err = dyn_event_register(&synth_event_ops); - if (err) { + if (err) pr_warn("Could not register synth_event_ops\n"); - return err; - } + return err; +} +core_initcall(trace_events_synth_init_early); + +static __init int trace_events_synth_init(void) +{ + struct dentry *entry = NULL; + int err = 0; err = tracing_init_dentry(); if (err) goto err; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9d46415296eb..b911e9f6d9f5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1897,8 +1897,8 @@ static __init void setup_boot_kprobe_events(void) } /* - * Register dynevent at subsys_initcall. This allows kernel to setup kprobe - * events in fs_initcall without tracefs. + * Register dynevent at core_initcall. This allows kernel to setup kprobe + * events in postcore_initcall without tracefs. */ static __init int init_kprobe_trace_early(void) { @@ -1913,7 +1913,7 @@ static __init int init_kprobe_trace_early(void) return 0; } -subsys_initcall(init_kprobe_trace_early); +core_initcall(init_kprobe_trace_early); /* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) -- cgit v1.2.3 From cfe2790b163acdc9c058a63bff310923e84a16b4 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Sat, 19 Sep 2020 00:40:19 +0206 Subject: printk: move printk_info into separate array The majority of the size of a descriptor is taken up by meta data, which is often not of interest to the ringbuffer (for example, when performing state checks). Since descriptors are often temporarily stored on the stack, keeping their size minimal will help reduce stack pressure. Rather than embedding the printk_info into the descriptor, create a separate printk_info array. The index of a descriptor in the descriptor array corresponds to the printk_info with the same index in the printk_info array. The rules for validity of a printk_info match the existing rules for the data blocks: the descriptor must be in a consistent state. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200918223421.21621-2-john.ogness@linutronix.de --- kernel/printk/printk.c | 30 ++++++-- kernel/printk/printk_ringbuffer.c | 145 ++++++++++++++++++++++++-------------- kernel/printk/printk_ringbuffer.h | 29 ++++---- 3 files changed, 133 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a2e23191576..25cfe4fe48af 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -959,11 +959,11 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_STRUCT_SIZE(prb_desc_ring); VMCOREINFO_OFFSET(prb_desc_ring, count_bits); VMCOREINFO_OFFSET(prb_desc_ring, descs); + VMCOREINFO_OFFSET(prb_desc_ring, infos); VMCOREINFO_OFFSET(prb_desc_ring, head_id); VMCOREINFO_OFFSET(prb_desc_ring, tail_id); VMCOREINFO_STRUCT_SIZE(prb_desc); - VMCOREINFO_OFFSET(prb_desc, info); VMCOREINFO_OFFSET(prb_desc, state_var); VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); VMCOREINFO_OFFSET(prb_desc, dict_blk_lpos); @@ -1097,11 +1097,13 @@ static char setup_dict_buf[CONSOLE_EXT_LOG_MAX] __initdata; void __init setup_log_buf(int early) { + struct printk_info *new_infos; unsigned int new_descs_count; struct prb_desc *new_descs; struct printk_info info; struct printk_record r; size_t new_descs_size; + size_t new_infos_size; unsigned long flags; char *new_dict_buf; char *new_log_buf; @@ -1142,8 +1144,7 @@ void __init setup_log_buf(int early) if (unlikely(!new_dict_buf)) { pr_err("log_buf_len: %lu dict bytes not available\n", new_log_buf_len); - memblock_free(__pa(new_log_buf), new_log_buf_len); - return; + goto err_free_log_buf; } new_descs_size = new_descs_count * sizeof(struct prb_desc); @@ -1151,9 +1152,15 @@ void __init setup_log_buf(int early) if (unlikely(!new_descs)) { pr_err("log_buf_len: %zu desc bytes not available\n", new_descs_size); - memblock_free(__pa(new_dict_buf), new_log_buf_len); - memblock_free(__pa(new_log_buf), new_log_buf_len); - return; + goto err_free_dict_buf; + } + + new_infos_size = new_descs_count * sizeof(struct printk_info); + new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); + if (unlikely(!new_infos)) { + pr_err("log_buf_len: %zu info bytes not available\n", + new_infos_size); + goto err_free_descs; } prb_rec_init_rd(&r, &info, @@ -1163,7 +1170,8 @@ void __init setup_log_buf(int early) prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), new_dict_buf, ilog2(new_log_buf_len), - new_descs, ilog2(new_descs_count)); + new_descs, ilog2(new_descs_count), + new_infos); logbuf_lock_irqsave(flags); @@ -1192,6 +1200,14 @@ void __init setup_log_buf(int early) pr_info("log_buf_len: %u bytes\n", log_buf_len); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); + return; + +err_free_descs: + memblock_free(__pa(new_descs), new_descs_size); +err_free_dict_buf: + memblock_free(__pa(new_dict_buf), new_log_buf_len); +err_free_log_buf: + memblock_free(__pa(new_log_buf), new_log_buf_len); } static bool __read_mostly ignore_loglevel; diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index f4e2e9890e0f..de4b10a98623 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -15,10 +15,10 @@ * The printk_ringbuffer is made up of 3 internal ringbuffers: * * desc_ring - * A ring of descriptors. A descriptor contains all record meta data - * (sequence number, timestamp, loglevel, etc.) as well as internal state - * information about the record and logical positions specifying where in - * the other ringbuffers the text and dictionary strings are located. + * A ring of descriptors and their meta data (such as sequence number, + * timestamp, loglevel, etc.) as well as internal state information about + * the record and logical positions specifying where in the other + * ringbuffers the text and dictionary strings are located. * * text_data_ring * A ring of data blocks. A data block consists of an unsigned long @@ -38,13 +38,14 @@ * * Descriptor Ring * ~~~~~~~~~~~~~~~ - * The descriptor ring is an array of descriptors. A descriptor contains all - * the meta data of a printk record as well as blk_lpos structs pointing to - * associated text and dictionary data blocks (see "Data Rings" below). Each - * descriptor is assigned an ID that maps directly to index values of the - * descriptor array and has a state. The ID and the state are bitwise combined - * into a single descriptor field named @state_var, allowing ID and state to - * be synchronously and atomically updated. + * The descriptor ring is an array of descriptors. A descriptor contains + * essential meta data to track the data of a printk record using + * blk_lpos structs pointing to associated text and dictionary data blocks + * (see "Data Rings" below). Each descriptor is assigned an ID that maps + * directly to index values of the descriptor array and has a state. The ID + * and the state are bitwise combined into a single descriptor field named + * @state_var, allowing ID and state to be synchronously and atomically + * updated. * * Descriptors have four states: * @@ -150,6 +151,14 @@ * descriptor. If a data block is not valid, the @tail_lpos cannot be * advanced beyond it. * + * Info Array + * ~~~~~~~~~~ + * The general meta data of printk records are stored in printk_info structs, + * stored in an array with the same number of elements as the descriptor ring. + * Each info corresponds to the descriptor of the same index in the + * descriptor ring. Info validity is confirmed by evaluating the corresponding + * descriptor before and after loading the info. + * * Usage * ----- * Here are some simple examples demonstrating writers and readers. For the @@ -367,6 +376,15 @@ static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n) return &desc_ring->descs[DESC_INDEX(desc_ring, n)]; } +/* + * Return the printk_info associated with @n. @n can be either a + * descriptor ID or a sequence number. + */ +static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n) +{ + return &desc_ring->infos[DESC_INDEX(desc_ring, n)]; +} + static struct prb_data_block *to_block(struct prb_data_ring *data_ring, unsigned long begin_lpos) { @@ -425,10 +443,16 @@ static enum desc_state get_desc_state(unsigned long id, * Get a copy of a specified descriptor and return its queried state. If the * descriptor is in an inconsistent state (miss or reserved), the caller can * only expect the descriptor's @state_var field to be valid. + * + * The sequence number and caller_id can be optionally retrieved. Like all + * non-state_var data, they are only valid if the descriptor is in a + * consistent state. */ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, - unsigned long id, struct prb_desc *desc_out) + unsigned long id, struct prb_desc *desc_out, + u64 *seq_out, u32 *caller_id_out) { + struct printk_info *info = to_info(desc_ring, id); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; enum desc_state d_state; @@ -469,11 +493,14 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, * state has been re-checked. A memcpy() for all of @desc * cannot be used because of the atomic_t @state_var field. */ - memcpy(&desc_out->info, &desc->info, sizeof(desc_out->info)); /* LMM(desc_read:C) */ memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, - sizeof(desc_out->text_blk_lpos)); /* also part of desc_read:C */ + sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ memcpy(&desc_out->dict_blk_lpos, &desc->dict_blk_lpos, sizeof(desc_out->dict_blk_lpos)); /* also part of desc_read:C */ + if (seq_out) + *seq_out = info->seq; /* also part of desc_read:C */ + if (caller_id_out) + *caller_id_out = info->caller_id; /* also part of desc_read:C */ /* * 1. Guarantee the descriptor content is loaded before re-checking @@ -588,7 +615,8 @@ static bool data_make_reusable(struct printk_ringbuffer *rb, */ id = blk->id; /* LMM(data_make_reusable:A) */ - d_state = desc_read(desc_ring, id, &desc); /* LMM(data_make_reusable:B) */ + d_state = desc_read(desc_ring, id, &desc, + NULL, NULL); /* LMM(data_make_reusable:B) */ switch (d_state) { case desc_miss: @@ -771,7 +799,7 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, enum desc_state d_state; struct prb_desc desc; - d_state = desc_read(desc_ring, tail_id, &desc); + d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL); switch (d_state) { case desc_miss: @@ -823,7 +851,8 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, * equal to @head_id so there is no risk of pushing the tail past the * head. */ - d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc); /* LMM(desc_push_tail:A) */ + d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc, + NULL, NULL); /* LMM(desc_push_tail:A) */ if (d_state == desc_finalized || d_state == desc_reusable) { /* @@ -1264,6 +1293,7 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, struct prb_desc desc; struct prb_desc *d; unsigned long id; + u32 cid; id = atomic_long_read(&desc_ring->head_id); @@ -1271,8 +1301,8 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * To reduce unnecessarily reopening, first check if the descriptor * state and caller ID are correct. */ - d_state = desc_read(desc_ring, id, &desc); - if (d_state != desc_committed || desc.info.caller_id != caller_id) + d_state = desc_read(desc_ring, id, &desc, NULL, &cid); + if (d_state != desc_committed || cid != caller_id) return NULL; d = to_desc(desc_ring, id); @@ -1353,6 +1383,8 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r, u32 caller_id) { + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info; unsigned int data_size; struct prb_desc *d; unsigned long id; @@ -1360,7 +1392,7 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer local_irq_save(e->irqflags); /* Transition the newest descriptor back to the reserved state. */ - d = desc_reopen_last(&rb->desc_ring, caller_id, &id); + d = desc_reopen_last(desc_ring, caller_id, &id); if (!d) { local_irq_restore(e->irqflags); goto fail_reopen; @@ -1368,6 +1400,8 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */ + info = to_info(desc_ring, id); + /* * Set the @e fields here so that prb_commit() can be used if * anything fails from now on. @@ -1380,14 +1414,14 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer * exclusive access at that point. The descriptor may have * changed since then. */ - if (caller_id != d->info.caller_id) + if (caller_id != info->caller_id) goto fail; if (BLK_DATALESS(&d->text_blk_lpos)) { - if (WARN_ON_ONCE(d->info.text_len != 0)) { + if (WARN_ON_ONCE(info->text_len != 0)) { pr_warn_once("wrong text_len value (%hu, expecting 0)\n", - d->info.text_len); - d->info.text_len = 0; + info->text_len); + info->text_len = 0; } if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) @@ -1404,12 +1438,12 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer * the meta data (@text_len) is not sane, use the full data * block size. */ - if (WARN_ON_ONCE(d->info.text_len > data_size)) { + if (WARN_ON_ONCE(info->text_len > data_size)) { pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n", - d->info.text_len, data_size); - d->info.text_len = data_size; + info->text_len, data_size); + info->text_len = data_size; } - r->text_buf_size += d->info.text_len; + r->text_buf_size += info->text_len; if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; @@ -1424,7 +1458,7 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer r->dict_buf = NULL; r->dict_buf_size = 0; - r->info = &d->info; + r->info = info; e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); @@ -1486,6 +1520,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info; struct prb_desc *d; unsigned long id; u64 seq; @@ -1512,14 +1547,15 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, } d = to_desc(desc_ring, id); + info = to_info(desc_ring, id); /* * All @info fields (except @seq) are cleared and must be filled in * by the writer. Save @seq before clearing because it is used to * determine the new sequence number. */ - seq = d->info.seq; - memset(&d->info, 0, sizeof(d->info)); + seq = info->seq; + memset(info, 0, sizeof(*info)); /* * Set the @e fields here so that prb_commit() can be used if @@ -1533,16 +1569,16 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, * Otherwise just increment it by a full wrap. * * @seq is considered "never been set" if it has a value of 0, - * _except_ for @descs[0], which was specially setup by the ringbuffer + * _except_ for @infos[0], which was specially setup by the ringbuffer * initializer and therefore is always considered as set. * * See the "Bootstrap" comment block in printk_ringbuffer.h for * details about how the initializer bootstraps the descriptors. */ if (seq == 0 && DESC_INDEX(desc_ring, id) != 0) - d->info.seq = DESC_INDEX(desc_ring, id); + info->seq = DESC_INDEX(desc_ring, id); else - d->info.seq = seq + DESCS_COUNT(desc_ring); + info->seq = seq + DESCS_COUNT(desc_ring); /* * New data is about to be reserved. Once that happens, previous @@ -1550,7 +1586,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, * previous descriptor now so that it can be made available to * readers. (For seq==0 there is no previous descriptor.) */ - if (d->info.seq > 0) + if (info->seq > 0) desc_make_final(desc_ring, DESC_ID(id - 1)); r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, @@ -1571,7 +1607,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, if (r->dict_buf_size && !r->dict_buf) r->dict_buf_size = 0; - r->info = &d->info; + r->info = info; /* Record full text space used by record. */ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); @@ -1726,12 +1762,12 @@ static bool copy_data(struct prb_data_ring *data_ring, /* * Actual cannot be less than expected. It can be more than expected * because of the trailing alignment padding. + * + * Note that invalid @len values can occur because the caller loads + * the value during an allowed data race. */ - if (WARN_ON_ONCE(data_size < (unsigned int)len)) { - pr_warn_once("wrong data size (%u, expecting >=%hu) for data: %.*s\n", - data_size, len, data_size, data); + if (data_size < (unsigned int)len) return false; - } /* Caller interested in the line count? */ if (line_count) @@ -1764,8 +1800,9 @@ static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, { struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos; enum desc_state d_state; + u64 s; - d_state = desc_read(desc_ring, id, desc_out); + d_state = desc_read(desc_ring, id, desc_out, &s, NULL); /* * An unexpected @id (desc_miss) or @seq mismatch means the record @@ -1775,7 +1812,7 @@ static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, if (d_state == desc_miss || d_state == desc_reserved || d_state == desc_committed || - desc_out->info.seq != seq) { + s != seq) { return -EINVAL; } @@ -1802,6 +1839,7 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r, unsigned int *line_count) { struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info = to_info(desc_ring, seq); struct prb_desc *rdesc = to_desc(desc_ring, seq); atomic_long_t *state_var = &rdesc->state_var; struct prb_desc desc; @@ -1823,10 +1861,10 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, /* If requested, copy meta data. */ if (r->info) - memcpy(r->info, &desc.info, sizeof(*(r->info))); + memcpy(r->info, info, sizeof(*(r->info))); /* Copy text data. If it fails, this is a data-less record. */ - if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, desc.info.text_len, + if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len, r->text_buf, r->text_buf_size, line_count)) { return -ENOENT; } @@ -1836,7 +1874,7 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, * important. So if it fails, modify the copied meta data to report * that there is no dict data, thus silently dropping the dict data. */ - if (!copy_data(&rb->dict_data_ring, &desc.dict_blk_lpos, desc.info.dict_len, + if (!copy_data(&rb->dict_data_ring, &desc.dict_blk_lpos, info->dict_len, r->dict_buf, r->dict_buf_size, NULL)) { if (r->info) r->info->dict_len = 0; @@ -1853,11 +1891,12 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb) enum desc_state d_state; struct prb_desc desc; unsigned long id; + u64 seq; for (;;) { id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */ - d_state = desc_read(desc_ring, id, &desc); /* LMM(prb_first_seq:B) */ + d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */ /* * This loop will not be infinite because the tail is @@ -1886,7 +1925,7 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb) smp_rmb(); /* LMM(prb_first_seq:C) */ } - return desc.info.seq; + return seq; } /* @@ -2049,6 +2088,7 @@ u64 prb_next_seq(struct printk_ringbuffer *rb) * @dictbits: The size of @dict_buf as a power-of-2 value. * @descs: The descriptor buffer for ringbuffer records. * @descbits: The count of @descs items as a power-of-2 value. + * @infos: The printk_info buffer for ringbuffer records. * * This is the public function available to writers to setup a ringbuffer * during runtime using provided buffers. @@ -2060,12 +2100,15 @@ u64 prb_next_seq(struct printk_ringbuffer *rb) void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int textbits, char *dict_buf, unsigned int dictbits, - struct prb_desc *descs, unsigned int descbits) + struct prb_desc *descs, unsigned int descbits, + struct printk_info *infos) { memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0])); + memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0])); rb->desc_ring.count_bits = descbits; rb->desc_ring.descs = descs; + rb->desc_ring.infos = infos; atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); @@ -2081,14 +2124,14 @@ void prb_init(struct printk_ringbuffer *rb, atomic_long_set(&rb->fail, 0); - descs[0].info.seq = -(u64)_DESCS_COUNT(descbits); - - descs[_DESCS_COUNT(descbits) - 1].info.seq = 0; atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.begin = FAILED_LPOS; descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.next = FAILED_LPOS; + + infos[0].seq = -(u64)_DESCS_COUNT(descbits); + infos[_DESCS_COUNT(descbits) - 1].seq = 0; } /** diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 853ea62dc5f2..97c8561e74e0 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -58,7 +58,6 @@ struct prb_data_blk_lpos { * @state_var: A bitwise combination of descriptor ID and descriptor state. */ struct prb_desc { - struct printk_info info; atomic_long_t state_var; struct prb_data_blk_lpos text_blk_lpos; struct prb_data_blk_lpos dict_blk_lpos; @@ -76,6 +75,7 @@ struct prb_data_ring { struct prb_desc_ring { unsigned int count_bits; struct prb_desc *descs; + struct printk_info *infos; atomic_long_t head_id; atomic_long_t tail_id; }; @@ -237,19 +237,8 @@ enum desc_state { static char _##name##_dict[1U << ((avgdictbits) + (descbits))] \ __aligned(__alignof__(unsigned long)); \ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ - /* this will be the first record reserved by a writer */ \ - [0] = { \ - .info = { \ - /* will be incremented to 0 on the first reservation */ \ - .seq = -(u64)_DESCS_COUNT(descbits), \ - }, \ - }, \ /* the initial head and tail */ \ [_DESCS_COUNT(descbits) - 1] = { \ - .info = { \ - /* reports the first seq value during the bootstrap phase */ \ - .seq = 0, \ - }, \ /* reusable */ \ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ /* no associated data block */ \ @@ -257,10 +246,23 @@ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ .dict_blk_lpos = FAILED_BLK_LPOS, \ }, \ }; \ +static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \ + /* this will be the first record reserved by a writer */ \ + [0] = { \ + /* will be incremented to 0 on the first reservation */ \ + .seq = -(u64)_DESCS_COUNT(descbits), \ + }, \ + /* the initial head and tail */ \ + [_DESCS_COUNT(descbits) - 1] = { \ + /* reports the first seq value during the bootstrap phase */ \ + .seq = 0, \ + }, \ +}; \ static struct printk_ringbuffer name = { \ .desc_ring = { \ .count_bits = descbits, \ .descs = &_##name##_descs[0], \ + .infos = &_##name##_infos[0], \ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ }, \ @@ -336,7 +338,8 @@ void prb_final_commit(struct prb_reserved_entry *e); void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int text_buf_size, char *dict_buf, unsigned int dict_buf_size, - struct prb_desc *descs, unsigned int descs_count_bits); + struct prb_desc *descs, unsigned int descs_count_bits, + struct printk_info *infos); unsigned int prb_record_text_space(struct prb_reserved_entry *e); /* Reader Interface */ -- cgit v1.2.3 From 74caba7f2a0685575b3ee5330a118f5922485e02 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 21 Sep 2020 13:24:45 +0206 Subject: printk: move dictionary keys to dev_printk_info Dictionaries are only used for SUBSYSTEM and DEVICE properties. The current implementation stores the property names each time they are used. This requires more space than otherwise necessary. Also, because the dictionary entries are currently considered optional, it cannot be relied upon that they are always available, even if the writer wanted to store them. These issues will increase should new dictionary properties be introduced. Rather than storing the subsystem and device properties in the dict ring, introduce a struct dev_printk_info with separate fields to store only the property values. Embed this struct within the struct printk_info to provide guaranteed availability. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/87mu1jl6ne.fsf@jogness.linutronix.de --- kernel/printk/internal.h | 4 +- kernel/printk/printk.c | 166 ++++++++++++++++++++------------------ kernel/printk/printk_ringbuffer.h | 3 + kernel/printk/printk_safe.c | 2 +- 4 files changed, 92 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 660f9a6bf73a..3a8fd491758c 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -14,9 +14,9 @@ extern raw_spinlock_t logbuf_lock; -__printf(5, 0) +__printf(4, 0) int vprintk_store(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args); __printf(1, 0) int vprintk_default(const char *fmt, va_list args); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 25cfe4fe48af..f1e243cc284a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -296,8 +296,8 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; /* * The printk log buffer consists of a sequenced collection of records, each - * containing variable length message and dictionary text. Every record - * also contains its own meta-data (@info). + * containing variable length message text. Every record also contains its + * own meta-data (@info). * * Every record meta-data carries the timestamp in microseconds, as well as * the standard userspace syslog level and syslog facility. The usual kernel @@ -310,9 +310,7 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; * terminated. * * Optionally, a record can carry a dictionary of properties (key/value - * pairs), to provide userspace with a machine-readable message context. The - * length of the dictionary is available in @dict_len. The dictionary is not - * terminated. + * pairs), to provide userspace with a machine-readable message context. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier @@ -322,21 +320,20 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; * +sound:card0 subsystem:devname * SUBSYSTEM=pci driver-core subsystem name * - * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value - * follows directly after a '=' character. Every property is terminated by - * a '\0' character. The last property is not terminated. + * Valid characters in property names are [a-zA-Z0-9.-_]. Property names + * and values are terminated by a '\0' character. * * Example of record values: - * record.text_buf = "it's a line" (unterminated) - * record.dict_buf = "DEVICE=b8:2\0DRIVER=bug" (unterminated) - * record.info.seq = 56 - * record.info.ts_nsec = 36863 - * record.info.text_len = 11 - * record.info.dict_len = 22 - * record.info.facility = 0 (LOG_KERN) - * record.info.flags = 0 - * record.info.level = 3 (LOG_ERR) - * record.info.caller_id = 299 (task 299) + * record.text_buf = "it's a line" (unterminated) + * record.info.seq = 56 + * record.info.ts_nsec = 36863 + * record.info.text_len = 11 + * record.info.facility = 0 (LOG_KERN) + * record.info.flags = 0 + * record.info.level = 3 (LOG_ERR) + * record.info.caller_id = 299 (task 299) + * record.info.dev_info.subsystem = "pci" (terminated) + * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) * * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might @@ -498,19 +495,19 @@ static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) /* insert record into the buffer, discard old ones, update heads */ static int log_store(u32 caller_id, int facility, int level, enum log_flags flags, u64 ts_nsec, - const char *dict, u16 dict_len, + const struct dev_printk_info *dev_info, const char *text, u16 text_len) { struct prb_reserved_entry e; struct printk_record r; u16 trunc_msg_len = 0; - prb_rec_init_wr(&r, text_len, dict_len); + prb_rec_init_wr(&r, text_len, 0); if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ truncate_msg(&text_len, &trunc_msg_len); - prb_rec_init_wr(&r, text_len + trunc_msg_len, dict_len); + prb_rec_init_wr(&r, text_len + trunc_msg_len, 0); /* survive when the log buffer is too small for trunc_msg */ if (!prb_reserve(&e, prb, &r)) return 0; @@ -521,10 +518,6 @@ static int log_store(u32 caller_id, int facility, int level, if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len; - if (r.dict_buf) { - memcpy(&r.dict_buf[0], dict, dict_len); - r.info->dict_len = dict_len; - } r.info->facility = facility; r.info->level = level & 7; r.info->flags = flags & 0x1f; @@ -533,6 +526,8 @@ static int log_store(u32 caller_id, int facility, int level, else r.info->ts_nsec = local_clock(); r.info->caller_id = caller_id; + if (dev_info) + memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* insert message */ if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE)) @@ -613,9 +608,9 @@ static ssize_t info_print_ext_header(char *buf, size_t size, ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } -static ssize_t msg_print_ext_body(char *buf, size_t size, - char *dict, size_t dict_len, - char *text, size_t text_len) +static ssize_t msg_add_ext_text(char *buf, size_t size, + const char *text, size_t text_len, + unsigned char endc) { char *p = buf, *e = buf + size; size_t i; @@ -629,36 +624,44 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, else append_char(&p, e, c); } - append_char(&p, e, '\n'); + append_char(&p, e, endc); - if (dict_len) { - bool line = true; + return p - buf; +} - for (i = 0; i < dict_len; i++) { - unsigned char c = dict[i]; +static ssize_t msg_add_dict_text(char *buf, size_t size, + const char *key, const char *val) +{ + size_t val_len = strlen(val); + ssize_t len; - if (line) { - append_char(&p, e, ' '); - line = false; - } + if (!val_len) + return 0; - if (c == '\0') { - append_char(&p, e, '\n'); - line = true; - continue; - } + len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ + len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); + len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); - if (c < ' ' || c >= 127 || c == '\\') { - p += scnprintf(p, e - p, "\\x%02x", c); - continue; - } + return len; +} - append_char(&p, e, c); - } - append_char(&p, e, '\n'); - } +static ssize_t msg_print_ext_body(char *buf, size_t size, + char *text, size_t text_len, + struct dev_printk_info *dev_info) +{ + ssize_t len; - return p - buf; + len = msg_add_ext_text(buf, size, text, text_len, '\n'); + + if (!dev_info) + goto out; + + len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", + dev_info->subsystem); + len += msg_add_dict_text(buf + len, size - len, "DEVICE", + dev_info->device); +out: + return len; } /* /dev/kmsg - userspace message inject/listen interface */ @@ -670,7 +673,6 @@ struct devkmsg_user { struct printk_info info; char text_buf[CONSOLE_EXT_LOG_MAX]; - char dict_buf[CONSOLE_EXT_LOG_MAX]; struct printk_record record; }; @@ -681,7 +683,7 @@ int devkmsg_emit(int facility, int level, const char *fmt, ...) int r; va_start(args, fmt); - r = vprintk_emit(facility, level, NULL, 0, fmt, args); + r = vprintk_emit(facility, level, NULL, fmt, args); va_end(args); return r; @@ -791,8 +793,8 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, len = info_print_ext_header(user->buf, sizeof(user->buf), r->info); len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, - &r->dict_buf[0], r->info->dict_len, - &r->text_buf[0], r->info->text_len); + &r->text_buf[0], r->info->text_len, + &r->info->dev_info); user->seq = r->info->seq + 1; logbuf_unlock_irq(); @@ -897,7 +899,7 @@ static int devkmsg_open(struct inode *inode, struct file *file) prb_rec_init_rd(&user->record, &user->info, &user->text_buf[0], sizeof(user->text_buf), - &user->dict_buf[0], sizeof(user->dict_buf)); + NULL, 0); logbuf_lock_irq(); user->seq = prb_first_valid_seq(prb); @@ -941,6 +943,8 @@ const struct file_operations kmsg_fops = { */ void log_buf_vmcoreinfo_setup(void) { + struct dev_printk_info *dev_info = NULL; + VMCOREINFO_SYMBOL(prb); VMCOREINFO_SYMBOL(printk_rb_static); VMCOREINFO_SYMBOL(clear_seq); @@ -978,6 +982,13 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_OFFSET(printk_info, text_len); VMCOREINFO_OFFSET(printk_info, dict_len); VMCOREINFO_OFFSET(printk_info, caller_id); + VMCOREINFO_OFFSET(printk_info, dev_info); + + VMCOREINFO_STRUCT_SIZE(dev_printk_info); + VMCOREINFO_OFFSET(dev_printk_info, subsystem); + VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); + VMCOREINFO_OFFSET(dev_printk_info, device); + VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); VMCOREINFO_STRUCT_SIZE(prb_data_ring); VMCOREINFO_OFFSET(prb_data_ring, size_bits); @@ -1070,22 +1081,19 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, struct prb_reserved_entry e; struct printk_record dest_r; - prb_rec_init_wr(&dest_r, r->info->text_len, r->info->dict_len); + prb_rec_init_wr(&dest_r, r->info->text_len, 0); if (!prb_reserve(&e, rb, &dest_r)) return 0; memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); dest_r.info->text_len = r->info->text_len; - if (dest_r.dict_buf) { - memcpy(&dest_r.dict_buf[0], &r->dict_buf[0], r->info->dict_len); - dest_r.info->dict_len = r->info->dict_len; - } dest_r.info->facility = r->info->facility; dest_r.info->level = r->info->level; dest_r.info->flags = r->info->flags; dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; + memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); prb_final_commit(&e); @@ -1093,7 +1101,6 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, } static char setup_text_buf[CONSOLE_EXT_LOG_MAX] __initdata; -static char setup_dict_buf[CONSOLE_EXT_LOG_MAX] __initdata; void __init setup_log_buf(int early) { @@ -1165,7 +1172,7 @@ void __init setup_log_buf(int early) prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf), - &setup_dict_buf[0], sizeof(setup_dict_buf)); + NULL, 0); prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), @@ -1903,7 +1910,9 @@ static inline u32 printk_caller_id(void) 0x80000000 + raw_smp_processor_id(); } -static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) +static size_t log_output(int facility, int level, enum log_flags lflags, + const struct dev_printk_info *dev_info, + char *text, size_t text_len) { const u32 caller_id = printk_caller_id(); @@ -1927,12 +1936,12 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c /* Store it in the record log */ return log_store(caller_id, facility, level, lflags, 0, - dict, dictlen, text, text_len); + dev_info, text, text_len); } /* Must be called under logbuf_lock. */ int vprintk_store(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args) { static char textbuf[LOG_LINE_MAX]; @@ -1974,15 +1983,14 @@ int vprintk_store(int facility, int level, if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; - if (dict) + if (dev_info) lflags |= LOG_NEWLINE; - return log_output(facility, level, lflags, - dict, dictlen, text, text_len); + return log_output(facility, level, lflags, dev_info, text, text_len); } asmlinkage int vprintk_emit(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args) { int printed_len; @@ -2003,7 +2011,7 @@ asmlinkage int vprintk_emit(int facility, int level, /* This stops the holder of console_sem just where we want him */ logbuf_lock_irqsave(flags); - printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); + printed_len = vprintk_store(facility, level, dev_info, fmt, args); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ @@ -2037,7 +2045,7 @@ EXPORT_SYMBOL(vprintk); int vprintk_default(const char *fmt, va_list args) { - return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); @@ -2100,8 +2108,8 @@ static ssize_t info_print_ext_header(char *buf, size_t size, return 0; } static ssize_t msg_print_ext_body(char *buf, size_t size, - char *dict, size_t dict_len, - char *text, size_t text_len) { return 0; } + char *text, size_t text_len, + struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, @@ -2390,7 +2398,6 @@ void console_unlock(void) { static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[LOG_LINE_MAX + PREFIX_MAX]; - static char dict[LOG_LINE_MAX]; unsigned long flags; bool do_cond_resched, retry; struct printk_info info; @@ -2401,7 +2408,7 @@ void console_unlock(void) return; } - prb_rec_init_rd(&r, &info, text, sizeof(text), dict, sizeof(dict)); + prb_rec_init_rd(&r, &info, text, sizeof(text), NULL, 0); /* * Console drivers are called with interrupts disabled, so @@ -2473,10 +2480,9 @@ skip: r.info); ext_len += msg_print_ext_body(ext_text + ext_len, sizeof(ext_text) - ext_len, - &r.dict_buf[0], - r.info->dict_len, &r.text_buf[0], - r.info->text_len); + r.info->text_len, + &r.info->dev_info); } len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, @@ -3055,7 +3061,7 @@ int vprintk_deferred(const char *fmt, va_list args) { int r; - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); defer_console_output(); return r; diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 97c8561e74e0..480499ce3c6b 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -4,6 +4,7 @@ #define _KERNEL_PRINTK_RINGBUFFER_H #include +#include /* * Meta information about each stored message. @@ -21,6 +22,8 @@ struct printk_info { u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ u32 caller_id; /* thread id or processor id */ + + struct dev_printk_info dev_info; }; /* diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 50aeae770434..5dbc40160990 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -375,7 +375,7 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) raw_spin_trylock(&logbuf_lock)) { int len; - len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); raw_spin_unlock(&logbuf_lock); defer_console_output(); return len; -- cgit v1.2.3 From f35efc78add6439a9fbe611f2dd517641e82d067 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Sat, 19 Sep 2020 00:40:21 +0206 Subject: printk: remove dict ring Since there is no code that will ever store anything into the dict ring, remove it. If any future dictionary properties are to be added, these should be added to the struct printk_info. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200918223421.21621-4-john.ogness@linutronix.de --- kernel/printk/printk.c | 46 ++++------- kernel/printk/printk_ringbuffer.c | 155 +++++++++----------------------------- kernel/printk/printk_ringbuffer.h | 63 ++++------------ 3 files changed, 64 insertions(+), 200 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f1e243cc284a..1fe3d0cb2fe0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -427,7 +427,6 @@ static u32 log_buf_len = __LOG_BUF_LEN; * Define the average message size. This only affects the number of * descriptors that will be available. Underestimating is better than * overestimating (too many available descriptors is better than not enough). - * The dictionary buffer will be the same size as the text buffer. */ #define PRB_AVGBITS 5 /* 32 character average length */ @@ -435,7 +434,7 @@ static u32 log_buf_len = __LOG_BUF_LEN; #error CONFIG_LOG_BUF_SHIFT value too small. #endif _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, - PRB_AVGBITS, PRB_AVGBITS, &__log_buf[0]); + PRB_AVGBITS, &__log_buf[0]); static struct printk_ringbuffer printk_rb_dynamic; @@ -502,12 +501,12 @@ static int log_store(u32 caller_id, int facility, int level, struct printk_record r; u16 trunc_msg_len = 0; - prb_rec_init_wr(&r, text_len, 0); + prb_rec_init_wr(&r, text_len); if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ truncate_msg(&text_len, &trunc_msg_len); - prb_rec_init_wr(&r, text_len + trunc_msg_len, 0); + prb_rec_init_wr(&r, text_len + trunc_msg_len); /* survive when the log buffer is too small for trunc_msg */ if (!prb_reserve(&e, prb, &r)) return 0; @@ -898,8 +897,7 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); prb_rec_init_rd(&user->record, &user->info, - &user->text_buf[0], sizeof(user->text_buf), - NULL, 0); + &user->text_buf[0], sizeof(user->text_buf)); logbuf_lock_irq(); user->seq = prb_first_valid_seq(prb); @@ -957,7 +955,6 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); - VMCOREINFO_OFFSET(printk_ringbuffer, dict_data_ring); VMCOREINFO_OFFSET(printk_ringbuffer, fail); VMCOREINFO_STRUCT_SIZE(prb_desc_ring); @@ -970,7 +967,6 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_STRUCT_SIZE(prb_desc); VMCOREINFO_OFFSET(prb_desc, state_var); VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); - VMCOREINFO_OFFSET(prb_desc, dict_blk_lpos); VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); @@ -980,7 +976,6 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_OFFSET(printk_info, seq); VMCOREINFO_OFFSET(printk_info, ts_nsec); VMCOREINFO_OFFSET(printk_info, text_len); - VMCOREINFO_OFFSET(printk_info, dict_len); VMCOREINFO_OFFSET(printk_info, caller_id); VMCOREINFO_OFFSET(printk_info, dev_info); @@ -1081,7 +1076,7 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, struct prb_reserved_entry e; struct printk_record dest_r; - prb_rec_init_wr(&dest_r, r->info->text_len, 0); + prb_rec_init_wr(&dest_r, r->info->text_len); if (!prb_reserve(&e, rb, &dest_r)) return 0; @@ -1112,7 +1107,6 @@ void __init setup_log_buf(int early) size_t new_descs_size; size_t new_infos_size; unsigned long flags; - char *new_dict_buf; char *new_log_buf; unsigned int free; u64 seq; @@ -1147,19 +1141,12 @@ void __init setup_log_buf(int early) return; } - new_dict_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); - if (unlikely(!new_dict_buf)) { - pr_err("log_buf_len: %lu dict bytes not available\n", - new_log_buf_len); - goto err_free_log_buf; - } - new_descs_size = new_descs_count * sizeof(struct prb_desc); new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); if (unlikely(!new_descs)) { pr_err("log_buf_len: %zu desc bytes not available\n", new_descs_size); - goto err_free_dict_buf; + goto err_free_log_buf; } new_infos_size = new_descs_count * sizeof(struct printk_info); @@ -1170,13 +1157,10 @@ void __init setup_log_buf(int early) goto err_free_descs; } - prb_rec_init_rd(&r, &info, - &setup_text_buf[0], sizeof(setup_text_buf), - NULL, 0); + prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), - new_dict_buf, ilog2(new_log_buf_len), new_descs, ilog2(new_descs_count), new_infos); @@ -1211,8 +1195,6 @@ void __init setup_log_buf(int early) err_free_descs: memblock_free(__pa(new_descs), new_descs_size); -err_free_dict_buf: - memblock_free(__pa(new_dict_buf), new_log_buf_len); err_free_log_buf: memblock_free(__pa(new_log_buf), new_log_buf_len); } @@ -1463,7 +1445,7 @@ static int syslog_print(char __user *buf, int size) if (!text) return -ENOMEM; - prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX, NULL, 0); + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); while (size > 0) { size_t n; @@ -1550,7 +1532,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) len -= get_record_print_text_size(&info, line_count, true, time); } - prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX, NULL, 0); + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); len = 0; prb_for_each_record(seq, prb, seq, &r) { @@ -1920,7 +1902,7 @@ static size_t log_output(int facility, int level, enum log_flags lflags, struct prb_reserved_entry e; struct printk_record r; - prb_rec_init_wr(&r, text_len, 0); + prb_rec_init_wr(&r, text_len); if (prb_reserve_in_last(&e, prb, &r, caller_id)) { memcpy(&r.text_buf[r.info->text_len], text, text_len); r.info->text_len += text_len; @@ -2408,7 +2390,7 @@ void console_unlock(void) return; } - prb_rec_init_rd(&r, &info, text, sizeof(text), NULL, 0); + prb_rec_init_rd(&r, &info, text, sizeof(text)); /* * Console drivers are called with interrupts disabled, so @@ -3266,7 +3248,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, size_t l = 0; bool ret = false; - prb_rec_init_rd(&r, &info, line, size, NULL, 0); + prb_rec_init_rd(&r, &info, line, size); if (!dumper->active) goto out; @@ -3357,7 +3339,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, bool ret = false; bool time = printk_time; - prb_rec_init_rd(&r, &info, buf, size, NULL, 0); + prb_rec_init_rd(&r, &info, buf, size); if (!dumper->active || !buf || !size) goto out; @@ -3405,7 +3387,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, l += record_print_text(&r, syslog, time); /* adjust record to store to remaining buffer space */ - prb_rec_init_rd(&r, &info, buf + l, size - l, NULL, 0); + prb_rec_init_rd(&r, &info, buf + l, size - l); seq = r.info->seq + 1; } diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index de4b10a98623..13b94b92342e 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -18,18 +18,13 @@ * A ring of descriptors and their meta data (such as sequence number, * timestamp, loglevel, etc.) as well as internal state information about * the record and logical positions specifying where in the other - * ringbuffers the text and dictionary strings are located. + * ringbuffer the text strings are located. * * text_data_ring * A ring of data blocks. A data block consists of an unsigned long * integer (ID) that maps to a desc_ring index followed by the text * string of the record. * - * dict_data_ring - * A ring of data blocks. A data block consists of an unsigned long - * integer (ID) that maps to a desc_ring index followed by the dictionary - * string of the record. - * * The internal state information of a descriptor is the key element to allow * readers and writers to locklessly synchronize access to the data. * @@ -40,8 +35,8 @@ * ~~~~~~~~~~~~~~~ * The descriptor ring is an array of descriptors. A descriptor contains * essential meta data to track the data of a printk record using - * blk_lpos structs pointing to associated text and dictionary data blocks - * (see "Data Rings" below). Each descriptor is assigned an ID that maps + * blk_lpos structs pointing to associated text data blocks (see + * "Data Rings" below). Each descriptor is assigned an ID that maps * directly to index values of the descriptor array and has a state. The ID * and the state are bitwise combined into a single descriptor field named * @state_var, allowing ID and state to be synchronously and atomically @@ -62,8 +57,8 @@ * writer cannot reopen the descriptor. * * reusable - * The record exists, but its text and/or dictionary data may no longer - * be available. + * The record exists, but its text and/or meta data may no longer be + * available. * * Querying the @state_var of a record requires providing the ID of the * descriptor to query. This can yield a possible fifth (pseudo) state: @@ -77,7 +72,7 @@ * When a new descriptor should be created (and the ring is full), the tail * descriptor is invalidated by first transitioning to the reusable state and * then invalidating all tail data blocks up to and including the data blocks - * associated with the tail descriptor (for text and dictionary rings). Then + * associated with the tail descriptor (for the text ring). Then * @tail_id is advanced, followed by advancing @head_id. And finally the * @state_var of the new descriptor is initialized to the new ID and reserved * state. @@ -108,13 +103,9 @@ * 3) When a record is committed via prb_commit() and a newer record * already exists, the record being committed is automatically finalized. * - * Data Rings - * ~~~~~~~~~~ - * The two data rings (text and dictionary) function identically. They exist - * separately so that their buffer sizes can be individually set and they do - * not affect one another. - * - * Data rings are byte arrays composed of data blocks. Data blocks are + * Data Ring + * ~~~~~~~~~ + * The text data ring is a byte array composed of data blocks. Data blocks are * referenced by blk_lpos structs that point to the logical position of the * beginning of a data block and the beginning of the next adjacent data * block. Logical positions are mapped directly to index values of the byte @@ -165,34 +156,28 @@ * examples a global ringbuffer (test_rb) is available (which is not the * actual ringbuffer used by printk):: * - * DEFINE_PRINTKRB(test_rb, 15, 5, 3); + * DEFINE_PRINTKRB(test_rb, 15, 5); * * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of - * 1 MiB (2 ^ (15 + 5)) for text data and 256 KiB (2 ^ (15 + 3)) for - * dictionary data. + * 1 MiB (2 ^ (15 + 5)) for text data. * * Sample writer code:: * - * const char *dictstr = "dictionary text"; * const char *textstr = "message text"; * struct prb_reserved_entry e; * struct printk_record r; * * // specify how much to allocate - * prb_rec_init_wr(&r, strlen(textstr) + 1, strlen(dictstr) + 1); + * prb_rec_init_wr(&r, strlen(textstr) + 1); * * if (prb_reserve(&e, &test_rb, &r)) { * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); - * r.info->text_len = strlen(textstr); - * - * // dictionary allocation may have failed - * if (r.dict_buf) { - * snprintf(r.dict_buf, r.dict_buf_size, "%s", dictstr); - * r.info->dict_len = strlen(dictstr); - * } * + * r.info->text_len = strlen(textstr); * r.info->ts_nsec = local_clock(); + * r.info->caller_id = printk_caller_id(); * + * // commit and finalize the record * prb_final_commit(&e); * } * @@ -203,8 +188,9 @@ * Sample writer code (record extending):: * * // alternate rest of previous example - * r.info->ts_nsec = local_clock(); + * * r.info->text_len = strlen(textstr); + * r.info->ts_nsec = local_clock(); * r.info->caller_id = printk_caller_id(); * * // commit the record (but do not finalize yet) @@ -214,7 +200,7 @@ * ... * * // specify additional 5 bytes text space to extend - * prb_rec_init_wr(&r, 5, 0); + * prb_rec_init_wr(&r, 5); * * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id())) { * snprintf(&r.text_buf[r.info->text_len], @@ -222,6 +208,7 @@ * * r.info->text_len += 5; * + * // commit and finalize the record * prb_final_commit(&e); * } * @@ -230,11 +217,9 @@ * struct printk_info info; * struct printk_record r; * char text_buf[32]; - * char dict_buf[32]; * u64 seq; * - * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf), - * &dict_buf[0], sizeof(dict_buf)); + * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf)); * * prb_for_each_record(0, &test_rb, &seq, &r) { * if (info.seq != seq) @@ -245,13 +230,8 @@ * text_buf[r.text_buf_size - 1] = 0; * } * - * if (info.dict_len > r.dict_buf_size) { - * pr_warn("record %llu dict truncated\n", info.seq); - * dict_buf[r.dict_buf_size - 1] = 0; - * } - * - * pr_info("%llu: %llu: %s;%s\n", info.seq, info.ts_nsec, - * &text_buf[0], info.dict_len ? &dict_buf[0] : ""); + * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec, + * &text_buf[0]); * } * * Note that additional less convenient reader functions are available to @@ -495,8 +475,6 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, */ memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ - memcpy(&desc_out->dict_blk_lpos, &desc->dict_blk_lpos, - sizeof(desc_out->dict_blk_lpos)); /* also part of desc_read:C */ if (seq_out) *seq_out = info->seq; /* also part of desc_read:C */ if (caller_id_out) @@ -571,7 +549,7 @@ static void desc_make_reusable(struct prb_desc_ring *desc_ring, } /* - * Given a data ring (text or dict), put the associated descriptor of each + * Given the text data ring, put the associated descriptor of each * data block from @lpos_begin until @lpos_end into the reusable state. * * If there is any problem making the associated descriptor reusable, either @@ -586,21 +564,12 @@ static bool data_make_reusable(struct printk_ringbuffer *rb, unsigned long *lpos_out) { struct prb_desc_ring *desc_ring = &rb->desc_ring; - struct prb_data_blk_lpos *blk_lpos; struct prb_data_block *blk; enum desc_state d_state; struct prb_desc desc; + struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos; unsigned long id; - /* - * Using the provided @data_ring, point @blk_lpos to the correct - * blk_lpos within the local copy of the descriptor. - */ - if (data_ring == &rb->text_data_ring) - blk_lpos = &desc.text_blk_lpos; - else - blk_lpos = &desc.dict_blk_lpos; - /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { blk = to_block(data_ring, lpos_begin); @@ -839,8 +808,6 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, if (!data_push_tail(rb, &rb->text_data_ring, desc.text_blk_lpos.next)) return false; - if (!data_push_tail(rb, &rb->dict_data_ring, desc.dict_blk_lpos.next)) - return false; /* * Check the next descriptor after @tail_id before pushing the tail @@ -1347,9 +1314,8 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * data. * * The writer specifies the text size to extend (not the new total size) by - * setting the @text_buf_size field of @r. Extending dictionaries is not - * supported, so @dict_buf_size of @r should be set to 0. To ensure proper - * initialization of @r, prb_rec_init_wr() should be used. + * setting the @text_buf_size field of @r. To ensure proper initialization + * of @r, prb_rec_init_wr() should be used. * * This function will fail if @caller_id does not match the caller ID of the * newest record. In that case the caller must reserve new data using @@ -1364,9 +1330,6 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * * - @r->text_buf_size is set to the new total size of the buffer. * - * - @r->dict_buf and @r->dict_buf_size are cleared because extending - * the dict buffer is not supported. - * * - @r->info is not touched so that @r->info->text_len could be used * to append the text. * @@ -1375,8 +1338,7 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * * Important: All @r->info fields will already be set with the current values * for the record. I.e. @r->info->text_len will be less than - * @text_buf_size and @r->info->dict_len may be set, even though - * @dict_buf_size is 0. Writers can use @r->info->text_len to know + * @text_buf_size. Writers can use @r->info->text_len to know * where concatenation begins and writers should update * @r->info->text_len after concatenating. */ @@ -1454,10 +1416,6 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer if (r->text_buf_size && !r->text_buf) goto fail; - /* Although dictionary data may be in use, it cannot be extended. */ - r->dict_buf = NULL; - r->dict_buf_size = 0; - r->info = info; e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); @@ -1494,27 +1452,21 @@ static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) * * This is the public function available to writers to reserve data. * - * The writer specifies the text and dict sizes to reserve by setting the - * @text_buf_size and @dict_buf_size fields of @r, respectively. Dictionaries - * are optional, so @dict_buf_size is allowed to be 0. To ensure proper - * initialization of @r, prb_rec_init_wr() should be used. + * The writer specifies the text size to reserve by setting the + * @text_buf_size field of @r. To ensure proper initialization of @r, + * prb_rec_init_wr() should be used. * * Context: Any context. Disables local interrupts on success. * Return: true if at least text data could be allocated, otherwise false. * - * On success, the fields @info, @text_buf, @dict_buf of @r will be set by - * this function and should be filled in by the writer before committing. Also + * On success, the fields @info and @text_buf of @r will be set by this + * function and should be filled in by the writer before committing. Also * on success, prb_record_text_space() can be used on @e to query the actual * space used for the text data block. * - * If the function fails to reserve dictionary space (but all else succeeded), - * it will still report success. In that case @dict_buf is set to NULL and - * @dict_buf_size is set to 0. Writers must check this before writing to - * dictionary space. - * - * Important: @info->text_len and @info->dict_len need to be set correctly by - * the writer in order for data to be readable and/or extended. - * Their values are initialized to 0. + * Important: @info->text_len needs to be set correctly by the writer in + * order for data to be readable and/or extended. Its value + * is initialized to 0. */ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r) @@ -1528,9 +1480,6 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; - if (!data_check_size(&rb->dict_data_ring, r->dict_buf_size)) - goto fail; - /* * Descriptors in the reserved state act as blockers to all further * reservations once the desc_ring has fully wrapped. Disable @@ -1598,15 +1547,6 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, goto fail; } - r->dict_buf = data_alloc(rb, &rb->dict_data_ring, r->dict_buf_size, - &d->dict_blk_lpos, id); - /* - * If dict data allocation fails, the caller can still commit - * text. But dictionary information will not be available. - */ - if (r->dict_buf_size && !r->dict_buf) - r->dict_buf_size = 0; - r->info = info; /* Record full text space used by record. */ @@ -1869,17 +1809,6 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, return -ENOENT; } - /* - * Copy dict data. Although this should not fail, dict data is not - * important. So if it fails, modify the copied meta data to report - * that there is no dict data, thus silently dropping the dict data. - */ - if (!copy_data(&rb->dict_data_ring, &desc.dict_blk_lpos, info->dict_len, - r->dict_buf, r->dict_buf_size, NULL)) { - if (r->info) - r->info->dict_len = 0; - } - /* Ensure the record is still finalized and has the same @seq. */ return desc_read_finalized_seq(desc_ring, id, seq, &desc); } @@ -1974,7 +1903,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * * This is the public function available to readers to read a record. * - * The reader provides the @info, @text_buf, @dict_buf buffers of @r to be + * The reader provides the @info and @text_buf buffers of @r to be * filled in. Any of the buffer pointers can be set to NULL if the reader * is not interested in that data. To ensure proper initialization of @r, * prb_rec_init_rd() should be used. @@ -2022,7 +1951,7 @@ bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, { struct printk_record r; - prb_rec_init_rd(&r, info, NULL, 0, NULL, 0); + prb_rec_init_rd(&r, info, NULL, 0); return _prb_read_valid(rb, &seq, &r, line_count); } @@ -2084,8 +2013,6 @@ u64 prb_next_seq(struct printk_ringbuffer *rb) * @rb: The ringbuffer to initialize. * @text_buf: The data buffer for text data. * @textbits: The size of @text_buf as a power-of-2 value. - * @dict_buf: The data buffer for dictionary data. - * @dictbits: The size of @dict_buf as a power-of-2 value. * @descs: The descriptor buffer for ringbuffer records. * @descbits: The count of @descs items as a power-of-2 value. * @infos: The printk_info buffer for ringbuffer records. @@ -2099,7 +2026,6 @@ u64 prb_next_seq(struct printk_ringbuffer *rb) */ void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int textbits, - char *dict_buf, unsigned int dictbits, struct prb_desc *descs, unsigned int descbits, struct printk_info *infos) { @@ -2117,18 +2043,11 @@ void prb_init(struct printk_ringbuffer *rb, atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits)); atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits)); - rb->dict_data_ring.size_bits = dictbits; - rb->dict_data_ring.data = dict_buf; - atomic_long_set(&rb->dict_data_ring.head_lpos, BLK0_LPOS(dictbits)); - atomic_long_set(&rb->dict_data_ring.tail_lpos, BLK0_LPOS(dictbits)); - atomic_long_set(&rb->fail, 0); atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; - descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.begin = FAILED_LPOS; - descs[_DESCS_COUNT(descbits) - 1].dict_blk_lpos.next = FAILED_LPOS; infos[0].seq = -(u64)_DESCS_COUNT(descbits); infos[_DESCS_COUNT(descbits) - 1].seq = 0; diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 480499ce3c6b..0adaa685d1ca 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -9,15 +9,13 @@ /* * Meta information about each stored message. * - * All fields are set and used by the printk code except for - * @seq, @text_len, @dict_len, which are set and/or modified - * by the ringbuffer code. + * All fields are set by the printk code except for @seq, which is + * set by the ringbuffer code. */ struct printk_info { u64 seq; /* sequence number */ u64 ts_nsec; /* timestamp in nanoseconds */ u16 text_len; /* length of text message */ - u16 dict_len; /* length of dictionary message */ u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ @@ -30,23 +28,20 @@ struct printk_info { * A structure providing the buffers, used by writers and readers. * * Writers: - * Using prb_rec_init_wr(), a writer sets @text_buf_size and @dict_buf_size - * before calling prb_reserve(). On success, prb_reserve() sets @info, - * @text_buf, @dict_buf to buffers reserved for that writer. + * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling + * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to + * buffers reserved for that writer. * * Readers: * Using prb_rec_init_rd(), a reader sets all fields before calling - * prb_read_valid(). Note that the reader provides the @info, @text_buf, - * @dict_buf buffers. On success, the struct pointed to by @info will be - * filled and the char arrays pointed to by @text_buf and @dict_buf will - * be filled with text and dict data. + * prb_read_valid(). Note that the reader provides the @info and @text_buf, + * buffers. On success, the struct pointed to by @info will be filled and + * the char array pointed to by @text_buf will be filled with text data. */ struct printk_record { struct printk_info *info; char *text_buf; - char *dict_buf; unsigned int text_buf_size; - unsigned int dict_buf_size; }; /* Specifies the logical position and span of a data block. */ @@ -63,7 +58,6 @@ struct prb_data_blk_lpos { struct prb_desc { atomic_long_t state_var; struct prb_data_blk_lpos text_blk_lpos; - struct prb_data_blk_lpos dict_blk_lpos; }; /* A ringbuffer of "ID + data" elements. */ @@ -92,7 +86,6 @@ struct prb_desc_ring { struct printk_ringbuffer { struct prb_desc_ring desc_ring; struct prb_data_ring text_data_ring; - struct prb_data_ring dict_data_ring; atomic_long_t fail; }; @@ -236,9 +229,7 @@ enum desc_state { * Note: The specified external buffer must be of the size: * 2 ^ (descbits + avgtextbits) */ -#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits, text_buf) \ -static char _##name##_dict[1U << ((avgdictbits) + (descbits))] \ - __aligned(__alignof__(unsigned long)); \ +#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ /* the initial head and tail */ \ [_DESCS_COUNT(descbits) - 1] = { \ @@ -246,7 +237,6 @@ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ /* no associated data block */ \ .text_blk_lpos = FAILED_BLK_LPOS, \ - .dict_blk_lpos = FAILED_BLK_LPOS, \ }, \ }; \ static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \ @@ -275,12 +265,6 @@ static struct printk_ringbuffer name = { \ .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ }, \ - .dict_data_ring = { \ - .size_bits = (avgtextbits) + (descbits), \ - .data = &_##name##_dict[0], \ - .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ - .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ - }, \ .fail = ATOMIC_LONG_INIT(0), \ } @@ -290,17 +274,15 @@ static struct printk_ringbuffer name = { \ * @name: The name of the ringbuffer variable. * @descbits: The number of descriptors as a power-of-2 value. * @avgtextbits: The average text data size per record as a power-of-2 value. - * @avgdictbits: The average dictionary data size per record as a - * power-of-2 value. * * This is a macro for defining a ringbuffer and all internal structures * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a * variant where the text data buffer can be specified externally. */ -#define DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits) \ +#define DEFINE_PRINTKRB(name, descbits, avgtextbits) \ static char _##name##_text[1U << ((avgtextbits) + (descbits))] \ __aligned(__alignof__(unsigned long)); \ -_DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits, &_##name##_text[0]) +_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0]) /* Writer Interface */ @@ -309,26 +291,13 @@ _DEFINE_PRINTKRB(name, descbits, avgtextbits, avgdictbits, &_##name##_text[0]) * * @r: The record to initialize. * @text_buf_size: The needed text buffer size. - * @dict_buf_size: The needed dictionary buffer size. - * - * Initialize all the fields that a writer is interested in. If - * @dict_buf_size is 0, a dictionary buffer will not be reserved. - * @text_buf_size must be greater than 0. - * - * Note that although @dict_buf_size may be initialized to non-zero, - * its value must be rechecked after a successful call to prb_reserve() - * to verify a dictionary buffer was actually reserved. Dictionary buffer - * reservation is allowed to fail. */ static inline void prb_rec_init_wr(struct printk_record *r, - unsigned int text_buf_size, - unsigned int dict_buf_size) + unsigned int text_buf_size) { r->info = NULL; r->text_buf = NULL; - r->dict_buf = NULL; r->text_buf_size = text_buf_size; - r->dict_buf_size = dict_buf_size; } bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, @@ -340,7 +309,6 @@ void prb_final_commit(struct prb_reserved_entry *e); void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int text_buf_size, - char *dict_buf, unsigned int dict_buf_size, struct prb_desc *descs, unsigned int descs_count_bits, struct printk_info *infos); unsigned int prb_record_text_space(struct prb_reserved_entry *e); @@ -354,8 +322,6 @@ unsigned int prb_record_text_space(struct prb_reserved_entry *e); * @info: A buffer to store record meta-data. * @text_buf: A buffer to store text data. * @text_buf_size: The size of @text_buf. - * @dict_buf: A buffer to store dictionary data. - * @dict_buf_size: The size of @dict_buf. * * Initialize all the fields that a reader is interested in. All arguments * (except @r) are optional. Only record data for arguments that are @@ -363,14 +329,11 @@ unsigned int prb_record_text_space(struct prb_reserved_entry *e); */ static inline void prb_rec_init_rd(struct printk_record *r, struct printk_info *info, - char *text_buf, unsigned int text_buf_size, - char *dict_buf, unsigned int dict_buf_size) + char *text_buf, unsigned int text_buf_size) { r->info = info; r->text_buf = text_buf; - r->dict_buf = dict_buf; r->text_buf_size = text_buf_size; - r->dict_buf_size = dict_buf_size; } /** -- cgit v1.2.3 From 900ffe39fec908e0aa26a30612e43ebc7140db79 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 19 Sep 2020 01:09:36 -0700 Subject: x86/entry: Fix typo in comments for syscall_enter_from_user_mode() Just to help myself and others with finding the correct function names, fix a typo for "usermode" vs "user_mode". Signed-off-by: Kees Cook Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200919080936.259819-1-keescook@chromium.org --- kernel/entry/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index fcae019158ca..7c7b9d00338a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -183,7 +183,7 @@ static inline bool report_single_step(unsigned long ti_work) /* * If TIF_SYSCALL_EMU is set, then the only reason to report is when * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall - * instruction has been already reported in syscall_enter_from_usermode(). + * instruction has been already reported in syscall_enter_from_user_mode(). */ #define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) -- cgit v1.2.3 From bb3247a399801ebba20bef101c89e563f5fe7f02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Sep 2020 09:19:55 +0200 Subject: PM: rewrite is_hibernate_resume_dev to not require an inode Just check the dev_t to help simplifying the code. Signed-off-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Jens Axboe --- kernel/power/user.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index d5eedc2baa2a..b5815685b944 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -35,12 +35,12 @@ static struct snapshot_data { bool ready; bool platform_support; bool free_bitmaps; - struct inode *bd_inode; + dev_t dev; } snapshot_state; -int is_hibernate_resume_dev(const struct inode *bd_inode) +int is_hibernate_resume_dev(dev_t dev) { - return hibernation_available() && snapshot_state.bd_inode == bd_inode; + return hibernation_available() && snapshot_state.dev == dev; } static int snapshot_open(struct inode *inode, struct file *filp) @@ -101,7 +101,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->frozen = false; data->ready = false; data->platform_support = false; - data->bd_inode = NULL; + data->dev = 0; Unlock: unlock_system_sleep(); @@ -117,7 +117,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) swsusp_free(); data = filp->private_data; - data->bd_inode = NULL; + data->dev = 0; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); @@ -245,7 +245,7 @@ static int snapshot_set_swap_area(struct snapshot_data *data, if (data->swap < 0) return -ENODEV; - data->bd_inode = bdev->bd_inode; + data->dev = bdev->bd_dev; bdput(bdev); return 0; } -- cgit v1.2.3 From 21bd900572f3708e281ea25f051fc92462eb1193 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Sep 2020 09:19:56 +0200 Subject: mm: split swap_type_of swap_type_of is used for two entirely different purposes: (1) check what swap type a given device/offset corresponds to (2) find the first available swap device that can be written to Mixing both in a single function creates an unreadable mess. Create two separate functions instead, and switch both to pass a dev_t instead of a struct block_device to further simplify the code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/power/swap.c | 17 ++++++++--------- kernel/power/user.c | 16 ++++------------ 2 files changed, 12 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 01e2858b5fe3..9d3ffbfe08db 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -335,12 +335,17 @@ static int swsusp_swap_check(void) { int res; - res = swap_type_of(swsusp_resume_device, swsusp_resume_block, - &hib_resume_bdev); + if (swsusp_resume_device) + res = swap_type_of(swsusp_resume_device, swsusp_resume_block); + else + res = find_first_swap(&swsusp_resume_device); if (res < 0) return res; - root_swap = res; + + hib_resume_bdev = bdget(swsusp_resume_device); + if (!hib_resume_bdev) + return -ENOMEM; res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); if (res) return res; @@ -349,12 +354,6 @@ static int swsusp_swap_check(void) if (res < 0) blkdev_put(hib_resume_bdev, FMODE_WRITE); - /* - * Update the resume device to the one actually used, - * so the test_resume mode can use it in case it is - * invoked from hibernate() to test the snapshot. - */ - swsusp_resume_device = hib_resume_bdev->bd_dev; return res; } diff --git a/kernel/power/user.c b/kernel/power/user.c index b5815685b944..6510a8f7687f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -69,8 +69,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { /* Hibernating. The image device should be accessible. */ - data->swap = swsusp_resume_device ? - swap_type_of(swsusp_resume_device, 0, NULL) : -1; + data->swap = swap_type_of(swsusp_resume_device, 0); data->mode = O_RDONLY; data->free_bitmaps = false; error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); @@ -210,7 +209,6 @@ struct compat_resume_swap_area { static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { - struct block_device *bdev; sector_t offset; dev_t swdev; @@ -237,16 +235,10 @@ static int snapshot_set_swap_area(struct snapshot_data *data, * User space encodes device types as two-byte values, * so we need to recode them */ - if (!swdev) { - data->swap = -1; - return -EINVAL; - } - data->swap = swap_type_of(swdev, offset, &bdev); + data->swap = swap_type_of(swdev, offset); if (data->swap < 0) - return -ENODEV; - - data->dev = bdev->bd_dev; - bdput(bdev); + return swdev ? -ENODEV : -EINVAL; + data->dev = swdev; return 0; } -- cgit v1.2.3 From 36daaa98f7ad9224421b18549a3f7bdf4d220414 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Sep 2020 09:19:57 +0200 Subject: PM: mm: cleanup swsusp_swap_check Use blkdev_get_by_dev instead of bdget + blkdev_get. Signed-off-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Signed-off-by: Jens Axboe --- kernel/power/swap.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 9d3ffbfe08db..71385bedcc3a 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -343,12 +343,10 @@ static int swsusp_swap_check(void) return res; root_swap = res; - hib_resume_bdev = bdget(swsusp_resume_device); - if (!hib_resume_bdev) - return -ENOMEM; - res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); - if (res) - return res; + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE, + NULL); + if (IS_ERR(hib_resume_bdev)) + return PTR_ERR(hib_resume_bdev); res = set_blocksize(hib_resume_bdev, PAGE_SIZE); if (res < 0) -- cgit v1.2.3 From 0789e13bc3f84f0adafe1935af036956638950f9 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 23 Sep 2020 17:01:55 +0100 Subject: bpf: Explicitly size compatible_reg_types Arrays with designated initializers have an implicit length of the highest initialized value plus one. I used this to ensure that newly added entries in enum bpf_reg_type get a NULL entry in compatible_reg_types. This is difficult to understand since it requires knowledge of the peculiarities of designated initializers. Use __BPF_ARG_TYPE_MAX to size the array instead. Suggested-by: Alexei Starovoitov Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200923160156.80814-1-lmb@cloudflare.com --- kernel/bpf/verifier.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 15ab889b0a3f..d7c993ded26a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4002,7 +4002,7 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types *compatible_reg_types[] = { +static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, @@ -4025,7 +4025,6 @@ static const struct bpf_reg_types *compatible_reg_types[] = { [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, - [__BPF_ARG_TYPE_MAX] = NULL, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, -- cgit v1.2.3 From f00f2f7fe86036a586e26bd55ad86310477199a3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 23 Sep 2020 19:10:38 -0700 Subject: Revert "bpf: Fix potential call bpf_link_free() in atomic context" This reverts commit 31f23a6a181c81543b10a1a9056b0e6c7ef1c747. This change made many selftests/bpf flaky: flow_dissector, sk_lookup, sk_assign and others. There was no issue in the code. Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 34268491d2de..2740df19f55e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2345,8 +2345,12 @@ void bpf_link_put(struct bpf_link *link) if (!atomic64_dec_and_test(&link->refcnt)) return; - INIT_WORK(&link->work, bpf_link_put_deferred); - schedule_work(&link->work); + if (in_atomic()) { + INIT_WORK(&link->work, bpf_link_put_deferred); + schedule_work(&link->work); + } else { + bpf_link_free(link); + } } static int bpf_link_release(struct inode *inode, struct file *filp) -- cgit v1.2.3 From e9ffc8c1b83931599663c21ba9082bcafa51d333 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 21 Sep 2020 21:24:25 -0400 Subject: kprobes: Use module_name() macro It is advised to use module_name() macro instead of dereferencing mod->name directly. This makes sense for consistencys sake and also it prevents a hard dependency to CONFIG_MODULES. Signed-off-by: Jarkko Sakkinen Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200818050857.117998-1-jarkko.sakkinen@linux.intel.com --- kernel/trace/trace_kprobe.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aefb6065b508..19c00ee90945 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -106,9 +106,10 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, struct module *mod) { - int len = strlen(mod->name); + int len = strlen(module_name(mod)); const char *name = trace_kprobe_symbol(tk); - return strncmp(mod->name, name, len) == 0 && name[len] == ':'; + + return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; } static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) @@ -688,7 +689,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, if (ret) pr_warn("Failed to re-register probe %s on %s: %d\n", trace_probe_name(&tk->tp), - mod->name, ret); + module_name(mod), ret); } } mutex_unlock(&event_mutex); -- cgit v1.2.3 From f9e62f318fd706a54b7ce9b28e5c7e49bbde8788 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Fri, 14 Aug 2020 17:40:27 -0700 Subject: treewide: Make all debug_obj_descriptors const This should make it harder for the kernel to corrupt the debug object descriptor, used to call functions to fixup state and track debug objects, by moving the structure to read-only memory. Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20200815004027.2046113-3-swboyd@chromium.org --- kernel/rcu/rcu.h | 2 +- kernel/rcu/update.c | 2 +- kernel/time/hrtimer.c | 4 ++-- kernel/time/timer.c | 4 ++-- kernel/workqueue.c | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index cf66a3ccd757..e01cba5e4b52 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -167,7 +167,7 @@ static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) # define STATE_RCU_HEAD_READY 0 # define STATE_RCU_HEAD_QUEUED 1 -extern struct debug_obj_descr rcuhead_debug_descr; +extern const struct debug_obj_descr rcuhead_debug_descr; static inline int debug_rcu_head_queue(struct rcu_head *head) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 2de49b5d8dd2..3e0f4bcb558f 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -469,7 +469,7 @@ void destroy_rcu_head_on_stack(struct rcu_head *head) } EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); -struct debug_obj_descr rcuhead_debug_descr = { +const struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", .is_static_object = rcuhead_is_static_object, }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 95b6a708b040..3624b9b5835d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS -static struct debug_obj_descr hrtimer_debug_descr; +static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { @@ -401,7 +401,7 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) } } -static struct debug_obj_descr hrtimer_debug_descr = { +static const struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a50364df1054..8b17cf28e54b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -611,7 +611,7 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer #ifdef CONFIG_DEBUG_OBJECTS_TIMERS -static struct debug_obj_descr timer_debug_descr; +static const struct debug_obj_descr timer_debug_descr; static void *timer_debug_hint(void *addr) { @@ -707,7 +707,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) } } -static struct debug_obj_descr timer_debug_descr = { +static const struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, .is_static_object = timer_is_static_object, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c41c3c17b86a..ac088ce6059b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -427,7 +427,7 @@ static void show_pwq(struct pool_workqueue *pwq); #ifdef CONFIG_DEBUG_OBJECTS_WORK -static struct debug_obj_descr work_debug_descr; +static const struct debug_obj_descr work_debug_descr; static void *work_debug_hint(void *addr) { @@ -477,7 +477,7 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state) } } -static struct debug_obj_descr work_debug_descr = { +static const struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, .is_static_object = work_is_static_object, -- cgit v1.2.3 From b952caf2d5ca898cc10d63be7722ae7a5daca696 Mon Sep 17 00:00:00 2001 From: Qianli Zhao Date: Thu, 13 Aug 2020 23:03:14 +0800 Subject: timers: Mask invalid flags in do_init_timer() do_init_timer() accepts any combination of timer flags handed in by the caller without a sanity check, but only TIMER_DEFFERABLE, TIMER_PINNED and TIMER_IRQSAFE are valid. If the supplied flags have other bits set, this could result in malfunction. If bits are set in TIMER_CPUMASK the first timer usage could deference a cpu base which is outside the range of possible CPUs. If TIMER_MIGRATION is set, then the switch_timer_base() will live lock. Prevent that with a sanity check which warns when invalid flags are supplied and masks them out. [ tglx: Made it WARN_ON_ONCE() and added context to the changelog ] Signed-off-by: Qianli Zhao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/9d79a8aa4eb56713af7379f99f062dedabcde140.1597326756.git.zhaoqianli@xiaomi.com --- kernel/time/timer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a16764b0116e..25e048d0e660 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -794,6 +794,8 @@ static void do_init_timer(struct timer_list *timer, { timer->entry.pprev = NULL; timer->function = func; + if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS)) + flags &= TIMER_INIT_FLAGS; timer->flags = flags | raw_smp_processor_id(); lockdep_init_map(&timer->lockdep_map, name, key, 0); } -- cgit v1.2.3 From 5a84292271402cffe0717bc58e2ad9a0c7977272 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Sep 2020 13:28:13 +0200 Subject: dma-mapping: remove dma_cache_sync All users are gone now, remove the API. Signed-off-by: Christoph Hellwig Acked-by: Thomas Bogendoerfer (MIPS part) --- kernel/dma/Kconfig | 3 --- kernel/dma/mapping.c | 14 -------------- 2 files changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 281785feb874..c5f717021f56 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -75,9 +75,6 @@ config ARCH_HAS_DMA_PREP_COHERENT config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool -config DMA_NONCOHERENT_CACHE_SYNC - bool - config DMA_VIRT_OPS bool depends on HAS_DMA diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 9a045e51df17..673ea6759c15 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -529,20 +529,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) EXPORT_SYMBOL(dma_set_coherent_mask); #endif -void dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - - if (dma_alloc_direct(dev, ops)) - arch_dma_cache_sync(dev, vaddr, size, dir); - else if (ops->cache_sync) - ops->cache_sync(dev, vaddr, size, dir); -} -EXPORT_SYMBOL(dma_cache_sync); - size_t dma_max_mapping_size(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); -- cgit v1.2.3 From efa70f2fdc842e63a0a13223e0e83cedcc2117f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Sep 2020 13:34:33 +0200 Subject: dma-mapping: add a new dma_alloc_pages API This API is the equivalent of alloc_pages, except that the returned memory is guaranteed to be DMA addressable by the passed in device. The implementation will also be used to provide a more sensible replacement for DMA_ATTR_NON_CONSISTENT flag. Additionally dma_alloc_noncoherent is switched over to use dma_alloc_pages as its backend. Signed-off-by: Christoph Hellwig Acked-by: Thomas Bogendoerfer (MIPS part) --- kernel/dma/direct.c | 52 ++++++++++++++++++++++++++++++++++++++- kernel/dma/mapping.c | 63 +++++++++++++++++++++++++++++++++++++++++++++--- kernel/dma/ops_helpers.c | 35 +++++++++++++++++++++++++++ kernel/dma/virt.c | 2 ++ 4 files changed, 148 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 750659f7447c..121a9c1969dd 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2018 Christoph Hellwig. + * Copyright (C) 2018-2020 Christoph Hellwig. * * DMA operations that map physical memory directly without using an IOMMU. */ @@ -292,6 +292,56 @@ void dma_direct_free(struct device *dev, size_t size, dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size); } +struct page *dma_direct_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + struct page *page; + void *ret; + + if (dma_should_alloc_from_pool(dev, gfp, 0)) { + page = dma_alloc_from_pool(dev, size, &ret, gfp, + dma_coherent_ok); + if (!page) + return NULL; + goto done; + } + + page = __dma_direct_alloc_pages(dev, size, gfp); + if (!page) + return NULL; + ret = page_address(page); + if (force_dma_unencrypted(dev)) { + if (set_memory_decrypted((unsigned long)ret, + 1 << get_order(size))) + goto out_free_pages; + } + memset(ret, 0, size); +done: + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); + return page; +out_free_pages: + dma_free_contiguous(dev, page, size); + return NULL; +} + +void dma_direct_free_pages(struct device *dev, size_t size, + struct page *page, dma_addr_t dma_addr, + enum dma_data_direction dir) +{ + unsigned int page_order = get_order(size); + void *vaddr = page_address(page); + + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ + if (dma_should_free_from_pool(dev, 0) && + dma_free_from_pool(dev, vaddr, size)) + return; + + if (force_dma_unencrypted(dev)) + set_memory_encrypted((unsigned long)vaddr, 1 << page_order); + + dma_free_contiguous(dev, page, size); +} + #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_SWIOTLB) void dma_direct_sync_sg_for_device(struct device *dev, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 673ea6759c15..06115f59f4ff 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -341,9 +341,7 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { if (force_dma_unencrypted(dev)) prot = pgprot_decrypted(prot); - if (dev_is_dma_coherent(dev) || - (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) && - (attrs & DMA_ATTR_NON_CONSISTENT))) + if (dev_is_dma_coherent(dev)) return prot; #ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE if (attrs & DMA_ATTR_WRITE_COMBINE) @@ -472,6 +470,65 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, } EXPORT_SYMBOL(dma_free_attrs); +struct page *dma_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + struct page *page; + + if (WARN_ON_ONCE(!dev->coherent_dma_mask)) + return NULL; + if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM))) + return NULL; + + size = PAGE_ALIGN(size); + if (dma_alloc_direct(dev, ops)) + page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); + else if (ops->alloc_pages) + page = ops->alloc_pages(dev, size, dma_handle, dir, gfp); + else + return NULL; + + debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); + + return page; +} +EXPORT_SYMBOL_GPL(dma_alloc_pages); + +void dma_free_pages(struct device *dev, size_t size, struct page *page, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + size = PAGE_ALIGN(size); + debug_dma_unmap_page(dev, dma_handle, size, dir); + + if (dma_alloc_direct(dev, ops)) + dma_direct_free_pages(dev, size, page, dma_handle, dir); + else if (ops->free_pages) + ops->free_pages(dev, size, page, dma_handle, dir); +} +EXPORT_SYMBOL_GPL(dma_free_pages); + +void *dma_alloc_noncoherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + struct page *page; + + page = dma_alloc_pages(dev, size, dma_handle, dir, gfp); + if (!page) + return NULL; + return page_address(page); +} +EXPORT_SYMBOL_GPL(dma_alloc_noncoherent); + +void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir); +} +EXPORT_SYMBOL_GPL(dma_free_noncoherent); + int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index e443c69be429..5828e5e01b79 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -3,6 +3,7 @@ * Helpers for DMA ops implementations. These generally rely on the fact that * the allocated memory contains normal pages in the direct kernel mapping. */ +#include #include /* @@ -49,3 +50,37 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, return -ENXIO; #endif /* CONFIG_MMU */ } + +struct page *dma_common_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + struct page *page; + + page = dma_alloc_contiguous(dev, size, gfp); + if (!page) + page = alloc_pages_node(dev_to_node(dev), gfp, get_order(size)); + if (!page) + return NULL; + + *dma_handle = ops->map_page(dev, page, 0, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + if (*dma_handle == DMA_MAPPING_ERROR) { + dma_free_contiguous(dev, page, size); + return NULL; + } + + memset(page_address(page), 0, size); + return page; +} + +void dma_common_free_pages(struct device *dev, size_t size, struct page *page, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops->unmap_page) + ops->unmap_page(dev, dma_handle, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + dma_free_contiguous(dev, page, size); +} diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c index ebe128833af7..6986bf1fd668 100644 --- a/kernel/dma/virt.c +++ b/kernel/dma/virt.c @@ -55,5 +55,7 @@ const struct dma_map_ops dma_virt_ops = { .free = dma_virt_free, .map_page = dma_virt_map_page, .map_sg = dma_virt_map_sg, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, }; EXPORT_SYMBOL(dma_virt_ops); -- cgit v1.2.3 From de7cf917768f438aae6d2f4e9bced3739f15f5b6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Sep 2020 13:59:45 +0200 Subject: dma-mapping: add new {alloc,free}_noncoherent dma_map_ops methods This will allow IOMMU drivers to allocate non-contigous memory and return a vmapped virtual address. Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 06115f59f4ff..9669550656a0 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -513,19 +513,40 @@ EXPORT_SYMBOL_GPL(dma_free_pages); void *dma_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { - struct page *page; + const struct dma_map_ops *ops = get_dma_ops(dev); + void *vaddr; - page = dma_alloc_pages(dev, size, dma_handle, dir, gfp); - if (!page) - return NULL; - return page_address(page); + if (!ops || !ops->alloc_noncoherent) { + struct page *page; + + page = dma_alloc_pages(dev, size, dma_handle, dir, gfp); + if (!page) + return NULL; + return page_address(page); + } + + size = PAGE_ALIGN(size); + vaddr = ops->alloc_noncoherent(dev, size, dma_handle, dir, gfp); + if (vaddr) + debug_dma_map_page(dev, virt_to_page(vaddr), 0, size, dir, + *dma_handle); + return vaddr; } EXPORT_SYMBOL_GPL(dma_alloc_noncoherent); void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir) { - dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir); + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (!ops || !ops->free_noncoherent) { + dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir); + return; + } + + size = PAGE_ALIGN(size); + debug_dma_unmap_page(dev, dma_handle, size, dir); + ops->free_noncoherent(dev, size, vaddr, dma_handle, dir); } EXPORT_SYMBOL_GPL(dma_free_noncoherent); -- cgit v1.2.3 From 8e0e0eda6a13e242239799263cc354796c05434a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 21 Sep 2020 09:29:59 +0200 Subject: sched/numa: Use runnable_avg to classify node Use runnable_avg to classify numa node state similarly to what is done for normal load balancer. This helps to ensure that numa and normal balancers use the same view of the state of the system. Large arm64system: 2 nodes / 224 CPUs: hackbench -l (256000/#grp) -g #grp grp tip/sched/core +patchset improvement 1 14,008(+/- 4,99 %) 13,800(+/- 3.88 %) 1,48 % 4 4,340(+/- 5.35 %) 4.283(+/- 4.85 %) 1,33 % 16 3,357(+/- 0.55 %) 3.359(+/- 0.54 %) -0,06 % 32 3,050(+/- 0.94 %) 3.039(+/- 1,06 %) 0,38 % 64 2.968(+/- 1,85 %) 3.006(+/- 2.92 %) -1.27 % 128 3,290(+/-12.61 %) 3,108(+/- 5.97 %) 5.51 % 256 3.235(+/- 3.95 %) 3,188(+/- 2.83 %) 1.45 % Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mel Gorman Link: https://lkml.kernel.org/r/20200921072959.16317-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 33699db27ed5..a15deb210a17 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1504,6 +1504,7 @@ enum numa_type { /* Cached statistics for all CPUs within a node */ struct numa_stats { unsigned long load; + unsigned long runnable; unsigned long util; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; @@ -1547,6 +1548,7 @@ struct task_numa_env { }; static unsigned long cpu_load(struct rq *rq); +static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_util(int cpu); static inline long adjust_numa_imbalance(int imbalance, int src_nr_running); @@ -1555,11 +1557,13 @@ numa_type numa_classify(unsigned int imbalance_pct, struct numa_stats *ns) { if ((ns->nr_running > ns->weight) && - ((ns->compute_capacity * 100) < (ns->util * imbalance_pct))) + (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) || + ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100)))) return node_overloaded; if ((ns->nr_running < ns->weight) || - ((ns->compute_capacity * 100) > (ns->util * imbalance_pct))) + (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) && + ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100)))) return node_has_spare; return node_fully_busy; @@ -1610,6 +1614,7 @@ static void update_numa_stats(struct task_numa_env *env, struct rq *rq = cpu_rq(cpu); ns->load += cpu_load(rq); + ns->runnable += cpu_runnable(rq); ns->util += cpu_util(cpu); ns->nr_running += rq->cfs.h_nr_running; ns->compute_capacity += capacity_of(cpu); -- cgit v1.2.3 From 46fcc4b00c3cca8adb9b7c9afdd499f64e427135 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 31 Aug 2020 13:07:19 +0200 Subject: sched/deadline: Fix stale throttling on de-/boosted tasks When a boosted task gets throttled, what normally happens is that it's immediately enqueued again with ENQUEUE_REPLENISH, which replenishes the runtime and clears the dl_throttled flag. There is a special case however: if the throttling happened on sched-out and the task has been deboosted in the meantime, the replenish is skipped as the task will return to its normal scheduling class. This leaves the task with the dl_throttled flag set. Now if the task gets boosted up to the deadline scheduling class again while it is sleeping, it's still in the throttled state. The normal wakeup however will enqueue the task with ENQUEUE_REPLENISH not set, so we don't actually place it on the rq. Thus we end up with a task that is runnable, but not actually on the rq and neither a immediate replenishment happens, nor is the replenishment timer set up, so the task is stuck in forever-throttled limbo. Clear the dl_throttled flag before dropping back to the normal scheduling class to fix this issue. Signed-off-by: Lucas Stach Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200831110719.2126930-1-l.stach@pengutronix.de --- kernel/sched/deadline.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3862a28cd05d..c19c1883d695 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1527,12 +1527,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) pi_se = &pi_task->dl; } else if (!dl_prio(p->normal_prio)) { /* - * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceeds its - * runtime while doing so. No point in replenishing - * it, as it's going to return back to its original - * scheduling class after this. + * Special case in which we have a !SCHED_DEADLINE task that is going + * to be deboosted, but exceeds its runtime while doing so. No point in + * replenishing it, as it's going to return back to its original + * scheduling class after this. If it has been throttled, we need to + * clear the flag, otherwise the task may wake up as throttled after + * being boosted again with no means to replenish the runtime and clear + * the throttle. */ + p->dl.dl_throttled = 0; BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); return; } -- cgit v1.2.3 From 2586af1ac187f6b3a50930a4e33497074e81762d Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 21 Sep 2020 16:39:49 +0200 Subject: sched/rt: Disable RT_RUNTIME_SHARE by default The RT_RUNTIME_SHARE sched feature enables the sharing of rt_runtime between CPUs, allowing a CPU to run a real-time task up to 100% of the time while leaving more space for non-real-time tasks to run on the CPU that lend rt_runtime. The problem is that a CPU can easily borrow enough rt_runtime to allow a spinning rt-task to run forever, starving per-cpu tasks like kworkers, which are non-real-time by design. This patch disables RT_RUNTIME_SHARE by default, avoiding this problem. The feature will still be present for users that want to enable it, though. Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Tested-by: Wei Wang Link: https://lkml.kernel.org/r/b776ab46817e3db5d8ef79175fa0d71073c051c7.1600697903.git.bristot@redhat.com --- kernel/sched/features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 7481cd96f391..68d369cba9e4 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -77,7 +77,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) SCHED_FEAT(RT_PUSH_IPI, true) #endif -SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(RT_RUNTIME_SHARE, false) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) -- cgit v1.2.3 From 51bd5121c4eb25b911f6bc1ab4de5fe865fe0dcb Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 22 Sep 2020 21:24:10 +0800 Subject: sched: Remove unused inline function uclamp_bucket_base_value() There is no caller in tree, so can remove it. Signed-off-by: YueHaibing Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20200922132410.48440-1-yuehaibing@huawei.com --- kernel/sched/core.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c36dc1ae58be..dd32d859ab40 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -940,11 +940,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) return clamp_value / UCLAMP_BUCKET_DELTA; } -static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) -{ - return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); -} - static inline unsigned int uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) -- cgit v1.2.3 From df3cb4ea1fb63ff326488efd671ba3c39034255e Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Thu, 24 Sep 2020 14:48:47 +0800 Subject: sched/fair: Fix wrong cpu selecting from isolated domain We've met problems that occasionally tasks with full cpumask (e.g. by putting it into a cpuset or setting to full affinity) were migrated to our isolated cpus in production environment. After some analysis, we found that it is due to the current select_idle_smt() not considering the sched_domain mask. Steps to reproduce on my 31-CPU hyperthreads machine: 1. with boot parameter: "isolcpus=domain,2-31" (thread lists: 0,16 and 1,17) 2. cgcreate -g cpu:test; cgexec -g cpu:test "test_threads" 3. some threads will be migrated to the isolated cpu16~17. Fix it by checking the valid domain mask in select_idle_smt(). Fixes: 10e2f1acd010 ("sched/core: Rewrite and improve select_idle_siblings()) Reported-by: Wetp Zhang Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiang Biao Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/1600930127-76857-1-git-send-email-xlpang@linux.alibaba.com --- kernel/sched/fair.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a15deb210a17..9613e5d39d8a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6080,7 +6080,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, int target) +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { int cpu; @@ -6088,7 +6088,8 @@ static int select_idle_smt(struct task_struct *p, int target) return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr) || + !cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -6104,7 +6105,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s return -1; } -static inline int select_idle_smt(struct task_struct *p, int target) +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { return -1; } @@ -6279,7 +6280,7 @@ symmetric: if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, target); + i = select_idle_smt(p, sd, target); if ((unsigned)i < nr_cpumask_bits) return i; -- cgit v1.2.3 From fe7491580d7c56152ea8d9d3124201191617435d Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Thu, 24 Sep 2020 09:47:55 +0800 Subject: sched/fair: Remove the force parameter of update_tg_load_avg() In the file fair.c, sometims update_tg_load_avg(cfs_rq, 0) is used, sometimes update_tg_load_avg(cfs_rq, false) is used. update_tg_load_avg() has the parameter force, but in current code, it never set 1 or true to it, so remove the force parameter. Signed-off-by: Xianting Tian Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200924014755.36253-1-tian.xianting@h3c.com --- kernel/sched/fair.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9613e5d39d8a..b56276ab2525 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se) void post_init_entity_util_avg(struct task_struct *p) { } -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static void update_tg_load_avg(struct cfs_rq *cfs_rq) { } #endif /* CONFIG_SMP */ @@ -3293,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) /** * update_tg_load_avg - update the tg's load avg * @cfs_rq: the cfs_rq whose avg changed - * @force: update regardless of how small the difference * * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. * However, because tg->load_avg is a global value there are performance @@ -3305,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) * * Updating tg's load_avg is necessary before update_cfs_share(). */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; @@ -3315,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) if (cfs_rq->tg == &root_task_group) return; - if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } @@ -3617,7 +3616,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} static inline int propagate_entity_load_avg(struct sched_entity *se) { @@ -3805,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * IOW we're enqueueing a task on a new CPU. */ attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); } else if (decayed) { cfs_rq_util_change(cfs_rq, 0); if (flags & UPDATE_TG) - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); } } @@ -7898,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) struct sched_entity *se; if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); if (cfs_rq == &rq->cfs) decayed = true; @@ -10797,7 +10796,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se) /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } @@ -10816,7 +10815,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } -- cgit v1.2.3 From 5a7f555904671c0737819fe4d19bd6143de3f6c0 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 21 Sep 2020 09:24:21 +0200 Subject: sched/fair: Relax constraint on task's load during load balance Some UCs like 9 always running tasks on 8 CPUs can't be balanced and the load balancer currently migrates the waiting task between the CPUs in an almost random manner. The success of a rq pulling a task depends of the value of nr_balance_failed of its domains and its ability to be faster than others to detach it. This behavior results in an unfair distribution of the running time between tasks because some CPUs will run most of the time, if not always, the same task whereas others will share their time between several tasks. Instead of using nr_balance_failed as a boolean to relax the condition for detaching task, the LB will use nr_balanced_failed to relax the threshold between the tasks'load and the imbalance. This mecanism prevents the same rq or domain to always win the load balance fight. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200921072424.14813-2-vincent.guittot@linaro.org --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b56276ab2525..5e3add351b89 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7679,8 +7679,8 @@ static int detach_tasks(struct lb_env *env) * scheduler fails to find a good waiting task to * migrate. */ - if (load/2 > env->imbalance && - env->sd->nr_balance_failed <= env->sd->cache_nice_tries) + + if ((load >> env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= load; -- cgit v1.2.3 From 2208cdaa56c957e20d8e16f28819aeb47851cb1e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 21 Sep 2020 09:24:22 +0200 Subject: sched/fair: Reduce minimal imbalance threshold The 25% default imbalance threshold for DIE and NUMA domain is large enough to generate significant unfairness between threads. A typical example is the case of 11 threads running on 2x4 CPUs. The imbalance of 20% between the 2 groups of 4 cores is just low enough to not trigger the load balance between the 2 groups. We will have always the same 6 threads on one group of 4 CPUs and the other 5 threads on the other group of CPUS. With a fair time sharing in each group, we ends up with +20% running time for the group of 5 threads. Consider decreasing the imbalance threshold for overloaded case where we use the load to balance task and to ensure fair time sharing. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Acked-by: Hillf Danton Link: https://lkml.kernel.org/r/20200921072424.14813-3-vincent.guittot@linaro.org --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 249bec7b0a4c..41df62884cea 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1349,7 +1349,7 @@ sd_init(struct sched_domain_topology_level *tl, .min_interval = sd_weight, .max_interval = 2*sd_weight, .busy_factor = 32, - .imbalance_pct = 125, + .imbalance_pct = 117, .cache_nice_tries = 0, -- cgit v1.2.3 From e4d32e4d5444977d8dc25fa98b3ce0a65544db8c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 21 Sep 2020 09:24:23 +0200 Subject: sched/fair: Minimize concurrent LBs between domain level sched domains tend to trigger simultaneously the load balance loop but the larger domains often need more time to collect statistics. This slowness makes the larger domain trying to detach tasks from a rq whereas tasks already migrated somewhere else at a sub-domain level. This is not a real problem for idle LB because the period of smaller domains will increase with its CPUs being busy and this will let time for higher ones to pulled tasks. But this becomes a problem when all CPUs are already busy because all domains stay synced when they trigger their LB. A simple way to minimize simultaneous LB of all domains is to decrement the the busy interval by 1 jiffies. Because of the busy_factor, the interval of larger domain will not be a multiple of smaller ones anymore. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200921072424.14813-4-vincent.guittot@linaro.org --- kernel/sched/fair.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e3add351b89..24a5ee63718f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9790,6 +9790,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); + + /* + * Reduce likelihood of busy balancing at higher domains racing with + * balancing at lower domains by preventing their balancing periods + * from being multiples of each other. + */ + if (cpu_busy) + interval -= 1; + interval = clamp(interval, 1UL, max_load_balance_interval); return interval; -- cgit v1.2.3 From 6e7499135db724539ca887b3aa64122502875c71 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 21 Sep 2020 09:24:24 +0200 Subject: sched/fair: Reduce busy load balance interval The busy_factor, which increases load balance interval when a cpu is busy, is set to 32 by default. This value generates some huge LB interval on large system like the THX2 made of 2 node x 28 cores x 4 threads. For such system, the interval increases from 112ms to 3584ms at MC level. And from 228ms to 7168ms at NUMA level. Even on smaller system, a lower busy factor has shown improvement on the fair distribution of the running time so let reduce it for all. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200921072424.14813-5-vincent.guittot@linaro.org --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 41df62884cea..a3a2417fec54 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1348,7 +1348,7 @@ sd_init(struct sched_domain_topology_level *tl, *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, - .busy_factor = 32, + .busy_factor = 16, .imbalance_pct = 117, .cache_nice_tries = 0, -- cgit v1.2.3 From 233e7aca4c8a2c764f556bba9644c36154017e7f Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 21 Sep 2020 23:18:49 +0100 Subject: sched/fair: Use dst group while checking imbalance for NUMA balancer Barry Song noted the following Something is wrong. In find_busiest_group(), we are checking if src has higher load, however, in task_numa_find_cpu(), we are checking if dst will have higher load after balancing. It seems it is not sensible to check src. It maybe cause wrong imbalance value, for example, if dst_running = env->dst_stats.nr_running + 1 results in 3 or above, and src_running = env->src_stats.nr_running - 1 results in 1; The current code is thinking imbalance as 0 since src_running is smaller than 2. This is inconsistent with load balancer. Basically, in find_busiest_group(), the NUMA imbalance is ignored if moving a task "from an almost idle domain" to a "domain with spare capacity". This patch forbids movement "from a misplaced domain" to "an almost idle domain" as that is closer to what the CPU load balancer expects. This patch is not a universal win. The old behaviour was intended to allow a task from an almost idle NUMA node to migrate to its preferred node if the destination had capacity but there are corner cases. For example, a NAS compute load could be parallelised to use 1/3rd of available CPUs but not all those potential tasks are active at all times allowing this logic to trigger. An obvious example is specjbb 2005 running various numbers of warehouses on a 2 socket box with 80 cpus. specjbb 5.9.0-rc4 5.9.0-rc4 vanilla dstbalance-v1r1 Hmean tput-1 46425.00 ( 0.00%) 43394.00 * -6.53%* Hmean tput-2 98416.00 ( 0.00%) 96031.00 * -2.42%* Hmean tput-3 150184.00 ( 0.00%) 148783.00 * -0.93%* Hmean tput-4 200683.00 ( 0.00%) 197906.00 * -1.38%* Hmean tput-5 236305.00 ( 0.00%) 245549.00 * 3.91%* Hmean tput-6 281559.00 ( 0.00%) 285692.00 * 1.47%* Hmean tput-7 338558.00 ( 0.00%) 334467.00 * -1.21%* Hmean tput-8 340745.00 ( 0.00%) 372501.00 * 9.32%* Hmean tput-9 424343.00 ( 0.00%) 413006.00 * -2.67%* Hmean tput-10 421854.00 ( 0.00%) 434261.00 * 2.94%* Hmean tput-11 493256.00 ( 0.00%) 485330.00 * -1.61%* Hmean tput-12 549573.00 ( 0.00%) 529959.00 * -3.57%* Hmean tput-13 593183.00 ( 0.00%) 555010.00 * -6.44%* Hmean tput-14 588252.00 ( 0.00%) 599166.00 * 1.86%* Hmean tput-15 623065.00 ( 0.00%) 642713.00 * 3.15%* Hmean tput-16 703924.00 ( 0.00%) 660758.00 * -6.13%* Hmean tput-17 666023.00 ( 0.00%) 697675.00 * 4.75%* Hmean tput-18 761502.00 ( 0.00%) 758360.00 * -0.41%* Hmean tput-19 796088.00 ( 0.00%) 798368.00 * 0.29%* Hmean tput-20 733564.00 ( 0.00%) 823086.00 * 12.20%* Hmean tput-21 840980.00 ( 0.00%) 856711.00 * 1.87%* Hmean tput-22 804285.00 ( 0.00%) 872238.00 * 8.45%* Hmean tput-23 795208.00 ( 0.00%) 889374.00 * 11.84%* Hmean tput-24 848619.00 ( 0.00%) 966783.00 * 13.92%* Hmean tput-25 750848.00 ( 0.00%) 903790.00 * 20.37%* Hmean tput-26 780523.00 ( 0.00%) 962254.00 * 23.28%* Hmean tput-27 1042245.00 ( 0.00%) 991544.00 * -4.86%* Hmean tput-28 1090580.00 ( 0.00%) 1035926.00 * -5.01%* Hmean tput-29 999483.00 ( 0.00%) 1082948.00 * 8.35%* Hmean tput-30 1098663.00 ( 0.00%) 1113427.00 * 1.34%* Hmean tput-31 1125671.00 ( 0.00%) 1134175.00 * 0.76%* Hmean tput-32 968167.00 ( 0.00%) 1250286.00 * 29.14%* Hmean tput-33 1077676.00 ( 0.00%) 1060893.00 * -1.56%* Hmean tput-34 1090538.00 ( 0.00%) 1090933.00 * 0.04%* Hmean tput-35 967058.00 ( 0.00%) 1107421.00 * 14.51%* Hmean tput-36 1051745.00 ( 0.00%) 1210663.00 * 15.11%* Hmean tput-37 1019465.00 ( 0.00%) 1351446.00 * 32.56%* Hmean tput-38 1083102.00 ( 0.00%) 1064541.00 * -1.71%* Hmean tput-39 1232990.00 ( 0.00%) 1303623.00 * 5.73%* Hmean tput-40 1175542.00 ( 0.00%) 1340943.00 * 14.07%* Hmean tput-41 1127826.00 ( 0.00%) 1339492.00 * 18.77%* Hmean tput-42 1198313.00 ( 0.00%) 1411023.00 * 17.75%* Hmean tput-43 1163733.00 ( 0.00%) 1228253.00 * 5.54%* Hmean tput-44 1305562.00 ( 0.00%) 1357886.00 * 4.01%* Hmean tput-45 1326752.00 ( 0.00%) 1406061.00 * 5.98%* Hmean tput-46 1339424.00 ( 0.00%) 1418451.00 * 5.90%* Hmean tput-47 1415057.00 ( 0.00%) 1381570.00 * -2.37%* Hmean tput-48 1392003.00 ( 0.00%) 1421167.00 * 2.10%* Hmean tput-49 1408374.00 ( 0.00%) 1418659.00 * 0.73%* Hmean tput-50 1359822.00 ( 0.00%) 1391070.00 * 2.30%* Hmean tput-51 1414246.00 ( 0.00%) 1392679.00 * -1.52%* Hmean tput-52 1432352.00 ( 0.00%) 1354020.00 * -5.47%* Hmean tput-53 1387563.00 ( 0.00%) 1409563.00 * 1.59%* Hmean tput-54 1406420.00 ( 0.00%) 1388711.00 * -1.26%* Hmean tput-55 1438804.00 ( 0.00%) 1387472.00 * -3.57%* Hmean tput-56 1399465.00 ( 0.00%) 1400296.00 * 0.06%* Hmean tput-57 1428132.00 ( 0.00%) 1396399.00 * -2.22%* Hmean tput-58 1432385.00 ( 0.00%) 1386253.00 * -3.22%* Hmean tput-59 1421612.00 ( 0.00%) 1371416.00 * -3.53%* Hmean tput-60 1429423.00 ( 0.00%) 1389412.00 * -2.80%* Hmean tput-61 1396230.00 ( 0.00%) 1351122.00 * -3.23%* Hmean tput-62 1418396.00 ( 0.00%) 1383098.00 * -2.49%* Hmean tput-63 1409918.00 ( 0.00%) 1374662.00 * -2.50%* Hmean tput-64 1410236.00 ( 0.00%) 1376216.00 * -2.41%* Hmean tput-65 1396405.00 ( 0.00%) 1364418.00 * -2.29%* Hmean tput-66 1395975.00 ( 0.00%) 1357326.00 * -2.77%* Hmean tput-67 1392986.00 ( 0.00%) 1349642.00 * -3.11%* Hmean tput-68 1386541.00 ( 0.00%) 1343261.00 * -3.12%* Hmean tput-69 1374407.00 ( 0.00%) 1342588.00 * -2.32%* Hmean tput-70 1377513.00 ( 0.00%) 1334654.00 * -3.11%* Hmean tput-71 1369319.00 ( 0.00%) 1334952.00 * -2.51%* Hmean tput-72 1354635.00 ( 0.00%) 1329005.00 * -1.89%* Hmean tput-73 1350933.00 ( 0.00%) 1318942.00 * -2.37%* Hmean tput-74 1351714.00 ( 0.00%) 1316347.00 * -2.62%* Hmean tput-75 1352198.00 ( 0.00%) 1309974.00 * -3.12%* Hmean tput-76 1349490.00 ( 0.00%) 1286064.00 * -4.70%* Hmean tput-77 1336131.00 ( 0.00%) 1303684.00 * -2.43%* Hmean tput-78 1308896.00 ( 0.00%) 1271024.00 * -2.89%* Hmean tput-79 1326703.00 ( 0.00%) 1290862.00 * -2.70%* Hmean tput-80 1336199.00 ( 0.00%) 1291629.00 * -3.34%* The performance at the mid-point is better but not universally better. The patch is a mixed bag depending on the workload, machine and overall levels of utilisation. Sometimes it's better (sometimes much better), other times it is worse (sometimes much worse). Given that there isn't a universally good decision in this section and more people seem to prefer the patch then it may be best to keep the LB decisions consistent and revisit imbalance handling when the load balancer code changes settle down. Jirka Hladky added the following observation. Our results are mostly in line with what you see. We observe big gains (20-50%) when the system is loaded to 1/3 of the maximum capacity and mixed results at the full load - some workloads benefit from the patch at the full load, others not, but performance changes at the full load are mostly within the noise of results (+/-5%). Overall, we think this patch is helpful. [mgorman@techsingularity.net: Rewrote changelog] Fixes: fb86f5b211 ("sched/numa: Use similar logic to the load balancer for moving between domains with spare capacity") Signed-off-by: Barry Song Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200921221849.GI3179@techsingularity.net --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 24a5ee63718f..fc3410b8b990 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1550,7 +1550,7 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_util(int cpu); -static inline long adjust_numa_imbalance(int imbalance, int src_nr_running); +static inline long adjust_numa_imbalance(int imbalance, int nr_running); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1930,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, src_running = env->src_stats.nr_running - 1; dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); - imbalance = adjust_numa_imbalance(imbalance, src_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -8967,7 +8967,7 @@ next_group: } } -static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) +static inline long adjust_numa_imbalance(int imbalance, int nr_running) { unsigned int imbalance_min; @@ -8976,7 +8976,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) * tasks that remain local when the source domain is almost idle. */ imbalance_min = 2; - if (src_nr_running <= imbalance_min) + if (nr_running <= imbalance_min) return 0; return imbalance; -- cgit v1.2.3 From 2a36ab717e8fe678d98f81c14a0b124712719840 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 23 Sep 2020 16:36:16 -0700 Subject: rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ This patchset is based on Google-internal RSEQ work done by Paul Turner and Andrew Hunter. When working with per-CPU RSEQ-based memory allocations, it is sometimes important to make sure that a global memory location is no longer accessed from RSEQ critical sections. For example, there can be two per-CPU lists, one is "active" and accessed per-CPU, while another one is inactive and worked on asynchronously "off CPU" (e.g. garbage collection is performed). Then at some point the two lists are swapped, and a fast RCU-like mechanism is required to make sure that the previously active list is no longer accessed. This patch introduces such a mechanism: in short, membarrier() syscall issues an IPI to a CPU, restarting a potentially active RSEQ critical section on the CPU. Signed-off-by: Peter Oskolkov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mathieu Desnoyers Link: https://lkml.kernel.org/r/20200923233618.2572849-1-posk@google.com --- kernel/sched/membarrier.c | 136 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 106 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 168479a7d61b..e23e74d52db5 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -18,6 +18,14 @@ #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #endif +#ifdef CONFIG_RSEQ +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) +#else +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#endif + #define MEMBARRIER_CMD_BITMASK \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ @@ -30,6 +38,11 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_rseq(void *info) +{ + rseq_preempt(current); +} + static void ipi_sync_rq_state(void *info) { struct mm_struct *mm = (struct mm_struct *) info; @@ -129,19 +142,27 @@ static int membarrier_global_expedited(void) return 0; } -static int membarrier_private_expedited(int flags) +static int membarrier_private_expedited(int flags, int cpu_id) { - int cpu; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; + smp_call_func_t ipi_func = ipi_mb; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + if (!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY)) + return -EPERM; + ipi_func = ipi_rseq; } else { + WARN_ON_ONCE(flags); if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; @@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; cpus_read_lock(); - rcu_read_lock(); - for_each_online_cpu(cpu) { + + if (cpu_id >= 0) { struct task_struct *p; - /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. - */ - if (cpu == raw_smp_processor_id()) - continue; - p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == mm) - __cpumask_set_cpu(cpu, tmpmask); + if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) + goto out; + if (cpu_id == raw_smp_processor_id()) + goto out; + rcu_read_lock(); + p = rcu_dereference(cpu_rq(cpu_id)->curr); + if (!p || p->mm != mm) { + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); + } else { + int cpu; + + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); } - rcu_read_unlock(); preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + if (cpu_id >= 0) + smp_call_function_single(cpu_id, ipi_func, NULL, 1); + else + smp_call_function_many(tmpmask, ipi_func, NULL, 1); preempt_enable(); - free_cpumask_var(tmpmask); +out: + if (cpu_id < 0) + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags) set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, ret; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY; + } else { + WARN_ON_ONCE(flags); } /* @@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags) return 0; if (flags & MEMBARRIER_FLAG_SYNC_CORE) set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + if (flags & MEMBARRIER_FLAG_RSEQ) + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ; atomic_or(set_state, &mm->membarrier_state); ret = sync_runqueues_membarrier_state(mm); if (ret) @@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags) /** * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0 for all commands other than + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter + * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id + * contains the CPU on which to interrupt (= restart) + * the RSEQ critical section. + * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which + * RSEQ CS should be interrupted (@cmd must be + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ). * * If this system call is not implemented, -ENOSYS is returned. If the * command specified does not exist, not available on the running @@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags) * smp_mb() X O O * sys_membarrier() O O O */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) { - if (unlikely(flags)) - return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU)) + return -EINVAL; + break; + default: + if (unlikely(flags)) + return -EINVAL; + } + + if (!(flags & MEMBARRIER_CMD_FLAG_CPU)) + cpu_id = -1; + switch (cmd) { case MEMBARRIER_CMD_QUERY: { @@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(0); + return membarrier_private_expedited(0, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: return membarrier_register_private_expedited(0); case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: - return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); default: return -EINVAL; } -- cgit v1.2.3 From fa01b1e9733fd59ecb8b5b6d85dfb481d2025fbf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Sep 2020 07:40:57 +0200 Subject: block: add a bdev_is_partition helper Add a littler helper to make the somewhat arcane bd_contains checks a little more obvious. Signed-off-by: Christoph Hellwig Acked-by: Ulf Hansson Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c0887f32f647..ec874ea04092 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -527,7 +527,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * and scsi-generic block devices we create a temporary new debugfs * directory that will be removed once the trace ends. */ - if (bdev && bdev == bdev->bd_contains) + if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); -- cgit v1.2.3 From 720dee53ad8dfd528941fbbc264574601b04488a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 25 Sep 2020 01:40:08 +0900 Subject: tracing/boot: Initialize per-instance event list in early boot Initialize per-instance event list in early boot time (before initializing instance directory on tracefs). This fixes boot-time tracing to correctly handle the boot-time per-instance settings. Link: https://lkml.kernel.org/r/160096560826.182763.17110991546046128881.stgit@devnote2 Fixes: 4114fbfd02f1 ("tracing: Enable creating new instance early boot") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 ++- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 30 ++++++++++++++++-------------- 3 files changed, 19 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6211a13b3327..3f2533adae72 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8700,7 +8700,8 @@ static struct trace_array *trace_array_create(const char *name) ret = trace_array_create_dir(tr); if (ret) goto out_free_tr; - } + } else + __trace_early_add_events(tr); list_add(&tr->list, &ftrace_trace_arrays); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 525434145eea..5b0e797cacdd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1658,6 +1658,7 @@ extern void trace_event_enable_tgid_record(bool enable); extern int event_trace_init(void); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); +extern void __trace_early_add_events(struct trace_array *tr); extern struct trace_event_file *__find_event_file(struct trace_array *tr, const char *system, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 42c0e7df6e70..851ab37058dd 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3131,14 +3131,13 @@ static inline int register_event_cmds(void) { return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ /* - * The top level array has already had its trace_event_file - * descriptors created in order to allow for early events to - * be recorded. This function is called after the tracefs has been - * initialized, and we now have to create the files associated - * to the events. + * The top level array and trace arrays created by boot-time tracing + * have already had its trace_event_file descriptors created in order + * to allow for early events to be recorded. + * This function is called after the tracefs has been initialized, + * and we now have to create the files associated to the events. */ -static __init void -__trace_early_add_event_dirs(struct trace_array *tr) +static void __trace_early_add_event_dirs(struct trace_array *tr) { struct trace_event_file *file; int ret; @@ -3153,13 +3152,12 @@ __trace_early_add_event_dirs(struct trace_array *tr) } /* - * For early boot up, the top trace array requires to have - * a list of events that can be enabled. This must be done before - * the filesystem is set up in order to allow events to be traced - * early. + * For early boot up, the top trace array and the trace arrays created + * by boot-time tracing require to have a list of events that can be + * enabled. This must be done before the filesystem is set up in order + * to allow events to be traced early. */ -static __init void -__trace_early_add_events(struct trace_array *tr) +void __trace_early_add_events(struct trace_array *tr) { struct trace_event_call *call; int ret; @@ -3290,7 +3288,11 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) goto out; down_write(&trace_event_sem); - __trace_add_event_dirs(tr); + /* If tr already has the event list, it is initialized in early boot. */ + if (unlikely(!list_empty(&tr->events))) + __trace_early_add_event_dirs(tr); + else + __trace_add_event_dirs(tr); up_write(&trace_event_sem); out: -- cgit v1.2.3 From a968d5e277f1a640a3184561ac2c39261ba79196 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:03:44 -0700 Subject: bpf: Move the PTR_TO_BTF_ID check to check_reg_type() check_reg_type() checks whether a reg can be used as an arg of a func_proto. For PTR_TO_BTF_ID, the check is actually not completely done until the reg->btf_id is pointing to a kernel struct that is acceptable by the func_proto. Thus, this patch moves the btf_id check into check_reg_type(). "arg_type" and "arg_btf_id" are passed to check_reg_type() instead of "compatible". The compatible_reg_types[] usage is localized in check_reg_type() now. The "if (!btf_id) verbose(...); " is also removed since it won't happen. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200925000344.3854828-1-kafai@fb.com --- kernel/bpf/verifier.c | 60 ++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 42dee5dcbc74..945fa2b4d096 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4028,19 +4028,27 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, - const struct bpf_reg_types *compatible) + enum bpf_arg_type arg_type, + const u32 *arg_btf_id) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected, type = reg->type; + const struct bpf_reg_types *compatible; int i, j; + compatible = compatible_reg_types[arg_type]; + if (!compatible) { + verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); + return -EFAULT; + } + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; if (expected == NOT_INIT) break; if (type == expected) - return 0; + goto found; } verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); @@ -4048,6 +4056,25 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, verbose(env, "%s, ", reg_type_str[compatible->types[j]]); verbose(env, "%s\n", reg_type_str[compatible->types[j]]); return -EACCES; + +found: + if (type == PTR_TO_BTF_ID) { + if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, + *arg_btf_id)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf_id), + kernel_type_name(*arg_btf_id)); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", + regno); + return -EACCES; + } + } + + return 0; } static int check_func_arg(struct bpf_verifier_env *env, u32 arg, @@ -4057,7 +4084,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_arg_type arg_type = fn->arg_type[arg]; - const struct bpf_reg_types *compatible; enum bpf_reg_type type = reg->type; int err = 0; @@ -4097,35 +4123,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, */ goto skip_type_check; - compatible = compatible_reg_types[arg_type]; - if (!compatible) { - verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); - return -EFAULT; - } - - err = check_reg_type(env, regno, compatible); + err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]); if (err) return err; - if (type == PTR_TO_BTF_ID) { - const u32 *btf_id = fn->arg_btf_id[arg]; - - if (!btf_id) { - verbose(env, "verifier internal error: missing BTF ID\n"); - return -EFAULT; - } - - if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, *btf_id)) { - verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf_id), kernel_type_name(*btf_id)); - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", - regno); - return -EACCES; - } - } else if (type == PTR_TO_CTX) { + if (type == PTR_TO_CTX) { err = check_ctx_reg(env, reg, regno); if (err < 0) return err; -- cgit v1.2.3 From 1df8f55a37bd286a3d40192980050bc3d7d78887 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:03:50 -0700 Subject: bpf: Enable bpf_skc_to_* sock casting helper to networking prog type There is a constant need to add more fields into the bpf_tcp_sock for the bpf programs running at tc, sock_ops...etc. A current workaround could be to use bpf_probe_read_kernel(). However, other than making another helper call for reading each field and missing CO-RE, it is also not as intuitive to use as directly reading "tp->lsndtime" for example. While already having perfmon cap to do bpf_probe_read_kernel(), it will be much easier if the bpf prog can directly read from the tcp_sock. This patch tries to do that by using the existing casting-helpers bpf_skc_to_*() whose func_proto returns a btf_id. For example, the func_proto of bpf_skc_to_tcp_sock returns the btf_id of the kernel "struct tcp_sock". These helpers are also added to is_ptr_cast_function(). It ensures the returning reg (BPF_REF_0) will also carries the ref_obj_id. That will keep the ref-tracking works properly. The bpf_skc_to_* helpers are made available to most of the bpf prog types in filter.c. The bpf_skc_to_* helpers will be limited by perfmon cap. This patch adds a ARG_PTR_TO_BTF_ID_SOCK_COMMON. The helper accepting this arg can accept a btf-id-ptr (PTR_TO_BTF_ID + &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]) or a legacy-ctx-convert-skc-ptr (PTR_TO_SOCK_COMMON). The bpf_skc_to_*() helpers are changed to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will accept pointer obtained from skb->sk. Instead of specifying both arg_type and arg_btf_id in the same func_proto which is how the current ARG_PTR_TO_BTF_ID does, the arg_btf_id of the new ARG_PTR_TO_BTF_ID_SOCK_COMMON is specified in the compatible_reg_types[] in verifier.c. The reason is the arg_btf_id is always the same. Discussion in this thread: https://lore.kernel.org/bpf/20200922070422.1917351-1-kafai@fb.com/ The ARG_PTR_TO_BTF_ID_ part gives a clear expectation that the helper is expecting a PTR_TO_BTF_ID which could be NULL. This is the same behavior as the existing helper taking ARG_PTR_TO_BTF_ID. The _SOCK_COMMON part means the helper is also expecting the legacy SOCK_COMMON pointer. By excluding the _OR_NULL part, the bpf prog cannot call helper with a literal NULL which doesn't make sense in most cases. e.g. bpf_skc_to_tcp_sock(NULL) will be rejected. All PTR_TO_*_OR_NULL reg has to do a NULL check first before passing into the helper or else the bpf prog will be rejected. This behavior is nothing new and consistent with the current expectation during bpf-prog-load. [ ARG_PTR_TO_BTF_ID_SOCK_COMMON will be used to replace ARG_PTR_TO_SOCK* of other existing helpers later such that those existing helpers can take the PTR_TO_BTF_ID returned by the bpf_skc_to_*() helpers. The only special case is bpf_sk_lookup_assign() which can accept a literal NULL ptr. It has to be handled specially in another follow up patch if there is a need (e.g. by renaming ARG_PTR_TO_SOCKET_OR_NULL to ARG_PTR_TO_BTF_ID_SOCK_COMMON_OR_NULL). ] [ When converting the older helpers that take ARG_PTR_TO_SOCK* in the later patch, if the kernel does not support BTF, ARG_PTR_TO_BTF_ID_SOCK_COMMON will behave like ARG_PTR_TO_SOCK_COMMON because no reg->type could have PTR_TO_BTF_ID in this case. It is not a concern for the newer-btf-only helper like the bpf_skc_to_*() here though because these helpers must require BTF vmlinux to begin with. ] Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200925000350.3855720-1-kafai@fb.com --- kernel/bpf/verifier.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 945fa2b4d096..d4ba29fb17a6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -486,7 +486,12 @@ static bool is_acquire_function(enum bpf_func_id func_id, static bool is_ptr_cast_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_tcp_sock || - func_id == BPF_FUNC_sk_fullsock; + func_id == BPF_FUNC_sk_fullsock || + func_id == BPF_FUNC_skc_to_tcp_sock || + func_id == BPF_FUNC_skc_to_tcp6_sock || + func_id == BPF_FUNC_skc_to_udp6_sock || + func_id == BPF_FUNC_skc_to_tcp_timewait_sock || + func_id == BPF_FUNC_skc_to_tcp_request_sock; } /* string representation of 'enum bpf_reg_type' */ @@ -3953,6 +3958,7 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, struct bpf_reg_types { const enum bpf_reg_type types[10]; + u32 *btf_id; }; static const struct bpf_reg_types map_key_value_types = { @@ -3973,6 +3979,17 @@ static const struct bpf_reg_types sock_types = { }, }; +static const struct bpf_reg_types btf_id_sock_common_types = { + .types = { + PTR_TO_SOCK_COMMON, + PTR_TO_SOCKET, + PTR_TO_TCP_SOCK, + PTR_TO_XDP_SOCK, + PTR_TO_BTF_ID, + }, + .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], +}; + static const struct bpf_reg_types mem_types = { .types = { PTR_TO_STACK, @@ -4014,6 +4031,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_CTX] = &context_types, [ARG_PTR_TO_CTX_OR_NULL] = &context_types, [ARG_PTR_TO_SOCK_COMMON] = &sock_types, + [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, [ARG_PTR_TO_SOCKET] = &fullsock_types, [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, @@ -4059,6 +4077,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, found: if (type == PTR_TO_BTF_ID) { + if (!arg_btf_id) { + if (!compatible->btf_id) { + verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); + return -EFAULT; + } + arg_btf_id = compatible->btf_id; + } + if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, *arg_btf_id)) { verbose(env, "R%d is of type %s but %s is expected\n", @@ -4575,10 +4601,14 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) { int i; - for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false; + if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + return false; + } + return true; } -- cgit v1.2.3 From 592a3498648af000e93dff2d36229ab11cd8c7f6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:02 -0700 Subject: bpf: Change bpf_sk_storage_*() to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_sk_storage_*() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. A micro benchmark has been done on a "cgroup_skb/egress" bpf program which does a bpf_sk_storage_get(). It was driven by netperf doing a 4096 connected UDP_STREAM test with 64bytes packet. The stats from "kernel.bpf_stats_enabled" shows no meaningful difference. The sk_storage_get_btf_proto, sk_storage_delete_btf_proto, btf_sk_storage_get_proto, and btf_sk_storage_delete_proto are no longer needed, so they are removed. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20200925000402.3856307-1-kafai@fb.com --- kernel/bpf/bpf_lsm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 9cd1428c7199..78ea8a7bd27f 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -56,9 +56,9 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_inode_storage_delete: return &bpf_inode_storage_delete_proto; case BPF_FUNC_sk_storage_get: - return &sk_storage_get_btf_proto; + return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: - return &sk_storage_delete_btf_proto; + return &bpf_sk_storage_delete_proto; default: return tracing_prog_func_proto(func_id, prog); } -- cgit v1.2.3 From 4fbb38a3b2cc7bf4340fe9bd9c73622760adc19d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 24 Sep 2020 11:45:06 -0700 Subject: bpf, verifier: Remove redundant var_off.value ops in scalar known reg cases In BPF_AND and BPF_OR alu cases we have this pattern when the src and dst tnum is a constant. 1 dst_reg->var_off = tnum_[op](dst_reg->var_off, src_reg.var_off) 2 scalar32_min_max_[op] 3 if (known) return 4 scalar_min_max_[op] 5 if (known) 6 __mark_reg_known(dst_reg, dst_reg->var_off.value [op] src_reg.var_off.value) The result is in 1 we calculate the var_off value and store it in the dst_reg. Then in 6 we duplicate this logic doing the op again on the value. The duplication comes from the the tnum_[op] handlers because they have already done the value calcuation. For example this is tnum_and(). struct tnum tnum_and(struct tnum a, struct tnum b) { u64 alpha, beta, v; alpha = a.value | a.mask; beta = b.value | b.mask; v = a.value & b.value; return TNUM(v, alpha & beta & ~v); } So lets remove the redundant op calculation. Its confusing for readers and unnecessary. Its also not harmful because those ops have the property, r1 & r1 = r1 and r1 | r1 = r1. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4ba29fb17a6..b25ba989c2dc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5849,8 +5849,7 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, u64 umax_val = src_reg->umax_value; if (src_known && dst_known) { - __mark_reg_known(dst_reg, dst_reg->var_off.value & - src_reg->var_off.value); + __mark_reg_known(dst_reg, dst_reg->var_off.value); return; } @@ -5920,8 +5919,7 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, u64 umin_val = src_reg->umin_value; if (src_known && dst_known) { - __mark_reg_known(dst_reg, dst_reg->var_off.value | - src_reg->var_off.value); + __mark_reg_known(dst_reg, dst_reg->var_off.value); return; } -- cgit v1.2.3 From 008cfe4418b3dbda2ff820cdd7b1a5ce458ae444 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Sep 2020 18:25:57 -0400 Subject: mm: Introduce mm_struct.has_pinned (Commit message majorly collected from Jason Gunthorpe) Reduce the chance of false positive from page_maybe_dma_pinned() by keeping track if the mm_struct has ever been used with pin_user_pages(). This allows cases that might drive up the page ref_count to avoid any penalty from handling dma_pinned pages. Future work is planned, to provide a more sophisticated solution, likely to turn it into a real counter. For now, make it atomic_t but use it as a boolean for simplicity. Suggested-by: Jason Gunthorpe Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 49677d668de4..e65d8192d080 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1011,6 +1011,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; + atomic_set(&mm->has_pinned, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); -- cgit v1.2.3 From 7a4830c380f3a8b3425f6383deff58e65b2557b5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Sep 2020 18:25:58 -0400 Subject: mm/fork: Pass new vma pointer into copy_page_range() This prepares for the future work to trigger early cow on pinned pages during fork(). No functional change intended. Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e65d8192d080..da8d360fb032 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -589,7 +589,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(mm, oldmm, mpnt, tmp); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); -- cgit v1.2.3 From f2d10ff4a903813df767a4b56b651a26b938df06 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Sun, 27 Sep 2020 22:15:29 +0100 Subject: kgdb: Honour the kprobe blocklist when setting breakpoints Currently kgdb has absolutely no safety rails in place to discourage or prevent a user from placing a breakpoint in dangerous places such as the debugger's own trap entry/exit and other places where it is not safe to take synchronous traps. Introduce a new config symbol KGDB_HONOUR_BLOCKLIST and modify the default implementation of kgdb_validate_break_address() so that we use the kprobe blocklist to prohibit instrumentation of critical functions if the config symbol is set. The config symbol dependencies are set to ensure that the blocklist will be enabled by default if we enable KGDB and are compiling for an architecture where we HAVE_KPROBES. Suggested-by: Peter Zijlstra Reviewed-by: Douglas Anderson Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/r/20200927211531.1380577-2-daniel.thompson@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 4 ++++ kernel/debug/kdb/kdb_bp.c | 9 +++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 165e5b0c2083..6b9383fa8278 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -180,6 +180,10 @@ int __weak kgdb_validate_break_address(unsigned long addr) { struct kgdb_bkpt tmp; int err; + + if (kgdb_within_blocklist(addr)) + return -EINVAL; + /* Validate setting the breakpoint and then removing it. If the * remove fails, the kernel needs to emit a bad message because we * are deep trouble not being able to put things back the way we diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index d7ebb2c79cb8..ec4940146612 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -306,6 +306,15 @@ static int kdb_bp(int argc, const char **argv) if (!template.bp_addr) return KDB_BADINT; + /* + * This check is redundant (since the breakpoint machinery should + * be doing the same check during kdb_bp_install) but gives the + * user immediate feedback. + */ + diag = kgdb_validate_break_address(template.bp_addr); + if (diag) + return diag; + /* * Find an empty bp structure to allocate */ -- cgit v1.2.3 From 55c4478a8f0ecedc0c1a0c9379380249985c372a Mon Sep 17 00:00:00 2001 From: Xiaoyi Chen Date: Tue, 22 Sep 2020 16:19:19 +0000 Subject: PM: hibernate: Batch hibernate and resume IO requests Hibernate and resume process submits individual IO requests for each page of the data, so use blk_plug to improve the batching of these requests. Testing this change with hibernate and resumes consistently shows merging of the IO requests and more than an order of magnitude improvement in hibernate and resume speed is observed. One hibernate and resume cycle for 16GB RAM out of 32GB in use takes around 21 minutes before the change, and 1 minutes after the change on a system with limited storage IOPS. Signed-off-by: Xiaoyi Chen Co-Developed-by: Anchal Agarwal Signed-off-by: Anchal Agarwal [ rjw: Subject and changelog edits, white space damage fixes ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 01e2858b5fe3..116320a0394d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -226,6 +226,7 @@ struct hib_bio_batch { atomic_t count; wait_queue_head_t wait; blk_status_t error; + struct blk_plug plug; }; static void hib_init_batch(struct hib_bio_batch *hb) @@ -233,6 +234,12 @@ static void hib_init_batch(struct hib_bio_batch *hb) atomic_set(&hb->count, 0); init_waitqueue_head(&hb->wait); hb->error = BLK_STS_OK; + blk_start_plug(&hb->plug); +} + +static void hib_finish_batch(struct hib_bio_batch *hb) +{ + blk_finish_plug(&hb->plug); } static void hib_end_io(struct bio *bio) @@ -294,6 +301,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, static blk_status_t hib_wait_io(struct hib_bio_batch *hb) { + /* + * We are relying on the behavior of blk_plug that a thread with + * a plug will flush the plug list before sleeping. + */ wait_event(hb->wait, atomic_read(&hb->count) == 0); return blk_status_to_errno(hb->error); } @@ -561,6 +572,7 @@ static int save_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_io(&hb); + hib_finish_batch(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -854,6 +866,7 @@ out_finish: pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); out_clean: + hib_finish_batch(&hb); if (crc) { if (crc->thr) kthread_stop(crc->thr); @@ -1084,6 +1097,7 @@ static int load_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_io(&hb); + hib_finish_batch(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -1447,6 +1461,7 @@ out_finish: } swsusp_show_speed(start, stop, nr_to_read, "Read"); out_clean: + hib_finish_batch(&hb); for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); if (crc) { -- cgit v1.2.3 From 1b4d60ec162f82ea29a2e7a907b5c6cc9f926321 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 25 Sep 2020 13:54:29 -0700 Subject: bpf: Enable BPF_PROG_TEST_RUN for raw_tracepoint Add .test_run for raw_tracepoint. Also, introduce a new feature that runs the target program on a specific CPU. This is achieved by a new flag in bpf_attr.test, BPF_F_TEST_RUN_ON_CPU. When this flag is set, the program is triggered on cpu with id bpf_attr.test.cpu. This feature is needed for BPF programs that handle perf_event and other percpu resources, as the program can access these resource locally. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200925205432.1777-2-songliubraving@fb.com --- kernel/bpf/syscall.c | 2 +- kernel/trace/bpf_trace.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2740df19f55e..3bc2ed2e171b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2979,7 +2979,7 @@ static int bpf_prog_query(const union bpf_attr *attr, } } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out +#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 36508f46a8db..2834866d379a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1678,6 +1678,7 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { }; const struct bpf_prog_ops raw_tracepoint_prog_ops = { + .test_run = bpf_prog_test_run_raw_tp, }; const struct bpf_verifier_ops tracing_verifier_ops = { -- cgit v1.2.3 From efa90b50934c93647e41da23d87e5d8b670014d4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 28 Sep 2020 13:24:57 +0200 Subject: bpf, cpumap: Remove rcpu pointer from cpu_map_build_skb signature Get rid of bpf_cpu_map_entry pointer in cpu_map_build_skb routine signature since it is no longer needed. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/33cb9b7dc447de3ea6fd6ce713ac41bca8794423.1601292015.git.lorenzo@kernel.org --- kernel/bpf/cpumap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 7e1a8ad0c32a..c61a23b564aa 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -155,8 +155,7 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_frame *xdpf, +static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf, struct sk_buff *skb) { unsigned int hard_start_headroom; @@ -365,7 +364,7 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(rcpu, xdpf, skb); + skb = cpu_map_build_skb(xdpf, skb); if (!skb) { xdp_return_frame(xdpf); continue; -- cgit v1.2.3 From 6550f2dddfab02a5b948369eeeaedfbc4ae3cc16 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 28 Sep 2020 10:08:02 +0100 Subject: bpf: sockmap: Enable map_update_elem from bpf_iter Allow passing a pointer to a BTF struct sock_common* when updating a sockmap or sockhash. Since BTF pointers can fault and therefore be NULL at runtime we need to add an additional !sk check to sock_map_update_elem. Since we may be passed a request or timewait socket we also need to check sk_fullsock. Doing this allows calling map_update_elem on sockmap from bpf_iter context, which uses BTF pointers. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200928090805.23343-2-lmb@cloudflare.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b25ba989c2dc..cc9c90d74dc1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3943,7 +3943,7 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKMAP: case BPF_MAP_TYPE_SOCKHASH: if (*arg_type == ARG_PTR_TO_MAP_VALUE) { - *arg_type = ARG_PTR_TO_SOCKET; + *arg_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON; } else { verbose(env, "invalid arg_type for sockmap/sockhash\n"); return -EINVAL; -- cgit v1.2.3 From 1af9270e908cd50a4f5d815c9b6f794c7d57ed07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 25 Sep 2020 23:25:00 +0200 Subject: bpf: disallow attaching modify_return tracing functions to other BPF programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From the checks and commit messages for modify_return, it seems it was never the intention that it should be possible to attach a tracing program with expected_attach_type == BPF_MODIFY_RETURN to another BPF program. However, check_attach_modify_return() will only look at the function name, so if the target function starts with "security_", the attach will be allowed even for bpf2bpf attachment. Fix this oversight by also blocking the modification if a target program is supplied. Fixes: 18644cec714a ("bpf: Fix use-after-free in fmod_ret check") Fixes: 6ba43b761c41 ("bpf: Attachment verification for BPF_MODIFY_RETURN") Acked-by: Andrii Nakryiko Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cc9c90d74dc1..52649a5497ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11500,6 +11500,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) verbose(env, "%s is not sleepable\n", prog->aux->attach_func_name); } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + if (tgt_prog) { + verbose(env, "can't modify return codes of BPF programs\n"); + ret = -EINVAL; + goto out; + } ret = check_attach_modify_return(prog, addr); if (ret) verbose(env, "%s() is not modifiable\n", -- cgit v1.2.3 From efc68158c429f37d87fd02ee9a26913c78546fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 25 Sep 2020 23:25:01 +0200 Subject: bpf: change logging calls from verbose() to bpf_log() and use log pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for moving code around, change a bunch of references to env->log (and the verbose() logging helper) to use bpf_log() and a direct pointer to struct bpf_verifier_log. While we're touching the function signature, mark the 'prog' argument to bpf_check_type_match() as const. Also enhance the bpf_verifier_log_needed() check to handle NULL pointers for the log struct so we can re-use the code with logging disabled. Acked-by: Andrii Nakryiko Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 6 +++--- kernel/bpf/verifier.c | 50 +++++++++++++++++++++++++------------------------- 2 files changed, 28 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 5d3c36e13139..868c03a24d0a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4388,7 +4388,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log, } /* Compare BTFs of given program with BTF of target program */ -int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, +int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf2, const struct btf_type *t2) { struct btf *btf1 = prog->aux->btf; @@ -4396,7 +4396,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, u32 btf_id = 0; if (!prog->aux->func_info) { - bpf_log(&env->log, "Program extension requires BTF\n"); + bpf_log(log, "Program extension requires BTF\n"); return -EINVAL; } @@ -4408,7 +4408,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, if (!t1 || !btf_type_is_func(t1)) return -EFAULT; - return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2); + return btf_check_func_type_match(log, btf1, t1, btf2, t2); } /* Compare BTF of a function with given bpf_reg_state. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 52649a5497ce..d93489c1ac65 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11249,6 +11249,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) struct bpf_prog *prog = env->prog; bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; struct bpf_prog *tgt_prog = prog->aux->linked_prog; + struct bpf_verifier_log *log = &env->log; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; struct btf_func_model fmodel; @@ -11276,23 +11277,23 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return 0; if (!btf_id) { - verbose(env, "Tracing programs must provide btf_id\n"); + bpf_log(log, "Tracing programs must provide btf_id\n"); return -EINVAL; } btf = bpf_prog_get_target_btf(prog); if (!btf) { - verbose(env, + bpf_log(log, "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); return -EINVAL; } t = btf_type_by_id(btf, btf_id); if (!t) { - verbose(env, "attach_btf_id %u is invalid\n", btf_id); + bpf_log(log, "attach_btf_id %u is invalid\n", btf_id); return -EINVAL; } tname = btf_name_by_offset(btf, t->name_off); if (!tname) { - verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id); + bpf_log(log, "attach_btf_id %u doesn't have a name\n", btf_id); return -EINVAL; } if (tgt_prog) { @@ -11304,18 +11305,18 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) break; } if (subprog == -1) { - verbose(env, "Subprog %s doesn't exist\n", tname); + bpf_log(log, "Subprog %s doesn't exist\n", tname); return -EINVAL; } conservative = aux->func_info_aux[subprog].unreliable; if (prog_extension) { if (conservative) { - verbose(env, + bpf_log(log, "Cannot replace static functions\n"); return -EINVAL; } if (!prog->jit_requested) { - verbose(env, + bpf_log(log, "Extension programs should be JITed\n"); return -EINVAL; } @@ -11323,7 +11324,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->expected_attach_type = tgt_prog->expected_attach_type; } if (!tgt_prog->jited) { - verbose(env, "Can attach to only JITed progs\n"); + bpf_log(log, "Can attach to only JITed progs\n"); return -EINVAL; } if (tgt_prog->type == prog->type) { @@ -11331,7 +11332,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) * Cannot attach program extension to another extension. * It's ok to attach fentry/fexit to extension program. */ - verbose(env, "Cannot recursively attach\n"); + bpf_log(log, "Cannot recursively attach\n"); return -EINVAL; } if (tgt_prog->type == BPF_PROG_TYPE_TRACING && @@ -11353,13 +11354,13 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) * reasonable stack size. Hence extending fentry is not * allowed. */ - verbose(env, "Cannot extend fentry/fexit\n"); + bpf_log(log, "Cannot extend fentry/fexit\n"); return -EINVAL; } key = ((u64)aux->id) << 32 | btf_id; } else { if (prog_extension) { - verbose(env, "Cannot replace kernel functions\n"); + bpf_log(log, "Cannot replace kernel functions\n"); return -EINVAL; } key = btf_id; @@ -11368,17 +11369,17 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) switch (prog->expected_attach_type) { case BPF_TRACE_RAW_TP: if (tgt_prog) { - verbose(env, + bpf_log(log, "Only FENTRY/FEXIT progs are attachable to another BPF prog\n"); return -EINVAL; } if (!btf_type_is_typedef(t)) { - verbose(env, "attach_btf_id %u is not a typedef\n", + bpf_log(log, "attach_btf_id %u is not a typedef\n", btf_id); return -EINVAL; } if (strncmp(prefix, tname, sizeof(prefix) - 1)) { - verbose(env, "attach_btf_id %u points to wrong type name %s\n", + bpf_log(log, "attach_btf_id %u points to wrong type name %s\n", btf_id, tname); return -EINVAL; } @@ -11401,7 +11402,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return 0; case BPF_TRACE_ITER: if (!btf_type_is_func(t)) { - verbose(env, "attach_btf_id %u is not a function\n", + bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); return -EINVAL; } @@ -11412,8 +11413,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; if (!bpf_iter_prog_supported(prog)) return -EINVAL; - ret = btf_distill_func_proto(&env->log, btf, t, - tname, &fmodel); + ret = btf_distill_func_proto(log, btf, t, tname, &fmodel); return ret; default: if (!prog_extension) @@ -11425,18 +11425,18 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) case BPF_TRACE_FEXIT: prog->aux->attach_func_name = tname; if (prog->type == BPF_PROG_TYPE_LSM) { - ret = bpf_lsm_verify_prog(&env->log, prog); + ret = bpf_lsm_verify_prog(log, prog); if (ret < 0) return ret; } if (!btf_type_is_func(t)) { - verbose(env, "attach_btf_id %u is not a function\n", + bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); return -EINVAL; } if (prog_extension && - btf_check_type_match(env, prog, btf, t)) + btf_check_type_match(log, prog, btf, t)) return -EINVAL; t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) @@ -11455,7 +11455,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = NULL; t = NULL; } - ret = btf_distill_func_proto(&env->log, btf, t, + ret = btf_distill_func_proto(log, btf, t, tname, &tr->func.model); if (ret < 0) goto out; @@ -11467,7 +11467,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } else { addr = kallsyms_lookup_name(tname); if (!addr) { - verbose(env, + bpf_log(log, "The address of function %s cannot be found\n", tname); ret = -ENOENT; @@ -11497,17 +11497,17 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) break; } if (ret) - verbose(env, "%s is not sleepable\n", + bpf_log(log, "%s is not sleepable\n", prog->aux->attach_func_name); } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { if (tgt_prog) { - verbose(env, "can't modify return codes of BPF programs\n"); + bpf_log(log, "can't modify return codes of BPF programs\n"); ret = -EINVAL; goto out; } ret = check_attach_modify_return(prog, addr); if (ret) - verbose(env, "%s() is not modifiable\n", + bpf_log(log, "%s() is not modifiable\n", prog->aux->attach_func_name); } if (ret) -- cgit v1.2.3 From f7b12b6fea00988496b7606d4964cd77beef46a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 25 Sep 2020 23:25:02 +0200 Subject: bpf: verifier: refactor check_attach_btf_id() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check_attach_btf_id() function really does three things: 1. It performs a bunch of checks on the program to ensure that the attachment is valid. 2. It stores a bunch of state about the attachment being requested in the verifier environment and struct bpf_prog objects. 3. It allocates a trampoline for the attachment. This patch splits out (1.) and (3.) into separate functions which will perform the checks, but return the computed values instead of directly modifying the environment. This is done in preparation for reusing the checks when the actual attachment is happening, which will allow tracing programs to have multiple (compatible) attachments. This also fixes a bug where a bunch of checks were skipped if a trampoline already existed for the tracing target. Fixes: 6ba43b761c41 ("bpf: Attachment verification for BPF_MODIFY_RETURN") Fixes: 1e6c62a88215 ("bpf: Introduce sleepable BPF programs") Acked-by: Andrii Nakryiko Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 22 +++++- kernel/bpf/verifier.c | 183 +++++++++++++++++++++++++----------------------- 2 files changed, 118 insertions(+), 87 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7dd523a7e32d..28c1899949e0 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -65,7 +65,7 @@ static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) bpf_image_ksym_add(tr->image, ksym); } -struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; struct hlist_head *head; @@ -336,6 +336,26 @@ out: return err; } +struct bpf_trampoline *bpf_trampoline_get(u64 key, + struct bpf_attach_target_info *tgt_info) +{ + struct bpf_trampoline *tr; + + tr = bpf_trampoline_lookup(key); + if (!tr) + return NULL; + + mutex_lock(&tr->mutex); + if (tr->func.addr) + goto out; + + memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); + tr->func.addr = (void *)tgt_info->tgt_addr; +out: + mutex_unlock(&tr->mutex); + return tr; +} + void bpf_trampoline_put(struct bpf_trampoline *tr) { if (!tr) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d93489c1ac65..7ff05a79984a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11203,11 +11203,10 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } #define SECURITY_PREFIX "security_" -static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) +static int check_attach_modify_return(unsigned long addr, const char *func_name) { if (within_error_injection_list(addr) || - !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name, - sizeof(SECURITY_PREFIX) - 1)) + !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1)) return 0; return -EINVAL; @@ -11244,43 +11243,26 @@ static int check_non_sleepable_error_inject(u32 btf_id) return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id); } -static int check_attach_btf_id(struct bpf_verifier_env *env) +int bpf_check_attach_target(struct bpf_verifier_log *log, + const struct bpf_prog *prog, + const struct bpf_prog *tgt_prog, + u32 btf_id, + struct bpf_attach_target_info *tgt_info) { - struct bpf_prog *prog = env->prog; bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; - struct bpf_prog *tgt_prog = prog->aux->linked_prog; - struct bpf_verifier_log *log = &env->log; - u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; - struct btf_func_model fmodel; int ret = 0, subprog = -1, i; - struct bpf_trampoline *tr; const struct btf_type *t; bool conservative = true; const char *tname; struct btf *btf; - long addr; - u64 key; - - if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_LSM) { - verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); - return -EINVAL; - } - - if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) - return check_struct_ops_btf_id(env); - - if (prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_LSM && - !prog_extension) - return 0; + long addr = 0; if (!btf_id) { bpf_log(log, "Tracing programs must provide btf_id\n"); return -EINVAL; } - btf = bpf_prog_get_target_btf(prog); + btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux; if (!btf) { bpf_log(log, "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); @@ -11320,8 +11302,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) "Extension programs should be JITed\n"); return -EINVAL; } - env->ops = bpf_verifier_ops[tgt_prog->type]; - prog->expected_attach_type = tgt_prog->expected_attach_type; } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); @@ -11357,13 +11337,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) bpf_log(log, "Cannot extend fentry/fexit\n"); return -EINVAL; } - key = ((u64)aux->id) << 32 | btf_id; } else { if (prog_extension) { bpf_log(log, "Cannot replace kernel functions\n"); return -EINVAL; } - key = btf_id; } switch (prog->expected_attach_type) { @@ -11393,13 +11371,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) /* should never happen in valid vmlinux build */ return -EINVAL; - /* remember two read only pointers that are valid for - * the life time of the kernel - */ - prog->aux->attach_func_name = tname; - prog->aux->attach_func_proto = t; - prog->aux->attach_btf_trace = true; - return 0; + break; case BPF_TRACE_ITER: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", @@ -11409,12 +11381,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; - prog->aux->attach_func_name = tname; - prog->aux->attach_func_proto = t; - if (!bpf_iter_prog_supported(prog)) - return -EINVAL; - ret = btf_distill_func_proto(log, btf, t, tname, &fmodel); - return ret; + ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel); + if (ret) + return ret; + break; default: if (!prog_extension) return -EINVAL; @@ -11423,13 +11393,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) case BPF_LSM_MAC: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: - prog->aux->attach_func_name = tname; - if (prog->type == BPF_PROG_TYPE_LSM) { - ret = bpf_lsm_verify_prog(log, prog); - if (ret < 0) - return ret; - } - if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -11441,24 +11404,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; - tr = bpf_trampoline_lookup(key); - if (!tr) - return -ENOMEM; - /* t is either vmlinux type or another program's type */ - prog->aux->attach_func_proto = t; - mutex_lock(&tr->mutex); - if (tr->func.addr) { - prog->aux->trampoline = tr; - goto out; - } - if (tgt_prog && conservative) { - prog->aux->attach_func_proto = NULL; + + if (tgt_prog && conservative) t = NULL; - } - ret = btf_distill_func_proto(log, btf, t, - tname, &tr->func.model); + + ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel); if (ret < 0) - goto out; + return ret; + if (tgt_prog) { if (subprog == 0) addr = (long) tgt_prog->bpf_func; @@ -11470,8 +11423,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) bpf_log(log, "The address of function %s cannot be found\n", tname); - ret = -ENOENT; - goto out; + return -ENOENT; } } @@ -11496,30 +11448,89 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) default: break; } - if (ret) - bpf_log(log, "%s is not sleepable\n", - prog->aux->attach_func_name); + if (ret) { + bpf_log(log, "%s is not sleepable\n", tname); + return ret; + } } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { if (tgt_prog) { bpf_log(log, "can't modify return codes of BPF programs\n"); - ret = -EINVAL; - goto out; + return -EINVAL; + } + ret = check_attach_modify_return(addr, tname); + if (ret) { + bpf_log(log, "%s() is not modifiable\n", tname); + return ret; } - ret = check_attach_modify_return(prog, addr); - if (ret) - bpf_log(log, "%s() is not modifiable\n", - prog->aux->attach_func_name); } - if (ret) - goto out; - tr->func.addr = (void *)addr; - prog->aux->trampoline = tr; -out: - mutex_unlock(&tr->mutex); - if (ret) - bpf_trampoline_put(tr); + + break; + } + tgt_info->tgt_addr = addr; + tgt_info->tgt_name = tname; + tgt_info->tgt_type = t; + return 0; +} + +static int check_attach_btf_id(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + struct bpf_prog *tgt_prog = prog->aux->linked_prog; + struct bpf_attach_target_info tgt_info = {}; + u32 btf_id = prog->aux->attach_btf_id; + struct bpf_trampoline *tr; + int ret; + u64 key; + + if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM) { + verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); + return -EINVAL; + } + + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) + return check_struct_ops_btf_id(env); + + if (prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM && + prog->type != BPF_PROG_TYPE_EXT) + return 0; + + ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info); + if (ret) return ret; + + if (tgt_prog && prog->type == BPF_PROG_TYPE_EXT) { + env->ops = bpf_verifier_ops[tgt_prog->type]; + prog->expected_attach_type = tgt_prog->expected_attach_type; + } + + /* store info about the attachment target that will be used later */ + prog->aux->attach_func_proto = tgt_info.tgt_type; + prog->aux->attach_func_name = tgt_info.tgt_name; + + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) { + prog->aux->attach_btf_trace = true; + return 0; + } else if (prog->expected_attach_type == BPF_TRACE_ITER) { + if (!bpf_iter_prog_supported(prog)) + return -EINVAL; + return 0; + } + + if (prog->type == BPF_PROG_TYPE_LSM) { + ret = bpf_lsm_verify_prog(&env->log, prog); + if (ret < 0) + return ret; } + + key = bpf_trampoline_compute_key(tgt_prog, btf_id); + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) + return -ENOMEM; + + prog->aux->trampoline = tr; + return 0; } int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, -- cgit v1.2.3 From 76654e67f3a01c50dc13dd6dea75e58943413956 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:03 +0100 Subject: bpf: Provide function to get vmlinux BTF information It will be used later for BPF structure display support Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-2-git-send-email-alan.maguire@oracle.com --- kernel/bpf/verifier.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7ff05a79984a..2ee343dda73a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11533,6 +11533,17 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return 0; } +struct btf *bpf_get_btf_vmlinux(void) +{ + if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + mutex_lock(&bpf_verifier_lock); + if (!btf_vmlinux) + btf_vmlinux = btf_parse_vmlinux(); + mutex_unlock(&bpf_verifier_lock); + } + return btf_vmlinux; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -11566,12 +11577,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->ops = bpf_verifier_ops[env->prog->type]; is_priv = bpf_capable(); - if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { - mutex_lock(&bpf_verifier_lock); - if (!btf_vmlinux) - btf_vmlinux = btf_parse_vmlinux(); - mutex_unlock(&bpf_verifier_lock); - } + bpf_get_btf_vmlinux(); /* grab the mutex to protect few globals used by verifier */ if (!is_priv) -- cgit v1.2.3 From 31d0bc81637d8d974a6dad9827b765b4b70c89d7 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:04 +0100 Subject: bpf: Move to generic BTF show support, apply it to seq files/strings generalize the "seq_show" seq file support in btf.c to support a generic show callback of which we support two instances; the current seq file show, and a show with snprintf() behaviour which instead writes the type data to a supplied string. Both classes of show function call btf_type_show() with different targets; the seq file or the string to be written. In the string case we need to track additional data - length left in string to write and length to return that we would have written (a la snprintf). By default show will display type information, field members and their types and values etc, and the information is indented based upon structure depth. Zeroed fields are omitted. Show however supports flags which modify its behaviour: BTF_SHOW_COMPACT - suppress newline/indent. BTF_SHOW_NONAME - suppress show of type and member names. BTF_SHOW_PTR_RAW - do not obfuscate pointer values. BTF_SHOW_UNSAFE - do not copy data to safe buffer before display. BTF_SHOW_ZERO - show zeroed values (by default they are not shown). Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-3-git-send-email-alan.maguire@oracle.com --- kernel/bpf/btf.c | 1007 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 905 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 868c03a24d0a..dcdd7109aa29 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -284,6 +284,91 @@ static const char *btf_type_str(const struct btf_type *t) return btf_kind_str[BTF_INFO_KIND(t->info)]; } +/* Chunk size we use in safe copy of data to be shown. */ +#define BTF_SHOW_OBJ_SAFE_SIZE 32 + +/* + * This is the maximum size of a base type value (equivalent to a + * 128-bit int); if we are at the end of our safe buffer and have + * less than 16 bytes space we can't be assured of being able + * to copy the next type safely, so in such cases we will initiate + * a new copy. + */ +#define BTF_SHOW_OBJ_BASE_TYPE_SIZE 16 + +/* Type name size */ +#define BTF_SHOW_NAME_SIZE 80 + +/* + * Common data to all BTF show operations. Private show functions can add + * their own data to a structure containing a struct btf_show and consult it + * in the show callback. See btf_type_show() below. + * + * One challenge with showing nested data is we want to skip 0-valued + * data, but in order to figure out whether a nested object is all zeros + * we need to walk through it. As a result, we need to make two passes + * when handling structs, unions and arrays; the first path simply looks + * for nonzero data, while the second actually does the display. The first + * pass is signalled by show->state.depth_check being set, and if we + * encounter a non-zero value we set show->state.depth_to_show to + * the depth at which we encountered it. When we have completed the + * first pass, we will know if anything needs to be displayed if + * depth_to_show > depth. See btf_[struct,array]_show() for the + * implementation of this. + * + * Another problem is we want to ensure the data for display is safe to + * access. To support this, the anonymous "struct {} obj" tracks the data + * object and our safe copy of it. We copy portions of the data needed + * to the object "copy" buffer, but because its size is limited to + * BTF_SHOW_OBJ_COPY_LEN bytes, multiple copies may be required as we + * traverse larger objects for display. + * + * The various data type show functions all start with a call to + * btf_show_start_type() which returns a pointer to the safe copy + * of the data needed (or if BTF_SHOW_UNSAFE is specified, to the + * raw data itself). btf_show_obj_safe() is responsible for + * using copy_from_kernel_nofault() to update the safe data if necessary + * as we traverse the object's data. skbuff-like semantics are + * used: + * + * - obj.head points to the start of the toplevel object for display + * - obj.size is the size of the toplevel object + * - obj.data points to the current point in the original data at + * which our safe data starts. obj.data will advance as we copy + * portions of the data. + * + * In most cases a single copy will suffice, but larger data structures + * such as "struct task_struct" will require many copies. The logic in + * btf_show_obj_safe() handles the logic that determines if a new + * copy_from_kernel_nofault() is needed. + */ +struct btf_show { + u64 flags; + void *target; /* target of show operation (seq file, buffer) */ + void (*showfn)(struct btf_show *show, const char *fmt, va_list args); + const struct btf *btf; + /* below are used during iteration */ + struct { + u8 depth; + u8 depth_to_show; + u8 depth_check; + u8 array_member:1, + array_terminated:1; + u16 array_encoding; + u32 type_id; + int status; /* non-zero for error */ + const struct btf_type *type; + const struct btf_member *member; + char name[BTF_SHOW_NAME_SIZE]; /* space for member name/type */ + } state; + struct { + u32 size; + void *head; + void *data; + u8 safe[BTF_SHOW_OBJ_SAFE_SIZE]; + } obj; +}; + struct btf_kind_operations { s32 (*check_meta)(struct btf_verifier_env *env, const struct btf_type *t, @@ -300,9 +385,9 @@ struct btf_kind_operations { const struct btf_type *member_type); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); - void (*seq_show)(const struct btf *btf, const struct btf_type *t, + void (*show)(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offsets, - struct seq_file *m); + struct btf_show *show); }; static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; @@ -679,6 +764,488 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, return true; } +/* Similar to btf_type_skip_modifiers() but does not skip typedefs. */ +static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf, + u32 id) +{ + const struct btf_type *t = btf_type_by_id(btf, id); + + while (btf_type_is_modifier(t) && + BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) { + id = t->type; + t = btf_type_by_id(btf, t->type); + } + + return t; +} + +#define BTF_SHOW_MAX_ITER 10 + +#define BTF_KIND_BIT(kind) (1ULL << kind) + +/* + * Populate show->state.name with type name information. + * Format of type name is + * + * [.member_name = ] (type_name) + */ +static const char *btf_show_name(struct btf_show *show) +{ + /* BTF_MAX_ITER array suffixes "[]" */ + const char *array_suffixes = "[][][][][][][][][][]"; + const char *array_suffix = &array_suffixes[strlen(array_suffixes)]; + /* BTF_MAX_ITER pointer suffixes "*" */ + const char *ptr_suffixes = "**********"; + const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)]; + const char *name = NULL, *prefix = "", *parens = ""; + const struct btf_member *m = show->state.member; + const struct btf_type *t = show->state.type; + const struct btf_array *array; + u32 id = show->state.type_id; + const char *member = NULL; + bool show_member = false; + u64 kinds = 0; + int i; + + show->state.name[0] = '\0'; + + /* + * Don't show type name if we're showing an array member; + * in that case we show the array type so don't need to repeat + * ourselves for each member. + */ + if (show->state.array_member) + return ""; + + /* Retrieve member name, if any. */ + if (m) { + member = btf_name_by_offset(show->btf, m->name_off); + show_member = strlen(member) > 0; + id = m->type; + } + + /* + * Start with type_id, as we have resolved the struct btf_type * + * via btf_modifier_show() past the parent typedef to the child + * struct, int etc it is defined as. In such cases, the type_id + * still represents the starting type while the struct btf_type * + * in our show->state points at the resolved type of the typedef. + */ + t = btf_type_by_id(show->btf, id); + if (!t) + return ""; + + /* + * The goal here is to build up the right number of pointer and + * array suffixes while ensuring the type name for a typedef + * is represented. Along the way we accumulate a list of + * BTF kinds we have encountered, since these will inform later + * display; for example, pointer types will not require an + * opening "{" for struct, we will just display the pointer value. + * + * We also want to accumulate the right number of pointer or array + * indices in the format string while iterating until we get to + * the typedef/pointee/array member target type. + * + * We start by pointing at the end of pointer and array suffix + * strings; as we accumulate pointers and arrays we move the pointer + * or array string backwards so it will show the expected number of + * '*' or '[]' for the type. BTF_SHOW_MAX_ITER of nesting of pointers + * and/or arrays and typedefs are supported as a precaution. + * + * We also want to get typedef name while proceeding to resolve + * type it points to so that we can add parentheses if it is a + * "typedef struct" etc. + */ + for (i = 0; i < BTF_SHOW_MAX_ITER; i++) { + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_TYPEDEF: + if (!name) + name = btf_name_by_offset(show->btf, + t->name_off); + kinds |= BTF_KIND_BIT(BTF_KIND_TYPEDEF); + id = t->type; + break; + case BTF_KIND_ARRAY: + kinds |= BTF_KIND_BIT(BTF_KIND_ARRAY); + parens = "["; + if (!t) + return ""; + array = btf_type_array(t); + if (array_suffix > array_suffixes) + array_suffix -= 2; + id = array->type; + break; + case BTF_KIND_PTR: + kinds |= BTF_KIND_BIT(BTF_KIND_PTR); + if (ptr_suffix > ptr_suffixes) + ptr_suffix -= 1; + id = t->type; + break; + default: + id = 0; + break; + } + if (!id) + break; + t = btf_type_skip_qualifiers(show->btf, id); + } + /* We may not be able to represent this type; bail to be safe */ + if (i == BTF_SHOW_MAX_ITER) + return ""; + + if (!name) + name = btf_name_by_offset(show->btf, t->name_off); + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + prefix = BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT ? + "struct" : "union"; + /* if it's an array of struct/union, parens is already set */ + if (!(kinds & (BTF_KIND_BIT(BTF_KIND_ARRAY)))) + parens = "{"; + break; + case BTF_KIND_ENUM: + prefix = "enum"; + break; + default: + break; + } + + /* pointer does not require parens */ + if (kinds & BTF_KIND_BIT(BTF_KIND_PTR)) + parens = ""; + /* typedef does not require struct/union/enum prefix */ + if (kinds & BTF_KIND_BIT(BTF_KIND_TYPEDEF)) + prefix = ""; + + if (!name) + name = ""; + + /* Even if we don't want type name info, we want parentheses etc */ + if (show->flags & BTF_SHOW_NONAME) + snprintf(show->state.name, sizeof(show->state.name), "%s", + parens); + else + snprintf(show->state.name, sizeof(show->state.name), + "%s%s%s(%s%s%s%s%s%s)%s", + /* first 3 strings comprise ".member = " */ + show_member ? "." : "", + show_member ? member : "", + show_member ? " = " : "", + /* ...next is our prefix (struct, enum, etc) */ + prefix, + strlen(prefix) > 0 && strlen(name) > 0 ? " " : "", + /* ...this is the type name itself */ + name, + /* ...suffixed by the appropriate '*', '[]' suffixes */ + strlen(ptr_suffix) > 0 ? " " : "", ptr_suffix, + array_suffix, parens); + + return show->state.name; +} + +static const char *__btf_show_indent(struct btf_show *show) +{ + const char *indents = " "; + const char *indent = &indents[strlen(indents)]; + + if ((indent - show->state.depth) >= indents) + return indent - show->state.depth; + return indents; +} + +static const char *btf_show_indent(struct btf_show *show) +{ + return show->flags & BTF_SHOW_COMPACT ? "" : __btf_show_indent(show); +} + +static const char *btf_show_newline(struct btf_show *show) +{ + return show->flags & BTF_SHOW_COMPACT ? "" : "\n"; +} + +static const char *btf_show_delim(struct btf_show *show) +{ + if (show->state.depth == 0) + return ""; + + if ((show->flags & BTF_SHOW_COMPACT) && show->state.type && + BTF_INFO_KIND(show->state.type->info) == BTF_KIND_UNION) + return "|"; + + return ","; +} + +__printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...) +{ + va_list args; + + if (!show->state.depth_check) { + va_start(args, fmt); + show->showfn(show, fmt, args); + va_end(args); + } +} + +/* Macros are used here as btf_show_type_value[s]() prepends and appends + * format specifiers to the format specifier passed in; these do the work of + * adding indentation, delimiters etc while the caller simply has to specify + * the type value(s) in the format specifier + value(s). + */ +#define btf_show_type_value(show, fmt, value) \ + do { \ + if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \ + show->state.depth == 0) { \ + btf_show(show, "%s%s" fmt "%s%s", \ + btf_show_indent(show), \ + btf_show_name(show), \ + value, btf_show_delim(show), \ + btf_show_newline(show)); \ + if (show->state.depth > show->state.depth_to_show) \ + show->state.depth_to_show = show->state.depth; \ + } \ + } while (0) + +#define btf_show_type_values(show, fmt, ...) \ + do { \ + btf_show(show, "%s%s" fmt "%s%s", btf_show_indent(show), \ + btf_show_name(show), \ + __VA_ARGS__, btf_show_delim(show), \ + btf_show_newline(show)); \ + if (show->state.depth > show->state.depth_to_show) \ + show->state.depth_to_show = show->state.depth; \ + } while (0) + +/* How much is left to copy to safe buffer after @data? */ +static int btf_show_obj_size_left(struct btf_show *show, void *data) +{ + return show->obj.head + show->obj.size - data; +} + +/* Is object pointed to by @data of @size already copied to our safe buffer? */ +static bool btf_show_obj_is_safe(struct btf_show *show, void *data, int size) +{ + return data >= show->obj.data && + (data + size) < (show->obj.data + BTF_SHOW_OBJ_SAFE_SIZE); +} + +/* + * If object pointed to by @data of @size falls within our safe buffer, return + * the equivalent pointer to the same safe data. Assumes + * copy_from_kernel_nofault() has already happened and our safe buffer is + * populated. + */ +static void *__btf_show_obj_safe(struct btf_show *show, void *data, int size) +{ + if (btf_show_obj_is_safe(show, data, size)) + return show->obj.safe + (data - show->obj.data); + return NULL; +} + +/* + * Return a safe-to-access version of data pointed to by @data. + * We do this by copying the relevant amount of information + * to the struct btf_show obj.safe buffer using copy_from_kernel_nofault(). + * + * If BTF_SHOW_UNSAFE is specified, just return data as-is; no + * safe copy is needed. + * + * Otherwise we need to determine if we have the required amount + * of data (determined by the @data pointer and the size of the + * largest base type we can encounter (represented by + * BTF_SHOW_OBJ_BASE_TYPE_SIZE). Having that much data ensures + * that we will be able to print some of the current object, + * and if more is needed a copy will be triggered. + * Some objects such as structs will not fit into the buffer; + * in such cases additional copies when we iterate over their + * members may be needed. + * + * btf_show_obj_safe() is used to return a safe buffer for + * btf_show_start_type(); this ensures that as we recurse into + * nested types we always have safe data for the given type. + * This approach is somewhat wasteful; it's possible for example + * that when iterating over a large union we'll end up copying the + * same data repeatedly, but the goal is safety not performance. + * We use stack data as opposed to per-CPU buffers because the + * iteration over a type can take some time, and preemption handling + * would greatly complicate use of the safe buffer. + */ +static void *btf_show_obj_safe(struct btf_show *show, + const struct btf_type *t, + void *data) +{ + const struct btf_type *rt; + int size_left, size; + void *safe = NULL; + + if (show->flags & BTF_SHOW_UNSAFE) + return data; + + rt = btf_resolve_size(show->btf, t, &size); + if (IS_ERR(rt)) { + show->state.status = PTR_ERR(rt); + return NULL; + } + + /* + * Is this toplevel object? If so, set total object size and + * initialize pointers. Otherwise check if we still fall within + * our safe object data. + */ + if (show->state.depth == 0) { + show->obj.size = size; + show->obj.head = data; + } else { + /* + * If the size of the current object is > our remaining + * safe buffer we _may_ need to do a new copy. However + * consider the case of a nested struct; it's size pushes + * us over the safe buffer limit, but showing any individual + * struct members does not. In such cases, we don't need + * to initiate a fresh copy yet; however we definitely need + * at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes left + * in our buffer, regardless of the current object size. + * The logic here is that as we resolve types we will + * hit a base type at some point, and we need to be sure + * the next chunk of data is safely available to display + * that type info safely. We cannot rely on the size of + * the current object here because it may be much larger + * than our current buffer (e.g. task_struct is 8k). + * All we want to do here is ensure that we can print the + * next basic type, which we can if either + * - the current type size is within the safe buffer; or + * - at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes are left in + * the safe buffer. + */ + safe = __btf_show_obj_safe(show, data, + min(size, + BTF_SHOW_OBJ_BASE_TYPE_SIZE)); + } + + /* + * We need a new copy to our safe object, either because we haven't + * yet copied and are intializing safe data, or because the data + * we want falls outside the boundaries of the safe object. + */ + if (!safe) { + size_left = btf_show_obj_size_left(show, data); + if (size_left > BTF_SHOW_OBJ_SAFE_SIZE) + size_left = BTF_SHOW_OBJ_SAFE_SIZE; + show->state.status = copy_from_kernel_nofault(show->obj.safe, + data, size_left); + if (!show->state.status) { + show->obj.data = data; + safe = show->obj.safe; + } + } + + return safe; +} + +/* + * Set the type we are starting to show and return a safe data pointer + * to be used for showing the associated data. + */ +static void *btf_show_start_type(struct btf_show *show, + const struct btf_type *t, + u32 type_id, void *data) +{ + show->state.type = t; + show->state.type_id = type_id; + show->state.name[0] = '\0'; + + return btf_show_obj_safe(show, t, data); +} + +static void btf_show_end_type(struct btf_show *show) +{ + show->state.type = NULL; + show->state.type_id = 0; + show->state.name[0] = '\0'; +} + +static void *btf_show_start_aggr_type(struct btf_show *show, + const struct btf_type *t, + u32 type_id, void *data) +{ + void *safe_data = btf_show_start_type(show, t, type_id, data); + + if (!safe_data) + return safe_data; + + btf_show(show, "%s%s%s", btf_show_indent(show), + btf_show_name(show), + btf_show_newline(show)); + show->state.depth++; + return safe_data; +} + +static void btf_show_end_aggr_type(struct btf_show *show, + const char *suffix) +{ + show->state.depth--; + btf_show(show, "%s%s%s%s", btf_show_indent(show), suffix, + btf_show_delim(show), btf_show_newline(show)); + btf_show_end_type(show); +} + +static void btf_show_start_member(struct btf_show *show, + const struct btf_member *m) +{ + show->state.member = m; +} + +static void btf_show_start_array_member(struct btf_show *show) +{ + show->state.array_member = 1; + btf_show_start_member(show, NULL); +} + +static void btf_show_end_member(struct btf_show *show) +{ + show->state.member = NULL; +} + +static void btf_show_end_array_member(struct btf_show *show) +{ + show->state.array_member = 0; + btf_show_end_member(show); +} + +static void *btf_show_start_array_type(struct btf_show *show, + const struct btf_type *t, + u32 type_id, + u16 array_encoding, + void *data) +{ + show->state.array_encoding = array_encoding; + show->state.array_terminated = 0; + return btf_show_start_aggr_type(show, t, type_id, data); +} + +static void btf_show_end_array_type(struct btf_show *show) +{ + show->state.array_encoding = 0; + show->state.array_terminated = 0; + btf_show_end_aggr_type(show, "]"); +} + +static void *btf_show_start_struct_type(struct btf_show *show, + const struct btf_type *t, + u32 type_id, + void *data) +{ + return btf_show_start_aggr_type(show, t, type_id, data); +} + +static void btf_show_end_struct_type(struct btf_show *show) +{ + btf_show_end_aggr_type(show, "}"); +} + __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { @@ -1268,11 +1835,11 @@ static int btf_df_resolve(struct btf_verifier_env *env, return -EINVAL; } -static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offsets, - struct seq_file *m) +static void btf_df_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offsets, + struct btf_show *show) { - seq_printf(m, "", BTF_INFO_KIND(t->info)); + btf_show(show, "", BTF_INFO_KIND(t->info)); } static int btf_int_check_member(struct btf_verifier_env *env, @@ -1445,7 +2012,7 @@ static void btf_int_log(struct btf_verifier_env *env, btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } -static void btf_int128_print(struct seq_file *m, void *data) +static void btf_int128_print(struct btf_show *show, void *data) { /* data points to a __int128 number. * Suppose @@ -1464,9 +2031,10 @@ static void btf_int128_print(struct seq_file *m, void *data) lower_num = *(u64 *)data; #endif if (upper_num == 0) - seq_printf(m, "0x%llx", lower_num); + btf_show_type_value(show, "0x%llx", lower_num); else - seq_printf(m, "0x%llx%016llx", upper_num, lower_num); + btf_show_type_values(show, "0x%llx%016llx", upper_num, + lower_num); } static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, @@ -1510,8 +2078,8 @@ static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, #endif } -static void btf_bitfield_seq_show(void *data, u8 bits_offset, - u8 nr_bits, struct seq_file *m) +static void btf_bitfield_show(void *data, u8 bits_offset, + u8 nr_bits, struct btf_show *show) { u16 left_shift_bits, right_shift_bits; u8 nr_copy_bytes; @@ -1531,14 +2099,14 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset, right_shift_bits = BITS_PER_U128 - nr_bits; btf_int128_shift(print_num, left_shift_bits, right_shift_bits); - btf_int128_print(m, print_num); + btf_int128_print(show, print_num); } -static void btf_int_bits_seq_show(const struct btf *btf, - const struct btf_type *t, - void *data, u8 bits_offset, - struct seq_file *m) +static void btf_int_bits_show(const struct btf *btf, + const struct btf_type *t, + void *data, u8 bits_offset, + struct btf_show *show) { u32 int_data = btf_type_int(t); u8 nr_bits = BTF_INT_BITS(int_data); @@ -1551,55 +2119,77 @@ static void btf_int_bits_seq_show(const struct btf *btf, total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); - btf_bitfield_seq_show(data, bits_offset, nr_bits, m); + btf_bitfield_show(data, bits_offset, nr_bits, show); } -static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void btf_int_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { u32 int_data = btf_type_int(t); u8 encoding = BTF_INT_ENCODING(int_data); bool sign = encoding & BTF_INT_SIGNED; u8 nr_bits = BTF_INT_BITS(int_data); + void *safe_data; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; if (bits_offset || BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(nr_bits)) { - btf_int_bits_seq_show(btf, t, data, bits_offset, m); - return; + btf_int_bits_show(btf, t, safe_data, bits_offset, show); + goto out; } switch (nr_bits) { case 128: - btf_int128_print(m, data); + btf_int128_print(show, safe_data); break; case 64: if (sign) - seq_printf(m, "%lld", *(s64 *)data); + btf_show_type_value(show, "%lld", *(s64 *)safe_data); else - seq_printf(m, "%llu", *(u64 *)data); + btf_show_type_value(show, "%llu", *(u64 *)safe_data); break; case 32: if (sign) - seq_printf(m, "%d", *(s32 *)data); + btf_show_type_value(show, "%d", *(s32 *)safe_data); else - seq_printf(m, "%u", *(u32 *)data); + btf_show_type_value(show, "%u", *(u32 *)safe_data); break; case 16: if (sign) - seq_printf(m, "%d", *(s16 *)data); + btf_show_type_value(show, "%d", *(s16 *)safe_data); else - seq_printf(m, "%u", *(u16 *)data); + btf_show_type_value(show, "%u", *(u16 *)safe_data); break; case 8: + if (show->state.array_encoding == BTF_INT_CHAR) { + /* check for null terminator */ + if (show->state.array_terminated) + break; + if (*(char *)data == '\0') { + show->state.array_terminated = 1; + break; + } + if (isprint(*(char *)data)) { + btf_show_type_value(show, "'%c'", + *(char *)safe_data); + break; + } + } if (sign) - seq_printf(m, "%d", *(s8 *)data); + btf_show_type_value(show, "%d", *(s8 *)safe_data); else - seq_printf(m, "%u", *(u8 *)data); + btf_show_type_value(show, "%u", *(u8 *)safe_data); break; default: - btf_int_bits_seq_show(btf, t, data, bits_offset, m); + btf_int_bits_show(btf, t, safe_data, bits_offset, show); + break; } +out: + btf_show_end_type(show); } static const struct btf_kind_operations int_ops = { @@ -1608,7 +2198,7 @@ static const struct btf_kind_operations int_ops = { .check_member = btf_int_check_member, .check_kflag_member = btf_int_check_kflag_member, .log_details = btf_int_log, - .seq_show = btf_int_seq_show, + .show = btf_int_show, }; static int btf_modifier_check_member(struct btf_verifier_env *env, @@ -1872,34 +2462,44 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, return 0; } -static void btf_modifier_seq_show(const struct btf *btf, - const struct btf_type *t, - u32 type_id, void *data, - u8 bits_offset, struct seq_file *m) +static void btf_modifier_show(const struct btf *btf, + const struct btf_type *t, + u32 type_id, void *data, + u8 bits_offset, struct btf_show *show) { if (btf->resolved_ids) t = btf_type_id_resolve(btf, &type_id); else t = btf_type_skip_modifiers(btf, type_id, NULL); - btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); + btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show); } -static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void btf_var_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { t = btf_type_id_resolve(btf, &type_id); - btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); + btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show); } -static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void btf_ptr_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { - /* It is a hashed value */ - seq_printf(m, "%p", *(void **)data); + void *safe_data; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; + + /* It is a hashed value unless BTF_SHOW_PTR_RAW is specified */ + if (show->flags & BTF_SHOW_PTR_RAW) + btf_show_type_value(show, "0x%px", *(void **)safe_data); + else + btf_show_type_value(show, "0x%p", *(void **)safe_data); + btf_show_end_type(show); } static void btf_ref_type_log(struct btf_verifier_env *env, @@ -1914,7 +2514,7 @@ static struct btf_kind_operations modifier_ops = { .check_member = btf_modifier_check_member, .check_kflag_member = btf_modifier_check_kflag_member, .log_details = btf_ref_type_log, - .seq_show = btf_modifier_seq_show, + .show = btf_modifier_show, }; static struct btf_kind_operations ptr_ops = { @@ -1923,7 +2523,7 @@ static struct btf_kind_operations ptr_ops = { .check_member = btf_ptr_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_ref_type_log, - .seq_show = btf_ptr_seq_show, + .show = btf_ptr_show, }; static s32 btf_fwd_check_meta(struct btf_verifier_env *env, @@ -1964,7 +2564,7 @@ static struct btf_kind_operations fwd_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_fwd_type_log, - .seq_show = btf_df_seq_show, + .show = btf_df_show, }; static int btf_array_check_member(struct btf_verifier_env *env, @@ -2123,28 +2723,90 @@ static void btf_array_log(struct btf_verifier_env *env, array->type, array->index_type, array->nelems); } -static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void __btf_array_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { const struct btf_array *array = btf_type_array(t); const struct btf_kind_operations *elem_ops; const struct btf_type *elem_type; - u32 i, elem_size, elem_type_id; + u32 i, elem_size = 0, elem_type_id; + u16 encoding = 0; elem_type_id = array->type; - elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + elem_type = btf_type_skip_modifiers(btf, elem_type_id, NULL); + if (elem_type && btf_type_has_size(elem_type)) + elem_size = elem_type->size; + + if (elem_type && btf_type_is_int(elem_type)) { + u32 int_type = btf_type_int(elem_type); + + encoding = BTF_INT_ENCODING(int_type); + + /* + * BTF_INT_CHAR encoding never seems to be set for + * char arrays, so if size is 1 and element is + * printable as a char, we'll do that. + */ + if (elem_size == 1) + encoding = BTF_INT_CHAR; + } + + if (!btf_show_start_array_type(show, t, type_id, encoding, data)) + return; + + if (!elem_type) + goto out; elem_ops = btf_type_ops(elem_type); - seq_puts(m, "["); + for (i = 0; i < array->nelems; i++) { - if (i) - seq_puts(m, ","); - elem_ops->seq_show(btf, elem_type, elem_type_id, data, - bits_offset, m); + btf_show_start_array_member(show); + + elem_ops->show(btf, elem_type, elem_type_id, data, + bits_offset, show); data += elem_size; + + btf_show_end_array_member(show); + + if (show->state.array_terminated) + break; } - seq_puts(m, "]"); +out: + btf_show_end_array_type(show); +} + +static void btf_array_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) +{ + const struct btf_member *m = show->state.member; + + /* + * First check if any members would be shown (are non-zero). + * See comments above "struct btf_show" definition for more + * details on how this works at a high-level. + */ + if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) { + if (!show->state.depth_check) { + show->state.depth_check = show->state.depth + 1; + show->state.depth_to_show = 0; + } + __btf_array_show(btf, t, type_id, data, bits_offset, show); + show->state.member = m; + + if (show->state.depth_check != show->state.depth + 1) + return; + show->state.depth_check = 0; + + if (show->state.depth_to_show <= show->state.depth) + return; + /* + * Reaching here indicates we have recursed and found + * non-zero array member(s). + */ + } + __btf_array_show(btf, t, type_id, data, bits_offset, show); } static struct btf_kind_operations array_ops = { @@ -2153,7 +2815,7 @@ static struct btf_kind_operations array_ops = { .check_member = btf_array_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_array_log, - .seq_show = btf_array_seq_show, + .show = btf_array_show, }; static int btf_struct_check_member(struct btf_verifier_env *env, @@ -2376,15 +3038,18 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) return off; } -static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { - const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ","; const struct btf_member *member; + void *safe_data; u32 i; - seq_puts(m, "{"); + safe_data = btf_show_start_struct_type(show, t, type_id, data); + if (!safe_data) + return; + for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); @@ -2393,23 +3058,65 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, u32 bytes_offset; u8 bits8_offset; - if (i) - seq_puts(m, seq); + btf_show_start_member(show, member); member_offset = btf_member_bit_offset(t, member); bitfield_size = btf_member_bitfield_size(t, member); bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { - btf_bitfield_seq_show(data + bytes_offset, bits8_offset, - bitfield_size, m); + safe_data = btf_show_start_type(show, member_type, + member->type, + data + bytes_offset); + if (safe_data) + btf_bitfield_show(safe_data, + bits8_offset, + bitfield_size, show); + btf_show_end_type(show); } else { ops = btf_type_ops(member_type); - ops->seq_show(btf, member_type, member->type, - data + bytes_offset, bits8_offset, m); + ops->show(btf, member_type, member->type, + data + bytes_offset, bits8_offset, show); } + + btf_show_end_member(show); } - seq_puts(m, "}"); + + btf_show_end_struct_type(show); +} + +static void btf_struct_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) +{ + const struct btf_member *m = show->state.member; + + /* + * First check if any members would be shown (are non-zero). + * See comments above "struct btf_show" definition for more + * details on how this works at a high-level. + */ + if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) { + if (!show->state.depth_check) { + show->state.depth_check = show->state.depth + 1; + show->state.depth_to_show = 0; + } + __btf_struct_show(btf, t, type_id, data, bits_offset, show); + /* Restore saved member data here */ + show->state.member = m; + if (show->state.depth_check != show->state.depth + 1) + return; + show->state.depth_check = 0; + + if (show->state.depth_to_show <= show->state.depth) + return; + /* + * Reaching here indicates we have recursed and found + * non-zero child values. + */ + } + + __btf_struct_show(btf, t, type_id, data, bits_offset, show); } static struct btf_kind_operations struct_ops = { @@ -2418,7 +3125,7 @@ static struct btf_kind_operations struct_ops = { .check_member = btf_struct_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_struct_log, - .seq_show = btf_struct_seq_show, + .show = btf_struct_show, }; static int btf_enum_check_member(struct btf_verifier_env *env, @@ -2549,24 +3256,35 @@ static void btf_enum_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void btf_enum_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { const struct btf_enum *enums = btf_type_enum(t); u32 i, nr_enums = btf_type_vlen(t); - int v = *(int *)data; + void *safe_data; + int v; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; + + v = *(int *)safe_data; for (i = 0; i < nr_enums; i++) { - if (v == enums[i].val) { - seq_printf(m, "%s", - __btf_name_by_offset(btf, - enums[i].name_off)); - return; - } + if (v != enums[i].val) + continue; + + btf_show_type_value(show, "%s", + __btf_name_by_offset(btf, + enums[i].name_off)); + + btf_show_end_type(show); + return; } - seq_printf(m, "%d", v); + btf_show_type_value(show, "%d", v); + btf_show_end_type(show); } static struct btf_kind_operations enum_ops = { @@ -2575,7 +3293,7 @@ static struct btf_kind_operations enum_ops = { .check_member = btf_enum_check_member, .check_kflag_member = btf_enum_check_kflag_member, .log_details = btf_enum_log, - .seq_show = btf_enum_seq_show, + .show = btf_enum_show, }; static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, @@ -2662,7 +3380,7 @@ static struct btf_kind_operations func_proto_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_func_proto_log, - .seq_show = btf_df_seq_show, + .show = btf_df_show, }; static s32 btf_func_check_meta(struct btf_verifier_env *env, @@ -2696,7 +3414,7 @@ static struct btf_kind_operations func_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, - .seq_show = btf_df_seq_show, + .show = btf_df_show, }; static s32 btf_var_check_meta(struct btf_verifier_env *env, @@ -2760,7 +3478,7 @@ static const struct btf_kind_operations var_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_var_log, - .seq_show = btf_var_seq_show, + .show = btf_var_show, }; static s32 btf_datasec_check_meta(struct btf_verifier_env *env, @@ -2886,24 +3604,28 @@ static void btf_datasec_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -static void btf_datasec_seq_show(const struct btf *btf, - const struct btf_type *t, u32 type_id, - void *data, u8 bits_offset, - struct seq_file *m) +static void btf_datasec_show(const struct btf *btf, + const struct btf_type *t, u32 type_id, + void *data, u8 bits_offset, + struct btf_show *show) { const struct btf_var_secinfo *vsi; const struct btf_type *var; u32 i; - seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off)); + if (!btf_show_start_type(show, t, type_id, data)) + return; + + btf_show_type_value(show, "section (\"%s\") = {", + __btf_name_by_offset(btf, t->name_off)); for_each_vsi(i, t, vsi) { var = btf_type_by_id(btf, vsi->type); if (i) - seq_puts(m, ","); - btf_type_ops(var)->seq_show(btf, var, vsi->type, - data + vsi->offset, bits_offset, m); + btf_show(show, ","); + btf_type_ops(var)->show(btf, var, vsi->type, + data + vsi->offset, bits_offset, show); } - seq_puts(m, "}"); + btf_show_end_type(show); } static const struct btf_kind_operations datasec_ops = { @@ -2912,7 +3634,7 @@ static const struct btf_kind_operations datasec_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_datasec_log, - .seq_show = btf_datasec_seq_show, + .show = btf_datasec_show, }; static int btf_func_proto_check(struct btf_verifier_env *env, @@ -4606,12 +5328,93 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, return 0; } +static void btf_type_show(const struct btf *btf, u32 type_id, void *obj, + struct btf_show *show) +{ + const struct btf_type *t = btf_type_by_id(btf, type_id); + + show->btf = btf; + memset(&show->state, 0, sizeof(show->state)); + memset(&show->obj, 0, sizeof(show->obj)); + + btf_type_ops(t)->show(btf, t, type_id, obj, 0, show); +} + +static void btf_seq_show(struct btf_show *show, const char *fmt, + va_list args) +{ + seq_vprintf((struct seq_file *)show->target, fmt, args); +} + +static int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, + void *obj, struct seq_file *m, u64 flags) +{ + struct btf_show sseq; + + sseq.target = m; + sseq.showfn = btf_seq_show; + sseq.flags = flags; + + btf_type_show(btf, type_id, obj, &sseq); + + return sseq.state.status; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { - const struct btf_type *t = btf_type_by_id(btf, type_id); + (void) btf_type_seq_show_flags(btf, type_id, obj, m, + BTF_SHOW_NONAME | BTF_SHOW_COMPACT | + BTF_SHOW_ZERO | BTF_SHOW_UNSAFE); +} + +struct btf_show_snprintf { + struct btf_show show; + int len_left; /* space left in string */ + int len; /* length we would have written */ +}; + +static void btf_snprintf_show(struct btf_show *show, const char *fmt, + va_list args) +{ + struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show; + int len; + + len = vsnprintf(show->target, ssnprintf->len_left, fmt, args); + + if (len < 0) { + ssnprintf->len_left = 0; + ssnprintf->len = len; + } else if (len > ssnprintf->len_left) { + /* no space, drive on to get length we would have written */ + ssnprintf->len_left = 0; + ssnprintf->len += len; + } else { + ssnprintf->len_left -= len; + ssnprintf->len += len; + show->target += len; + } +} + +int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, + char *buf, int len, u64 flags) +{ + struct btf_show_snprintf ssnprintf; + + ssnprintf.show.target = buf; + ssnprintf.show.flags = flags; + ssnprintf.show.showfn = btf_snprintf_show; + ssnprintf.len_left = len; + ssnprintf.len = 0; + + btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf); + + /* If we encontered an error, return it. */ + if (ssnprintf.show.state.status) + return ssnprintf.show.state.status; - btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); + /* Otherwise return length we would have written */ + return ssnprintf.len; } #ifdef CONFIG_PROC_FS -- cgit v1.2.3 From c4d0bfb45068d853a478b9067a95969b1886a30f Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:05 +0100 Subject: bpf: Add bpf_snprintf_btf helper A helper is added to support tracing kernel type information in BPF using the BPF Type Format (BTF). Its signature is long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags); struct btf_ptr * specifies - a pointer to the data to be traced - the BTF id of the type of data pointed to - a flags field is provided for future use; these flags are not to be confused with the BTF_F_* flags below that control how the btf_ptr is displayed; the flags member of the struct btf_ptr may be used to disambiguate types in kernel versus module BTF, etc; the main distinction is the flags relate to the type and information needed in identifying it; not how it is displayed. For example a BPF program with a struct sk_buff *skb could do the following: static struct btf_ptr b = { }; b.ptr = skb; b.type_id = __builtin_btf_type_id(struct sk_buff, 1); bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0); Default output looks like this: (struct sk_buff){ .transport_header = (__u16)65535, .mac_header = (__u16)65535, .end = (sk_buff_data_t)192, .head = (unsigned char *)0x000000007524fd8b, .data = (unsigned char *)0x000000007524fd8b, .truesize = (unsigned int)768, .users = (refcount_t){ .refs = (atomic_t){ .counter = (int)1, }, }, } Flags modifying display are as follows: - BTF_F_COMPACT: no formatting around type information - BTF_F_NONAME: no struct/union member names/types - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; equivalent to %px. - BTF_F_ZERO: show zero-valued struct/union members; they are not displayed by default Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-4-git-send-email-alan.maguire@oracle.com --- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 4 +++ kernel/trace/bpf_trace.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c4811b139caa..403fb2341a86 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2216,6 +2216,7 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; +const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5cc7425ee476..e825441781ab 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -683,6 +683,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) if (!perfmon_capable()) return NULL; return bpf_get_trace_printk_proto(); + case BPF_FUNC_snprintf_btf: + if (!perfmon_capable()) + return NULL; + return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; default: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2834866d379a..140e1be9dab6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,9 @@ #include #include +#include +#include + #include #include "trace_probe.h" @@ -1147,6 +1151,65 @@ static const struct bpf_func_proto bpf_d_path_proto = { .allowed = bpf_d_path_allowed, }; +#define BTF_F_ALL (BTF_F_COMPACT | BTF_F_NONAME | \ + BTF_F_PTR_RAW | BTF_F_ZERO) + +static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, + u64 flags, const struct btf **btf, + s32 *btf_id) +{ + const struct btf_type *t; + + if (unlikely(flags & ~(BTF_F_ALL))) + return -EINVAL; + + if (btf_ptr_size != sizeof(struct btf_ptr)) + return -EINVAL; + + *btf = bpf_get_btf_vmlinux(); + + if (IS_ERR_OR_NULL(*btf)) + return PTR_ERR(*btf); + + if (ptr->type_id > 0) + *btf_id = ptr->type_id; + else + return -EINVAL; + + if (*btf_id > 0) + t = btf_type_by_id(*btf, *btf_id); + if (*btf_id <= 0 || !t) + return -ENOENT; + + return 0; +} + +BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr, + u32, btf_ptr_size, u64, flags) +{ + const struct btf *btf; + s32 btf_id; + int ret; + + ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); + if (ret) + return ret; + + return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size, + flags); +} + +const struct bpf_func_proto bpf_snprintf_btf_proto = { + .func = bpf_snprintf_btf, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1233,6 +1296,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_task_stack_proto; case BPF_FUNC_copy_from_user: return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; + case BPF_FUNC_snprintf_btf: + return &bpf_snprintf_btf_proto; default: return NULL; } -- cgit v1.2.3 From af65320948b848dd1ae65eb4bc3e164c53e741d8 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:07 +0100 Subject: bpf: Bump iter seq size to support BTF representation of large data structures BPF iter size is limited to PAGE_SIZE; if we wish to display BTF-based representations of larger kernel data structures such as task_struct, this will be insufficient. Suggested-by: Alexei Starovoitov Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-6-git-send-email-alan.maguire@oracle.com --- kernel/bpf/bpf_iter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 30833bbf3019..8f10e30ea0b0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -88,8 +88,8 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, mutex_lock(&seq->lock); if (!seq->buf) { - seq->size = PAGE_SIZE; - seq->buf = kmalloc(seq->size, GFP_KERNEL); + seq->size = PAGE_SIZE << 3; + seq->buf = kvmalloc(seq->size, GFP_KERNEL); if (!seq->buf) { err = -ENOMEM; goto done; -- cgit v1.2.3 From eb411377aed9e27835e77ee0710ee8f4649958f3 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:09 +0100 Subject: bpf: Add bpf_seq_printf_btf helper A helper is added to allow seq file writing of kernel data structures using vmlinux BTF. Its signature is long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags); Flags and struct btf_ptr definitions/use are identical to the bpf_snprintf_btf helper, and the helper returns 0 on success or a negative error value. Suggested-by: Alexei Starovoitov Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-8-git-send-email-alan.maguire@oracle.com --- kernel/bpf/btf.c | 4 ++-- kernel/bpf/core.c | 1 + kernel/trace/bpf_trace.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dcdd7109aa29..498e5e553825 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5346,8 +5346,8 @@ static void btf_seq_show(struct btf_show *show, const char *fmt, seq_vprintf((struct seq_file *)show->target, fmt, args); } -static int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, - void *obj, struct seq_file *m, u64 flags) +int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, + void *obj, struct seq_file *m, u64 flags) { struct btf_show sseq; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 403fb2341a86..c4ba45fa4fe1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2217,6 +2217,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; +const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 140e1be9dab6..e118a83439c3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -71,6 +71,10 @@ static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, + u64 flags, const struct btf **btf, + s32 *btf_id); + /** * trace_call_bpf - invoke BPF program * @call: tracepoint event @@ -776,6 +780,31 @@ static const struct bpf_func_proto bpf_seq_write_proto = { .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; +BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr, + u32, btf_ptr_size, u64, flags) +{ + const struct btf *btf; + s32 btf_id; + int ret; + + ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); + if (ret) + return ret; + + return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags); +} + +static const struct bpf_func_proto bpf_seq_printf_btf_proto = { + .func = bpf_seq_printf_btf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) @@ -1695,6 +1724,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_write_proto : NULL; + case BPF_FUNC_seq_printf_btf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_btf_proto : + NULL; case BPF_FUNC_d_path: return &bpf_d_path_proto; default: -- cgit v1.2.3 From 6d1823ccc480866e571ab1206665d693aeb600cf Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 17 Sep 2020 16:01:50 +0800 Subject: lockdep: Optimize the memory usage of circular queue Qian Cai reported a BFS_EQUEUEFULL warning [1] after read recursive deadlock detection merged into tip tree recently. Unlike the previous lockep graph searching, which iterate every lock class (every node in the graph) exactly once, the graph searching for read recurisve deadlock detection needs to iterate every lock dependency (every edge in the graph) once, as a result, the maximum memory cost of the circular queue changes from O(V), where V is the number of lock classes (nodes or vertices) in the graph, to O(E), where E is the number of lock dependencies (edges), because every lock class or dependency gets enqueued once in the BFS. Therefore we hit the BFS_EQUEUEFULL case. However, actually we don't need to enqueue all dependencies for the BFS, because every time we enqueue a dependency, we almostly enqueue all other dependencies in the same dependency list ("almostly" is because we currently check before enqueue, so if a dependency doesn't pass the check stage we won't enqueue it, however, we can always do in reverse ordering), based on this, we can only enqueue the first dependency from a dependency list and every time we want to fetch a new dependency to work, we can either: 1) fetch the dependency next to the current dependency in the dependency list or 2) if the dependency in 1) doesn't exist, fetch the dependency from the queue. With this approach, the "max bfs queue depth" for a x86_64_defconfig + lockdep and selftest config kernel can get descreased from: max bfs queue depth: 201 to (after apply this patch) max bfs queue depth: 61 While I'm at it, clean up the code logic a little (e.g. directly return other than set a "ret" value and goto the "exit" label). [1]: https://lore.kernel.org/lkml/17343f6f7f2438fc376125384133c5ba70c2a681.camel@redhat.com/ Reported-by: Qian Cai Reported-by: syzbot+62ebe501c1ce9a91f68c@syzkaller.appspotmail.com Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200917080210.108095-1-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 99 +++++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index cccf4bc759c6..9560a4e55277 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1606,6 +1606,15 @@ static inline void bfs_init_rootb(struct lock_list *lock, lock->only_xr = (hlock->read != 0); } +static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset) +{ + if (!lock || !lock->parent) + return NULL; + + return list_next_or_null_rcu(get_dep_list(lock->parent, offset), + &lock->entry, struct lock_list, entry); +} + /* * Breadth-First Search to find a strong path in the dependency graph. * @@ -1639,36 +1648,25 @@ static enum bfs_result __bfs(struct lock_list *source_entry, struct lock_list **target_entry, int offset) { + struct circular_queue *cq = &lock_cq; + struct lock_list *lock = NULL; struct lock_list *entry; - struct lock_list *lock; struct list_head *head; - struct circular_queue *cq = &lock_cq; - enum bfs_result ret = BFS_RNOMATCH; + unsigned int cq_depth; + bool first; lockdep_assert_locked(); - if (match(source_entry, data)) { - *target_entry = source_entry; - ret = BFS_RMATCH; - goto exit; - } - - head = get_dep_list(source_entry, offset); - if (list_empty(head)) - goto exit; - __cq_init(cq); __cq_enqueue(cq, source_entry); - while ((lock = __cq_dequeue(cq))) { - bool prev_only_xr; - - if (!lock->class) { - ret = BFS_EINVALIDNODE; - goto exit; - } + while ((lock = __bfs_next(lock, offset)) || (lock = __cq_dequeue(cq))) { + if (!lock->class) + return BFS_EINVALIDNODE; /* + * Step 1: check whether we already finish on this one. + * * If we have visited all the dependencies from this @lock to * others (iow, if we have visited all lock_list entries in * @lock->class->locks_{after,before}) we skip, otherwise go @@ -1680,13 +1678,13 @@ static enum bfs_result __bfs(struct lock_list *source_entry, else mark_lock_accessed(lock); - head = get_dep_list(lock, offset); - - prev_only_xr = lock->only_xr; - - list_for_each_entry_rcu(entry, head, entry) { - unsigned int cq_depth; - u8 dep = entry->dep; + /* + * Step 2: check whether prev dependency and this form a strong + * dependency path. + */ + if (lock->parent) { /* Parent exists, check prev dependency */ + u8 dep = lock->dep; + bool prev_only_xr = lock->parent->only_xr; /* * Mask out all -(S*)-> if we only have *R in previous @@ -1701,26 +1699,49 @@ static enum bfs_result __bfs(struct lock_list *source_entry, continue; /* If there are only -(*R)-> left, set that for the next step */ - entry->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK)); + lock->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK)); + } + /* + * Step 3: we haven't visited this and there is a strong + * dependency path to this, so check with @match. + */ + if (match(lock, data)) { + *target_entry = lock; + return BFS_RMATCH; + } + + /* + * Step 4: if not match, expand the path by adding the + * forward or backwards dependencis in the search + * + */ + first = true; + head = get_dep_list(lock, offset); + list_for_each_entry_rcu(entry, head, entry) { visit_lock_entry(entry, lock); - if (match(entry, data)) { - *target_entry = entry; - ret = BFS_RMATCH; - goto exit; - } - if (__cq_enqueue(cq, entry)) { - ret = BFS_EQUEUEFULL; - goto exit; - } + /* + * Note we only enqueue the first of the list into the + * queue, because we can always find a sibling + * dependency from one (see __bfs_next()), as a result + * the space of queue is saved. + */ + if (!first) + continue; + + first = false; + + if (__cq_enqueue(cq, entry)) + return BFS_EQUEUEFULL; + cq_depth = __cq_get_elem_count(cq); if (max_bfs_queue_depth < cq_depth) max_bfs_queue_depth = cq_depth; } } -exit: - return ret; + + return BFS_RNOMATCH; } static inline enum bfs_result -- cgit v1.2.3 From 851e6f61cd076954f9d521e0d79b173ad3a2453b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 29 Sep 2020 12:27:23 -0400 Subject: tracing: Fix trace_find_next_entry() accounting of temp buffer size The temp buffer size variable for trace_find_next_entry() was incorrectly being updated when the size did not change. The temp buffer size should only be updated when it is reallocated. This is mostly an issue when used with ftrace_dump(). That's because ftrace_dump() can not allocate a new buffer, and instead uses a temporary buffer with a fix size. But the variable that keeps track of that size is incorrectly updated with each call, and it could fall into the path that would try to reallocate the buffer and produce a warning. ------------[ cut here ]------------ WARNING: CPU: 1 PID: 1601 at kernel/trace/trace.c:3548 trace_find_next_entry+0xd0/0xe0 Modules linked in [..] CPU: 1 PID: 1601 Comm: bash Not tainted 5.9.0-rc5-test+ #521 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:trace_find_next_entry+0xd0/0xe0 Code: 40 21 00 00 4c 89 e1 31 d2 4c 89 ee 48 89 df e8 c6 9e ff ff 89 ab 54 21 00 00 5b 5d 41 5c 41 5d c3 48 63 d5 eb bf 31 c0 eb f0 <0f> 0b 48 63 d5 eb b4 66 0f 1f 84 00 00 00 00 00 53 48 8d 8f 60 21 RSP: 0018:ffff95a4f2e8bd70 EFLAGS: 00010046 RAX: ffffffff96679fc0 RBX: ffffffff97910de0 RCX: ffffffff96679fc0 RDX: ffff95a4f2e8bd98 RSI: ffff95a4ee321098 RDI: ffffffff97913000 RBP: 0000000000000018 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000046 R12: ffff95a4f2e8bd98 R13: 0000000000000000 R14: ffff95a4ee321098 R15: 00000000009aa301 FS: 00007f8565484740(0000) GS:ffff95a55aa40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055876bd43d90 CR3: 00000000b76e6003 CR4: 00000000001706e0 Call Trace: trace_print_lat_context+0x58/0x2d0 ? cpumask_next+0x16/0x20 print_trace_line+0x1a4/0x4f0 ftrace_dump.cold+0xad/0x12c __handle_sysrq.cold+0x51/0x126 write_sysrq_trigger+0x3f/0x4a proc_reg_write+0x53/0x80 vfs_write+0xca/0x210 ksys_write+0x70/0xf0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f8565579487 Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 RSP: 002b:00007ffd40707948 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f8565579487 RDX: 0000000000000002 RSI: 000055876bd74de0 RDI: 0000000000000001 RBP: 000055876bd74de0 R08: 000000000000000a R09: 0000000000000001 R10: 000055876bdec280 R11: 0000000000000246 R12: 0000000000000002 R13: 00007f856564a500 R14: 0000000000000002 R15: 00007f856564a700 irq event stamp: 109958 ---[ end trace 7aab5b7e51484b00 ]--- Not only fix the updating of the temp buffer, but also do not free the temp buffer before a new buffer is allocated (there's no reason to not continue to use the current temp buffer if an allocation fails). Cc: stable@vger.kernel.org Fixes: 8e99cf91b99bb ("tracing: Do not allocate buffer in trace_find_next_entry() in atomic") Reported-by: Anna-Maria Behnsen Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a7c26345e83..d3e5de717df2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3546,13 +3546,15 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, if (iter->ent && iter->ent != iter->temp) { if ((!iter->temp || iter->temp_size < iter->ent_size) && !WARN_ON_ONCE(iter->temp == static_temp_buf)) { - kfree(iter->temp); - iter->temp = kmalloc(iter->ent_size, GFP_KERNEL); - if (!iter->temp) + void *temp; + temp = kmalloc(iter->ent_size, GFP_KERNEL); + if (!temp) return NULL; + kfree(iter->temp); + iter->temp = temp; + iter->temp_size = iter->ent_size; } memcpy(iter->temp, iter->ent, iter->ent_size); - iter->temp_size = iter->ent_size; iter->ent = iter->temp; } entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts); -- cgit v1.2.3 From b40341fad6cc2daa195f8090fd3348f18fff640a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 29 Sep 2020 12:40:31 -0400 Subject: ftrace: Move RCU is watching check after recursion check The first thing that the ftrace function callback helper functions should do is to check for recursion. Peter Zijlstra found that when "rcu_is_watching()" had its notrace removed, it caused perf function tracing to crash. This is because the call of rcu_is_watching() is tested before function recursion is checked and and if it is traced, it will cause an infinite recursion loop. rcu_is_watching() should still stay notrace, but to prevent this should never had crashed in the first place. The recursion prevention must be the first thing done in callback functions. Link: https://lore.kernel.org/r/20200929112541.GM2628@hirez.programming.kicks-ass.net Cc: stable@vger.kernel.org Cc: Paul McKenney Fixes: c68c0fa293417 ("ftrace: Have ftrace_ops_get_func() handle RCU and PER_CPU flags too") Acked-by: Peter Zijlstra (Intel) Reported-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 603255f5f085..541453927c82 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6993,16 +6993,14 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, { int bit; - if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching()) - return; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; preempt_disable_notrace(); - op->func(ip, parent_ip, op, regs); + if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) + op->func(ip, parent_ip, op, regs); preempt_enable_notrace(); trace_clear_recursion(bit); -- cgit v1.2.3 From 9d9aae53b96d0659e9085221453e48b7df9edbed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Sun, 27 Sep 2020 21:30:05 +0200 Subject: bpf/preload: Make sure Makefile cleans up after itself, and add .gitignore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Makefile in bpf/preload builds a local copy of libbpf, but does not properly clean up after itself. This can lead to subsequent compilation failures, since the feature detection cache is kept around which can lead subsequent detection to fail. Fix this by properly setting clean-files, and while we're at it, also add a .gitignore for the directory to ignore the build artifacts. Fixes: d71fa5c9763c ("bpf: Add kernel module with user mode driver that populates bpffs.") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200927193005.8459-1-toke@redhat.com --- kernel/bpf/preload/.gitignore | 4 ++++ kernel/bpf/preload/Makefile | 2 ++ 2 files changed, 6 insertions(+) create mode 100644 kernel/bpf/preload/.gitignore (limited to 'kernel') diff --git a/kernel/bpf/preload/.gitignore b/kernel/bpf/preload/.gitignore new file mode 100644 index 000000000000..856a4c5ad0dd --- /dev/null +++ b/kernel/bpf/preload/.gitignore @@ -0,0 +1,4 @@ +/FEATURE-DUMP.libbpf +/bpf_helper_defs.h +/feature +/bpf_preload_umd diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile index 12c7b62b9b6e..23ee310b6eb4 100644 --- a/kernel/bpf/preload/Makefile +++ b/kernel/bpf/preload/Makefile @@ -12,6 +12,8 @@ userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ userprogs := bpf_preload_umd +clean-files := $(userprogs) bpf_helper_defs.h FEATURE-DUMP.libbpf staticobjs/ feature/ + bpf_preload_umd-objs := iterators/iterators.o bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz -- cgit v1.2.3 From 3aac1ead5eb6b76f3d2b8d111e6de1c2de23fb34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 29 Sep 2020 14:45:50 +0200 Subject: bpf: Move prog->aux->linked_prog and trampoline into bpf_link on attach MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for allowing multiple attachments of freplace programs, move the references to the target program and trampoline into the bpf_tracing_link structure when that is created. To do this atomically, introduce a new mutex in prog->aux to protect writing to the two pointers to target prog and trampoline, and rename the members to make it clear that they are related. With this change, it is no longer possible to attach the same tracing program multiple times (detaching in-between), since the reference from the tracing program to the target disappears on the first attach. However, since the next patch will let the caller supply an attach target, that will also make it possible to attach to the same place multiple times. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/160138355059.48470.2503076992210324984.stgit@toke.dk --- kernel/bpf/btf.c | 6 +- kernel/bpf/core.c | 9 +- kernel/bpf/preload/iterators/iterators.bpf.c | 4 +- kernel/bpf/preload/iterators/iterators.skel.h | 444 +++++++++++++------------- kernel/bpf/syscall.c | 56 +++- kernel/bpf/trampoline.c | 12 +- kernel/bpf/verifier.c | 11 +- 7 files changed, 289 insertions(+), 253 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 498e5e553825..05816471bac6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4428,7 +4428,7 @@ errout: struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { - struct bpf_prog *tgt_prog = prog->aux->linked_prog; + struct bpf_prog *tgt_prog = prog->aux->dst_prog; if (tgt_prog) { return tgt_prog->aux->btf; @@ -4455,7 +4455,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const struct btf_type *t = prog->aux->attach_func_proto; - struct bpf_prog *tgt_prog = prog->aux->linked_prog; + struct bpf_prog *tgt_prog = prog->aux->dst_prog; struct btf *btf = bpf_prog_get_target_btf(prog); const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; @@ -5281,7 +5281,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, return -EFAULT; } if (prog_type == BPF_PROG_TYPE_EXT) - prog_type = prog->aux->linked_prog->type; + prog_type = prog->aux->dst_prog->type; t = btf_type_by_id(btf, t->type); if (!t || !btf_type_is_func_proto(t)) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c4ba45fa4fe1..cda674f1392f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -99,6 +99,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); mutex_init(&fp->aux->used_maps_mutex); + mutex_init(&fp->aux->dst_mutex); return fp; } @@ -255,6 +256,7 @@ void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); + mutex_destroy(&fp->aux->dst_mutex); free_percpu(fp->aux->stats); kfree(fp->aux->poke_tab); kfree(fp->aux); @@ -2138,7 +2140,8 @@ static void bpf_prog_free_deferred(struct work_struct *work) if (aux->prog->has_callchain_buf) put_callchain_buffers(); #endif - bpf_trampoline_put(aux->trampoline); + if (aux->dst_trampoline) + bpf_trampoline_put(aux->dst_trampoline); for (i = 0; i < aux->func_cnt; i++) bpf_jit_free(aux->func[i]); if (aux->func_cnt) { @@ -2154,8 +2157,8 @@ void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; - if (aux->linked_prog) - bpf_prog_put(aux->linked_prog); + if (aux->dst_prog) + bpf_prog_put(aux->dst_prog); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c index 5ded550b2ed6..52aa7b38e8b8 100644 --- a/kernel/bpf/preload/iterators/iterators.bpf.c +++ b/kernel/bpf/preload/iterators/iterators.bpf.c @@ -42,7 +42,7 @@ struct bpf_prog_aux { __u32 id; char name[16]; const char *attach_func_name; - struct bpf_prog *linked_prog; + struct bpf_prog *dst_prog; struct bpf_func_info *func_info; struct btf *btf; }; @@ -108,7 +108,7 @@ int dump_bpf_prog(struct bpf_iter__bpf_prog *ctx) BPF_SEQ_PRINTF(seq, "%4u %-16s %s %s\n", aux->id, get_name(aux->btf, aux->func_info[0].type_id, aux->name), - aux->attach_func_name, aux->linked_prog->aux->name); + aux->attach_func_name, aux->dst_prog->aux->name); return 0; } char LICENSE[] SEC("license") = "GPL"; diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h index c3171357dc4f..cf9a6a94b3a4 100644 --- a/kernel/bpf/preload/iterators/iterators.skel.h +++ b/kernel/bpf/preload/iterators/iterators.skel.h @@ -47,7 +47,7 @@ iterators_bpf__open_opts(const struct bpf_object_open_opts *opts) { struct iterators_bpf *obj; - obj = (typeof(obj))calloc(1, sizeof(*obj)); + obj = (struct iterators_bpf *)calloc(1, sizeof(*obj)); if (!obj) return NULL; if (iterators_bpf__create_skeleton(obj)) @@ -105,7 +105,7 @@ iterators_bpf__create_skeleton(struct iterators_bpf *obj) { struct bpf_object_skeleton *s; - s = (typeof(s))calloc(1, sizeof(*s)); + s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s)); if (!s) return -1; obj->skeleton = s; @@ -117,7 +117,7 @@ iterators_bpf__create_skeleton(struct iterators_bpf *obj) /* maps */ s->map_cnt = 1; s->map_skel_sz = sizeof(*s->maps); - s->maps = (typeof(s->maps))calloc(s->map_cnt, s->map_skel_sz); + s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz); if (!s->maps) goto err; @@ -128,7 +128,7 @@ iterators_bpf__create_skeleton(struct iterators_bpf *obj) /* programs */ s->prog_cnt = 2; s->prog_skel_sz = sizeof(*s->progs); - s->progs = (typeof(s->progs))calloc(s->prog_cnt, s->prog_skel_sz); + s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz); if (!s->progs) goto err; @@ -140,10 +140,10 @@ iterators_bpf__create_skeleton(struct iterators_bpf *obj) s->progs[1].prog = &obj->progs.dump_bpf_prog; s->progs[1].link = &obj->links.dump_bpf_prog; - s->data_sz = 7128; + s->data_sz = 7176; s->data = (void *)"\ \x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x18\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\ +\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\ \x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\ \x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\ \x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\ @@ -164,7 +164,7 @@ iterators_bpf__create_skeleton(struct iterators_bpf *obj) \x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\ \0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\ \x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\ -\0\x04\0\0\0\x85\0\0\0\x04\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\ +\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\ \x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\ \0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\ \x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\ @@ -176,230 +176,232 @@ iterators_bpf__create_skeleton(struct iterators_bpf *obj) \x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ \x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\ \x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\ -\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\0\x05\0\0\0\0\0\0\0\0\0\ +\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\ \x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\ \0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\ \0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\ -\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\ -\0\0\xb1\0\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\ -\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\ -\0\0\0\0\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x98\x01\0\0\x03\ -\0\0\x04\x18\0\0\0\xa0\x01\0\0\x0e\0\0\0\0\0\0\0\xa3\x01\0\0\x11\0\0\0\x20\0\0\ -\0\xa8\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb4\x01\0\0\0\0\0\x08\x0f\0\0\0\xba\x01\0\0\ -\0\0\0\x01\x04\0\0\0\x20\0\0\0\xc7\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\ -\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xcc\x01\0\0\0\0\0\x01\x04\ -\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x30\x02\0\0\x02\0\0\x04\x10\0\0\0\ -\x13\0\0\0\x03\0\0\0\0\0\0\0\x43\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\ -\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x48\x02\0\0\x01\0\ -\0\x0c\x16\0\0\0\x94\x02\0\0\x01\0\0\x04\x08\0\0\0\x9d\x02\0\0\x19\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xee\x02\0\0\x06\0\0\x04\x38\0\0\0\xa0\x01\0\0\ -\x0e\0\0\0\0\0\0\0\xa3\x01\0\0\x11\0\0\0\x20\0\0\0\xfb\x02\0\0\x1b\0\0\0\xc0\0\ -\0\0\x0c\x03\0\0\x15\0\0\0\0\x01\0\0\x18\x03\0\0\x1d\0\0\0\x40\x01\0\0\x22\x03\ +\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\ +\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\ +\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\ +\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\ +\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\ +\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\ +\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\ +\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\ +\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\ +\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\ +\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\ +\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\ +\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\ +\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\ \0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\ -\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x6c\x03\0\0\x02\0\ -\0\x04\x08\0\0\0\x7a\x03\0\0\x0e\0\0\0\0\0\0\0\x83\x03\0\0\x0e\0\0\0\x20\0\0\0\ -\x22\x03\0\0\x03\0\0\x04\x18\0\0\0\x8d\x03\0\0\x1b\0\0\0\0\0\0\0\x95\x03\0\0\ -\x21\0\0\0\x40\0\0\0\x9b\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\ -\0\0\0\0\0\0\0\0\x02\x24\0\0\0\x9f\x03\0\0\x01\0\0\x04\x04\0\0\0\xaa\x03\0\0\ -\x0e\0\0\0\0\0\0\0\x13\x04\0\0\x01\0\0\x04\x04\0\0\0\x1c\x04\0\0\x0e\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x92\x04\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\ +\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\ +\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\ +\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\ +\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\ +\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\ \x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\ -\xa6\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\ -\x12\0\0\0\x20\0\0\0\xbc\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\ -\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xd1\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe8\x04\0\0\0\0\0\x0e\ -\x2d\0\0\0\x01\0\0\0\xf0\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\ +\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\ +\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\ +\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\ +\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\ \0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\ -\0\0\x11\0\0\0\xf8\x04\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\ +\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\ \x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\ \x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\ \x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\ -\x30\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\ -\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\ -\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\ -\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\ -\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\ -\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\ -\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\ -\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\ -\x6c\x6f\x6e\x67\x20\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\ -\x31\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\ -\x61\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\ -\x21\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\ -\x5f\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\ -\x65\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\ -\x20\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\ -\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\ -\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\ -\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\ -\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\ -\x75\x33\x32\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\ -\x72\0\x5f\x5f\x41\x52\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\ -\x5f\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\ -\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\ -\x2c\x20\x6d\x61\x70\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\ -\x65\x2c\x20\x6d\x61\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\ -\x29\x3b\0\x7d\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\ -\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\ -\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\ -\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\ -\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\ -\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\ -\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\ -\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\ -\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ -\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\ -\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\ -\x63\x5f\x6e\x61\x6d\x65\0\x6c\x69\x6e\x6b\x65\x64\x5f\x70\x72\x6f\x67\0\x66\ -\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\ -\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\ -\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\ -\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\ -\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\ -\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\ -\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\ -\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\ -\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\ -\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\ -\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\ -\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\ -\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\ -\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\ -\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\ -\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\ -\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\ -\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\ -\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\ -\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\ -\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\ -\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\ -\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\ -\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\ -\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\ -\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\ -\0\0\0\0\0\x07\0\0\0\x56\x02\0\0\x01\0\0\0\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\ -\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x7b\ -\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\xf2\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\ -\0\0\0\x13\x01\0\0\x06\x50\x01\0\x20\0\0\0\x42\0\0\0\x22\x01\0\0\x1d\x44\x01\0\ -\x28\0\0\0\x42\0\0\0\x47\x01\0\0\x06\x5c\x01\0\x38\0\0\0\x42\0\0\0\x5a\x01\0\0\ -\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xe0\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\ -\0\x2e\x02\0\0\x01\x70\x01\0\x56\x02\0\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\ -\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x7b\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\ -\x64\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\x42\0\0\0\x88\x02\0\0\x06\x98\x01\0\x20\0\ -\0\0\x42\0\0\0\xa1\x02\0\0\x0e\xa4\x01\0\x28\0\0\0\x42\0\0\0\x22\x01\0\0\x1d\ -\x88\x01\0\x30\0\0\0\x42\0\0\0\x47\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\ -\xb3\x02\0\0\x03\xac\x01\0\x80\0\0\0\x42\0\0\0\x26\x03\0\0\x02\xb4\x01\0\xb8\0\ -\0\0\x42\0\0\0\x61\x03\0\0\x06\x08\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\ -\xd8\0\0\0\x42\0\0\0\xb2\x03\0\0\x0f\x14\x01\0\xe0\0\0\0\x42\0\0\0\xc7\x03\0\0\ -\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\xfe\x03\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\ -\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\0\0\0\xc7\x03\0\0\x02\x18\x01\0\x20\x01\0\0\ -\x42\0\0\0\x25\x04\0\0\x0d\x1c\x01\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\ -\x01\0\0\x42\0\0\0\x25\x04\0\0\x0d\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x25\x04\0\0\ -\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\0\x53\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\ -\0\0\0\x53\x04\0\0\x06\x20\x01\0\x70\x01\0\0\x42\0\0\0\x76\x04\0\0\x0d\x28\x01\ -\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x80\x01\0\0\x42\0\0\0\x26\x03\0\0\x02\ -\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\x2e\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\ -\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\ -\0\0\0\x10\0\0\0\x02\0\0\0\xee\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x1e\x01\0\0\0\ -\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xee\0\0\0\0\0\ -\0\0\xa0\0\0\0\x0d\0\0\0\x1e\x01\0\0\0\0\0\0\x56\x02\0\0\x12\0\0\0\0\0\0\0\x14\ -\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\ -\0\0\xee\0\0\0\0\0\0\0\x20\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\ -\0\x1e\x01\0\0\0\0\0\0\x80\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\ -\0\xee\0\0\0\0\0\0\0\xa8\0\0\0\x1a\0\0\0\x59\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\ -\0\x5d\x03\0\0\0\0\0\0\xc0\0\0\0\x1f\0\0\0\x8b\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\ -\0\0\xee\0\0\0\0\0\0\0\xf0\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\ -\0\0\x3e\0\0\0\0\0\0\0\x50\x01\0\0\x1a\0\0\0\xee\0\0\0\0\0\0\0\x60\x01\0\0\x20\ -\0\0\0\x4d\x04\0\0\0\0\0\0\x88\x01\0\0\x1a\0\0\0\x1e\x01\0\0\0\0\0\0\x98\x01\0\ -\0\x1a\0\0\0\x8e\x04\0\0\0\0\0\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd6\0\0\0\0\0\x02\0\x70\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\xc8\0\0\0\0\0\x02\0\xf0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\xcf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc1\0\0\0\0\0\x03\0\x80\ -\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xba\0\0\0\0\0\x03\0\xf8\x01\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\0\0\0\0\0\0\0\xf4\0\0\0\ -\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\0\0\0\x01\0\x04\0\x31\0\0\ -\0\0\0\0\0\x20\0\0\0\0\0\0\0\xdd\0\0\0\x01\0\x04\0\x51\0\0\0\0\0\0\0\x11\0\0\0\ -\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\x03\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\xb2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\ -\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\ -\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\0\0\x0c\0\0\0\xc8\0\0\0\0\0\0\0\ -\x01\0\0\0\x0c\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\0\x0c\0\0\0\xd0\x01\0\0\0\0\0\0\ -\x01\0\0\0\x0c\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\0\0\x0c\0\0\0\xfc\x03\0\0\0\0\0\ -\0\x0a\0\0\0\x0c\0\0\0\x08\x04\0\0\0\0\0\0\x0a\0\0\0\x0c\0\0\0\x14\x04\0\0\0\0\ -\0\0\x0a\0\0\0\x0c\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\x0d\0\0\0\x2c\0\0\0\0\0\0\ -\0\0\0\0\0\x0a\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x50\0\0\0\0\0\0\0\0\0\ -\0\0\x0a\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\ -\x0a\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\x90\0\0\0\0\0\0\0\0\0\0\0\x0a\0\ -\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\xb0\0\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\ -\xc0\0\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\xd0\0\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\xe8\0\ -\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xf8\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x08\x01\0\0\ -\0\0\0\0\0\0\0\0\x0b\0\0\0\x18\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x28\x01\0\0\0\ -\0\0\0\0\0\0\0\x0b\0\0\0\x38\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x48\x01\0\0\0\0\ -\0\0\0\0\0\0\x0b\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x68\x01\0\0\0\0\0\ -\0\0\0\0\0\x0b\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x88\x01\0\0\0\0\0\0\ -\0\0\0\0\x0b\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa8\x01\0\0\0\0\0\0\0\ -\0\0\0\x0b\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\ -\0\0\x0b\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\ -\0\x0b\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\ -\x0a\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0a\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0a\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\x0a\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0a\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\x4e\x4f\x41\x42\x43\x44\x4d\0\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\ -\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\ -\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\ -\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\ -\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\ -\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\ -\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\x6e\x73\x65\0\x2e\x73\x74\x72\x74\x61\ -\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\x61\x74\x61\0\x2e\x72\x65\ -\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\x4c\x42\x42\x31\x5f\x37\0\ -\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\x4c\x42\x42\x31\x5f\x33\0\ -\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\ -\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\ +\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\ +\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\ +\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\ +\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\ +\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\ +\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\ +\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\ +\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\ +\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\ +\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\ +\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\ +\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\ +\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\ +\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\ +\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\ +\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\ +\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\ +\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ +\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ +\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ +\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ +\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ +\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ +\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ +\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ +\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ +\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ +\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ +\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ +\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ +\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ +\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ +\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ +\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ +\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ +\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ +\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ +\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ +\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ +\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ +\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ +\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ +\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ +\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ +\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ +\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ +\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ +\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ +\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ +\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ +\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ +\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ +\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ +\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ +\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ +\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\ +\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\ +\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\ +\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\ +\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\ +\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\ +\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\ +\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\ +\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\ +\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\ +\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\ +\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\ +\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\ +\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\ +\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\ +\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\ +\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\ +\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\ +\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\ +\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\ +\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\ +\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\ +\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\ +\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\ +\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\ +\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\ +\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\ +\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\ +\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\ +\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\ +\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\ +\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\ +\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\ +\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\ +\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\ +\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\ +\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\ +\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\ +\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\ +\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\ +\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\ +\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\ +\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\ +\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\ +\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\ +\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\ +\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\ +\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\ +\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\ +\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\ +\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\ +\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\ +\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\ +\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\ +\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\ +\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\ +\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\ +\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\ +\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\ +\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\ +\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\ +\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\ +\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\ +\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\ +\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\ +\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\ +\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\ +\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\ +\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\ +\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\ +\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\ +\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\ +\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\ -\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\0\0\0\0\x08\ -\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\ -\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\0\0\0\0\0\x62\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x89\0\0\0\x01\0\0\0\x03\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xad\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x34\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\xe2\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\x99\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\x11\0\0\0\ -\0\0\0\x80\x01\0\0\0\0\0\0\x0e\0\0\0\x0d\0\0\0\x08\0\0\0\0\0\0\0\x18\0\0\0\0\0\ -\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x90\x12\0\0\0\0\0\0\ -\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x69\ -\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\ -\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\xa9\0\0\0\x09\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\0\0\0\0\0\0\x50\0\0\0\0\0\0\0\ -\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x07\0\0\0\x09\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x13\0\0\0\0\0\0\xe0\x03\0\0\0\0\0\0\x08\0\0\ -\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\0\0\x03\x4c\xff\x6f\0\0\ -\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x91\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\x07\x17\0\0\0\0\0\0\x0a\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0"; +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\ +\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\ +\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\ +\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\ +\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\ +\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\ +\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\ +\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\ +\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\ +\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\ +\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; return 0; err: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3bc2ed2e171b..e6a0a948e30c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2154,14 +2154,14 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->expected_attach_type = attr->expected_attach_type; prog->aux->attach_btf_id = attr->attach_btf_id; if (attr->attach_prog_fd) { - struct bpf_prog *tgt_prog; + struct bpf_prog *dst_prog; - tgt_prog = bpf_prog_get(attr->attach_prog_fd); - if (IS_ERR(tgt_prog)) { - err = PTR_ERR(tgt_prog); + dst_prog = bpf_prog_get(attr->attach_prog_fd); + if (IS_ERR(dst_prog)) { + err = PTR_ERR(dst_prog); goto free_prog_nouncharge; } - prog->aux->linked_prog = tgt_prog; + prog->aux->dst_prog = dst_prog; } prog->aux->offload_requested = !!attr->prog_ifindex; @@ -2498,11 +2498,23 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) struct bpf_tracing_link { struct bpf_link link; enum bpf_attach_type attach_type; + struct bpf_trampoline *trampoline; + struct bpf_prog *tgt_prog; }; static void bpf_tracing_link_release(struct bpf_link *link) { - WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog)); + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog, + tr_link->trampoline)); + + bpf_trampoline_put(tr_link->trampoline); + + /* tgt_prog is NULL if target is a kernel function */ + if (tr_link->tgt_prog) + bpf_prog_put(tr_link->tgt_prog); } static void bpf_tracing_link_dealloc(struct bpf_link *link) @@ -2545,7 +2557,9 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog) { struct bpf_link_primer link_primer; + struct bpf_prog *tgt_prog = NULL; struct bpf_tracing_link *link; + struct bpf_trampoline *tr; int err; switch (prog->type) { @@ -2583,19 +2597,37 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) &bpf_tracing_link_lops, prog); link->attach_type = prog->expected_attach_type; - err = bpf_link_prime(&link->link, &link_primer); - if (err) { - kfree(link); - goto out_put_prog; + mutex_lock(&prog->aux->dst_mutex); + + if (!prog->aux->dst_trampoline) { + err = -ENOENT; + goto out_unlock; } + tr = prog->aux->dst_trampoline; + tgt_prog = prog->aux->dst_prog; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto out_unlock; - err = bpf_trampoline_link_prog(prog); + err = bpf_trampoline_link_prog(prog, tr); if (err) { bpf_link_cleanup(&link_primer); - goto out_put_prog; + link = NULL; + goto out_unlock; } + link->tgt_prog = tgt_prog; + link->trampoline = tr; + + prog->aux->dst_prog = NULL; + prog->aux->dst_trampoline = NULL; + mutex_unlock(&prog->aux->dst_mutex); + return bpf_link_settle(&link_primer); +out_unlock: + mutex_unlock(&prog->aux->dst_mutex); + kfree(link); out_put_prog: bpf_prog_put(prog); return err; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 28c1899949e0..35c5887d82ff 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -261,14 +261,12 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -int bpf_trampoline_link_prog(struct bpf_prog *prog) +int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; - struct bpf_trampoline *tr; int err = 0; int cnt; - tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog); mutex_lock(&tr->mutex); if (tr->extension_prog) { @@ -301,7 +299,7 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog) } hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; - err = bpf_trampoline_update(prog->aux->trampoline); + err = bpf_trampoline_update(tr); if (err) { hlist_del(&prog->aux->tramp_hlist); tr->progs_cnt[kind]--; @@ -312,13 +310,11 @@ out: } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_prog *prog) +int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; - struct bpf_trampoline *tr; int err; - tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog); mutex_lock(&tr->mutex); if (kind == BPF_TRAMP_REPLACE) { @@ -330,7 +326,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog) } hlist_del(&prog->aux->tramp_hlist); tr->progs_cnt[kind]--; - err = bpf_trampoline_update(prog->aux->trampoline); + err = bpf_trampoline_update(tr); out: mutex_unlock(&tr->mutex); return err; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ee343dda73a..a97a2f2964e3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2648,8 +2648,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) { - return prog->aux->linked_prog ? prog->aux->linked_prog->type - : prog->type; + return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; } static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, @@ -11475,7 +11474,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; - struct bpf_prog *tgt_prog = prog->aux->linked_prog; + struct bpf_prog *tgt_prog = prog->aux->dst_prog; struct bpf_attach_target_info tgt_info = {}; u32 btf_id = prog->aux->attach_btf_id; struct bpf_trampoline *tr; @@ -11501,6 +11500,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return ret; if (tgt_prog && prog->type == BPF_PROG_TYPE_EXT) { + /* to make freplace equivalent to their targets, they need to + * inherit env->ops and expected_attach_type for the rest of the + * verification + */ env->ops = bpf_verifier_ops[tgt_prog->type]; prog->expected_attach_type = tgt_prog->expected_attach_type; } @@ -11529,7 +11532,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (!tr) return -ENOMEM; - prog->aux->trampoline = tr; + prog->aux->dst_trampoline = tr; return 0; } -- cgit v1.2.3 From 4a1e7c0c63e02daad751842b7880f9bbcdfb6e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 29 Sep 2020 14:45:51 +0200 Subject: bpf: Support attaching freplace programs to multiple attach points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This enables support for attaching freplace programs to multiple attach points. It does this by amending the UAPI for bpf_link_Create with a target btf ID that can be used to supply the new attachment point along with the target program fd. The target must be compatible with the target that was supplied at program load time. The implementation reuses the checks that were factored out of check_attach_btf_id() to ensure compatibility between the BTF types of the old and new attachment. If these match, a new bpf_tracing_link will be created for the new attach target, allowing multiple attachments to co-exist simultaneously. The code could theoretically support multiple-attach of other types of tracing programs as well, but since I don't have a use case for any of those, there is no API support for doing so. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/160138355169.48470.17165680973640685368.stgit@toke.dk --- kernel/bpf/syscall.c | 132 ++++++++++++++++++++++++++++++++++++++++++++------ kernel/bpf/verifier.c | 10 ++++ 2 files changed, 126 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e6a0a948e30c..f1528c2a6927 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -2554,12 +2555,15 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { .fill_link_info = bpf_tracing_link_fill_link_info, }; -static int bpf_tracing_prog_attach(struct bpf_prog *prog) +static int bpf_tracing_prog_attach(struct bpf_prog *prog, + int tgt_prog_fd, + u32 btf_id) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; + struct bpf_trampoline *tr = NULL; struct bpf_tracing_link *link; - struct bpf_trampoline *tr; + u64 key = 0; int err; switch (prog->type) { @@ -2588,6 +2592,28 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) goto out_put_prog; } + if (!!tgt_prog_fd != !!btf_id) { + err = -EINVAL; + goto out_put_prog; + } + + if (tgt_prog_fd) { + /* For now we only allow new targets for BPF_PROG_TYPE_EXT */ + if (prog->type != BPF_PROG_TYPE_EXT) { + err = -EINVAL; + goto out_put_prog; + } + + tgt_prog = bpf_prog_get(tgt_prog_fd); + if (IS_ERR(tgt_prog)) { + err = PTR_ERR(tgt_prog); + tgt_prog = NULL; + goto out_put_prog; + } + + key = bpf_trampoline_compute_key(tgt_prog, btf_id); + } + link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; @@ -2599,12 +2625,58 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) mutex_lock(&prog->aux->dst_mutex); - if (!prog->aux->dst_trampoline) { + /* There are a few possible cases here: + * + * - if prog->aux->dst_trampoline is set, the program was just loaded + * and not yet attached to anything, so we can use the values stored + * in prog->aux + * + * - if prog->aux->dst_trampoline is NULL, the program has already been + * attached to a target and its initial target was cleared (below) + * + * - if tgt_prog != NULL, the caller specified tgt_prog_fd + + * target_btf_id using the link_create API. + * + * - if tgt_prog == NULL when this function was called using the old + * raw_tracepoint_open API, and we need a target from prog->aux + * + * The combination of no saved target in prog->aux, and no target + * specified on load is illegal, and we reject that here. + */ + if (!prog->aux->dst_trampoline && !tgt_prog) { err = -ENOENT; goto out_unlock; } - tr = prog->aux->dst_trampoline; - tgt_prog = prog->aux->dst_prog; + + if (!prog->aux->dst_trampoline || + (key && key != prog->aux->dst_trampoline->key)) { + /* If there is no saved target, or the specified target is + * different from the destination specified at load time, we + * need a new trampoline and a check for compatibility + */ + struct bpf_attach_target_info tgt_info = {}; + + err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id, + &tgt_info); + if (err) + goto out_unlock; + + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) { + err = -ENOMEM; + goto out_unlock; + } + } else { + /* The caller didn't specify a target, or the target was the + * same as the destination supplied during program load. This + * means we can reuse the trampoline and reference from program + * load time, and there is no need to allocate a new one. This + * can only happen once for any program, as the saved values in + * prog->aux are cleared below. + */ + tr = prog->aux->dst_trampoline; + tgt_prog = prog->aux->dst_prog; + } err = bpf_link_prime(&link->link, &link_primer); if (err) @@ -2620,15 +2692,31 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) link->tgt_prog = tgt_prog; link->trampoline = tr; + /* Always clear the trampoline and target prog from prog->aux to make + * sure the original attach destination is not kept alive after a + * program is (re-)attached to another target. + */ + if (prog->aux->dst_prog && + (tgt_prog_fd || tr != prog->aux->dst_trampoline)) + /* got extra prog ref from syscall, or attaching to different prog */ + bpf_prog_put(prog->aux->dst_prog); + if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline) + /* we allocated a new trampoline, so free the old one */ + bpf_trampoline_put(prog->aux->dst_trampoline); + prog->aux->dst_prog = NULL; prog->aux->dst_trampoline = NULL; mutex_unlock(&prog->aux->dst_mutex); return bpf_link_settle(&link_primer); out_unlock: + if (tr && tr != prog->aux->dst_trampoline) + bpf_trampoline_put(tr); mutex_unlock(&prog->aux->dst_mutex); kfree(link); out_put_prog: + if (tgt_prog_fd && tgt_prog) + bpf_prog_put(tgt_prog); bpf_prog_put(prog); return err; } @@ -2742,7 +2830,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog); + return bpf_tracing_prog_attach(prog, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, @@ -3926,10 +4014,15 @@ err_put: static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { - if (attr->link_create.attach_type == BPF_TRACE_ITER && - prog->expected_attach_type == BPF_TRACE_ITER) - return bpf_iter_link_attach(attr, prog); + if (attr->link_create.attach_type != prog->expected_attach_type) + return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + else if (prog->type == BPF_PROG_TYPE_EXT) + return bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id); return -EINVAL; } @@ -3943,18 +4036,25 @@ static int link_create(union bpf_attr *attr) if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; - ptype = attach_type_to_prog_type(attr->link_create.attach_type); - if (ptype == BPF_PROG_TYPE_UNSPEC) - return -EINVAL; - - prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype); + prog = bpf_prog_get(attr->link_create.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); ret = bpf_prog_attach_check_attach_type(prog, attr->link_create.attach_type); if (ret) - goto err_out; + goto out; + + if (prog->type == BPF_PROG_TYPE_EXT) { + ret = tracing_bpf_link_attach(attr, prog); + goto out; + } + + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { + ret = -EINVAL; + goto out; + } switch (ptype) { case BPF_PROG_TYPE_CGROUP_SKB: @@ -3982,7 +4082,7 @@ static int link_create(union bpf_attr *attr) ret = -EINVAL; } -err_out: +out: if (ret < 0) bpf_prog_put(prog); return ret; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a97a2f2964e3..015a1c074b6b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11404,6 +11404,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (!btf_type_is_func_proto(t)) return -EINVAL; + if ((prog->aux->saved_dst_prog_type || prog->aux->saved_dst_attach_type) && + (!tgt_prog || prog->aux->saved_dst_prog_type != tgt_prog->type || + prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type)) + return -EINVAL; + if (tgt_prog && conservative) t = NULL; @@ -11512,6 +11517,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = tgt_info.tgt_type; prog->aux->attach_func_name = tgt_info.tgt_name; + if (tgt_prog) { + prog->aux->saved_dst_prog_type = tgt_prog->type; + prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type; + } + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) { prog->aux->attach_btf_trace = true; return 0; -- cgit v1.2.3 From 43bc2874e779c17932c447bb06ef4d5188578bdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 29 Sep 2020 14:45:52 +0200 Subject: bpf: Fix context type resolving for extension programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eelco reported we can't properly access arguments if the tracing program is attached to extension program. Having following program: SEC("classifier/test_pkt_md_access") int test_pkt_md_access(struct __sk_buff *skb) with its extension: SEC("freplace/test_pkt_md_access") int test_pkt_md_access_new(struct __sk_buff *skb) and tracing that extension with: SEC("fentry/test_pkt_md_access_new") int BPF_PROG(fentry, struct sk_buff *skb) It's not possible to access skb argument in the fentry program, with following error from verifier: ; int BPF_PROG(fentry, struct sk_buff *skb) 0: (79) r1 = *(u64 *)(r1 +0) invalid bpf_context access off=0 size=8 The problem is that btf_ctx_access gets the context type for the traced program, which is in this case the extension. But when we trace extension program, we want to get the context type of the program that the extension is attached to, so we can access the argument properly in the trace program. This version of the patch is tweaked slightly from Jiri's original one, since the refactoring in the previous patches means we have to get the target prog type from the new variable in prog->aux instead of directly from the target prog. Reported-by: Eelco Chaudron Suggested-by: Jiri Olsa Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/160138355278.48470.17057040257274725638.stgit@toke.dk --- kernel/bpf/btf.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 05816471bac6..4d0ee7839fdb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4582,7 +4582,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { - ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); + enum bpf_prog_type tgt_type; + + if (tgt_prog->type == BPF_PROG_TYPE_EXT) + tgt_type = tgt_prog->aux->saved_dst_prog_type; + else + tgt_type = tgt_prog->type; + + ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg); if (ret > 0) { info->btf_id = ret; return true; -- cgit v1.2.3 From 59f8bcca1ef6a5326f7c127e11b949e745bfa230 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 30 Sep 2020 11:07:33 +0206 Subject: printk: avoid and/or handle record truncation If a reader provides a buffer that is smaller than the message text, the @text_len field of @info will have a value larger than the buffer size. If readers blindly read @text_len bytes of data without checking the size, they will read beyond their buffer. Add this check to record_print_text() to properly recognize when such truncation has occurred. Add a maximum size argument to the ringbuffer function to extend records so that records can not be created that are larger than the buffer size of readers. When extending records (LOG_CONT), do not extend records beyond LOG_LINE_MAX since that is the maximum size available in the buffers used by consoles and syslog. Fixes: f5f022e53b87 ("printk: reimplement log_cont using record extension") Reported-by: Marek Szyprowski Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200930090134.8723-2-john.ogness@linutronix.de --- kernel/printk/printk.c | 9 ++++++++- kernel/printk/printk_ringbuffer.c | 12 ++++++++++-- kernel/printk/printk_ringbuffer.h | 2 +- 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1fe3d0cb2fe0..15cd73da0528 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1349,6 +1349,13 @@ static size_t record_print_text(struct printk_record *r, bool syslog, size_t len = 0; char *next; + /* + * If the message was truncated because the buffer was not large + * enough, treat the available text as if it were the full text. + */ + if (text_len > buf_size) + text_len = buf_size; + prefix_len = info_print_prefix(r->info, syslog, time, prefix); /* @@ -1903,7 +1910,7 @@ static size_t log_output(int facility, int level, enum log_flags lflags, struct printk_record r; prb_rec_init_wr(&r, text_len); - if (prb_reserve_in_last(&e, prb, &r, caller_id)) { + if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { memcpy(&r.text_buf[r.info->text_len], text, text_len); r.info->text_len += text_len; if (lflags & LOG_NEWLINE) { diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 13b94b92342e..2493348a1631 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -202,7 +202,8 @@ * // specify additional 5 bytes text space to extend * prb_rec_init_wr(&r, 5); * - * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id())) { + * // try to extend, but only if it does not exceed 32 bytes + * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id()), 32) { * snprintf(&r.text_buf[r.info->text_len], * r.text_buf_size - r.info->text_len, "hello"); * @@ -1309,6 +1310,7 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * @rb: The ringbuffer to re-reserve and extend data in. * @r: The record structure to allocate buffers for. * @caller_id: The caller ID of the caller (reserving writer). + * @max_size: Fail if the extended size would be greater than this. * * This is the public function available to writers to re-reserve and extend * data. @@ -1343,7 +1345,7 @@ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, * @r->info->text_len after concatenating. */ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, - struct printk_record *r, u32 caller_id) + struct printk_record *r, u32 caller_id, unsigned int max_size) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info; @@ -1389,6 +1391,9 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; + if (r->text_buf_size > max_size) + goto fail; + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, &d->text_blk_lpos, id); } else { @@ -1410,6 +1415,9 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; + if (r->text_buf_size > max_size) + goto fail; + r->text_buf = data_realloc(rb, &rb->text_data_ring, r->text_buf_size, &d->text_blk_lpos, id); } diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 0adaa685d1ca..5dc9d022db07 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -303,7 +303,7 @@ static inline void prb_rec_init_wr(struct printk_record *r, bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r); bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, - struct printk_record *r, u32 caller_id); + struct printk_record *r, u32 caller_id, unsigned int max_size); void prb_commit(struct prb_reserved_entry *e); void prb_final_commit(struct prb_reserved_entry *e); -- cgit v1.2.3 From 0463d04ea03a12d8a5aad42197a5945dfd78d7d6 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 30 Sep 2020 11:07:34 +0206 Subject: printk: reduce setup_text_buf size to LOG_LINE_MAX @setup_text_buf only copies the original text messages (without any prefix or extended text). It only needs to be LOG_LINE_MAX in size. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200930090134.8723-3-john.ogness@linutronix.de --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 15cd73da0528..f5c2945d1e3f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1095,7 +1095,7 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, return prb_record_text_space(&e); } -static char setup_text_buf[CONSOLE_EXT_LOG_MAX] __initdata; +static char setup_text_buf[LOG_LINE_MAX] __initdata; void __init setup_log_buf(int early) { -- cgit v1.2.3 From 95d325185c06cbef1c5be97825265a7129a03512 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 25 Sep 2020 17:11:10 +0800 Subject: cgroup: remove redundant kernfs_activate in cgroup_setup_root() This step is already done in rebind_subsystems(). Not necessary to do it again. Signed-off-by: Wei Yang Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index dd247747ec14..809b13588124 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2006,7 +2006,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); - kernfs_activate(root_cgrp->kn); ret = 0; goto out; -- cgit v1.2.3 From 65026da59cda16baf6c3e98b74ec439f366e468f Mon Sep 17 00:00:00 2001 From: Jouni Roivas Date: Wed, 30 Sep 2020 19:42:42 +0300 Subject: cgroup: Zero sized write should be no-op Do not report failure on zero sized writes, and handle them as no-op. There's issues for example in case of writev() when there's iovec containing zero buffer as a first one. It's expected writev() on below example to successfully perform the write to specified writable cgroup file expecting integer value, and to return 2. For now it's returning value -1, and skipping the write: int writetest(int fd) { const char *buf1 = ""; const char *buf2 = "1\n"; struct iovec iov[2] = { { .iov_base = (void*)buf1, .iov_len = 0 }, { .iov_base = (void*)buf2, .iov_len = 2 } }; return writev(fd, iov, 2); } This patch fixes the issue by checking if there's nothing to write, and handling the write as no-op by just returning 0. Signed-off-by: Jouni Roivas Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 809b13588124..e41c21819ba0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3681,6 +3681,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, struct cgroup_subsys_state *css; int ret; + if (!nbytes) + return 0; + /* * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace -- cgit v1.2.3 From 92acdc58ab11af66fcaef485433fde61b5e32fac Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:16 +0200 Subject: bpf, net: Rework cookie generator as per-cpu one With its use in BPF, the cookie generator can be called very frequently in particular when used out of cgroup v2 hooks (e.g. connect / sendmsg) and attached to the root cgroup, for example, when used in v1/v2 mixed environments. In particular, when there's a high churn on sockets in the system there can be many parallel requests to the bpf_get_socket_cookie() and bpf_get_netns_cookie() helpers which then cause contention on the atomic counter. As similarly done in f991bd2e1421 ("fs: introduce a per-cpu last_ino allocator"), add a small helper library that both can use for the 64 bit counters. Given this can be called from different contexts, we also need to deal with potential nested calls even though in practice they are considered extremely rare. One idea as suggested by Eric Dumazet was to use a reverse counter for this situation since we don't expect 64 bit overflows anyways; that way, we can avoid bigger gaps in the 64 bit counter space compared to just batch-wise increase. Even on machines with small number of cores (e.g. 4) the cookie generation shrinks from min/max/med/avg (ns) of 22/50/40/38.9 down to 10/35/14/17.3 when run in parallel from multiple CPUs. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Cc: Eric Dumazet Link: https://lore.kernel.org/bpf/8a80b8d27d3c49f9a14e1d5213c19d8be87d1dc8.1601477936.git.daniel@iogearbox.net --- kernel/bpf/reuseport_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 5a2ba1182493..a55cd542f2ce 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -191,7 +191,7 @@ int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, rcu_read_lock(); sk = reuseport_array_lookup_elem(map, key); if (sk) { - *(u64 *)value = sock_gen_cookie(sk); + *(u64 *)value = __sock_gen_cookie(sk); err = 0; } else { err = -ENOENT; -- cgit v1.2.3 From 0f2122045b946241a9e549c2a76cea54fa58a7ff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 13 Sep 2020 13:09:39 -0600 Subject: io_uring: don't rely on weak ->files references Grab actual references to the files_struct. To avoid circular references issues due to this, we add a per-task note that keeps track of what io_uring contexts a task has used. When the tasks execs or exits its assigned files, we cancel requests based on this tracking. With that, we can grab proper references to the files table, and no longer need to rely on stashing away ring_fd and ring_file to check if the ring_fd may have been closed. Cc: stable@vger.kernel.org # v5.5+ Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- kernel/fork.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index da8d360fb032..a3795aaaab5c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -95,6 +95,7 @@ #include #include #include +#include #include #include @@ -728,6 +729,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); @@ -1983,6 +1985,10 @@ static __latent_entropy struct task_struct *copy_process( p->vtime.state = VTIME_INACTIVE; #endif +#ifdef CONFIG_IO_URING + p->io_uring = NULL; +#endif + #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif -- cgit v1.2.3 From 792caccc4526bb489e054f9ab61d7c024b15dea2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 30 Sep 2020 15:49:26 -0700 Subject: bpf: Introduce BPF_F_PRESERVE_ELEMS for perf event array Currently, perf event in perf event array is removed from the array when the map fd used to add the event is closed. This behavior makes it difficult to the share perf events with perf event array. Introduce perf event map that keeps the perf event open with a new flag BPF_F_PRESERVE_ELEMS. With this flag set, perf events in the array are not removed when the original map fd is closed. Instead, the perf event will stay in the map until 1) it is explicitly removed from the array; or 2) the array is freed. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200930224927.1936644-2-songliubraving@fb.com --- kernel/bpf/arraymap.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e5fd31268ae0..bd777dd6f967 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -15,7 +15,8 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) + (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ + BPF_F_PRESERVE_ELEMS) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -64,6 +65,10 @@ int array_map_alloc_check(union bpf_attr *attr) attr->map_flags & BPF_F_MMAPABLE) return -EINVAL; + if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && + attr->map_flags & BPF_F_PRESERVE_ELEMS) + return -EINVAL; + if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. @@ -1134,6 +1139,9 @@ static void perf_event_fd_array_release(struct bpf_map *map, struct bpf_event_entry *ee; int i; + if (map->map_flags & BPF_F_PRESERVE_ELEMS) + return; + rcu_read_lock(); for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); @@ -1143,12 +1151,19 @@ static void perf_event_fd_array_release(struct bpf_map *map, rcu_read_unlock(); } +static void perf_event_fd_array_map_free(struct bpf_map *map) +{ + if (map->map_flags & BPF_F_PRESERVE_ELEMS) + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, - .map_free = fd_array_map_free, + .map_free = perf_event_fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, -- cgit v1.2.3 From 4c4197eda710d197c7474abcceb3f8789ec22a64 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Sun, 27 Sep 2020 22:15:30 +0100 Subject: kgdb: Add NOKPROBE labels on the trap handler functions Currently kgdb honours the kprobe blocklist but doesn't place its own trap handling code on the list. Add labels to discourage attempting to use kgdb to debug itself. Not every functions that executes from the trap handler needs to be marked up: relatively early in the trap handler execution (just after we bring the other CPUs to a halt) all breakpoints are replaced with the original opcodes. This patch marks up code in the debug_core that executes between trap entry and the breakpoints being deactivated and, also, code that executes between breakpoint activation and trap exit. To be clear these changes are not sufficient to make recursive trapping impossible since they do not include library calls made during kgdb's entry/exit logic. However going much further whilst we are sharing the kprobe blocklist risks reducing the capabilities of kprobe and this would be a bad trade off (especially so given kgdb's users are currently conditioned to avoid recursive traps). Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20200927211531.1380577-3-daniel.thompson@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 6b9383fa8278..0761cbcbdd6d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -169,12 +169,14 @@ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); return err; } +NOKPROBE_SYMBOL(kgdb_arch_set_breakpoint); int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { return copy_to_kernel_nofault((char *)bpt->bpt_addr, (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } +NOKPROBE_SYMBOL(kgdb_arch_remove_breakpoint); int __weak kgdb_validate_break_address(unsigned long addr) { @@ -204,6 +206,7 @@ unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) { return instruction_pointer(regs); } +NOKPROBE_SYMBOL(kgdb_arch_pc); int __weak kgdb_arch_init(void) { @@ -214,6 +217,7 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs) { return 0; } +NOKPROBE_SYMBOL(kgdb_skipexception); #ifdef CONFIG_SMP @@ -235,6 +239,7 @@ void __weak kgdb_call_nmi_hook(void *ignored) */ kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); } +NOKPROBE_SYMBOL(kgdb_call_nmi_hook); void __weak kgdb_roundup_cpus(void) { @@ -268,6 +273,7 @@ void __weak kgdb_roundup_cpus(void) kgdb_info[cpu].rounding_up = false; } } +NOKPROBE_SYMBOL(kgdb_roundup_cpus); #endif @@ -294,6 +300,7 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } +NOKPROBE_SYMBOL(kgdb_flush_swbreak_addr); /* * SW breakpoint management: @@ -321,6 +328,7 @@ int dbg_activate_sw_breakpoints(void) } return ret; } +NOKPROBE_SYMBOL(dbg_activate_sw_breakpoints); int dbg_set_sw_break(unsigned long addr) { @@ -384,6 +392,7 @@ int dbg_deactivate_sw_breakpoints(void) } return ret; } +NOKPROBE_SYMBOL(dbg_deactivate_sw_breakpoints); int dbg_remove_sw_break(unsigned long addr) { @@ -505,6 +514,7 @@ static int kgdb_io_ready(int print_wait) } return 1; } +NOKPROBE_SYMBOL(kgdb_io_ready); static int kgdb_reenter_check(struct kgdb_state *ks) { @@ -552,6 +562,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) return 1; } +NOKPROBE_SYMBOL(kgdb_reenter_check); static void dbg_touch_watchdogs(void) { @@ -559,6 +570,7 @@ static void dbg_touch_watchdogs(void) clocksource_touch_watchdog(); rcu_cpu_stall_reset(); } +NOKPROBE_SYMBOL(dbg_touch_watchdogs); static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, int exception_state) @@ -790,6 +802,7 @@ kgdb_restore: return kgdb_info[cpu].ret_state; } +NOKPROBE_SYMBOL(kgdb_cpu_enter); /* * kgdb_handle_exception() - main entry point from a kernel exception @@ -834,6 +847,7 @@ out: arch_kgdb_ops.enable_nmi(1); return ret; } +NOKPROBE_SYMBOL(kgdb_handle_exception); /* * GDB places a breakpoint at this function to know dynamically loaded objects. @@ -868,6 +882,7 @@ int kgdb_nmicallback(int cpu, void *regs) #endif return 1; } +NOKPROBE_SYMBOL(kgdb_nmicallback); int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code, atomic_t *send_ready) @@ -893,6 +908,7 @@ int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code, #endif return 1; } +NOKPROBE_SYMBOL(kgdb_nmicallin); static void kgdb_console_write(struct console *co, const char *s, unsigned count) -- cgit v1.2.3 From 771910f719651789adee8260e1a2c4c0ba161007 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Sun, 27 Sep 2020 22:15:31 +0100 Subject: kernel: debug: Centralize dbg_[de]activate_sw_breakpoints During debug trap execution we expect dbg_deactivate_sw_breakpoints() to be paired with an dbg_activate_sw_breakpoint(). Currently although the calls are paired correctly they are needlessly smeared across three different functions. Worse this also results in code to drive polled I/O being called with breakpoints activated which, in turn, needlessly increases the set of functions that will recursively trap if breakpointed. Fix this by moving the activation of breakpoints into the debug core. Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20200927211531.1380577-4-daniel.thompson@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 2 ++ kernel/debug/gdbstub.c | 1 - kernel/debug/kdb/kdb_debugger.c | 2 -- 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0761cbcbdd6d..1e75a8923a8d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -760,6 +760,8 @@ cpu_master_loop: } } + dbg_activate_sw_breakpoints(); + /* Call the I/O driver's post_exception routine */ if (dbg_io_ops->post_exception) dbg_io_ops->post_exception(); diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index b52ebff09ac8..a77df59d9ca5 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1061,7 +1061,6 @@ int gdb_serial_stub(struct kgdb_state *ks) error_packet(remcom_out_buffer, -EINVAL); break; } - dbg_activate_sw_breakpoints(); fallthrough; /* to default processing */ default: default_handle: diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 53a0df6e4d92..0220afda3200 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -147,7 +147,6 @@ int kdb_stub(struct kgdb_state *ks) return DBG_PASS_EVENT; } kdb_bp_install(ks->linux_regs); - dbg_activate_sw_breakpoints(); /* Set the exit state to a single step or a continue */ if (KDB_STATE(DOING_SS)) gdbstub_state(ks, "s"); @@ -167,7 +166,6 @@ int kdb_stub(struct kgdb_state *ks) * differently vs the gdbstub */ kgdb_single_step = 0; - dbg_deactivate_sw_breakpoints(); return DBG_SWITCH_CPU_EVENT; } return kgdb_info[ks->cpu].ret_state; -- cgit v1.2.3 From d081a6e353168f15e63eb9e9334757f20343319f Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 9 Sep 2020 15:17:08 +0100 Subject: kdb: Fix pager search for multi-line strings Currently using forward search doesn't handle multi-line strings correctly. The search routine replaces line breaks with \0 during the search and, for regular searches ("help | grep Common\n"), there is code after the line has been discarded or printed to replace the break character. However during a pager search ("help\n" followed by "/Common\n") when the string is matched we will immediately return to normal output and the code that should restore the \n becomes unreachable. Fix this by restoring the replaced character when we disable the search mode and update the comment accordingly. Fixes: fb6daa7520f9d ("kdb: Provide forward search at more prompt") Link: https://lore.kernel.org/r/20200909141708.338273-1-daniel.thompson@linaro.org Reviewed-by: Douglas Anderson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index e7835ca88e16..6735ac36b718 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -706,12 +706,16 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) size_avail = sizeof(kdb_buffer) - len; goto kdb_print_out; } - if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) + if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) { /* * This was a interactive search (using '/' at more - * prompt) and it has completed. Clear the flag. + * prompt) and it has completed. Replace the \0 with + * its original value to ensure multi-line strings + * are handled properly, and return to normal mode. */ + *cphold = replaced_byte; kdb_grepping_flag = 0; + } /* * at this point the string is a full line and * should be printed, up to the null. -- cgit v1.2.3 From fdda88d31addb19a35ac7962e752e55aaeb5c20a Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Fri, 2 Oct 2020 22:31:26 +0800 Subject: ftrace: Fix some typos in comment s/coorditate/coordinate/ s/emty/empty/ s/preeptive/preemptive/ s/succes/success/ s/carefule/careful/ Link: https://lkml.kernel.org/r/20201002143126.2890-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 84f32dbc7be8..123d520b9261 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -230,7 +230,7 @@ static void update_ftrace_function(void) /* * For static tracing, we need to be a bit more careful. * The function change takes affect immediately. Thus, - * we need to coorditate the setting of the function_trace_ops + * we need to coordinate the setting of the function_trace_ops * with the setting of the ftrace_trace_function. * * Set the function to the list ops, which will call the @@ -1451,7 +1451,7 @@ static bool hash_contains_ip(unsigned long ip, { /* * The function record is a match if it exists in the filter - * hash and not in the notrace hash. Note, an emty hash is + * hash and not in the notrace hash. Note, an empty hash is * considered a match for the filter hash, but an empty * notrace hash is considered not in the notrace hash. */ @@ -2976,7 +2976,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) synchronize_rcu_tasks_rude(); /* - * When the kernel is preeptive, tasks can be preempted + * When the kernel is preemptive, tasks can be preempted * while on a ftrace trampoline. Just scheduling a task on * a CPU is not good enough to flush them. Calling * synchornize_rcu_tasks() will wait for those tasks to @@ -4368,7 +4368,7 @@ void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, * @ip: The instruction pointer address to map @data to * @data: The data to map to @ip * - * Returns 0 on succes otherwise an error. + * Returns 0 on success otherwise an error. */ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, unsigned long ip, void *data) @@ -4536,7 +4536,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, /* * Note, there's a small window here that the func_hash->filter_hash - * may be NULL or empty. Need to be carefule when reading the loop. + * may be NULL or empty. Need to be careful when reading the loop. */ mutex_lock(&probe->ops.func_hash->regex_lock); -- cgit v1.2.3 From 547305a64632813286700cb6d768bfe773df7d19 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 1 Oct 2020 21:27:57 -0400 Subject: tracepoint: Fix out of sync data passing by static caller Naresh reported a bug that appears to be a side effect of the static calls. It happens when going from more than one tracepoint callback to a single one, and removing the first callback on the list. The list of tracepoint callbacks holds data and a function to call with the parameters of that tracepoint and a handler to the associated data. old_list: 0: func = foo; data = NULL; 1: func = bar; data = &bar_struct; new_list: 0: func = bar; data = &bar_struct; CPU 0 CPU 1 ----- ----- tp_funcs = old_list; tp_static_caller = tp_interator __DO_TRACE() data = tp_funcs[0].data = NULL; tp_funcs = new_list; tracepoint_update_call() tp_static_caller = tp_funcs[0] = bar; tp_static_caller(data) bar(data) x = data->item = NULL->item BOOM! To solve this, add a tracepoint_synchronize_unregister() between changing tp_funcs and updating the static tracepoint, that does both a synchronize_rcu() and synchronize_srcu(). This will ensure that when the static call is updated to the single callback that it will be receiving the data that it registered with. Fixes: d25e37d89dd2f ("tracepoint: Optimize using static_call()") Reported-by: Naresh Kamboju Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/linux-next/CA+G9fYvPXVRO0NV7yL=FxCmFEMYkCwdz7R=9W+_votpT824YJA@mail.gmail.com --- kernel/tracepoint.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index e92f3fb8887a..26efd22f0633 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -221,7 +221,7 @@ static void *func_remove(struct tracepoint_func **funcs, return old; } -static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs) +static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs, bool sync) { void *func = tp->iterator; @@ -229,8 +229,17 @@ static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func if (!tp->static_call_key) return; - if (!tp_funcs[1].func) + if (!tp_funcs[1].func) { func = tp_funcs[0].func; + /* + * If going from the iterator back to a single caller, + * we need to synchronize with __DO_TRACE to make sure + * that the data passed to the callback is the one that + * belongs to that callback. + */ + if (sync) + tracepoint_synchronize_unregister(); + } __static_call_update(tp->static_call_key, tp->static_call_tramp, func); } @@ -265,7 +274,7 @@ static int tracepoint_add_func(struct tracepoint *tp, * include/linux/tracepoint.h using rcu_dereference_sched(). */ rcu_assign_pointer(tp->funcs, tp_funcs); - tracepoint_update_call(tp, tp_funcs); + tracepoint_update_call(tp, tp_funcs, false); static_key_enable(&tp->key); release_probes(old); @@ -297,11 +306,12 @@ static int tracepoint_remove_func(struct tracepoint *tp, tp->unregfunc(); static_key_disable(&tp->key); + rcu_assign_pointer(tp->funcs, tp_funcs); } else { - tracepoint_update_call(tp, tp_funcs); + rcu_assign_pointer(tp->funcs, tp_funcs); + tracepoint_update_call(tp, tp_funcs, + tp_funcs[0].func != old[0].func); } - - rcu_assign_pointer(tp->funcs, tp_funcs); release_probes(old); return 0; } -- cgit v1.2.3 From 69e0ad37c9f32d5aa1beb02aab4ec0cd055be013 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 28 Sep 2020 16:09:39 -0700 Subject: static_call: Fix return type of static_call_init Functions that are passed to early_initcall should be of type initcall_t, which expects a return type of int. This is not currently an error but a patch in the Clang LTO series could change that in the future. Fixes: 9183c3f9ed71 ("static_call: Add inline static call infrastructure") Signed-off-by: Nathan Chancellor Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Link: https://lore.kernel.org/lkml/20200903203053.3411268-17-samitolvanen@google.com/ --- kernel/static_call.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/static_call.c b/kernel/static_call.c index f8362b3f8fd5..84565c2a41b8 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -410,12 +410,12 @@ int static_call_text_reserved(void *start, void *end) return __static_call_mod_text_reserved(start, end); } -void __init static_call_init(void) +int __init static_call_init(void) { int ret; if (static_call_initialized) - return; + return 0; cpus_read_lock(); static_call_lock(); @@ -434,6 +434,7 @@ void __init static_call_init(void) #ifdef CONFIG_MODULES register_module_notifier(&static_call_module_nb); #endif + return 0; } early_initcall(static_call_init); -- cgit v1.2.3 From 4976b718c3551faba2c0616ef55ebeb74db1c5ca Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:44 -0700 Subject: bpf: Introduce pseudo_btf_id Pseudo_btf_id is a type of ld_imm insn that associates a btf_id to a ksym so that further dereferences on the ksym can use the BTF info to validate accesses. Internally, when seeing a pseudo_btf_id ld insn, the verifier reads the btf_id stored in the insn[0]'s imm field and marks the dst_reg as PTR_TO_BTF_ID. The btf_id points to a VAR_KIND, which is encoded in btf_vminux by pahole. If the VAR is not of a struct type, the dst reg will be marked as PTR_TO_MEM instead of PTR_TO_BTF_ID and the mem_size is resolved to the size of the VAR's type. >From the VAR btf_id, the verifier can also read the address of the ksym's corresponding kernel var from kallsyms and use that to fill dst_reg. Therefore, the proper functionality of pseudo_btf_id depends on (1) kallsyms and (2) the encoding of kernel global VARs in pahole, which should be available since pahole v1.18. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-2-haoluo@google.com --- kernel/bpf/btf.c | 15 ------ kernel/bpf/verifier.c | 125 ++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 112 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4d0ee7839fdb..00569afe3d0d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -440,16 +440,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t) return !t || btf_type_nosize(t); } -/* union is only a special case of struct: - * all its offsetof(member) == 0 - */ -static bool btf_type_is_struct(const struct btf_type *t) -{ - u8 kind = BTF_INFO_KIND(t->info); - - return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; -} - static bool __btf_type_is_struct(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; @@ -460,11 +450,6 @@ static bool btf_type_is_array(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } -static bool btf_type_is_var(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; -} - static bool btf_type_is_datasec(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 015a1c074b6b..fe4965079773 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7488,6 +7488,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *dst_reg; struct bpf_map *map; int err; @@ -7504,25 +7505,44 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + dst_reg = ®s[insn->dst_reg]; if (insn->src_reg == 0) { u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - regs[insn->dst_reg].type = SCALAR_VALUE; + dst_reg->type = SCALAR_VALUE; __mark_reg_known(®s[insn->dst_reg], imm); return 0; } + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { + mark_reg_known_zero(env, regs, insn->dst_reg); + + dst_reg->type = aux->btf_var.reg_type; + switch (dst_reg->type) { + case PTR_TO_MEM: + dst_reg->mem_size = aux->btf_var.mem_size; + break; + case PTR_TO_BTF_ID: + dst_reg->btf_id = aux->btf_var.btf_id; + break; + default: + verbose(env, "bpf verifier is misconfigured\n"); + return -EFAULT; + } + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); - regs[insn->dst_reg].map_ptr = map; + dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { - regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - regs[insn->dst_reg].off = aux->map_off; + dst_reg->type = PTR_TO_MAP_VALUE; + dst_reg->off = aux->map_off; if (map_value_has_spin_lock(map)) - regs[insn->dst_reg].id = ++env->id_gen; + dst_reg->id = ++env->id_gen; } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + dst_reg->type = CONST_PTR_TO_MAP; } else { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; @@ -9424,6 +9444,73 @@ process_bpf_exit: return 0; } +/* replace pseudo btf_id with kernel symbol address */ +static int check_pseudo_btf_id(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_insn_aux_data *aux) +{ + u32 type, id = insn->imm; + const struct btf_type *t; + const char *sym_name; + u64 addr; + + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; + } + + if (insn[1].imm != 0) { + verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); + return -EINVAL; + } + + t = btf_type_by_id(btf_vmlinux, id); + if (!t) { + verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); + return -ENOENT; + } + + if (!btf_type_is_var(t)) { + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", + id); + return -EINVAL; + } + + sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); + addr = kallsyms_lookup_name(sym_name); + if (!addr) { + verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", + sym_name); + return -ENOENT; + } + + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + + type = t->type; + t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); + if (!btf_type_is_struct(t)) { + const struct btf_type *ret; + const char *tname; + u32 tsize; + + /* resolve the type size of ksym. */ + ret = btf_resolve_size(btf_vmlinux, t, &tsize); + if (IS_ERR(ret)) { + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", + tname, PTR_ERR(ret)); + return -EINVAL; + } + aux->btf_var.reg_type = PTR_TO_MEM; + aux->btf_var.mem_size = tsize; + } else { + aux->btf_var.reg_type = PTR_TO_BTF_ID; + aux->btf_var.btf_id = type; + } + return 0; +} + static int check_map_prealloc(struct bpf_map *map) { return (map->map_type != BPF_MAP_TYPE_HASH && @@ -9534,10 +9621,14 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map) map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); } -/* look for pseudo eBPF instructions that access map FDs and - * replace them with actual map pointers +/* find and rewrite pseudo imm in ld_imm64 instructions: + * + * 1. if it accesses map FD, replace it with actual map pointer. + * 2. if it accesses btf_id of a VAR, replace it with pointer to the var. + * + * NOTE: btf_vmlinux is required for converting pseudo btf_id. */ -static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) +static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -9578,6 +9669,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* valid generic load 64-bit imm */ goto next_insn; + if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) { + aux = &env->insn_aux_data[i]; + err = check_pseudo_btf_id(env, insn, aux); + if (err) + return err; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -11633,10 +11732,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; - ret = replace_map_fd_with_map_ptr(env); - if (ret < 0) - goto skip_full_check; - if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env->prog); if (ret) @@ -11662,6 +11757,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret) goto skip_full_check; + ret = resolve_pseudo_ldimm64(env); + if (ret < 0) + goto skip_full_check; + ret = check_cfg(env); if (ret < 0) goto skip_full_check; -- cgit v1.2.3 From eaa6bcb71ef6ed3dc18fc525ee7e293b06b4882b Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:47 -0700 Subject: bpf: Introduce bpf_per_cpu_ptr() Add bpf_per_cpu_ptr() to help bpf programs access percpu vars. bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the kernel except that it may return NULL. This happens when the cpu parameter is out of range. So the caller must check the returned value. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-5-haoluo@google.com --- kernel/bpf/btf.c | 10 -------- kernel/bpf/helpers.c | 18 ++++++++++++++ kernel/bpf/verifier.c | 64 +++++++++++++++++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 2 ++ 4 files changed, 81 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 00569afe3d0d..ed7d02e8bc93 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -188,11 +188,6 @@ i < btf_type_vlen(struct_type); \ i++, member++) -#define for_each_vsi(i, struct_type, member) \ - for (i = 0, member = btf_type_var_secinfo(struct_type); \ - i < btf_type_vlen(struct_type); \ - i++, member++) - #define for_each_vsi_from(i, from, struct_type, member) \ for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ i < btf_type_vlen(struct_type); \ @@ -598,11 +593,6 @@ static const struct btf_var *btf_type_var(const struct btf_type *t) return (const struct btf_var *)(t + 1); } -static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) -{ - return (const struct btf_var_secinfo *)(t + 1); -} - static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e825441781ab..14fe3f64fd82 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -623,6 +623,22 @@ const struct bpf_func_proto bpf_copy_from_user_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) +{ + if (cpu >= nr_cpu_ids) + return (unsigned long)NULL; + + return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu); +} + +const struct bpf_func_proto bpf_per_cpu_ptr_proto = { + .func = bpf_per_cpu_ptr, + .gpl_only = false, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, + .arg2_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -689,6 +705,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + case BPF_FUNC_bpf_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fe4965079773..216b8ece23ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,6 +238,8 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int func_id; + u32 btf_id; + u32 ret_btf_id; }; struct btf *btf_vmlinux; @@ -517,6 +519,7 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", [PTR_TO_MEM_OR_NULL] = "mem_or_null", [PTR_TO_RDONLY_BUF] = "rdonly_buf", @@ -583,7 +586,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) + if (t == PTR_TO_BTF_ID || + t == PTR_TO_BTF_ID_OR_NULL || + t == PTR_TO_PERCPU_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -2204,6 +2209,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_RDONLY_BUF_OR_NULL: case PTR_TO_RDWR_BUF: case PTR_TO_RDWR_BUF_OR_NULL: + case PTR_TO_PERCPU_BTF_ID: return true; default: return false; @@ -4017,6 +4023,7 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4042,6 +4049,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, + [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4205,6 +4213,12 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); + } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) { + if (!reg->btf_id) { + verbose(env, "Helper has invalid btf_id in R%d\n", regno); + return -EACCES; + } + meta->ret_btf_id = reg->btf_id; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -5114,6 +5128,30 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL) { + const struct btf_type *t; + + mark_reg_known_zero(env, regs, BPF_REG_0); + t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL); + if (!btf_type_is_struct(t)) { + u32 tsize; + const struct btf_type *ret; + const char *tname; + + /* resolve the type size of ksym. */ + ret = btf_resolve_size(btf_vmlinux, t, &tsize); + if (IS_ERR(ret)) { + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + verbose(env, "unable to resolve the size of type '%s': %ld\n", + tname, PTR_ERR(ret)); + return -EINVAL; + } + regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].mem_size = tsize; + } else { + regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].btf_id = meta.ret_btf_id; + } } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { int ret_btf_id; @@ -7523,6 +7561,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->mem_size = aux->btf_var.mem_size; break; case PTR_TO_BTF_ID: + case PTR_TO_PERCPU_BTF_ID: dst_reg->btf_id = aux->btf_var.btf_id; break; default: @@ -9449,10 +9488,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *aux) { - u32 type, id = insn->imm; + u32 datasec_id, type, id = insn->imm; + const struct btf_var_secinfo *vsi; + const struct btf_type *datasec; const struct btf_type *t; const char *sym_name; + bool percpu = false; u64 addr; + int i; if (!btf_vmlinux) { verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); @@ -9484,12 +9527,27 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, return -ENOENT; } + datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", + BTF_KIND_DATASEC); + if (datasec_id > 0) { + datasec = btf_type_by_id(btf_vmlinux, datasec_id); + for_each_vsi(i, datasec, vsi) { + if (vsi->type == id) { + percpu = true; + break; + } + } + } + insn[0].imm = (u32)addr; insn[1].imm = addr >> 32; type = t->type; t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); - if (!btf_type_is_struct(t)) { + if (percpu) { + aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.btf_id = type; + } else if (!btf_type_is_struct(t)) { const struct btf_type *ret; const char *tname; u32 tsize; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e118a83439c3..364a322e2898 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1327,6 +1327,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; + case BPF_FUNC_bpf_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; default: return NULL; } -- cgit v1.2.3 From 63d9b80dcf2c67bc5ade61cbbaa09d7af21f43f1 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:48 -0700 Subject: bpf: Introducte bpf_this_cpu_ptr() Add bpf_this_cpu_ptr() to help access percpu var on this cpu. This helper always returns a valid pointer, therefore no need to check returned value for NULL. Also note that all programs run with preemption disabled, which means that the returned pointer is stable during all the execution of the program. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-6-haoluo@google.com --- kernel/bpf/helpers.c | 14 ++++++++++++++ kernel/bpf/verifier.c | 11 ++++++++--- kernel/trace/bpf_trace.c | 2 ++ 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 14fe3f64fd82..25520f5eeaf6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -639,6 +639,18 @@ const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) +{ + return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr); +} + +const struct bpf_func_proto bpf_this_cpu_ptr_proto = { + .func = bpf_this_cpu_ptr, + .gpl_only = false, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -707,6 +719,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_jiffies64_proto; case BPF_FUNC_bpf_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_bpf_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 216b8ece23ce..d9dbf271ebab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5128,7 +5128,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; - } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL) { + } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); @@ -5146,10 +5147,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn tname, PTR_ERR(ret)); return -EINVAL; } - regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].type = + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? + PTR_TO_MEM : PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].mem_size = tsize; } else { - regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? + PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 364a322e2898..a136a6a63a71 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1329,6 +1329,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_snprintf_btf_proto; case BPF_FUNC_bpf_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_bpf_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; default: return NULL; } -- cgit v1.2.3 From 66a9b9287d2447a91cef2fafc648dee32186f708 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 2 Oct 2020 14:49:54 -0700 Subject: genetlink: move to smaller ops wherever possible Bulk of the genetlink users can use smaller ops, move them. Signed-off-by: Jakub Kicinski Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- kernel/taskstats.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index e2ac0e37c4ae..ef4de29fbe8a 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -644,7 +644,7 @@ err: nlmsg_free(rep_skb); } -static const struct genl_ops taskstats_ops[] = { +static const struct genl_small_ops taskstats_ops[] = { { .cmd = TASKSTATS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -687,8 +687,8 @@ static struct genl_family family __ro_after_init = { .version = TASKSTATS_GENL_VERSION, .maxattr = TASKSTATS_CMD_ATTR_MAX, .module = THIS_MODULE, - .ops = taskstats_ops, - .n_ops = ARRAY_SIZE(taskstats_ops), + .small_ops = taskstats_ops, + .n_small_ops = ARRAY_SIZE(taskstats_ops), .pre_doit = taskstats_pre_doit, }; -- cgit v1.2.3 From 7c1e0926da18051f4a09da1e380591213b0cdb5f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 2 Oct 2020 14:49:58 -0700 Subject: taskstats: move specifying netlink policy back to ops commit 3b0f31f2b8c9 ("genetlink: make policy common to family") had to work around removal of policy from ops by parsing in the pre_doit callback. Now that policy is back in full ops we can switch again. Set maxattr to actual size of the policies - both commands set GENL_DONT_VALIDATE_STRICT so out of range attributes will be silently ignored, anyway. v2: - remove stale comment Suggested-by: Johannes Berg Signed-off-by: Jakub Kicinski Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- kernel/taskstats.c | 46 ++++++++++------------------------------------ 1 file changed, 10 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index ef4de29fbe8a..a2802b6ff4bb 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -34,17 +34,13 @@ struct kmem_cache *taskstats_cache; static struct genl_family family; -static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { +static const struct nla_policy taskstats_cmd_get_policy[] = { [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; -/* - * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. - * Make sure they are always aligned. - */ -static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { +static const struct nla_policy cgroupstats_cmd_get_policy[] = { [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, }; @@ -644,52 +640,30 @@ err: nlmsg_free(rep_skb); } -static const struct genl_small_ops taskstats_ops[] = { +static const struct genl_ops taskstats_ops[] = { { .cmd = TASKSTATS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = taskstats_user_cmd, - /* policy enforced later */ - .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL, + .policy = taskstats_cmd_get_policy, + .maxattr = ARRAY_SIZE(taskstats_cmd_get_policy) - 1, + .flags = GENL_ADMIN_PERM, }, { .cmd = CGROUPSTATS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = cgroupstats_user_cmd, - /* policy enforced later */ - .flags = GENL_CMD_CAP_HASPOL, + .policy = cgroupstats_cmd_get_policy, + .maxattr = ARRAY_SIZE(cgroupstats_cmd_get_policy) - 1, }, }; -static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, - struct genl_info *info) -{ - const struct nla_policy *policy = NULL; - - switch (ops->cmd) { - case TASKSTATS_CMD_GET: - policy = taskstats_cmd_get_policy; - break; - case CGROUPSTATS_CMD_GET: - policy = cgroupstats_cmd_get_policy; - break; - default: - return -EINVAL; - } - - return nlmsg_validate_deprecated(info->nlhdr, GENL_HDRLEN, - TASKSTATS_CMD_ATTR_MAX, policy, - info->extack); -} - static struct genl_family family __ro_after_init = { .name = TASKSTATS_GENL_NAME, .version = TASKSTATS_GENL_VERSION, - .maxattr = TASKSTATS_CMD_ATTR_MAX, .module = THIS_MODULE, - .small_ops = taskstats_ops, - .n_small_ops = ARRAY_SIZE(taskstats_ops), - .pre_doit = taskstats_pre_doit, + .ops = taskstats_ops, + .n_ops = ARRAY_SIZE(taskstats_ops), }; /* Needed early in initialization */ -- cgit v1.2.3 From 1028ae4069991e26d1522e957939fb61d2da1d12 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 2 Oct 2020 17:25:44 -0700 Subject: bpf: Deref map in BPF_PROG_BIND_MAP when it's already used We are missing a deref for the case when we are doing BPF_PROG_BIND_MAP on a map that's being already held by the program. There is 'if (ret) bpf_map_put(map)' below which doesn't trigger because we don't consider this an error. Let's add missing bpf_map_put() for this specific condition. Fixes: ef15314aa5de ("bpf: Add BPF_PROG_BIND_MAP syscall") Reported-by: Alexei Starovoitov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201003002544.3601440-1-sdf@google.com --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f1528c2a6927..1110ecd7d1f3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4323,8 +4323,10 @@ static int bpf_prog_bind_map(union bpf_attr *attr) used_maps_old = prog->aux->used_maps; for (i = 0; i < prog->aux->used_map_cnt; i++) - if (used_maps_old[i] == map) + if (used_maps_old[i] == map) { + bpf_map_put(map); goto out_unlock; + } used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, sizeof(used_maps_new[0]), -- cgit v1.2.3 From 9abb897345ce1d41257567f571a78137c961c405 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 30 Sep 2020 10:35:32 -0700 Subject: sched/fair: Tweak pick_next_entity() Currently, pick_next_entity(...) has the following structure (simplified): [...] if (last_buddy_ok()) result = last_buddy; if (next_buddy_ok()) result = next_buddy; [...] The intended behavior is to prefer next buddy over last buddy; the current code somewhat obfuscates this, and also wastes cycles checking the last buddy when eventually the next buddy is picked up. So this patch refactors two 'ifs' above into [...] if (next_buddy_ok()) result = next_buddy; else if (last_buddy_ok()) result = last_buddy; [...] Signed-off-by: Peter Oskolkov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200930173532.1069092-1-posk@google.com --- kernel/sched/fair.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fc3410b8b990..cec6cf9b2bb3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4465,17 +4465,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) se = second; } - /* - * Prefer last buddy, try to return the CPU to a preempted task. - */ - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) - se = cfs_rq->last; - - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { + /* + * Someone really wants this to run. If it's not unfair, run it. + */ se = cfs_rq->next; + } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + se = cfs_rq->last; + } clear_buddies(cfs_rq, se); -- cgit v1.2.3 From 51cf18c90ca1b51d1cb4af3064e85fcf8610b5d2 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 28 Aug 2020 10:00:49 +0100 Subject: sched/debug: Add new tracepoint to track cpu_capacity rq->cpu_capacity is a key element in several scheduler parts, such as EAS task placement and load balancing. Tracking this value enables testing and/or debugging by a toolkit. Signed-off-by: Vincent Donnefort Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1598605249-72651-1-git-send-email-vincent.donnefort@arm.com --- kernel/sched/core.c | 1 + kernel/sched/fair.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd32d859ab40..3dc415f58bd7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cec6cf9b2bb3..aa4c6227cd6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8108,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity = 1; cpu_rq(cpu)->cpu_capacity = capacity; + trace_sched_cpu_capacity_tp(cpu_rq(cpu)); + sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; sdg->sgc->max_capacity = capacity; @@ -11321,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq) } EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); +int sched_trace_rq_cpu_capacity(struct rq *rq) +{ + return rq ? +#ifdef CONFIG_SMP + rq->cpu_capacity +#else + SCHED_CAPACITY_SCALE +#endif + : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); + const struct cpumask *sched_trace_rd_span(struct root_domain *rd) { #ifdef CONFIG_SMP -- cgit v1.2.3 From feff2e65efd8d84cf831668e182b2ce73c604bbb Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 16 Sep 2020 09:06:39 +0200 Subject: sched/deadline: Unthrottle PI boosted threads while enqueuing stress-ng has a test (stress-ng --cyclic) that creates a set of threads under SCHED_DEADLINE with the following parameters: dl_runtime = 10000 (10 us) dl_deadline = 100000 (100 us) dl_period = 100000 (100 us) These parameters are very aggressive. When using a system without HRTICK set, these threads can easily execute longer than the dl_runtime because the throttling happens with 1/HZ resolution. During the main part of the test, the system works just fine because the workload does not try to run over the 10 us. The problem happens at the end of the test, on the exit() path. During exit(), the threads need to do some cleanups that require real-time mutex locks, mainly those related to memory management, resulting in this scenario: Note: locks are rt_mutexes... ------------------------------------------------------------------------ TASK A: TASK B: TASK C: activation activation activation lock(a): OK! lock(b): OK! lock(a) -> block (task A owns it) -> self notice/set throttled +--< -> arm replenished timer | switch-out | lock(b) | -> B prio> | -> boost TASK B | unlock(a) switch-out | -> handle lock a to B | -> wakeup(B) | -> B is throttled: | -> do not enqueue | switch-out | | +---------------------> replenishment timer -> TASK B is boosted: -> do not enqueue ------------------------------------------------------------------------ BOOM: TASK B is runnable but !enqueued, holding TASK C: the system crashes with hung task C. This problem is avoided by removing the throttle state from the boosted thread while boosting it (by TASK A in the example above), allowing it to be queued and run boosted. The next replenishment will take care of the runtime overrun, pushing the deadline further away. See the "while (dl_se->runtime <= 0)" on replenish_dl_entity() for more information. Reported-by: Mark Simmons Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Tested-by: Mark Simmons Link: https://lkml.kernel.org/r/5076e003450835ec74e6fa5917d02c4fa41687e6.1600170294.git.bristot@redhat.com --- kernel/sched/deadline.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c19c1883d695..6d93f4518734 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1525,6 +1525,27 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) */ if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { pi_se = &pi_task->dl; + /* + * Because of delays in the detection of the overrun of a + * thread's runtime, it might be the case that a thread + * goes to sleep in a rt mutex with negative runtime. As + * a consequence, the thread will be throttled. + * + * While waiting for the mutex, this thread can also be + * boosted via PI, resulting in a thread that is throttled + * and boosted at the same time. + * + * In this case, the boost overrides the throttle. + */ + if (p->dl.dl_throttled) { + /* + * The replenish timer needs to be canceled. No + * problem if it fires concurrently: boosted threads + * are ignored in dl_task_timer(). + */ + hrtimer_try_to_cancel(&p->dl.dl_timer); + p->dl.dl_throttled = 0; + } } else if (!dl_prio(p->normal_prio)) { /* * Special case in which we have a !SCHED_DEADLINE task that is going -- cgit v1.2.3 From 8438f5211479e4b8433f641634362264bc3bbd9e Mon Sep 17 00:00:00 2001 From: Tingwei Zhang Date: Mon, 5 Oct 2020 10:13:13 +0300 Subject: tracing: Add flag to control different traces More traces like event trace or trace marker will be supported. Add flag for difference traces, so that they can be controlled separately. Move current function trace to it's own flag instead of global ftrace enable flag. Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Alexander Shishkin Signed-off-by: Tingwei Zhang Signed-off-by: Alexander Shishkin Link: https://lore.kernel.org/r/20201005071319.78508-3-alexander.shishkin@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f40d850ebabc..3ca121ad8728 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2744,33 +2744,37 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, static void trace_process_export(struct trace_export *export, - struct ring_buffer_event *event) + struct ring_buffer_event *event, int flag) { struct trace_entry *entry; unsigned int size = 0; - entry = ring_buffer_event_data(event); - size = ring_buffer_event_length(event); - export->write(export, entry, size); + if (export->flags & flag) { + entry = ring_buffer_event_data(event); + size = ring_buffer_event_length(event); + export->write(export, entry, size); + } } static DEFINE_MUTEX(ftrace_export_lock); static struct trace_export __rcu *ftrace_exports_list __read_mostly; -static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); -static inline void ftrace_exports_enable(void) +static inline void ftrace_exports_enable(struct trace_export *export) { - static_branch_enable(&ftrace_exports_enabled); + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_inc(&trace_function_exports_enabled); } -static inline void ftrace_exports_disable(void) +static inline void ftrace_exports_disable(struct trace_export *export) { - static_branch_disable(&ftrace_exports_enabled); + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_dec(&trace_function_exports_enabled); } -static void ftrace_exports(struct ring_buffer_event *event) +static void ftrace_exports(struct ring_buffer_event *event, int flag) { struct trace_export *export; @@ -2778,7 +2782,7 @@ static void ftrace_exports(struct ring_buffer_event *event) export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { - trace_process_export(export, event); + trace_process_export(export, event, flag); export = rcu_dereference_raw_check(export->next); } @@ -2818,8 +2822,7 @@ rm_trace_export(struct trace_export **list, struct trace_export *export) static inline void add_ftrace_export(struct trace_export **list, struct trace_export *export) { - if (*list == NULL) - ftrace_exports_enable(); + ftrace_exports_enable(export); add_trace_export(list, export); } @@ -2830,8 +2833,7 @@ rm_ftrace_export(struct trace_export **list, struct trace_export *export) int ret; ret = rm_trace_export(list, export); - if (*list == NULL) - ftrace_exports_disable(); + ftrace_exports_disable(export); return ret; } @@ -2884,8 +2886,8 @@ trace_function(struct trace_array *tr, entry->parent_ip = parent_ip; if (!call_filter_check_discard(call, entry, buffer, event)) { - if (static_branch_unlikely(&ftrace_exports_enabled)) - ftrace_exports(event); + if (static_branch_unlikely(&trace_function_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); } } -- cgit v1.2.3 From 8ab7a2b7055c88c3da5e4684dfa015c6a8987c28 Mon Sep 17 00:00:00 2001 From: Tingwei Zhang Date: Mon, 5 Oct 2020 10:13:14 +0300 Subject: tracing: Add trace_export support for event trace Only function traces can be exported to other destinations currently. This patch exports event trace as well. Move trace export related function to the beginning of file so other trace can call trace_process_export() to export. Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Alexander Shishkin Signed-off-by: Tingwei Zhang Signed-off-by: Alexander Shishkin Link: https://lore.kernel.org/r/20201005071319.78508-4-alexander.shishkin@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 259 ++++++++++++++++++++++++++------------------------- 1 file changed, 134 insertions(+), 125 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3ca121ad8728..a40ee413123c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -251,6 +251,138 @@ unsigned long long ns2usecs(u64 nsec) return nsec; } +static void +trace_process_export(struct trace_export *export, + struct ring_buffer_event *event, int flag) +{ + struct trace_entry *entry; + unsigned int size = 0; + + if (export->flags & flag) { + entry = ring_buffer_event_data(event); + size = ring_buffer_event_length(event); + export->write(export, entry, size); + } +} + +static DEFINE_MUTEX(ftrace_export_lock); + +static struct trace_export __rcu *ftrace_exports_list __read_mostly; + +static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled); + +static inline void ftrace_exports_enable(struct trace_export *export) +{ + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_inc(&trace_function_exports_enabled); + + if (export->flags & TRACE_EXPORT_EVENT) + static_branch_inc(&trace_event_exports_enabled); +} + +static inline void ftrace_exports_disable(struct trace_export *export) +{ + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_dec(&trace_function_exports_enabled); + + if (export->flags & TRACE_EXPORT_EVENT) + static_branch_dec(&trace_event_exports_enabled); +} + +static void ftrace_exports(struct ring_buffer_event *event, int flag) +{ + struct trace_export *export; + + preempt_disable_notrace(); + + export = rcu_dereference_raw_check(ftrace_exports_list); + while (export) { + trace_process_export(export, event, flag); + export = rcu_dereference_raw_check(export->next); + } + + preempt_enable_notrace(); +} + +static inline void +add_trace_export(struct trace_export **list, struct trace_export *export) +{ + rcu_assign_pointer(export->next, *list); + /* + * We are entering export into the list but another + * CPU might be walking that list. We need to make sure + * the export->next pointer is valid before another CPU sees + * the export pointer included into the list. + */ + rcu_assign_pointer(*list, export); +} + +static inline int +rm_trace_export(struct trace_export **list, struct trace_export *export) +{ + struct trace_export **p; + + for (p = list; *p != NULL; p = &(*p)->next) + if (*p == export) + break; + + if (*p != export) + return -1; + + rcu_assign_pointer(*p, (*p)->next); + + return 0; +} + +static inline void +add_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + ftrace_exports_enable(export); + + add_trace_export(list, export); +} + +static inline int +rm_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + int ret; + + ret = rm_trace_export(list, export); + ftrace_exports_disable(export); + + return ret; +} + +int register_ftrace_export(struct trace_export *export) +{ + if (WARN_ON_ONCE(!export->write)) + return -1; + + mutex_lock(&ftrace_export_lock); + + add_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ftrace_export); + +int unregister_ftrace_export(struct trace_export *export) +{ + int ret; + + mutex_lock(&ftrace_export_lock); + + ret = rm_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_export); + /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS \ (FUNCTION_DEFAULT_FLAGS | \ @@ -2699,6 +2831,8 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) if (static_key_false(&tracepoint_printk_key.key)) output_printk(fbuffer); + if (static_branch_unlikely(&trace_event_exports_enabled)) + ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, fbuffer->flags, fbuffer->pc, fbuffer->regs); @@ -2742,131 +2876,6 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, __buffer_unlock_commit(buffer, event); } -static void -trace_process_export(struct trace_export *export, - struct ring_buffer_event *event, int flag) -{ - struct trace_entry *entry; - unsigned int size = 0; - - if (export->flags & flag) { - entry = ring_buffer_event_data(event); - size = ring_buffer_event_length(event); - export->write(export, entry, size); - } -} - -static DEFINE_MUTEX(ftrace_export_lock); - -static struct trace_export __rcu *ftrace_exports_list __read_mostly; - -static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); - -static inline void ftrace_exports_enable(struct trace_export *export) -{ - if (export->flags & TRACE_EXPORT_FUNCTION) - static_branch_inc(&trace_function_exports_enabled); -} - -static inline void ftrace_exports_disable(struct trace_export *export) -{ - if (export->flags & TRACE_EXPORT_FUNCTION) - static_branch_dec(&trace_function_exports_enabled); -} - -static void ftrace_exports(struct ring_buffer_event *event, int flag) -{ - struct trace_export *export; - - preempt_disable_notrace(); - - export = rcu_dereference_raw_check(ftrace_exports_list); - while (export) { - trace_process_export(export, event, flag); - export = rcu_dereference_raw_check(export->next); - } - - preempt_enable_notrace(); -} - -static inline void -add_trace_export(struct trace_export **list, struct trace_export *export) -{ - rcu_assign_pointer(export->next, *list); - /* - * We are entering export into the list but another - * CPU might be walking that list. We need to make sure - * the export->next pointer is valid before another CPU sees - * the export pointer included into the list. - */ - rcu_assign_pointer(*list, export); -} - -static inline int -rm_trace_export(struct trace_export **list, struct trace_export *export) -{ - struct trace_export **p; - - for (p = list; *p != NULL; p = &(*p)->next) - if (*p == export) - break; - - if (*p != export) - return -1; - - rcu_assign_pointer(*p, (*p)->next); - - return 0; -} - -static inline void -add_ftrace_export(struct trace_export **list, struct trace_export *export) -{ - ftrace_exports_enable(export); - - add_trace_export(list, export); -} - -static inline int -rm_ftrace_export(struct trace_export **list, struct trace_export *export) -{ - int ret; - - ret = rm_trace_export(list, export); - ftrace_exports_disable(export); - - return ret; -} - -int register_ftrace_export(struct trace_export *export) -{ - if (WARN_ON_ONCE(!export->write)) - return -1; - - mutex_lock(&ftrace_export_lock); - - add_ftrace_export(&ftrace_exports_list, export); - - mutex_unlock(&ftrace_export_lock); - - return 0; -} -EXPORT_SYMBOL_GPL(register_ftrace_export); - -int unregister_ftrace_export(struct trace_export *export) -{ - int ret; - - mutex_lock(&ftrace_export_lock); - - ret = rm_ftrace_export(&ftrace_exports_list, export); - - mutex_unlock(&ftrace_export_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(unregister_ftrace_export); - void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, -- cgit v1.2.3 From 458999c6f67b0ffcc704a4892041dd700adf7d83 Mon Sep 17 00:00:00 2001 From: Tingwei Zhang Date: Mon, 5 Oct 2020 10:13:15 +0300 Subject: tracing: Add trace_export support for trace_marker Add the support to route trace_marker buffer to other destination via trace_export. Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Alexander Shishkin Signed-off-by: Tingwei Zhang Signed-off-by: Alexander Shishkin Link: https://lore.kernel.org/r/20201005071319.78508-5-alexander.shishkin@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a40ee413123c..6048fba2f590 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -271,6 +271,7 @@ static struct trace_export __rcu *ftrace_exports_list __read_mostly; static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled); static inline void ftrace_exports_enable(struct trace_export *export) { @@ -279,6 +280,9 @@ static inline void ftrace_exports_enable(struct trace_export *export) if (export->flags & TRACE_EXPORT_EVENT) static_branch_inc(&trace_event_exports_enabled); + + if (export->flags & TRACE_EXPORT_MARKER) + static_branch_inc(&trace_marker_exports_enabled); } static inline void ftrace_exports_disable(struct trace_export *export) @@ -288,6 +292,9 @@ static inline void ftrace_exports_disable(struct trace_export *export) if (export->flags & TRACE_EXPORT_EVENT) static_branch_dec(&trace_event_exports_enabled); + + if (export->flags & TRACE_EXPORT_MARKER) + static_branch_dec(&trace_marker_exports_enabled); } static void ftrace_exports(struct ring_buffer_event *event, int flag) @@ -6687,6 +6694,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } else entry->buf[cnt] = '\0'; + if (static_branch_unlikely(&trace_marker_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_MARKER); __buffer_unlock_commit(buffer, event); if (tt) -- cgit v1.2.3 From c307459b9d1fcb8bbf3ea5a4162979532322ef77 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Oct 2020 10:38:13 -0700 Subject: fs/kernel_read_file: Remove FIRMWARE_PREALLOC_BUFFER enum FIRMWARE_PREALLOC_BUFFER is a "how", not a "what", and confuses the LSMs that are interested in filtering between types of things. The "how" should be an internal detail made uninteresting to the LSMs. Fixes: a098ecd2fa7d ("firmware: support loading into a pre-allocated buffer") Fixes: fd90bc559bfb ("ima: based on policy verify firmware signatures (pre-allocated buffer)") Fixes: 4f0496d8ffa3 ("ima: based on policy warn about loading firmware (pre-allocated buffer)") Signed-off-by: Kees Cook Reviewed-by: Mimi Zohar Reviewed-by: Luis Chamberlain Acked-by: Scott Branden Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201002173828.2099543-2-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1c5cff34d9f2..b2808acac46b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4035,7 +4035,7 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { struct load_info info = { }; loff_t size; - void *hdr; + void *hdr = NULL; int err; err = may_init_module(); -- cgit v1.2.3 From b89999d004931ab2e5123611ace7dab77328f8d6 Mon Sep 17 00:00:00 2001 From: Scott Branden Date: Fri, 2 Oct 2020 10:38:15 -0700 Subject: fs/kernel_read_file: Split into separate include file Move kernel_read_file* out of linux/fs.h to its own linux/kernel_read_file.h include file. That header gets pulled in just about everywhere and doesn't really need functions not related to the general fs interface. Suggested-by: Christoph Hellwig Signed-off-by: Scott Branden Signed-off-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Mimi Zohar Reviewed-by: Luis Chamberlain Acked-by: Greg Kroah-Hartman Acked-by: James Morris Link: https://lore.kernel.org/r/20200706232309.12010-2-scott.branden@broadcom.com Link: https://lore.kernel.org/r/20201002173828.2099543-4-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- kernel/kexec_file.c | 1 + kernel/module.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ca40bef75a61..1cc82557f4c1 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include "kexec_internal.h" diff --git a/kernel/module.c b/kernel/module.c index b2808acac46b..4218abd272ee 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From f7a4f689bca6072492626938aad6dd2f32c5bf97 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Oct 2020 10:38:17 -0700 Subject: fs/kernel_read_file: Remove redundant size argument In preparation for refactoring kernel_read_file*(), remove the redundant "size" argument which is not needed: it can be included in the return code, with callers adjusted. (VFS reads already cannot be larger than INT_MAX.) Signed-off-by: Kees Cook Reviewed-by: Mimi Zohar Reviewed-by: Luis Chamberlain Reviewed-by: James Morris Acked-by: Scott Branden Link: https://lore.kernel.org/r/20201002173828.2099543-6-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- kernel/kexec_file.c | 14 +++++++------- kernel/module.c | 7 +++---- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 1cc82557f4c1..b20cfde8a01d 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -220,13 +220,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, { int ret; void *ldata; - loff_t size; ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf, - &size, INT_MAX, READING_KEXEC_IMAGE); - if (ret) + INT_MAX, READING_KEXEC_IMAGE); + if (ret < 0) return ret; - image->kernel_buf_len = size; + image->kernel_buf_len = ret; /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, @@ -243,11 +242,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf, - &size, INT_MAX, + INT_MAX, READING_KEXEC_INITRAMFS); - if (ret) + if (ret < 0) goto out; - image->initrd_buf_len = size; + image->initrd_buf_len = ret; + ret = 0; } if (cmdline_len) { diff --git a/kernel/module.c b/kernel/module.c index 4218abd272ee..9faa6322f17b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4035,7 +4035,6 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { struct load_info info = { }; - loff_t size; void *hdr = NULL; int err; @@ -4049,12 +4048,12 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_IGNORE_VERMAGIC)) return -EINVAL; - err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX, + err = kernel_read_file_from_fd(fd, &hdr, INT_MAX, READING_MODULE); - if (err) + if (err < 0) return err; info.hdr = hdr; - info.len = size; + info.len = err; return load_module(&info, uargs, flags); } -- cgit v1.2.3 From 885352881f11f1f3113d8eb877786bcb6d720544 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Oct 2020 10:38:19 -0700 Subject: fs/kernel_read_file: Add file_size output argument In preparation for adding partial read support, add an optional output argument to kernel_read_file*() that reports the file size so callers can reason more easily about their reading progress. Signed-off-by: Kees Cook Reviewed-by: Mimi Zohar Reviewed-by: Luis Chamberlain Reviewed-by: James Morris Acked-by: Scott Branden Link: https://lore.kernel.org/r/20201002173828.2099543-8-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- kernel/kexec_file.c | 4 ++-- kernel/module.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b20cfde8a01d..ee51c1028658 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -222,7 +222,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, void *ldata; ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf, - INT_MAX, READING_KEXEC_IMAGE); + INT_MAX, NULL, READING_KEXEC_IMAGE); if (ret < 0) return ret; image->kernel_buf_len = ret; @@ -242,7 +242,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf, - INT_MAX, + INT_MAX, NULL, READING_KEXEC_INITRAMFS); if (ret < 0) goto out; diff --git a/kernel/module.c b/kernel/module.c index 9faa6322f17b..0f11eaed047e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4048,7 +4048,7 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_IGNORE_VERMAGIC)) return -EINVAL; - err = kernel_read_file_from_fd(fd, &hdr, INT_MAX, + err = kernel_read_file_from_fd(fd, &hdr, INT_MAX, NULL, READING_MODULE); if (err < 0) return err; -- cgit v1.2.3 From b64fcae74b6d6940d14243c963ab0089e8f0d82d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Oct 2020 10:38:20 -0700 Subject: LSM: Introduce kernel_post_load_data() hook There are a few places in the kernel where LSMs would like to have visibility into the contents of a kernel buffer that has been loaded or read. While security_kernel_post_read_file() (which includes the buffer) exists as a pairing for security_kernel_read_file(), no such hook exists to pair with security_kernel_load_data(). Earlier proposals for just using security_kernel_post_read_file() with a NULL file argument were rejected (i.e. "file" should always be valid for the security_..._file hooks, but it appears at least one case was left in the kernel during earlier refactoring. (This will be fixed in a subsequent patch.) Since not all cases of security_kernel_load_data() can have a single contiguous buffer made available to the LSM hook (e.g. kexec image segments are separately loaded), there needs to be a way for the LSM to reason about its expectations of the hook coverage. In order to handle this, add a "contents" argument to the "kernel_load_data" hook that indicates if the newly added "kernel_post_load_data" hook will be called with the full contents once loaded. That way, LSMs requiring full contents can choose to unilaterally reject "kernel_load_data" with contents=false (which is effectively the existing hook coverage), but when contents=true they can allow it and later evaluate the "kernel_post_load_data" hook once the buffer is loaded. With this change, LSMs can gain coverage over non-file-backed data loads (e.g. init_module(2) and firmware userspace helper), which will happen in subsequent patches. Additionally prepare IMA to start processing these cases. Signed-off-by: Kees Cook Reviewed-by: KP Singh Link: https://lore.kernel.org/r/20201002173828.2099543-9-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- kernel/kexec.c | 2 +- kernel/module.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index f977786fe498..c82c6c06f051 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -205,7 +205,7 @@ static inline int kexec_load_check(unsigned long nr_segments, return -EPERM; /* Permit LSMs and IMA to fail the kexec */ - result = security_kernel_load_data(LOADING_KEXEC_IMAGE); + result = security_kernel_load_data(LOADING_KEXEC_IMAGE, false); if (result < 0) return result; diff --git a/kernel/module.c b/kernel/module.c index 0f11eaed047e..f47209e0fde6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3014,7 +3014,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; - err = security_kernel_load_data(LOADING_MODULE); + err = security_kernel_load_data(LOADING_MODULE, false); if (err) return err; -- cgit v1.2.3 From 38f901735a9e2b3d182fc04a1dfcdb0d3325ea5d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Oct 2020 10:38:22 -0700 Subject: module: Call security_kernel_post_load_data() Now that there is an API for checking loaded contents for modules loaded without a file, call into the LSM hooks. Signed-off-by: Kees Cook Reviewed-by: KP Singh Acked-by: Jessica Yu Link: https://lore.kernel.org/r/20201002173828.2099543-11-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f47209e0fde6..adfa21dd3842 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3014,7 +3014,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; - err = security_kernel_load_data(LOADING_MODULE, false); + err = security_kernel_load_data(LOADING_MODULE, true); if (err) return err; @@ -3024,11 +3024,17 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return -ENOMEM; if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) { - vfree(info->hdr); - return -EFAULT; + err = -EFAULT; + goto out; } - return 0; + err = security_kernel_post_load_data((char *)info->hdr, info->len, + LOADING_MODULE, "init_module"); +out: + if (err) + vfree(info->hdr); + + return err; } static void free_copy(struct load_info *info) -- cgit v1.2.3 From 0fa8e084648779eeb8929ae004301b3acf3bad84 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Oct 2020 10:38:25 -0700 Subject: fs/kernel_file_read: Add "offset" arg for partial reads To perform partial reads, callers of kernel_read_file*() must have a non-NULL file_size argument and a preallocated buffer. The new "offset" argument can then be used to seek to specific locations in the file to fill the buffer to, at most, "buf_size" per call. Where possible, the LSM hooks can report whether a full file has been read or not so that the contents can be reasoned about. Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201002173828.2099543-14-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- kernel/kexec_file.c | 4 ++-- kernel/module.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ee51c1028658..84f7316792a7 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -221,7 +221,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, int ret; void *ldata; - ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf, + ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf, INT_MAX, NULL, READING_KEXEC_IMAGE); if (ret < 0) return ret; @@ -241,7 +241,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, #endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { - ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf, + ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf, INT_MAX, NULL, READING_KEXEC_INITRAMFS); if (ret < 0) diff --git a/kernel/module.c b/kernel/module.c index adfa21dd3842..9c578e44abe7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4054,7 +4054,7 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_IGNORE_VERMAGIC)) return -EINVAL; - err = kernel_read_file_from_fd(fd, &hdr, INT_MAX, NULL, + err = kernel_read_file_from_fd(fd, 0, &hdr, INT_MAX, NULL, READING_MODULE); if (err < 0) return err; -- cgit v1.2.3 From 08d8c65e849d7579bafe2b03eab844d7860e3682 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 5 Oct 2020 13:26:05 +0530 Subject: cpufreq: Move traces and update to policy->cur to cpufreq core The cpufreq core handles the updates to policy->cur and recording of cpufreq trace events for all the governors except schedutil's fast switch case. Move that as well to cpufreq core for consistency and readability. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e39008242cf4..28f6d1ad608b 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -115,21 +115,11 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { struct cpufreq_policy *policy = sg_policy->policy; - int cpu; if (!sugov_update_next_freq(sg_policy, time, next_freq)) return; - next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (!next_freq) - return; - - policy->cur = next_freq; - - if (trace_cpu_frequency_enabled()) { - for_each_cpu(cpu, policy->cpus) - trace_cpu_frequency(next_freq, cpu); - } + cpufreq_driver_fast_switch(policy, next_freq); } static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, -- cgit v1.2.3 From 8731745e4821b6738189b7ac490cf042fa9f1fe0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 2 Oct 2020 18:42:17 -0500 Subject: bpf, verifier: Use fallthrough pseudo-keyword Replace /* fallthrough */ comments with the new pseudo-keyword macro fallthrough [1]. [1] https://www.kernel.org/doc/html/v5.7/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through Signed-off-by: Gustavo A. R. Silva Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201002234217.GA12280@embeddedor --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d9dbf271ebab..01120acab09a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2673,7 +2673,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_CGROUP_SKB: if (t == BPF_WRITE) return false; - /* fallthrough */ + fallthrough; /* Program types with direct read + write access go here! */ case BPF_PROG_TYPE_SCHED_CLS: @@ -5475,7 +5475,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) break; - /* fall-through */ + fallthrough; case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: -- cgit v1.2.3 From 4e797e6ec79c0705791c14f3e60f38b01c78ea1d Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 2 Oct 2020 17:46:27 -0500 Subject: printk: Use fallthrough pseudo-keyword Replace /* FALL THRU */ comment with the new pseudo-keyword macro fallthrough[1]. [1] https://www.kernel.org/doc/html/v5.7/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through Signed-off-by: Gustavo A. R. Silva Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20201002224627.GA30475@embeddedor --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e4987ebe21b4..8c870ba76071 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1567,7 +1567,7 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: clear = true; - /* FALL THRU */ + fallthrough; /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: if (!buf || len < 0) -- cgit v1.2.3 From 10ed16662da9e28a33b6c991c36c6b323b03dd5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Sep 2020 18:06:18 +0200 Subject: block: add a bdget_part helper All remaining callers of bdget() outside of fs/block_dev.c want to get a reference to the struct block_device for a given struct hd_struct. Add a helper just for that and then mark bdget static. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ec874ea04092..f1022945e346 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1827,13 +1827,11 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = bdget_part(dev_to_part(dev)); struct request_queue *q; - struct block_device *bdev; struct blk_trace *bt; ssize_t ret = -ENXIO; - bdev = bdget(part_devt(p)); if (bdev == NULL) goto out; @@ -1875,7 +1873,6 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, { struct block_device *bdev; struct request_queue *q; - struct hd_struct *p; struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1895,9 +1892,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out; ret = -ENXIO; - - p = dev_to_part(dev); - bdev = bdget(part_devt(p)); + bdev = bdget_part(dev_to_part(dev)); if (bdev == NULL) goto out; -- cgit v1.2.3 From 428805c0c5e76ef643b1fbc893edfb636b3d8aef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Sep 2020 18:14:47 +0200 Subject: PM: hibernate: remove the bogus call to get_gendisk() in software_resume() get_gendisk grabs a reference on the disk and file operation, so this code will leak both of them while having absolutely no use for the gendisk itself. This effectively reverts commit 2df83fa4bce421f ("PM / Hibernate: Use get_gendisk to verify partition if resume_file is integer format") Signed-off-by: Christoph Hellwig Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e7aa57fb2fdc..7d0b99d2e696 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -948,17 +948,6 @@ static int software_resume(void) /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); - - /* - * name_to_dev_t is ineffective to verify parition if resume_file is in - * integer format. (e.g. major:minor) - */ - if (isdigit(resume_file[0]) && resume_wait) { - int partno; - while (!get_gendisk(swsusp_resume_device, &partno)) - msleep(10); - } - if (!swsusp_resume_device) { /* * Some device discovery might still be in progress; we need -- cgit v1.2.3 From 39d8f0d1026a990604770a658708f5845f7dbec0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 5 Oct 2020 09:58:38 -0700 Subject: bpf: Use raw_spin_trylock() for pcpu_freelist_push/pop in NMI Recent improvements in LOCKDEP highlighted a potential A-A deadlock with pcpu_freelist in NMI: ./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi [ 18.984807] ================================ [ 18.984807] WARNING: inconsistent lock state [ 18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted [ 18.984809] -------------------------------- [ 18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage. [ 18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes: [ 18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at: __pcpu_freelist_pop+0xe3/0x180 [ 18.984813] {INITIAL USE} state was registered at: [ 18.984814] lock_acquire+0x175/0x7c0 [ 18.984814] _raw_spin_lock+0x2c/0x40 [ 18.984815] __pcpu_freelist_pop+0xe3/0x180 [ 18.984815] pcpu_freelist_pop+0x31/0x40 [ 18.984816] htab_map_alloc+0xbbf/0xf40 [ 18.984816] __do_sys_bpf+0x5aa/0x3ed0 [ 18.984817] do_syscall_64+0x2d/0x40 [ 18.984818] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 18.984818] irq event stamp: 12 [...] [ 18.984822] other info that might help us debug this: [ 18.984823] Possible unsafe locking scenario: [ 18.984823] [ 18.984824] CPU0 [ 18.984824] ---- [ 18.984824] lock(&head->lock); [ 18.984826] [ 18.984826] lock(&head->lock); [ 18.984827] [ 18.984828] *** DEADLOCK *** [ 18.984828] [ 18.984829] 2 locks held by test_progs/1990: [...] [ 18.984838] [ 18.984838] dump_stack+0x9a/0xd0 [ 18.984839] lock_acquire+0x5c9/0x7c0 [ 18.984839] ? lock_release+0x6f0/0x6f0 [ 18.984840] ? __pcpu_freelist_pop+0xe3/0x180 [ 18.984840] _raw_spin_lock+0x2c/0x40 [ 18.984841] ? __pcpu_freelist_pop+0xe3/0x180 [ 18.984841] __pcpu_freelist_pop+0xe3/0x180 [ 18.984842] pcpu_freelist_pop+0x17/0x40 [ 18.984842] ? lock_release+0x6f0/0x6f0 [ 18.984843] __bpf_get_stackid+0x534/0xaf0 [ 18.984843] bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350 [ 18.984844] bpf_overflow_handler+0x12f/0x3f0 This is because pcpu_freelist_head.lock is accessed in both NMI and non-NMI context. Fix this issue by using raw_spin_trylock() in NMI. Since NMI interrupts non-NMI context, when NMI context tries to lock the raw_spinlock, non-NMI context of the same CPU may already have locked a lock and is blocked from unlocking the lock. For a system with N CPUs, there could be N NMIs at the same time, and they may block N non-NMI raw_spinlocks. This is tricky for pcpu_freelist_push(), where unlike _pop(), failing _push() means leaking memory. This issue is more likely to trigger in non-SMP system. Fix this issue with an extra list, pcpu_freelist.extralist. The extralist is primarily used to take _push() when raw_spin_trylock() failed on all the per CPU lists. It should be empty most of the time. The following table summarizes the behavior of pcpu_freelist in NMI and non-NMI: non-NMI pop(): use _lock(); check per CPU lists first; if all per CPU lists are empty, check extralist; if extralist is empty, return NULL. non-NMI push(): use _lock(); only push to per CPU lists. NMI pop(): use _trylock(); check per CPU lists first; if all per CPU lists are locked or empty, check extralist; if extralist is locked or empty, return NULL. NMI push(): use _trylock(); check per CPU lists first; if all per CPU lists are locked; try push to extralist; if extralist is also locked, keep trying on per CPU lists. Reported-by: Alexei Starovoitov Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201005165838.3735218-1-songliubraving@fb.com --- kernel/bpf/percpu_freelist.c | 101 ++++++++++++++++++++++++++++++++++++++++--- kernel/bpf/percpu_freelist.h | 1 + 2 files changed, 97 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index b367430e611c..3d897de89061 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s) raw_spin_lock_init(&head->lock); head->first = NULL; } + raw_spin_lock_init(&s->extralist.lock); + s->extralist.first = NULL; return 0; } @@ -40,12 +42,50 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, raw_spin_unlock(&head->lock); } +static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + if (!raw_spin_trylock(&s->extralist.lock)) + return false; + + pcpu_freelist_push_node(&s->extralist, node); + raw_spin_unlock(&s->extralist.lock); + return true; +} + +static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + int cpu, orig_cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + struct pcpu_freelist_head *head; + + head = per_cpu_ptr(s->freelist, cpu); + if (raw_spin_trylock(&head->lock)) { + pcpu_freelist_push_node(head, node); + raw_spin_unlock(&head->lock); + return; + } + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + + /* cannot lock any per cpu lock, try extralist */ + if (cpu == orig_cpu && + pcpu_freelist_try_push_extra(s, node)) + return; + } +} + void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { - struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); - - ___pcpu_freelist_push(head, node); + if (in_nmi()) + ___pcpu_freelist_push_nmi(s, node); + else + ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); } void pcpu_freelist_push(struct pcpu_freelist *s, @@ -81,7 +121,7 @@ again: } } -struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) +static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; @@ -102,8 +142,59 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) if (cpu >= nr_cpu_ids) cpu = 0; if (cpu == orig_cpu) - return NULL; + break; + } + + /* per cpu lists are all empty, try extralist */ + raw_spin_lock(&s->extralist.lock); + node = s->extralist.first; + if (node) + s->extralist.first = node->next; + raw_spin_unlock(&s->extralist.lock); + return node; +} + +static struct pcpu_freelist_node * +___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) +{ + struct pcpu_freelist_head *head; + struct pcpu_freelist_node *node; + int orig_cpu, cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + head = per_cpu_ptr(s->freelist, cpu); + if (raw_spin_trylock(&head->lock)) { + node = head->first; + if (node) { + head->first = node->next; + raw_spin_unlock(&head->lock); + return node; + } + raw_spin_unlock(&head->lock); + } + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + if (cpu == orig_cpu) + break; } + + /* cannot pop from per cpu lists, try extralist */ + if (!raw_spin_trylock(&s->extralist.lock)) + return NULL; + node = s->extralist.first; + if (node) + s->extralist.first = node->next; + raw_spin_unlock(&s->extralist.lock); + return node; +} + +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) +{ + if (in_nmi()) + return ___pcpu_freelist_pop_nmi(s); + return ___pcpu_freelist_pop(s); } struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index fbf8a8a28979..3c76553cfe57 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -13,6 +13,7 @@ struct pcpu_freelist_head { struct pcpu_freelist { struct pcpu_freelist_head __percpu *freelist; + struct pcpu_freelist_head extralist; }; struct pcpu_freelist_node { -- cgit v1.2.3 From 4a4a56b4e76bbe3211c4f93d99c0c1543f5f3230 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 4 Oct 2020 17:14:03 -0500 Subject: tracing: Change STR_VAR_MAX_LEN 32 is too small for this value, and anyway it makes more sense to use MAX_FILTER_STR_VAL, as this is also the value used for variable-length __strings. Link: https://lkml.kernel.org/r/6adfd1668ac1fd8670bd58206944a762061a5559.1601848695.git.zanussi@kernel.org Tested-by: Axel Rasmussen Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 ++ kernel/trace/trace_synth.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1b2ef6490229..3b22e2122d1a 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1398,6 +1398,8 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) n_str = hist_data->n_field_var_str + hist_data->n_save_var_str; + BUILD_BUG_ON(STR_VAR_LEN_MAX & (sizeof(u64) - 1)); + size = STR_VAR_LEN_MAX; for (i = 0; i < n_str; i++) { diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h index ac35c45207c4..5166705d1556 100644 --- a/kernel/trace/trace_synth.h +++ b/kernel/trace/trace_synth.h @@ -7,7 +7,7 @@ #define SYNTH_SYSTEM "synthetic" #define SYNTH_FIELDS_MAX 32 -#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ +#define STR_VAR_LEN_MAX MAX_FILTER_STR_VAL /* must be multiple of sizeof(u64) */ struct synth_field { char *type; -- cgit v1.2.3 From 8fbeb52a598c7ab5aa603d6bb083b8a8d16d607a Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 4 Oct 2020 17:14:04 -0500 Subject: tracing: Fix parse_synth_field() error handling synth_field_size() returns either a positive size or an error (zero or a negative value). However, the existing code assumes the only error value is 0. It doesn't handle negative error codes, as it assigns directly to field->size (a size_t; unsigned), thereby interpreting the error code as a valid size instead. Do the test before assignment to field->size. [ axelrasmussen@google.com: changelog addition, first paragraph above ] Link: https://lkml.kernel.org/r/9b6946d9776b2eeb43227678158196de1c3c6e1d.1601848695.git.zanussi@kernel.org Fixes: 4b147936fa50 (tracing: Add support for 'synthetic' events) Reviewed-by: Masami Hiramatsu Tested-by: Axel Rasmussen Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index a9cd7793f7ea..fa8a99828f41 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -465,6 +465,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, struct synth_field *field; const char *prefix = NULL, *field_type = argv[0], *field_name, *array; int len, ret = 0; + ssize_t size; if (field_type[0] == ';') field_type++; @@ -520,11 +521,12 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, field->type[len - 1] = '\0'; } - field->size = synth_field_size(field->type); - if (!field->size) { + size = synth_field_size(field->type); + if (size <= 0) { ret = -EINVAL; goto free; } + field->size = size; if (synth_field_is_string(field->type)) field->is_string = true; -- cgit v1.2.3 From 63a1e5de3006f4ad713e4d72bcb404d0301e853d Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 4 Oct 2020 17:14:05 -0500 Subject: tracing: Save normal string variables String variables created as field variables and save variables are already handled properly by having their values copied when set. The same isn't done for normal variables, but needs to be - simply saving a pointer to a string contained in an old event isn't sufficient, since that event's data may quickly become overwritten and therefore a string pointer to it could yield garbage. This change uses the same mechanism as field variables and simply appends the new strings to the existing per-element field_var_str[] array allocated for that purpose. Link: https://lkml.kernel.org/r/1c1a03798b02e67307412a0c719d1bfb69b13007.1601848695.git.zanussi@kernel.org Fixes: 02205a6752f2 (tracing: Add support for 'field variables') Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 3b22e2122d1a..812bc5f94b5c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -147,6 +147,8 @@ struct hist_field { */ unsigned int var_ref_idx; bool read_once; + + unsigned int var_str_idx; }; static u64 hist_field_none(struct hist_field *field, @@ -349,6 +351,7 @@ struct hist_trigger_data { unsigned int n_keys; unsigned int n_fields; unsigned int n_vars; + unsigned int n_var_str; unsigned int key_size; struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; unsigned int n_sort_keys; @@ -1396,7 +1399,12 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) } } - n_str = hist_data->n_field_var_str + hist_data->n_save_var_str; + n_str = hist_data->n_field_var_str + hist_data->n_save_var_str + + hist_data->n_var_str; + if (n_str > SYNTH_FIELDS_MAX) { + hist_elt_data_free(elt_data); + return -EINVAL; + } BUILD_BUG_ON(STR_VAR_LEN_MAX & (sizeof(u64) - 1)); @@ -3653,6 +3661,7 @@ static int create_var_field(struct hist_trigger_data *hist_data, { struct trace_array *tr = hist_data->event_file->tr; unsigned long flags = 0; + int ret; if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) return -EINVAL; @@ -3667,7 +3676,12 @@ static int create_var_field(struct hist_trigger_data *hist_data, if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX)) return -EINVAL; - return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + + if (hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) + hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; + + return ret; } static int create_val_fields(struct hist_trigger_data *hist_data, @@ -4394,6 +4408,22 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, hist_val = hist_field->fn(hist_field, elt, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; + + if (hist_field->flags & HIST_FIELD_FL_STRING) { + unsigned int str_start, var_str_idx, idx; + char *str, *val_str; + + str_start = hist_data->n_field_var_str + + hist_data->n_save_var_str; + var_str_idx = hist_field->var_str_idx; + idx = str_start + var_str_idx; + + str = elt_data->field_var_str[idx]; + val_str = (char *)(uintptr_t)hist_val; + strscpy(str, val_str, STR_VAR_LEN_MAX); + + hist_val = (u64)(uintptr_t)str; + } tracing_map_set_var(elt, var_idx, hist_val); continue; } -- cgit v1.2.3 From bd82631d7ccdc894af2738e47abcba2cb6e7dea9 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 4 Oct 2020 17:14:06 -0500 Subject: tracing: Add support for dynamic strings to synthetic events Currently, sythetic events only support static string fields such as: # echo 'test_latency u64 lat; char somename[32]' > /sys/kernel/debug/tracing/synthetic_events Which is fine, but wastes a lot of space in the event. It also prevents the most commonly-defined strings in the existing trace events e.g. those defined using __string(), from being passed to synthetic events via the trace() action. With this change, synthetic events with dynamic fields can be defined: # echo 'test_latency u64 lat; char somename[]' > /sys/kernel/debug/tracing/synthetic_events And the trace() action can be used to generate events using either dynamic or static strings: # echo 'hist:keys=name:lat=common_timestamp.usecs-$ts0:onmatch(sys.event).test_latency($lat,name)' > /sys/kernel/debug/tracing/events The synthetic event dynamic strings are implemented in the same way as the existing __data_loc strings and appear as such in the format file. [ : added __set_synth_event_print_fmt() changes: I added the following to make it work with trace-cmd. Dynamic strings must have __get_str() for events in the print_fmt otherwise it can't be parsed correctly. ] Link: https://lore.kernel.org/r/cover.1601588066.git.zanussi@kernel.org Link: https://lkml.kernel.org/r/3ed35b6d0e390f5b94cb4a9ba1cc18f5982ab277.1601848695.git.zanussi@kernel.org Tested-by: Axel Rasmussen Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/synth_event_gen_test.c | 18 ++- kernel/trace/trace_events_hist.c | 9 ++ kernel/trace/trace_events_synth.c | 248 +++++++++++++++++++++++++++++++----- kernel/trace/trace_synth.h | 4 + 4 files changed, 241 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 7d56d621ffea..edd912cd14aa 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -242,9 +242,11 @@ static struct synth_field_desc create_synth_test_fields[] = { { .type = "pid_t", .name = "next_pid_field" }, { .type = "char[16]", .name = "next_comm_field" }, { .type = "u64", .name = "ts_ns" }, + { .type = "char[]", .name = "dynstring_field_1" }, { .type = "u64", .name = "ts_ms" }, { .type = "unsigned int", .name = "cpu" }, { .type = "char[64]", .name = "my_string_field" }, + { .type = "char[]", .name = "dynstring_field_2" }, { .type = "int", .name = "my_int_field" }, }; @@ -254,7 +256,7 @@ static struct synth_field_desc create_synth_test_fields[] = { */ static int __init test_create_synth_event(void) { - u64 vals[7]; + u64 vals[9]; int ret; /* Create the create_synth_test event with the fields above */ @@ -292,10 +294,12 @@ static int __init test_create_synth_event(void) vals[0] = 777; /* next_pid_field */ vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ - vals[3] = 1000; /* ts_ms */ - vals[4] = raw_smp_processor_id(); /* cpu */ - vals[5] = (u64)(long)"thneed"; /* my_string_field */ - vals[6] = 398; /* my_int_field */ + vals[3] = (u64)(long)"xrayspecs"; /* dynstring_field_1 */ + vals[4] = 1000; /* ts_ms */ + vals[5] = raw_smp_processor_id(); /* cpu */ + vals[6] = (u64)(long)"thneed"; /* my_string_field */ + vals[7] = (u64)(long)"kerplunk"; /* dynstring_field_2 */ + vals[8] = 398; /* my_int_field */ /* Now generate a create_synth_test event */ ret = synth_event_trace_array(create_synth_test, vals, ARRAY_SIZE(vals)); @@ -422,13 +426,15 @@ static int __init test_trace_synth_event(void) int ret; /* Trace some bogus values just for testing */ - ret = synth_event_trace(create_synth_test, 7, /* number of values */ + ret = synth_event_trace(create_synth_test, 9, /* number of values */ (u64)444, /* next_pid_field */ (u64)(long)"clackers", /* next_comm_field */ (u64)1000000, /* ts_ns */ + (u64)(long)"viewmaster",/* dynstring_field_1 */ (u64)1000, /* ts_ms */ (u64)raw_smp_processor_id(), /* cpu */ (u64)(long)"Thneed", /* my_string_field */ + (u64)(long)"yoyos", /* dynstring_field_2 */ (u64)999); /* my_int_field */ return ret; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 812bc5f94b5c..c74a7d157306 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3289,6 +3289,15 @@ static int check_synth_field(struct synth_event *event, field = event->fields[field_pos]; + /* + * A dynamic string synth field can accept static or + * dynamic. A static string synth field can only accept a + * same-sized static string, which is checked for later. + */ + if (strstr(hist_field->type, "char[") && field->is_string + && field->is_dynamic) + return 0; + if (strcmp(field->type, hist_field->type) != 0) { if (field->size != hist_field->size || field->is_signed != hist_field->is_signed) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index fa8a99828f41..24bc6d61aa40 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -88,7 +88,7 @@ static int synth_event_define_fields(struct trace_event_call *call) event->fields[i]->offset = n_u64; - if (event->fields[i]->is_string) { + if (event->fields[i]->is_string && !event->fields[i]->is_dynamic) { offset += STR_VAR_LEN_MAX; n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { @@ -139,6 +139,9 @@ static int synth_field_string_size(char *type) if (len > 3) return -EINVAL; + if (len == 0) + return 0; /* variable-length string */ + strncpy(buf, start, len); buf[len] = '\0'; @@ -290,10 +293,25 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, /* parameter values */ if (se->fields[i]->is_string) { - trace_seq_printf(s, print_fmt, se->fields[i]->name, - (char *)&entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + if (se->fields[i]->is_dynamic) { + u32 offset, data_offset; + char *str_field; + + offset = (u32)entry->fields[n_u64]; + data_offset = offset & 0xffff; + + str_field = (char *)entry + data_offset; + + trace_seq_printf(s, print_fmt, se->fields[i]->name, + str_field, + i == se->n_fields - 1 ? "" : " "); + n_u64++; + } else { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + (char *)&entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; @@ -325,16 +343,52 @@ static struct trace_event_functions synth_event_funcs = { .trace = print_synth_event }; +static unsigned int trace_string(struct synth_trace_event *entry, + struct synth_event *event, + char *str_val, + bool is_dynamic, + unsigned int data_size, + unsigned int *n_u64) +{ + unsigned int len = 0; + char *str_field; + + if (is_dynamic) { + u32 data_offset; + + data_offset = offsetof(typeof(*entry), fields); + data_offset += event->n_u64 * sizeof(u64); + data_offset += data_size; + + str_field = (char *)entry + data_offset; + + len = strlen(str_val) + 1; + strscpy(str_field, str_val, len); + + data_offset |= len << 16; + *(u32 *)&entry->fields[*n_u64] = data_offset; + + (*n_u64)++; + } else { + str_field = (char *)&entry->fields[*n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + (*n_u64) += STR_VAR_LEN_MAX / sizeof(u64); + } + + return len; +} + static notrace void trace_event_raw_event_synth(void *__data, u64 *var_ref_vals, unsigned int *var_ref_idx) { + unsigned int i, n_u64, val_idx, len, data_size = 0; struct trace_event_file *trace_file = __data; struct synth_trace_event *entry; struct trace_event_buffer fbuffer; struct trace_buffer *buffer; struct synth_event *event; - unsigned int i, n_u64, val_idx; int fields_size = 0; event = trace_file->event_call->data; @@ -344,6 +398,18 @@ static notrace void trace_event_raw_event_synth(void *__data, fields_size = event->n_u64 * sizeof(u64); + for (i = 0; i < event->n_dynamic_fields; i++) { + unsigned int field_pos = event->dynamic_fields[i]->field_pos; + char *str_val; + + val_idx = var_ref_idx[field_pos]; + str_val = (char *)(long)var_ref_vals[val_idx]; + + len = strlen(str_val) + 1; + + fields_size += len; + } + /* * Avoid ring buffer recursion detection, as this event * is being performed within another event. @@ -360,10 +426,11 @@ static notrace void trace_event_raw_event_synth(void *__data, val_idx = var_ref_idx[i]; if (event->fields[i]->is_string) { char *str_val = (char *)(long)var_ref_vals[val_idx]; - char *str_field = (char *)&entry->fields[n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + len = trace_string(entry, event, str_val, + event->fields[i]->is_dynamic, + data_size, &n_u64); + data_size += len; /* only dynamic string increments */ } else { struct synth_field *field = event->fields[i]; u64 val = var_ref_vals[val_idx]; @@ -422,8 +489,13 @@ static int __set_synth_event_print_fmt(struct synth_event *event, pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < event->n_fields; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", REC->%s", event->fields[i]->name); + if (event->fields[i]->is_dynamic && + event->fields[i]->is_dynamic) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", event->fields[i]->name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", event->fields[i]->name); } #undef LEN_OR_ZERO @@ -522,9 +594,30 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, } size = synth_field_size(field->type); - if (size <= 0) { + if (size < 0) { ret = -EINVAL; goto free; + } else if (size == 0) { + if (synth_field_is_string(field->type)) { + char *type; + + type = kzalloc(sizeof("__data_loc ") + strlen(field->type) + 1, GFP_KERNEL); + if (!type) { + ret = -ENOMEM; + goto free; + } + + strcat(type, "__data_loc "); + strcat(type, field->type); + kfree(field->type); + field->type = type; + + field->is_dynamic = true; + size = sizeof(u64); + } else { + ret = -EINVAL; + goto free; + } } field->size = size; @@ -532,7 +625,6 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, field->is_string = true; field->is_signed = synth_field_signed(field->type); - out: return field; free: @@ -663,6 +755,7 @@ static void free_synth_event(struct synth_event *event) free_synth_field(event->fields[i]); kfree(event->fields); + kfree(event->dynamic_fields); kfree(event->name); kfree(event->class.system); free_synth_tracepoint(event->tp); @@ -673,8 +766,8 @@ static void free_synth_event(struct synth_event *event) static struct synth_event *alloc_synth_event(const char *name, int n_fields, struct synth_field **fields) { + unsigned int i, j, n_dynamic_fields = 0; struct synth_event *event; - unsigned int i; event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) { @@ -696,11 +789,33 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields, goto out; } + for (i = 0; i < n_fields; i++) + if (fields[i]->is_dynamic) + n_dynamic_fields++; + + if (n_dynamic_fields) { + event->dynamic_fields = kcalloc(n_dynamic_fields, + sizeof(*event->dynamic_fields), + GFP_KERNEL); + if (!event->dynamic_fields) { + free_synth_event(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + } + dyn_event_init(&event->devent, &synth_event_ops); - for (i = 0; i < n_fields; i++) + for (i = 0, j = 0; i < n_fields; i++) { event->fields[i] = fields[i]; + if (fields[i]->is_dynamic) { + event->dynamic_fields[j] = fields[i]; + event->dynamic_fields[j]->field_pos = i; + event->dynamic_fields[j++] = fields[i]; + event->n_dynamic_fields++; + } + } event->n_fields = n_fields; out: return event; @@ -712,6 +827,10 @@ static int synth_event_check_arg_fn(void *data) int size; size = synth_field_size((char *)arg_pair->lhs); + if (size == 0) { + if (strstr((char *)arg_pair->lhs, "[")) + return 0; + } return size ? 0 : -EINVAL; } @@ -1200,10 +1319,9 @@ void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) EXPORT_SYMBOL_GPL(synth_event_cmd_init); static inline int -__synth_event_trace_start(struct trace_event_file *file, - struct synth_event_trace_state *trace_state) +__synth_event_trace_init(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) { - int entry_size, fields_size = 0; int ret = 0; memset(trace_state, '\0', sizeof(*trace_state)); @@ -1225,8 +1343,20 @@ __synth_event_trace_start(struct trace_event_file *file, } trace_state->event = file->event_call->data; +out: + return ret; +} + +static inline int +__synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state, + int dynamic_fields_size) +{ + int entry_size, fields_size = 0; + int ret = 0; fields_size = trace_state->event->n_u64 * sizeof(u64); + fields_size += dynamic_fields_size; /* * Avoid ring buffer recursion detection, as this event @@ -1243,7 +1373,7 @@ __synth_event_trace_start(struct trace_event_file *file, ring_buffer_nest_end(trace_state->buffer); ret = -EINVAL; } -out: + return ret; } @@ -1276,23 +1406,46 @@ __synth_event_trace_end(struct synth_event_trace_state *trace_state) */ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) { + unsigned int i, n_u64, len, data_size = 0; struct synth_event_trace_state state; - unsigned int i, n_u64; va_list args; int ret; - ret = __synth_event_trace_start(file, &state); + ret = __synth_event_trace_init(file, &state); if (ret) { if (ret == -ENOENT) ret = 0; /* just disabled, not really an error */ return ret; } + if (state.event->n_dynamic_fields) { + va_start(args, n_vals); + + for (i = 0; i < state.event->n_fields; i++) { + u64 val = va_arg(args, u64); + + if (state.event->fields[i]->is_string && + state.event->fields[i]->is_dynamic) { + char *str_val = (char *)(long)val; + + data_size += strlen(str_val) + 1; + } + } + + va_end(args); + } + + ret = __synth_event_trace_start(file, &state, data_size); + if (ret) + return ret; + if (n_vals != state.event->n_fields) { ret = -EINVAL; goto out; } + data_size = 0; + va_start(args, n_vals); for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { u64 val; @@ -1301,10 +1454,11 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)val; - char *str_field = (char *)&state.entry->fields[n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + len = trace_string(state.entry, state.event, str_val, + state.event->fields[i]->is_dynamic, + data_size, &n_u64); + data_size += len; /* only dynamic string increments */ } else { struct synth_field *field = state.event->fields[i]; @@ -1357,29 +1511,46 @@ EXPORT_SYMBOL_GPL(synth_event_trace); int synth_event_trace_array(struct trace_event_file *file, u64 *vals, unsigned int n_vals) { + unsigned int i, n_u64, field_pos, len, data_size = 0; struct synth_event_trace_state state; - unsigned int i, n_u64; + char *str_val; int ret; - ret = __synth_event_trace_start(file, &state); + ret = __synth_event_trace_init(file, &state); if (ret) { if (ret == -ENOENT) ret = 0; /* just disabled, not really an error */ return ret; } + if (state.event->n_dynamic_fields) { + for (i = 0; i < state.event->n_dynamic_fields; i++) { + field_pos = state.event->dynamic_fields[i]->field_pos; + str_val = (char *)(long)vals[field_pos]; + len = strlen(str_val) + 1; + data_size += len; + } + } + + ret = __synth_event_trace_start(file, &state, data_size); + if (ret) + return ret; + if (n_vals != state.event->n_fields) { ret = -EINVAL; goto out; } + data_size = 0; + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)vals[i]; - char *str_field = (char *)&state.entry->fields[n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + len = trace_string(state.entry, state.event, str_val, + state.event->fields[i]->is_dynamic, + data_size, &n_u64); + data_size += len; /* only dynamic string increments */ } else { struct synth_field *field = state.event->fields[i]; u64 val = vals[i]; @@ -1447,9 +1618,17 @@ int synth_event_trace_start(struct trace_event_file *file, if (!trace_state) return -EINVAL; - ret = __synth_event_trace_start(file, trace_state); - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ + ret = __synth_event_trace_init(file, trace_state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (trace_state->event->n_dynamic_fields) + return -ENOTSUPP; + + ret = __synth_event_trace_start(file, trace_state, 0); return ret; } @@ -1510,6 +1689,11 @@ static int __synth_event_add_val(const char *field_name, u64 val, char *str_val = (char *)(long)val; char *str_field; + if (field->is_dynamic) { /* add_val can't do dynamic strings */ + ret = -EINVAL; + goto out; + } + if (!str_val) { ret = -EINVAL; goto out; diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h index 5166705d1556..6e146b959dcd 100644 --- a/kernel/trace/trace_synth.h +++ b/kernel/trace/trace_synth.h @@ -16,6 +16,8 @@ struct synth_field { unsigned int offset; bool is_signed; bool is_string; + bool is_dynamic; + bool field_pos; }; struct synth_event { @@ -24,6 +26,8 @@ struct synth_event { char *name; struct synth_field **fields; unsigned int n_fields; + struct synth_field **dynamic_fields; + unsigned int n_dynamic_fields; unsigned int n_u64; struct trace_event_class class; struct trace_event_call call; -- cgit v1.2.3 From 0de327969b61a245e3a47b60009eae73fe513cef Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Wed, 30 Sep 2020 12:28:21 +0200 Subject: cma: decrease CMA_ALIGNMENT lower limit to 2 On an embedded system with a tiny (1 MiB) CMA area for video memory, and a simple enough video pipeline, we can decrease the CMA_ALIGNMENT by a factor of 2 to avoid wasting memory, as all the allocations for video buffers will be of the exact same size (dictated by the size of the screen). Signed-off-by: Paul Cercueil Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index c5f717021f56..647996702bc9 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -177,7 +177,7 @@ endchoice config CMA_ALIGNMENT int "Maximum PAGE_SIZE order of alignment for contiguous buffers" - range 4 12 + range 2 12 default 8 help DMA mapping framework by default aligns all buffers to the smallest -- cgit v1.2.3 From 0a0f0d8be76dcd4390ff538e7060fda34db79717 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 22 Sep 2020 15:31:03 +0200 Subject: dma-mapping: split Split out all the bits that are purely for dma_map_ops implementations and related code into a new header so that they don't get pulled into all the drivers. That also means the architecture specific is not pulled in by any more, which leads to a missing includes that were pulled in by the x86 or arm versions in a few not overly portable drivers. Signed-off-by: Christoph Hellwig --- kernel/dma/coherent.c | 1 + kernel/dma/direct.c | 1 + kernel/dma/dummy.c | 2 +- kernel/dma/mapping.c | 2 +- kernel/dma/ops_helpers.c | 1 + kernel/dma/virt.c | 2 +- 6 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index c0685196fb6d..5b5b6c7ec7f2 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -8,6 +8,7 @@ #include #include #include +#include struct dma_coherent_mem { void *virt_base; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 121a9c1969dd..8cf5689a8c40 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c index 6974b1bd7d0b..eacd4c5b10bf 100644 --- a/kernel/dma/dummy.c +++ b/kernel/dma/dummy.c @@ -2,7 +2,7 @@ /* * Dummy DMA ops that always fail. */ -#include +#include static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 9669550656a0..2e13e6d3903f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -8,7 +8,7 @@ #include /* for max_pfn */ #include #include -#include +#include #include #include #include diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 5828e5e01b79..60d7b6cdfd8d 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -4,6 +4,7 @@ * the allocated memory contains normal pages in the direct kernel mapping. */ #include +#include #include /* diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c index 6986bf1fd668..59d32317dd57 100644 --- a/kernel/dma/virt.c +++ b/kernel/dma/virt.c @@ -4,7 +4,7 @@ */ #include #include -#include +#include #include static void *dma_virt_alloc(struct device *dev, size_t size, -- cgit v1.2.3 From 5af638931eb374aa0894d8343cee72f50307ef20 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Sep 2020 10:56:03 +0200 Subject: dma-contiguous: remove dev_set_cma_area dev_set_cma_area contains a trivial assignment. It has just three callers that all have a non-NULL device and depend on CONFIG_DMA_CMA, so remove the wrapper. Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index f4c150810fd2..95adcee972e8 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -359,14 +359,14 @@ void dma_free_contiguous(struct device *dev, struct page *page, size_t size) static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) { - dev_set_cma_area(dev, rmem->priv); + dev->cma_area = rmem->priv; return 0; } static void rmem_cma_device_release(struct reserved_mem *rmem, struct device *dev) { - dev_set_cma_area(dev, NULL); + dev->cma_area = NULL; } static const struct reserved_mem_ops rmem_cma_ops = { -- cgit v1.2.3 From 580a0cc9c3f662e0b10136bc8af1e672e472806f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Sep 2020 10:56:40 +0200 Subject: dma-contiguous: remove dma_contiguous_set_default dma_contiguous_set_default contains a trivial assignment, and has a single caller that is compiled if CONFIG_CMA_DMA is enabled. Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 95adcee972e8..bf05ec2256e1 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -407,7 +407,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) dma_contiguous_early_fixup(rmem->base, rmem->size); if (default_cma) - dma_contiguous_set_default(cma); + dma_contiguous_default_area = cma; rmem->ops = &rmem_cma_ops; rmem->priv = cma; -- cgit v1.2.3 From 0b1abd1fb7efafc25231c54a67c6fbb3d3127efd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Sep 2020 10:56:52 +0200 Subject: dma-mapping: merge into Merge dma-contiguous.h into dma-map-ops.h, after removing the comment describing the contiguous allocator into kernel/dma/contigous.c. Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 2 +- kernel/dma/contiguous.c | 30 +++++++++++++++++++++++++++++- kernel/dma/direct.c | 1 - kernel/dma/ops_helpers.c | 1 - kernel/dma/pool.c | 2 +- 5 files changed, 31 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 647996702bc9..c99de4a21458 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -118,7 +118,7 @@ config DMA_CMA You can disable CMA by specifying "cma=0" on the kernel's command line. - For more information see . + For more information see . If unsure, say "n". if DMA_CMA diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index bf05ec2256e1..6bfb763fff6f 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -5,6 +5,34 @@ * Written by: * Marek Szyprowski * Michal Nazarewicz + * + * Contiguous Memory Allocator + * + * The Contiguous Memory Allocator (CMA) makes it possible to + * allocate big contiguous chunks of memory after the system has + * booted. + * + * Why is it needed? + * + * Various devices on embedded systems have no scatter-getter and/or + * IO map support and require contiguous blocks of memory to + * operate. They include devices such as cameras, hardware video + * coders, etc. + * + * Such devices often require big memory buffers (a full HD frame + * is, for instance, more then 2 mega pixels large, i.e. more than 6 + * MB of memory), which makes mechanisms such as kmalloc() or + * alloc_page() ineffective. + * + * At the same time, a solution where a big memory region is + * reserved for a device is suboptimal since often more memory is + * reserved then strictly required and, moreover, the memory is + * inaccessible to page system even if device drivers don't use it. + * + * CMA tries to solve this issue by operating on memory regions + * where only movable pages can be allocated from. This way, kernel + * can use the memory for pagecache and when device driver requests + * it, allocated pages can be migrated. */ #define pr_fmt(fmt) "cma: " fmt @@ -21,7 +49,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_CMA_SIZE_MBYTES diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8cf5689a8c40..87697c86f0b8 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 60d7b6cdfd8d..bc80fd2648bc 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -3,7 +3,6 @@ * Helpers for DMA ops implementations. These generally rely on the fact that * the allocated memory contains normal pages in the direct kernel mapping. */ -#include #include #include diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index fe11643ff9cc..c9fb5c3d8bd0 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -5,7 +5,7 @@ */ #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 5db5d93089880c3cc9e83ca8bba68a5502e92dfe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Sep 2020 11:04:43 +0200 Subject: dma-mapping: remove Just provide a weak default definition of dma_contiguous_early_fixup and let arm override it. Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 6bfb763fff6f..a2ee330a3749 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -44,7 +44,6 @@ #endif #include -#include #include #include @@ -212,6 +211,11 @@ void __init dma_contiguous_reserve(phys_addr_t limit) } } +void __weak +dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) +{ +} + /** * dma_contiguous_reserve_area() - reserve custom contiguous area * @size: Size of the reserved area (in bytes), -- cgit v1.2.3 From a1fd09e8e6ae35228ecc7c1e4bfff1fd725f78a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Sep 2020 10:12:44 +0200 Subject: dma-mapping: move dma-debug.h to kernel/dma/ Most of dma-debug.h is not required by anything outside of kernel/dma. Move the four declarations needed by dma-mappin.h or dma-ops providers into dma-mapping.h and dma-map-ops.h, and move the remainder of the file to kernel/dma/debug.h. Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 5 +-- kernel/dma/debug.h | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/dma/mapping.c | 1 + 3 files changed, 125 insertions(+), 3 deletions(-) create mode 100644 kernel/dma/debug.h (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 4211800d9f3e..14de1271463f 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -9,10 +9,9 @@ #include #include -#include +#include #include #include -#include #include #include #include @@ -24,8 +23,8 @@ #include #include #include - #include +#include "debug.h" #define HASH_SIZE 16384ULL #define HASH_FN_SHIFT 13 diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h new file mode 100644 index 000000000000..83643b3010b2 --- /dev/null +++ b/kernel/dma/debug.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2008 Advanced Micro Devices, Inc. + * + * Author: Joerg Roedel + */ + +#ifndef _KERNEL_DMA_DEBUG_H +#define _KERNEL_DMA_DEBUG_H + +#ifdef CONFIG_DMA_API_DEBUG +extern void debug_dma_map_page(struct device *dev, struct page *page, + size_t offset, size_t size, + int direction, dma_addr_t dma_addr); + +extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, int direction); + +extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, int mapped_ents, int direction); + +extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nelems, int dir); + +extern void debug_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t dma_addr, void *virt); + +extern void debug_dma_free_coherent(struct device *dev, size_t size, + void *virt, dma_addr_t addr); + +extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr, + size_t size, int direction, + dma_addr_t dma_addr); + +extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction); + +extern void debug_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, size_t size, + int direction); + +extern void debug_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, + size_t size, int direction); + +extern void debug_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sg, + int nelems, int direction); + +extern void debug_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, + int nelems, int direction); +#else /* CONFIG_DMA_API_DEBUG */ +static inline void debug_dma_map_page(struct device *dev, struct page *page, + size_t offset, size_t size, + int direction, dma_addr_t dma_addr) +{ +} + +static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, int direction) +{ +} + +static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, int mapped_ents, int direction) +{ +} + +static inline void debug_dma_unmap_sg(struct device *dev, + struct scatterlist *sglist, + int nelems, int dir) +{ +} + +static inline void debug_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t dma_addr, void *virt) +{ +} + +static inline void debug_dma_free_coherent(struct device *dev, size_t size, + void *virt, dma_addr_t addr) +{ +} + +static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr, + size_t size, int direction, + dma_addr_t dma_addr) +{ +} + +static inline void debug_dma_unmap_resource(struct device *dev, + dma_addr_t dma_addr, size_t size, + int direction) +{ +} + +static inline void debug_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, + size_t size, int direction) +{ +} + +static inline void debug_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, + size_t size, int direction) +{ +} + +static inline void debug_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sg, + int nelems, int direction) +{ +} + +static inline void debug_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, + int nelems, int direction) +{ +} +#endif /* CONFIG_DMA_API_DEBUG */ +#endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 2e13e6d3903f..335ba183e095 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -14,6 +14,7 @@ #include #include #include +#include "debug.h" /* * Managed DMA API -- cgit v1.2.3 From 19c65c3d30bb5a97170e425979d2e44ab2096c7d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 22 Sep 2020 15:34:22 +0200 Subject: dma-mapping: move large parts of to kernel/dma Most of the dma_direct symbols should only be used by direct.c and mapping.c, so move them to kernel/dma. In fact more of dma-direct.h should eventually move, but that will require more coordination with other subsystems. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 2 +- kernel/dma/direct.h | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/dma/mapping.c | 2 +- 3 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 kernel/dma/direct.h (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 87697c86f0b8..bf9f77623022 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -7,13 +7,13 @@ #include /* for max_pfn */ #include #include -#include #include #include #include #include #include #include +#include "direct.h" /* * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h new file mode 100644 index 000000000000..b98615578737 --- /dev/null +++ b/kernel/dma/direct.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Christoph Hellwig. + * + * DMA operations that map physical memory directly without using an IOMMU. + */ +#ifndef _KERNEL_DMA_DIRECT_H +#define _KERNEL_DMA_DIRECT_H + +#include + +int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); +bool dma_direct_can_mmap(struct device *dev); +int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr); +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, unsigned long attrs); +size_t dma_direct_max_mapping_size(struct device *dev); + +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ + defined(CONFIG_SWIOTLB) +void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir); +#else +static inline void dma_direct_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ +} +#endif + +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_SWIOTLB) +void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs); +void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); +#else +static inline void dma_direct_unmap_sg(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir, + unsigned long attrs) +{ +} +static inline void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ +} +#endif + +static inline void dma_direct_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + phys_addr_t paddr = dma_to_phys(dev, addr); + + if (unlikely(is_swiotlb_buffer(paddr))) + swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); + + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_for_device(paddr, size, dir); +} + +static inline void dma_direct_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + phys_addr_t paddr = dma_to_phys(dev, addr); + + if (!dev_is_dma_coherent(dev)) { + arch_sync_dma_for_cpu(paddr, size, dir); + arch_sync_dma_for_cpu_all(); + } + + if (unlikely(is_swiotlb_buffer(paddr))) + swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); + + if (dir == DMA_FROM_DEVICE) + arch_dma_mark_clean(paddr, size); +} + +static inline dma_addr_t dma_direct_map_page(struct device *dev, + struct page *page, unsigned long offset, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + phys_addr_t phys = page_to_phys(page) + offset; + dma_addr_t dma_addr = phys_to_dma(dev, phys); + + if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + return swiotlb_map(dev, phys, size, dir, attrs); + + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (swiotlb_force != SWIOTLB_NO_FORCE) + return swiotlb_map(dev, phys, size, dir, attrs); + + dev_WARN_ONCE(dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; + } + + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(phys, size, dir); + return dma_addr; +} + +static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + phys_addr_t phys = dma_to_phys(dev, addr); + + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_direct_sync_single_for_cpu(dev, addr, size, dir); + + if (unlikely(is_swiotlb_buffer(phys))) + swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); +} +#endif /* _KERNEL_DMA_DIRECT_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 335ba183e095..51bb8fa8eb89 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -7,7 +7,6 @@ */ #include /* for max_pfn */ #include -#include #include #include #include @@ -15,6 +14,7 @@ #include #include #include "debug.h" +#include "direct.h" /* * Managed DMA API -- cgit v1.2.3 From 9f4df96b8781e40d0cb0e32eb3d1f6d87375adf9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 22 Sep 2020 15:36:11 +0200 Subject: dma-mapping: merge into Move more nitty gritty DMA implementation details into the common internal header. Signed-off-by: Christoph Hellwig --- kernel/dma/ops_helpers.c | 1 - kernel/dma/pool.c | 1 - kernel/dma/swiotlb.c | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index bc80fd2648bc..910ae69cae77 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -4,7 +4,6 @@ * the allocated memory contains normal pages in the direct kernel mapping. */ #include -#include /* * Create scatter-list for the already allocated DMA buffer. diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index c9fb5c3d8bd0..d4637f72239b 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 4ea72d145cd2..2be1e8b34ae3 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 90428a8eb4947f9c7c905a178f3520dc7e2ee6d2 Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Mon, 28 Sep 2020 10:02:01 +0530 Subject: genirq/PM: Introduce IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND flag An interrupt that is disabled/masked but set for wakeup may still need to be able to wake up the system from sleep states like "suspend to RAM". To that effect, introduce the IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND flag. If the irqchip have this flag set, the irq PM code will enable/unmask the irqs that are marked for wakeup, but that are in a disabled state. On resume, such irqs will be restored back to their disabled state. Suggested-by: Thomas Gleixner Signed-off-by: Maulik Shah [maz: commit message fix-up] Signed-off-by: Marc Zyngier Tested-by: Stephen Boyd Reviewed-by: Thomas Gleixner Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/1601267524-20199-4-git-send-email-mkshah@codeaurora.org --- kernel/irq/debugfs.c | 3 +++ kernel/irq/pm.c | 34 ++++++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index b95ff5d5f4bd..b6e10945e8be 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -57,6 +57,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_EOI_THREADED), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI), + BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND), }; static void @@ -125,6 +126,8 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET), BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX), + + BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND), }; static const struct irq_bit_descr irqdesc_states[] = { diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index c6c7e187ae74..ce0adb22ee96 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -69,12 +69,26 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) static bool suspend_device_irq(struct irq_desc *desc) { + unsigned long chipflags = irq_desc_get_chip(desc)->flags; + struct irq_data *irqd = &desc->irq_data; + if (!desc->action || irq_desc_is_chained(desc) || desc->no_suspend_depth) return false; - if (irqd_is_wakeup_set(&desc->irq_data)) { - irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + if (irqd_is_wakeup_set(irqd)) { + irqd_set(irqd, IRQD_WAKEUP_ARMED); + + if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) && + irqd_irq_disabled(irqd)) { + /* + * Interrupt marked for wakeup is in disabled state. + * Enable interrupt here to unmask/enable in irqchip + * to be able to resume with such interrupts. + */ + __enable_irq(desc); + irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); + } /* * We return true here to force the caller to issue * synchronize_irq(). We need to make sure that the @@ -93,7 +107,7 @@ static bool suspend_device_irq(struct irq_desc *desc) * chip level. The chip implementation indicates that with * IRQCHIP_MASK_ON_SUSPEND. */ - if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + if (chipflags & IRQCHIP_MASK_ON_SUSPEND) mask_irq(desc); return true; } @@ -137,7 +151,19 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); static void resume_irq(struct irq_desc *desc) { - irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + struct irq_data *irqd = &desc->irq_data; + + irqd_clear(irqd, IRQD_WAKEUP_ARMED); + + if (irqd_is_enabled_on_suspend(irqd)) { + /* + * Interrupt marked for wakeup was enabled during suspend + * entry. Disable such interrupts to restore them back to + * original state. + */ + __disable_irq(desc); + irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); + } if (desc->istate & IRQS_SUSPENDED) goto resume; -- cgit v1.2.3 From 4013c1496c49615d90d36b9d513eee8e369778e9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 5 Oct 2020 10:56:22 -0700 Subject: usermodehelper: reset umask to default before executing user process Kernel threads intentionally do CLONE_FS in order to follow any changes that 'init' does to set up the root directory (or cwd). It is admittedly a bit odd, but it avoids the situation where 'init' does some extensive setup to initialize the system environment, and then we execute a usermode helper program, and it uses the original FS setup from boot time that may be very limited and incomplete. [ Both Al Viro and Eric Biederman point out that 'pivot_root()' will follow the root regardless, since it fixes up other users of root (see chroot_fs_refs() for details), but overmounting root and doing a chroot() would not. ] However, Vegard Nossum noticed that the CLONE_FS not only means that we follow the root and current working directories, it also means we share umask with whatever init changed it to. That wasn't intentional. Just reset umask to the original default (0022) before actually starting the usermode helper program. Reported-by: Vegard Nossum Cc: Al Viro Acked-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- kernel/umh.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index fcf3ee803630..3f646613a9d3 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -71,6 +72,14 @@ static int call_usermodehelper_exec_async(void *data) flush_signal_handlers(current, 1); spin_unlock_irq(¤t->sighand->siglock); + /* + * Initial kernel threads share ther FS with init, in order to + * get the init root directory. But we've now created a new + * thread that is going to execve a user process and has its own + * 'struct fs_struct'. Reset umask to the default. + */ + current->fs->umask = 0022; + /* * Our parent (unbound workqueue) runs with elevated scheduling * priority. Avoid propagating that into the userspace child. -- cgit v1.2.3 From 08a89c28304ae74e4c7422f784359e41a37e3e7c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 26 Sep 2020 16:39:36 +0200 Subject: dma-direct check for highmem pages in dma_direct_alloc_pages Check for highmem pages from CMA, just like in the dma_direct_alloc path. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index bf9f77623022..f5ecadd4e1c1 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -309,6 +309,17 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, page = __dma_direct_alloc_pages(dev, size, gfp); if (!page) return NULL; + if (PageHighMem(page)) { + /* + * Depending on the cma= arguments and per-arch setup + * dma_alloc_contiguous could return highmem pages. + * Without remapping there is no way to return them here, + * so log an error and fail. + */ + dev_info(dev, "Rejecting highmem page from CMA.\n"); + goto out_free_pages; + } + ret = page_address(page); if (force_dma_unencrypted(dev)) { if (set_memory_decrypted((unsigned long)ret, -- cgit v1.2.3 From 5b138c534fda57c2ebc1e6de72578aa1d70788a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 Oct 2020 11:06:09 +0200 Subject: dma-direct: factor out a dma_direct_alloc_from_pool helper This ensures dma_direct_alloc_pages will use the right gfp mask, as well as keeping the code for that common between the two allocators. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f5ecadd4e1c1..eddcb70251c9 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -147,6 +147,22 @@ again: return page; } +static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + struct page *page; + u64 phys_mask; + void *ret; + + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); + page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok); + if (!page) + return NULL; + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); + return ret; +} + void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { @@ -163,17 +179,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; - if (dma_should_alloc_from_pool(dev, gfp, attrs)) { - u64 phys_mask; - - gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_mask); - page = dma_alloc_from_pool(dev, size, &ret, gfp, - dma_coherent_ok); - if (!page) - return NULL; - goto done; - } + if (dma_should_alloc_from_pool(dev, gfp, attrs)) + return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); @@ -298,13 +305,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - if (dma_should_alloc_from_pool(dev, gfp, 0)) { - page = dma_alloc_from_pool(dev, size, &ret, gfp, - dma_coherent_ok); - if (!page) - return NULL; - goto done; - } + if (dma_should_alloc_from_pool(dev, gfp, 0)) + return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); page = __dma_direct_alloc_pages(dev, size, gfp); if (!page) @@ -327,7 +329,6 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, goto out_free_pages; } memset(ret, 0, size); -done: *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return page; out_free_pages: -- cgit v1.2.3 From 849facea92fa68d9292f9b06d7c4ee9e7a06b8dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 Oct 2020 11:04:08 +0200 Subject: dma-direct: simplify the DMA_ATTR_NO_KERNEL_MAPPING handling Use and entirely separate code path for the DMA_ATTR_NO_KERNEL_MAPPING path. This avoids any confusion about the ret type, and avoids lots of attr checks and helpers that can be significantly simplified now. It also ensures that common handling is applied to architetures still using the arch alloc/free hooks. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 100 ++++++++++++++++++++-------------------------------- 1 file changed, 39 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index eddcb70251c9..b92d08e65999 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -75,39 +75,6 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } -/* - * Decrypting memory is allowed to block, so if this device requires - * unencrypted memory it must come from atomic pools. - */ -static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp, - unsigned long attrs) -{ - if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) - return false; - if (gfpflags_allow_blocking(gfp)) - return false; - if (force_dma_unencrypted(dev)) - return true; - if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) - return false; - if (dma_alloc_need_uncached(dev, attrs)) - return true; - return false; -} - -static inline bool dma_should_free_from_pool(struct device *dev, - unsigned long attrs) -{ - if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) - return true; - if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev)) - return false; - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) - return true; - return false; -} - static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp) { @@ -170,35 +137,45 @@ void *dma_direct_alloc(struct device *dev, size_t size, void *ret; int err; - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs)) - return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); - size = PAGE_ALIGN(size); if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; - if (dma_should_alloc_from_pool(dev, gfp, attrs)) - return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); - - /* we always manually zero the memory once we are done */ - page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); - if (!page) - return NULL; - if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + if (!page) + return NULL; /* remove any dirty cache lines on the kernel alias */ if (!PageHighMem(page)) arch_dma_prep_coherent(page, size); + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); /* return the page pointer as the opaque cookie */ - ret = page; - goto done; + return page; } + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !dev_is_dma_coherent(dev)) + return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); + + /* + * Remapping or decrypting memory may block. If either is required and + * we can't block, allocate the memory from the atomic pools. + */ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && + !gfpflags_allow_blocking(gfp) && + (force_dma_unencrypted(dev) || + (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev)))) + return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); + + /* we always manually zero the memory once we are done */ + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + if (!page) + return NULL; + if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs)) || + !dev_is_dma_coherent(dev)) || (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) { /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); @@ -241,7 +218,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, memset(ret, 0, size); if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - dma_alloc_need_uncached(dev, attrs)) { + !dev_is_dma_coherent(dev)) { arch_dma_prep_coherent(page, size); ret = arch_dma_set_uncached(ret, size); if (IS_ERR(ret)) @@ -269,25 +246,25 @@ void dma_direct_free(struct device *dev, size_t size, { unsigned int page_order = get_order(size); + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev)) { + /* cpu_addr is a struct page cookie, not a kernel address */ + dma_free_contiguous(dev, cpu_addr, size); + return; + } + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs)) { + !dev_is_dma_coherent(dev)) { arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); return; } /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ - if (dma_should_free_from_pool(dev, attrs) && + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) return; - if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev)) { - /* cpu_addr is a struct page cookie, not a kernel address */ - dma_free_contiguous(dev, cpu_addr, size); - return; - } - if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); @@ -305,7 +282,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - if (dma_should_alloc_from_pool(dev, gfp, 0)) + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && + force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); page = __dma_direct_alloc_pages(dev, size, gfp); @@ -344,7 +322,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *vaddr = page_address(page); /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ - if (dma_should_free_from_pool(dev, 0) && + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && dma_free_from_pool(dev, vaddr, size)) return; -- cgit v1.2.3 From 86836bac55f971995499978df4e62115d7baf5ef Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 Oct 2020 14:01:31 +0200 Subject: cpufreq: schedutil: Simplify sugov_fast_switch() Drop a redundant local variable definition from sugov_fast_switch() and rearrange the code in there to avoid the redundant logical negation. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- kernel/sched/cpufreq_schedutil.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 28f6d1ad608b..5ae7b4e6e8d6 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -114,12 +114,8 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - struct cpufreq_policy *policy = sg_policy->policy; - - if (!sugov_update_next_freq(sg_policy, time, next_freq)) - return; - - cpufreq_driver_fast_switch(policy, next_freq); + if (sugov_update_next_freq(sg_policy, time, next_freq)) + cpufreq_driver_fast_switch(sg_policy->policy, next_freq); } static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, -- cgit v1.2.3 From 49a2a4d4163f95e36e2bec50e06813b73401bc76 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 6 Oct 2020 19:16:13 -0700 Subject: kernel/bpf/verifier: Fix build when NET is not enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build errors in kernel/bpf/verifier.c when CONFIG_NET is not enabled. ../kernel/bpf/verifier.c:3995:13: error: ‘btf_sock_ids’ undeclared here (not in a function); did you mean ‘bpf_sock_ops’? .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], ../kernel/bpf/verifier.c:3995:26: error: ‘BTF_SOCK_TYPE_SOCK_COMMON’ undeclared here (not in a function); did you mean ‘PTR_TO_SOCK_COMMON’? .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], Fixes: 1df8f55a37bd ("bpf: Enable bpf_skc_to_* sock casting helper to networking prog type") Signed-off-by: Randy Dunlap Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201007021613.13646-1-rdunlap@infradead.org --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 01120acab09a..62b804651a48 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3984,6 +3984,7 @@ static const struct bpf_reg_types sock_types = { }, }; +#ifdef CONFIG_NET static const struct bpf_reg_types btf_id_sock_common_types = { .types = { PTR_TO_SOCK_COMMON, @@ -3994,6 +3995,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = { }, .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], }; +#endif static const struct bpf_reg_types mem_types = { .types = { @@ -4037,7 +4039,9 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_CTX] = &context_types, [ARG_PTR_TO_CTX_OR_NULL] = &context_types, [ARG_PTR_TO_SOCK_COMMON] = &sock_types, +#ifdef CONFIG_NET [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, +#endif [ARG_PTR_TO_SOCKET] = &fullsock_types, [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, -- cgit v1.2.3 From ebfb4d40ed9dd45d2d6780f58d0cd9336d647890 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 6 Oct 2020 23:29:33 -0700 Subject: bpf: Fix build failure for kernel/trace/bpf_trace.c with CONFIG_NET=n When CONFIG_NET is not defined, I hit the following build error: kernel/trace/bpf_trace.o:(.rodata+0x110): undefined reference to `bpf_prog_test_run_raw_tp' Commit 1b4d60ec162f ("bpf: Enable BPF_PROG_TEST_RUN for raw_tracepoint") added test_run support for raw_tracepoint in /kernel/trace/bpf_trace.c. But the test_run function bpf_prog_test_run_raw_tp is defined in net/bpf/test_run.c, only available with CONFIG_NET=y. Adding a CONFIG_NET guard for .test_run = bpf_prog_test_run_raw_tp; fixed the above build issue. Fixes: 1b4d60ec162f ("bpf: Enable BPF_PROG_TEST_RUN for raw_tracepoint") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201007062933.3425899-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a136a6a63a71..a2a4535b6277 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1780,7 +1780,9 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { }; const struct bpf_prog_ops raw_tracepoint_prog_ops = { +#ifdef CONFIG_NET .test_run = bpf_prog_test_run_raw_tp, +#endif }; const struct bpf_verifier_ops tracing_verifier_ops = { -- cgit v1.2.3 From 5b9fbeb75b6a98955f628e205ac26689bcb1383e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 7 Oct 2020 15:48:58 +0200 Subject: bpf: Fix scalar32_min_max_or bounds tracking Simon reported an issue with the current scalar32_min_max_or() implementation. That is, compared to the other 32 bit subreg tracking functions, the code in scalar32_min_max_or() stands out that it's using the 64 bit registers instead of 32 bit ones. This leads to bounds tracking issues, for example: [...] 8: R0=map_value(id=0,off=0,ks=4,vs=48,imm=0) R10=fp0 fp-8=mmmmmmmm 8: (79) r1 = *(u64 *)(r0 +0) R0=map_value(id=0,off=0,ks=4,vs=48,imm=0) R10=fp0 fp-8=mmmmmmmm 9: R0=map_value(id=0,off=0,ks=4,vs=48,imm=0) R1_w=inv(id=0) R10=fp0 fp-8=mmmmmmmm 9: (b7) r0 = 1 10: R0_w=inv1 R1_w=inv(id=0) R10=fp0 fp-8=mmmmmmmm 10: (18) r2 = 0x600000002 12: R0_w=inv1 R1_w=inv(id=0) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 12: (ad) if r1 < r2 goto pc+1 R0_w=inv1 R1_w=inv(id=0,umin_value=25769803778) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 13: R0_w=inv1 R1_w=inv(id=0,umin_value=25769803778) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 13: (95) exit 14: R0_w=inv1 R1_w=inv(id=0,umax_value=25769803777,var_off=(0x0; 0x7ffffffff)) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 14: (25) if r1 > 0x0 goto pc+1 R0_w=inv1 R1_w=inv(id=0,umax_value=0,var_off=(0x0; 0x7fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 15: R0_w=inv1 R1_w=inv(id=0,umax_value=0,var_off=(0x0; 0x7fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 15: (95) exit 16: R0_w=inv1 R1_w=inv(id=0,umin_value=1,umax_value=25769803777,var_off=(0x0; 0x77fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 16: (47) r1 |= 0 17: R0_w=inv1 R1_w=inv(id=0,umin_value=1,umax_value=32212254719,var_off=(0x1; 0x700000000),s32_max_value=1,u32_max_value=1) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm [...] The bound tests on the map value force the upper unsigned bound to be 25769803777 in 64 bit (0b11000000000000000000000000000000001) and then lower one to be 1. By using OR they are truncated and thus result in the range [1,1] for the 32 bit reg tracker. This is incorrect given the only thing we know is that the value must be positive and thus 2147483647 (0b1111111111111111111111111111111) at max for the subregs. Fix it by using the {u,s}32_{min,max}_value vars instead. This also makes sense, for example, for the case where we update dst_reg->s32_{min,max}_value in the else branch we need to use the newly computed dst_reg->u32_{min,max}_value as we know that these are positive. Previously, in the else branch the 64 bit values of umin_value=1 and umax_value=32212254719 were used and latter got truncated to be 1 as upper bound there. After the fix the subreg range is now correct: [...] 8: R0=map_value(id=0,off=0,ks=4,vs=48,imm=0) R10=fp0 fp-8=mmmmmmmm 8: (79) r1 = *(u64 *)(r0 +0) R0=map_value(id=0,off=0,ks=4,vs=48,imm=0) R10=fp0 fp-8=mmmmmmmm 9: R0=map_value(id=0,off=0,ks=4,vs=48,imm=0) R1_w=inv(id=0) R10=fp0 fp-8=mmmmmmmm 9: (b7) r0 = 1 10: R0_w=inv1 R1_w=inv(id=0) R10=fp0 fp-8=mmmmmmmm 10: (18) r2 = 0x600000002 12: R0_w=inv1 R1_w=inv(id=0) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 12: (ad) if r1 < r2 goto pc+1 R0_w=inv1 R1_w=inv(id=0,umin_value=25769803778) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 13: R0_w=inv1 R1_w=inv(id=0,umin_value=25769803778) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 13: (95) exit 14: R0_w=inv1 R1_w=inv(id=0,umax_value=25769803777,var_off=(0x0; 0x7ffffffff)) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 14: (25) if r1 > 0x0 goto pc+1 R0_w=inv1 R1_w=inv(id=0,umax_value=0,var_off=(0x0; 0x7fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 15: R0_w=inv1 R1_w=inv(id=0,umax_value=0,var_off=(0x0; 0x7fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 15: (95) exit 16: R0_w=inv1 R1_w=inv(id=0,umin_value=1,umax_value=25769803777,var_off=(0x0; 0x77fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm 16: (47) r1 |= 0 17: R0_w=inv1 R1_w=inv(id=0,umin_value=1,umax_value=32212254719,var_off=(0x0; 0x77fffffff),u32_max_value=2147483647) R2_w=inv25769803778 R10=fp0 fp-8=mmmmmmmm [...] Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: Simon Scannell Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 47e74f09fa37..fba52d9ec8fc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5667,8 +5667,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->smin_value; - u32 umin_val = src_reg->umin_value; + s32 smin_val = src_reg->s32_min_value; + u32 umin_val = src_reg->u32_min_value; /* Assuming scalar64_min_max_or will be called so it is safe * to skip updating register for known case. @@ -5691,8 +5691,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, /* ORing two positives gives a positive, so safe to * cast result into s64. */ - dst_reg->s32_min_value = dst_reg->umin_value; - dst_reg->s32_max_value = dst_reg->umax_value; + dst_reg->s32_min_value = dst_reg->u32_min_value; + dst_reg->s32_max_value = dst_reg->u32_max_value; } } -- cgit v1.2.3 From 1bc36bd4a8557285870b34cfec7910871049e93e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 4 Oct 2020 17:14:07 -0500 Subject: tracing: Add README information for synthetic_events file Add an entry with a basic description of events/synthetic_events along with a simple example. Link: https://lkml.kernel.org/r/3c7f178cf95aaeebc01eda7d95600dd937233eb7.1601848695.git.zanussi@kernel.org Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3f2533adae72..73fd0e0c0f39 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5249,7 +5249,12 @@ static const char readme_msg[] = "\t trace(,param list) - generate synthetic event\n" "\t save(field,...) - save current event fields\n" #ifdef CONFIG_TRACER_SNAPSHOT - "\t snapshot() - snapshot the trace buffer\n" + "\t snapshot() - snapshot the trace buffer\n\n" +#endif +#ifdef CONFIG_SYNTH_EVENTS + " events/synthetic_events\t- Create/append/remove/show synthetic events\n" + "\t Write into this file to define/undefine new synthetic events.\n" + "\t example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n" #endif #endif ; -- cgit v1.2.3 From 8db4d6bfbbf9206567fd529dc73dc058b3929db0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 4 Oct 2020 17:14:09 -0500 Subject: tracing: Change synthetic event string format to limit printed length Change the format for printing synthetic field strings to limit the length of the string printed even if it's not correctly terminated. Link: https://lore.kernel.org/r/20201002210036.0200371b@oasis.local.home Link: https://lkml.kernel.org/r/b6bdb34e70d970e8026daa3503db6b8e5cdad524.1601848695.git.zanussi@kernel.org Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 24bc6d61aa40..742ce5f62d6d 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -234,7 +234,7 @@ static const char *synth_field_fmt(char *type) else if (strcmp(type, "gfp_t") == 0) fmt = "%x"; else if (synth_field_is_string(type)) - fmt = "%s"; + fmt = "%.*s"; return fmt; } @@ -303,11 +303,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, str_field = (char *)entry + data_offset; trace_seq_printf(s, print_fmt, se->fields[i]->name, + STR_VAR_LEN_MAX, str_field, i == se->n_fields - 1 ? "" : " "); n_u64++; } else { trace_seq_printf(s, print_fmt, se->fields[i]->name, + STR_VAR_LEN_MAX, (char *)&entry->fields[n_u64], i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); -- cgit v1.2.3 From 59e65b3358f44d4d0134eca3b6c269f359f21cd5 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 31 Aug 2020 11:11:00 +0800 Subject: ftrace: Use fls() to get the bits for dup_hash() The effect here is to get the number of bits, lets use fls() to do this job. Link: https://lkml.kernel.org/r/20200831031104.23322-3-richard.weiyang@linux.alibaba.com Signed-off-by: Wei Yang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 123d520b9261..5633d37d8806 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1370,8 +1370,9 @@ static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size) /* * Make the hash size about 1/2 the # found */ - for (size /= 2; size; size >>= 1) - bits++; + bits = fls(size); + if (bits) + bits--; /* Don't allocate too much */ if (bits > FTRACE_HASH_MAX_BITS) -- cgit v1.2.3 From be49313273211c47d1b317d6b2dbe02637c2794c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 5 Oct 2020 20:21:14 -0400 Subject: ftrace: Simplify the hash calculation No need to add a check to subtract the number of bits if bits is zero after fls(). Just divide the size by two before calling it. This does give the same answer for size of 0 and 1, but that's fine. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5633d37d8806..c51a91aea1fd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1368,11 +1368,10 @@ static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size) int i; /* - * Make the hash size about 1/2 the # found + * Use around half the size (max bit of it), but + * a minimum of 2 is fine (as size of 0 or 1 both give 1 for bits). */ - bits = fls(size); - if (bits) - bits--; + bits = fls(size / 2); /* Don't allocate too much */ if (bits > FTRACE_HASH_MAX_BITS) -- cgit v1.2.3 From b40c6eabfcd409e022fcb377ac268e3ef9446fde Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 31 Aug 2020 11:11:02 +0800 Subject: ftrace: Simplify the calculation of page number for ftrace_page->records Based on the following two reasones, we could simplify the calculation: - If the number after roundup count is not power of 2, we would definitely have more than 1 empty page with a higher order. - get_count_order() just return current order, so one lower order could meet the requirement. The calculation could be simplified by lower one order level when pages are not power of 2. Link: https://lkml.kernel.org/r/20200831031104.23322-5-richard.weiyang@linux.alibaba.com Signed-off-by: Wei Yang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c51a91aea1fd..c3be18b4f27b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3129,18 +3129,19 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) static int ftrace_allocate_records(struct ftrace_page *pg, int count) { int order; - int cnt; + int pages, cnt; if (WARN_ON(!count)) return -EINVAL; - order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); + pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); + order = get_count_order(pages); /* * We want to fill as much as possible. No more than a page * may be empty. */ - while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) + if (!is_power_of_2(pages)) order--; again: -- cgit v1.2.3 From 7ba031e8b74c6aa156a0d9867dc13cf817d52047 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 5 Oct 2020 20:37:41 -0400 Subject: ftrace: Format variable declarations of ftrace_allocate_records I hate when unrelated variables are declared on the same line. Split them. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c3be18b4f27b..4833b6a82ce7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3129,7 +3129,8 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) static int ftrace_allocate_records(struct ftrace_page *pg, int count) { int order; - int pages, cnt; + int pages; + int cnt; if (WARN_ON(!count)) return -EINVAL; -- cgit v1.2.3 From 43aa422c0c07135236bd91cbb45b048fd85e73b5 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Wed, 30 Sep 2020 19:43:03 +0100 Subject: tracing: Remove a pointless assignment The variable 'len' has been assigned a value but is not used after that. So, remove the assignement. Link: https://lkml.kernel.org/r/20200930184303.22896-1-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73fd0e0c0f39..0806fa9f2815 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6667,7 +6667,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, written = -EFAULT; } else written = cnt; - len = cnt; if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ -- cgit v1.2.3 From 848183553e431e6e9c2ea2f72421a7a1bbc6532e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Oct 2020 10:34:34 -0400 Subject: tracing: Fix synthetic print fmt check for use of __get_str() A cut and paste error had the check to use __get_str() test "is_dynamic" twice, instead of checking "is_string && is_dynamic". Link: https://lore.kernel.org/r/d34dccd5-96ba-a2d9-46ea-de8807525deb@canonical.com Reported-by: Colin Ian King Acked-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 742ce5f62d6d..3b2dcc42b8ee 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -491,7 +491,7 @@ static int __set_synth_event_print_fmt(struct synth_event *event, pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < event->n_fields; i++) { - if (event->fields[i]->is_dynamic && + if (event->fields[i]->is_string && event->fields[i]->is_dynamic) pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(%s)", event->fields[i]->name); -- cgit v1.2.3 From dfe719fef03d752f1682fa8aeddf30ba501c8555 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 5 Oct 2020 03:44:01 +0200 Subject: seccomp: Make duplicate listener detection non-racy Currently, init_listener() tries to prevent adding a filter with SECCOMP_FILTER_FLAG_NEW_LISTENER if one of the existing filters already has a listener. However, this check happens without holding any lock that would prevent another thread from concurrently installing a new filter (potentially with a listener) on top of the ones we already have. Theoretically, this is also a data race: The plain load from current->seccomp.filter can race with concurrent writes to the same location. Fix it by moving the check into the region that holds the siglock to guard against concurrent TSYNC. (The "Fixes" tag points to the commit that introduced the theoretical data race; concurrent installation of another filter with TSYNC only became possible later, in commit 51891498f2da ("seccomp: allow TSYNC and USER_NOTIF together").) Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Reviewed-by: Tycho Andersen Signed-off-by: Jann Horn Signed-off-by: Kees Cook Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201005014401.490175-1-jannh@google.com --- kernel/seccomp.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ae6b40cc39f4..8ad7a293255a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1476,13 +1476,7 @@ static const struct file_operations seccomp_notify_ops = { static struct file *init_listener(struct seccomp_filter *filter) { - struct file *ret = ERR_PTR(-EBUSY); - struct seccomp_filter *cur; - - for (cur = current->seccomp.filter; cur; cur = cur->prev) { - if (cur->notif) - goto out; - } + struct file *ret; ret = ERR_PTR(-ENOMEM); filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL); @@ -1508,6 +1502,31 @@ out: return ret; } +/* + * Does @new_child have a listener while an ancestor also has a listener? + * If so, we'll want to reject this filter. + * This only has to be tested for the current process, even in the TSYNC case, + * because TSYNC installs @child with the same parent on all threads. + * Note that @new_child is not hooked up to its parent at this point yet, so + * we use current->seccomp.filter. + */ +static bool has_duplicate_listener(struct seccomp_filter *new_child) +{ + struct seccomp_filter *cur; + + /* must be protected against concurrent TSYNC */ + lockdep_assert_held(¤t->sighand->siglock); + + if (!new_child->notif) + return false; + for (cur = current->seccomp.filter; cur; cur = cur->prev) { + if (cur->notif) + return true; + } + + return false; +} + /** * seccomp_set_mode_filter: internal function for setting seccomp filter * @flags: flags to change filter behavior @@ -1579,6 +1598,11 @@ static long seccomp_set_mode_filter(unsigned int flags, if (!seccomp_may_assign_mode(seccomp_mode)) goto out; + if (has_duplicate_listener(prepared)) { + ret = -EBUSY; + goto out; + } + ret = seccomp_attach_filter(flags, prepared); if (ret) goto out; -- cgit v1.2.3 From 6d6b8b9f4fceab7266ca03d194f60ec72bd4b654 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Thu, 27 Aug 2020 12:17:32 +0530 Subject: perf: Fix task_function_call() error handling The error handling introduced by commit: 2ed6edd33a21 ("perf: Add cond_resched() to task_function_call()") looses any return value from smp_call_function_single() that is not {0, -EINVAL}. This is a problem because it will return -EXNIO when the target CPU is offline. Worse, in that case it'll turn into an infinite loop. Fixes: 2ed6edd33a21 ("perf: Add cond_resched() to task_function_call()") Reported-by: Srikar Dronamraju Signed-off-by: Kajol Jain Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Barret Rhoden Tested-by: Srikar Dronamraju Link: https://lkml.kernel.org/r/20200827064732.20860-1-kjain@linux.ibm.com --- kernel/events/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7ed5248f0445..e8bf92202542 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -99,7 +99,7 @@ static void remote_function(void *data) * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * - * returns @func return value or -ESRCH when the process isn't running + * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -115,7 +115,8 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - ret = !ret ? data.ret : -EAGAIN; + if (!ret) + ret = data.ret; if (ret != -EAGAIN) break; -- cgit v1.2.3 From 2bb8945bcc1a768f2bc402a16c9610bba8d5187d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Sep 2020 11:49:37 +0200 Subject: lockdep: Fix usage_traceoverflow Basically print_lock_class_header()'s for loop is out of sync with the the size of of ->usage_traces[]. Also clean things up a bit while at it, to avoid such mishaps in the future. Fixes: 23870f122768 ("locking/lockdep: Fix "USED" <- "IN-NMI" inversions") Reported-by: Qian Cai Debugged-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Qian Cai Link: https://lkml.kernel.org/r/20200930094937.GE2651@hirez.programming.kicks-ass.net --- kernel/locking/lockdep.c | 32 +++++++++++++++----------------- kernel/locking/lockdep_internals.h | 7 +++++-- 2 files changed, 20 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2facbbd146ec..a430fbb01eb1 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -585,6 +585,8 @@ static const char *usage_str[] = #include "lockdep_states.h" #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", + [LOCK_USED_READ] = "INITIAL READ USE", + /* abused as string storage for verify_lock_unused() */ [LOCK_USAGE_STATES] = "IN-NMI", }; #endif @@ -1939,7 +1941,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) #endif printk(KERN_CONT " {\n"); - for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { + for (bit = 0; bit < LOCK_TRACE_STATES; bit++) { if (class->usage_mask & (1 << bit)) { int len = depth; @@ -3969,7 +3971,7 @@ static int separate_irq_context(struct task_struct *curr, static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { - unsigned int old_mask, new_mask, ret = 1; + unsigned int new_mask, ret = 1; if (new_bit >= LOCK_USAGE_STATES) { DEBUG_LOCKS_WARN_ON(1); @@ -3996,30 +3998,26 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, if (unlikely(hlock_class(this)->usage_mask & new_mask)) goto unlock; - old_mask = hlock_class(this)->usage_mask; hlock_class(this)->usage_mask |= new_mask; - /* - * Save one usage_traces[] entry and map both LOCK_USED and - * LOCK_USED_READ onto the same entry. - */ - if (new_bit == LOCK_USED || new_bit == LOCK_USED_READ) { - if (old_mask & (LOCKF_USED | LOCKF_USED_READ)) - goto unlock; - new_bit = LOCK_USED; + if (new_bit < LOCK_TRACE_STATES) { + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) + return 0; } - if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) - return 0; - switch (new_bit) { + case 0 ... LOCK_USED-1: + ret = mark_lock_irq(curr, this, new_bit); + if (!ret) + return 0; + break; + case LOCK_USED: debug_atomic_dec(nr_unused_locks); break; + default: - ret = mark_lock_irq(curr, this, new_bit); - if (!ret) - return 0; + break; } unlock: diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index b0be1560ed17..de49f9e1c11b 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -20,9 +20,12 @@ enum lock_usage_bit { #undef LOCKDEP_STATE LOCK_USED, LOCK_USED_READ, - LOCK_USAGE_STATES + LOCK_USAGE_STATES, }; +/* states after LOCK_USED_READ are not traced and printed */ +static_assert(LOCK_TRACE_STATES == LOCK_USAGE_STATES); + #define LOCK_USAGE_READ_MASK 1 #define LOCK_USAGE_DIR_MASK 2 #define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK)) @@ -121,7 +124,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; -#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) +#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1) extern void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]); -- cgit v1.2.3 From 4d004099a668c41522242aa146a38cc4eb59cb1e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Oct 2020 11:04:21 +0200 Subject: lockdep: Fix lockdep recursion Steve reported that lockdep_assert*irq*(), when nested inside lockdep itself, will trigger a false-positive. One example is the stack-trace code, as called from inside lockdep, triggering tracing, which in turn calls RCU, which then uses lockdep_assert_irqs_disabled(). Fixes: a21ee6055c30 ("lockdep: Change hardirq{s_enabled,_context} to per-cpu variables") Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 99 +++++++++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a430fbb01eb1..85d15f0362dc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -76,6 +76,23 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif +DEFINE_PER_CPU(unsigned int, lockdep_recursion); +EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion); + +static inline bool lockdep_enabled(void) +{ + if (!debug_locks) + return false; + + if (raw_cpu_read(lockdep_recursion)) + return false; + + if (current->lockdep_recursion) + return false; + + return true; +} + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -93,7 +110,7 @@ static inline void lockdep_lock(void) arch_spin_lock(&__lock); __owner = current; - current->lockdep_recursion++; + __this_cpu_inc(lockdep_recursion); } static inline void lockdep_unlock(void) @@ -101,7 +118,7 @@ static inline void lockdep_unlock(void) if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) return; - current->lockdep_recursion--; + __this_cpu_dec(lockdep_recursion); __owner = NULL; arch_spin_unlock(&__lock); } @@ -393,10 +410,15 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } +static __always_inline void lockdep_recursion_inc(void) +{ + __this_cpu_inc(lockdep_recursion); +} + static __always_inline void lockdep_recursion_finish(void) { - if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK)) - current->lockdep_recursion = 0; + if (WARN_ON_ONCE(__this_cpu_dec_return(lockdep_recursion))) + __this_cpu_write(lockdep_recursion, 0); } void lockdep_set_selftest_task(struct task_struct *task) @@ -3659,7 +3681,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(in_nmi())) return; - if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) + if (unlikely(__this_cpu_read(lockdep_recursion))) return; if (unlikely(lockdep_hardirqs_enabled())) { @@ -3695,7 +3717,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) current->hardirq_chain_key = current->curr_chain_key; - current->lockdep_recursion++; + lockdep_recursion_inc(); __trace_hardirqs_on_caller(); lockdep_recursion_finish(); } @@ -3728,7 +3750,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) goto skip_checks; } - if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) + if (unlikely(__this_cpu_read(lockdep_recursion))) return; if (lockdep_hardirqs_enabled()) { @@ -3781,7 +3803,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (in_nmi()) { if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) return; - } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + } else if (__this_cpu_read(lockdep_recursion)) return; /* @@ -3814,7 +3836,7 @@ void lockdep_softirqs_on(unsigned long ip) { struct irqtrace_events *trace = ¤t->irqtrace; - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; /* @@ -3829,7 +3851,7 @@ void lockdep_softirqs_on(unsigned long ip) return; } - current->lockdep_recursion++; + lockdep_recursion_inc(); /* * We'll do an OFF -> ON transition: */ @@ -3852,7 +3874,7 @@ void lockdep_softirqs_on(unsigned long ip) */ void lockdep_softirqs_off(unsigned long ip) { - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; /* @@ -4233,11 +4255,11 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, if (subclass) { unsigned long flags; - if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) + if (DEBUG_LOCKS_WARN_ON(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); register_lock_class(lock, subclass, 1); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -4920,11 +4942,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); check_flags(flags); if (__lock_set_class(lock, name, key, subclass, ip)) check_chain_key(current); @@ -4937,11 +4959,11 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); check_flags(flags); if (__lock_downgrade(lock, ip)) check_chain_key(current); @@ -4979,7 +5001,7 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock static bool lockdep_nmi(void) { - if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + if (raw_cpu_read(lockdep_recursion)) return false; if (!in_nmi()) @@ -5000,7 +5022,10 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); - if (unlikely(current->lockdep_recursion)) { + if (!debug_locks) + return; + + if (unlikely(!lockdep_enabled())) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { struct held_lock hlock; @@ -5023,7 +5048,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip, 0, 0); lockdep_recursion_finish(); @@ -5037,13 +5062,13 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) trace_lock_release(lock, ip); - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); if (__lock_release(lock, ip)) check_chain_key(current); lockdep_recursion_finish(); @@ -5056,13 +5081,13 @@ noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) unsigned long flags; int ret = 0; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return 1; /* avoid false negative lockdep_assert_held() */ raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); ret = __lock_is_held(lock, read); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5077,13 +5102,13 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock) struct pin_cookie cookie = NIL_COOKIE; unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return cookie; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); cookie = __lock_pin_lock(lock); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5096,13 +5121,13 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_repin_lock(lock, cookie); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5113,13 +5138,13 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_unpin_lock(lock, cookie); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5249,15 +5274,12 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) trace_lock_acquired(lock, ip); - if (unlikely(!lock_stat || !debug_locks)) - return; - - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lock_stat || !lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_contended(lock, ip); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5270,15 +5292,12 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) trace_lock_contended(lock, ip); - if (unlikely(!lock_stat || !debug_locks)) - return; - - if (unlikely(current->lockdep_recursion)) + if (unlikely(!lock_stat || !lockdep_enabled())) return; raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion++; + lockdep_recursion_inc(); __lock_acquired(lock, ip); lockdep_recursion_finish(); raw_local_irq_restore(flags); -- cgit v1.2.3 From 75748837b7e56919679e02163f45d5818c644d03 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 8 Oct 2020 18:12:37 -0700 Subject: bpf: Propagate scalar ranges through register assignments. The llvm register allocator may use two different registers representing the same virtual register. In such case the following pattern can be observed: 1047: (bf) r9 = r6 1048: (a5) if r6 < 0x1000 goto pc+1 1050: ... 1051: (a5) if r9 < 0x2 goto pc+66 1052: ... 1053: (bf) r2 = r9 /* r2 needs to have upper and lower bounds */ This is normal behavior of greedy register allocator. The slides 137+ explain why regalloc introduces such register copy: http://llvm.org/devmtg/2018-04/slides/Yatsina-LLVM%20Greedy%20Register%20Allocator.pdf There is no way to tell llvm 'not to do this'. Hence the verifier has to recognize such patterns. In order to track this information without backtracking allocate ID for scalars in a similar way as it's done for find_good_pkt_pointers(). When the verifier encounters r9 = r6 assignment it will assign the same ID to both registers. Later if either register range is narrowed via conditional jump propagate the register state into the other register. Clear register ID in adjust_reg_min_max_vals() for any alu instruction. The register ID is ignored for scalars in regsafe() and doesn't affect state pruning. mark_reg_unknown() clears the ID. It's used to process call, endian and other instructions. Hence ID is explicitly cleared only in adjust_reg_min_max_vals() and in 32-bit mov. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20201009011240.48506-2-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 62b804651a48..ba96f7e9bbc0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6436,6 +6436,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, src_reg = NULL; if (dst_reg->type != SCALAR_VALUE) ptr_reg = dst_reg; + else + /* Make sure ID is cleared otherwise dst_reg min/max could be + * incorrectly propagated into other registers by find_equal_scalars() + */ + dst_reg->id = 0; if (BPF_SRC(insn->code) == BPF_X) { src_reg = ®s[insn->src_reg]; if (src_reg->type != SCALAR_VALUE) { @@ -6569,6 +6574,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* case: R1 = R2 * copy register state to dest reg */ + if (src_reg->type == SCALAR_VALUE && !src_reg->id) + /* Assign src and dst registers the same ID + * that will be used by find_equal_scalars() + * to propagate min/max range. + */ + src_reg->id = ++env->id_gen; *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; @@ -6581,6 +6592,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { *dst_reg = *src_reg; + /* Make sure ID is cleared otherwise + * dst_reg min/max could be incorrectly + * propagated into src_reg by find_equal_scalars() + */ + dst_reg->id = 0; dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; } else { @@ -7369,6 +7385,30 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, return true; } +static void find_equal_scalars(struct bpf_verifier_state *vstate, + struct bpf_reg_state *known_reg) +{ + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int i, j; + + for (i = 0; i <= vstate->curframe; i++) { + state = vstate->frame[i]; + for (j = 0; j < MAX_BPF_REG; j++) { + reg = &state->regs[j]; + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + } + + bpf_for_each_spilled_reg(j, state, reg) { + if (!reg) + continue; + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + } + } +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -7497,6 +7537,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, reg_combine_min_max(&other_branch_regs[insn->src_reg], &other_branch_regs[insn->dst_reg], src_reg, dst_reg, opcode); + if (src_reg->id) { + find_equal_scalars(this_branch, src_reg); + find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); + } + } } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch_regs[insn->dst_reg], @@ -7504,6 +7549,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, opcode, is_jmp32); } + if (dst_reg->type == SCALAR_VALUE && dst_reg->id) { + find_equal_scalars(this_branch, dst_reg); + find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]); + } + /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). * NOTE: these optimizations below are related with pointer comparison * which will never be JMP32. -- cgit v1.2.3 From 5689d49b71ad25409e41c6052dd9549fbeaf9bde Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 8 Oct 2020 18:12:38 -0700 Subject: bpf: Track spill/fill of bounded scalars. Under register pressure the llvm may spill registers with bounds into the stack. The verifier has to track them through spill/fill otherwise many kinds of bound errors will be seen. The spill/fill of induction variables was already happening. This patch extends this logic from tracking spill/fill of a constant into any bounded register. There is no need to track spill/fill of unbounded, since no new information will be retrieved from the stack during register fill. Though extra stack difference could cause state pruning to be less effective, no adverse affects were seen from this patch on selftests and on cilium programs. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20201009011240.48506-3-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ba96f7e9bbc0..f3e36eade3d4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2227,6 +2227,20 @@ static bool register_is_const(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); } +static bool __is_scalar_unbounded(struct bpf_reg_state *reg) +{ + return tnum_is_unknown(reg->var_off) && + reg->smin_value == S64_MIN && reg->smax_value == S64_MAX && + reg->umin_value == 0 && reg->umax_value == U64_MAX && + reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX && + reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX; +} + +static bool register_is_bounded(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg); +} + static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -2278,7 +2292,7 @@ static int check_stack_write(struct bpf_verifier_env *env, if (value_regno >= 0) reg = &cur->regs[value_regno]; - if (reg && size == BPF_REG_SIZE && register_is_const(reg) && + if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit -- cgit v1.2.3 From 55567976629e58fde28fb70612ca73228271eef2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Oct 2020 10:10:20 +0100 Subject: genirq/irqdomain: Allow partial trimming of irq_data hierarchy It appears that some HW is ugly enough that not all the interrupts connected to a particular interrupt controller end up with the same hierarchy depth (some of them are terminated early). This leaves the irqchip hacker with only two choices, both equally bad: - create discrete domain chains, one for each "hierarchy depth", which is very hard to maintain - create fake hierarchy levels for the shallow paths, leading to all kind of problems (what are the safe hwirq values for these fake levels?) Implement the ability to cut short a single interrupt hierarchy from a level marked as being disconnected by using the new irq_domain_disconnect_hierarchy() helper. The irqdomain allocation code will then perform the trimming Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 76cd7ebd1178..cf8b374b892d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1136,6 +1136,17 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, return irq_data; } +static void __irq_domain_free_hierarchy(struct irq_data *irq_data) +{ + struct irq_data *tmp; + + while (irq_data) { + tmp = irq_data; + irq_data = irq_data->parent_data; + kfree(tmp); + } +} + static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data, *tmp; @@ -1147,12 +1158,83 @@ static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) irq_data->parent_data = NULL; irq_data->domain = NULL; - while (tmp) { - irq_data = tmp; - tmp = tmp->parent_data; - kfree(irq_data); + __irq_domain_free_hierarchy(tmp); + } +} + +/** + * irq_domain_disconnect_hierarchy - Mark the first unused level of a hierarchy + * @domain: IRQ domain from which the hierarchy is to be disconnected + * @virq: IRQ number where the hierarchy is to be trimmed + * + * Marks the @virq level belonging to @domain as disconnected. + * Returns -EINVAL if @virq doesn't have a valid irq_data pointing + * to @domain. + * + * Its only use is to be able to trim levels of hierarchy that do not + * have any real meaning for this interrupt, and that the driver marks + * as such from its .alloc() callback. + */ +int irq_domain_disconnect_hierarchy(struct irq_domain *domain, + unsigned int virq) +{ + struct irq_data *irqd; + + irqd = irq_domain_get_irq_data(domain, virq); + if (!irqd) + return -EINVAL; + + irqd->chip = ERR_PTR(-ENOTCONN); + return 0; +} + +static int irq_domain_trim_hierarchy(unsigned int virq) +{ + struct irq_data *tail, *irqd, *irq_data; + + irq_data = irq_get_irq_data(virq); + tail = NULL; + + /* The first entry must have a valid irqchip */ + if (!irq_data->chip || IS_ERR(irq_data->chip)) + return -EINVAL; + + /* + * Validate that the irq_data chain is sane in the presence of + * a hierarchy trimming marker. + */ + for (irqd = irq_data->parent_data; irqd; irq_data = irqd, irqd = irqd->parent_data) { + /* Can't have a valid irqchip after a trim marker */ + if (irqd->chip && tail) + return -EINVAL; + + /* Can't have an empty irqchip before a trim marker */ + if (!irqd->chip && !tail) + return -EINVAL; + + if (IS_ERR(irqd->chip)) { + /* Only -ENOTCONN is a valid trim marker */ + if (PTR_ERR(irqd->chip) != -ENOTCONN) + return -EINVAL; + + tail = irq_data; } } + + /* No trim marker, nothing to do */ + if (!tail) + return 0; + + pr_info("IRQ%d: trimming hierarchy from %s\n", + virq, tail->parent_data->domain->name); + + /* Sever the inner part of the hierarchy... */ + irqd = tail; + tail = tail->parent_data; + irqd->parent_data = NULL; + __irq_domain_free_hierarchy(tail); + + return 0; } static int irq_domain_alloc_irq_data(struct irq_domain *domain, @@ -1362,6 +1444,15 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, mutex_unlock(&irq_domain_mutex); goto out_free_irq_data; } + + for (i = 0; i < nr_irqs; i++) { + ret = irq_domain_trim_hierarchy(virq + i); + if (ret) { + mutex_unlock(&irq_domain_mutex); + goto out_free_irq_data; + } + } + for (i = 0; i < nr_irqs; i++) irq_domain_insert_irq(virq + i); mutex_unlock(&irq_domain_mutex); -- cgit v1.2.3 From 4a8f87e60f6db40e640f1db555d063b2c4dea5f1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:03 +0200 Subject: bpf: Allow for map-in-map with dynamic inner array map entries Recent work in f4d05259213f ("bpf: Add map_meta_equal map ops") and 134fede4eecf ("bpf: Relax max_entries check for most of the inner map types") added support for dynamic inner max elements for most map-in-map types. Exceptions were maps like array or prog array where the map_gen_lookup() callback uses the maps' max_entries field as a constant when emitting instructions. We recently implemented Maglev consistent hashing into Cilium's load balancer which uses map-in-map with an outer map being hash and inner being array holding the Maglev backend table for each service. This has been designed this way in order to reduce overall memory consumption given the outer hash map allows to avoid preallocating a large, flat memory area for all services. Also, the number of service mappings is not always known a-priori. The use case for dynamic inner array map entries is to further reduce memory overhead, for example, some services might just have a small number of back ends while others could have a large number. Right now the Maglev backend table for small and large number of backends would need to have the same inner array map entries which adds a lot of unneeded overhead. Dynamic inner array map entries can be realized by avoiding the inlined code generation for their lookup. The lookup will still be efficient since it will be calling into array_map_lookup_elem() directly and thus avoiding retpoline. The patch adds a BPF_F_INNER_MAP flag to map creation which therefore skips inline code generation and relaxes array_map_meta_equal() check to ignore both maps' max_entries. This also still allows to have faster lookups for map-in-map when BPF_F_INNER_MAP is not specified and hence dynamic max_entries not needed. Example code generation where inner map is dynamic sized array: # bpftool p d x i 125 int handle__sys_enter(void * ctx): ; int handle__sys_enter(void *ctx) 0: (b4) w1 = 0 ; int key = 0; 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 ; 3: (07) r2 += -4 ; inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key); 4: (18) r1 = map[id:468] 6: (07) r1 += 272 7: (61) r0 = *(u32 *)(r2 +0) 8: (35) if r0 >= 0x3 goto pc+5 9: (67) r0 <<= 3 10: (0f) r0 += r1 11: (79) r0 = *(u64 *)(r0 +0) 12: (15) if r0 == 0x0 goto pc+1 13: (05) goto pc+1 14: (b7) r0 = 0 15: (b4) w6 = -1 ; if (!inner_map) 16: (15) if r0 == 0x0 goto pc+6 17: (bf) r2 = r10 ; 18: (07) r2 += -4 ; val = bpf_map_lookup_elem(inner_map, &key); 19: (bf) r1 = r0 | No inlining but instead 20: (85) call array_map_lookup_elem#149280 | call to array_map_lookup_elem() ; return val ? *val : -1; | for inner array lookup. 21: (15) if r0 == 0x0 goto pc+1 ; return val ? *val : -1; 22: (61) r6 = *(u32 *)(r0 +0) ; } 23: (bc) w0 = w6 24: (95) exit Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201010234006.7075-4-daniel@iogearbox.net --- kernel/bpf/arraymap.c | 17 +++++++++++------ kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/verifier.c | 6 ++++-- 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index bd777dd6f967..c6c81eceb68f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -16,7 +16,7 @@ #define ARRAY_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ - BPF_F_PRESERVE_ELEMS) + BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -62,7 +62,7 @@ int array_map_alloc_check(union bpf_attr *attr) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_ARRAY && - attr->map_flags & BPF_F_MMAPABLE) + attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && @@ -214,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ -static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; @@ -223,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; + if (map->map_flags & BPF_F_INNER_MAP) + return -EOPNOTSUPP; + *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { @@ -496,8 +499,10 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) static bool array_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { - return meta0->max_entries == meta1->max_entries && - bpf_map_meta_equal(meta0, meta1); + if (!bpf_map_meta_equal(meta0, meta1)) + return false; + return meta0->map_flags & BPF_F_INNER_MAP ? true : + meta0->max_entries == meta1->max_entries; } struct bpf_iter_seq_array_map_info { @@ -1251,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 array_of_map_gen_lookup(struct bpf_map *map, +static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3395cf140d22..1815e97d4c9c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) * bpf_prog * __htab_map_lookup_elem */ -static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) return __htab_lru_map_lookup_elem(map, key, false); } -static u32 htab_lru_map_gen_lookup(struct bpf_map *map, +static int htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; @@ -2070,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 htab_of_map_gen_lookup(struct bpf_map *map, +static int htab_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3e36eade3d4..fa5badc9279a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11049,7 +11049,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { cnt = ops->map_gen_lookup(map_ptr, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt == -EOPNOTSUPP) + goto patch_map_ops_generic; + if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -11079,7 +11081,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); - +patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - -- cgit v1.2.3 From f91072ed1b7283b13ca57fcfbece5a3b92726143 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Sep 2020 13:53:11 +0200 Subject: perf/core: Fix race in the perf_mmap_close() function There's a possible race in perf_mmap_close() when checking ring buffer's mmap_count refcount value. The problem is that the mmap_count check is not atomic because we call atomic_dec() and atomic_read() separately. perf_mmap_close: ... atomic_dec(&rb->mmap_count); ... if (atomic_read(&rb->mmap_count)) goto out_put; free_uid out_put: ring_buffer_put(rb); /* could be last */ The race can happen when we have two (or more) events sharing same ring buffer and they go through atomic_dec() and then they both see 0 as refcount value later in atomic_read(). Then both will go on and execute code which is meant to be run just once. The code that detaches ring buffer is probably fine to be executed more than once, but the problem is in calling free_uid(), which will later on demonstrate in related crashes and refcount warnings, like: refcount_t: addition on 0; use-after-free. ... RIP: 0010:refcount_warn_saturate+0x6d/0xf ... Call Trace: prepare_creds+0x190/0x1e0 copy_creds+0x35/0x172 copy_process+0x471/0x1a80 _do_fork+0x83/0x3a0 __do_sys_wait4+0x83/0x90 __do_sys_clone+0x85/0xa0 do_syscall_64+0x5b/0x1e0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Using atomic decrease and check instead of separated calls. Tested-by: Michael Petlan Signed-off-by: Jiri Olsa Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Namhyung Kim Acked-by: Wade Mealing Fixes: 9bb5d40cd93c ("perf: Fix mmap() accounting hole"); Link: https://lore.kernel.org/r/20200916115311.GE2301783@krava --- kernel/events/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 45edb85344a1..fb662eb4fb69 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5895,11 +5895,11 @@ static void perf_pmu_output_stop(struct perf_event *event); static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); + bool detach_rest = false; if (event->pmu->event_unmapped) event->pmu->event_unmapped(event, vma->vm_mm); @@ -5930,7 +5930,8 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&event->mmap_mutex); } - atomic_dec(&rb->mmap_count); + if (atomic_dec_and_test(&rb->mmap_count)) + detach_rest = true; if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; @@ -5939,7 +5940,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ - if (atomic_read(&rb->mmap_count)) + if (!detach_rest) goto out_put; /* -- cgit v1.2.3 From fdf09ab887829cd1b671e45d9549f8ec1ffda0fa Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 8 Oct 2020 13:32:20 -0400 Subject: module: statically initialize init section freeing data Corentin hit the following workqueue warning when running with CRYPTO_MANAGER_EXTRA_TESTS: WARNING: CPU: 2 PID: 147 at kernel/workqueue.c:1473 __queue_work+0x3b8/0x3d0 Modules linked in: ghash_generic CPU: 2 PID: 147 Comm: modprobe Not tainted 5.6.0-rc1-next-20200214-00068-g166c9264f0b1-dirty #545 Hardware name: Pine H64 model A (DT) pc : __queue_work+0x3b8/0x3d0 Call trace: __queue_work+0x3b8/0x3d0 queue_work_on+0x6c/0x90 do_init_module+0x188/0x1f0 load_module+0x1d00/0x22b0 I wasn't able to reproduce on x86 or rpi 3b+. This is WARN_ON(!list_empty(&work->entry)) from __queue_work(), and it happens because the init_free_wq work item isn't initialized in time for a crypto test that requests the gcm module. Some crypto tests were recently moved earlier in boot as explained in commit c4741b230597 ("crypto: run initcalls for generic implementations earlier"), which went into mainline less than two weeks before the Fixes commit. Avoid the warning by statically initializing init_free_wq and the corresponding llist. Link: https://lore.kernel.org/lkml/20200217204803.GA13479@Red/ Fixes: 1a7b7d922081 ("modules: Use vmalloc special flag") Reported-by: Corentin Labbe Tested-by: Corentin Labbe Tested-on: sun50i-h6-pine-h64 Tested-on: imx8mn-ddr4-evk Tested-on: sun50i-a64-bananapi-m64 Reviewed-by: Eric Biggers Signed-off-by: Daniel Jordan Signed-off-by: Jessica Yu --- kernel/module.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 2c00059ac1c9..7ff2dd301bd1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -91,8 +91,9 @@ EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); /* Work queue for freeing init sections in success case */ -static struct work_struct init_free_wq; -static struct llist_head init_free_list; +static void do_free_init(struct work_struct *w); +static DECLARE_WORK(init_free_wq, do_free_init); +static LLIST_HEAD(init_free_list); #ifdef CONFIG_MODULES_TREE_LOOKUP @@ -3582,14 +3583,6 @@ static void do_free_init(struct work_struct *w) } } -static int __init modules_wq_init(void) -{ - INIT_WORK(&init_free_wq, do_free_init); - init_llist_head(&init_free_list); - return 0; -} -module_init(modules_wq_init); - /* * This is where the real work happens. * -- cgit v1.2.3 From 111767c1d86bd9661f8b72ace50cbcb13507a1bf Mon Sep 17 00:00:00 2001 From: Thomas Cedeno Date: Thu, 16 Jul 2020 19:13:57 +0000 Subject: LSM: Signal to SafeSetID when setting group IDs For SafeSetID to properly gate set*gid() calls, it needs to know whether ns_capable() is being called from within a sys_set*gid() function or is being called from elsewhere in the kernel. This allows SafeSetID to deny CAP_SETGID to restricted groups when they are attempting to use the capability for code paths other than updating GIDs (e.g. setting up userns GID mappings). This is the identical approach to what is currently done for CAP_SETUID. NOTE: We also add signaling to SafeSetID from the setgroups() syscall, as we have future plans to restrict a process' ability to set supplementary groups in addition to what is added in this series for restricting setting of the primary group. Signed-off-by: Thomas Cedeno Signed-off-by: Micah Morton --- kernel/capability.c | 2 +- kernel/groups.c | 2 +- kernel/sys.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 7c59b096c98a..de7eac903a2a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -418,7 +418,7 @@ EXPORT_SYMBOL(ns_capable_noaudit); /** * ns_capable_setid - Determine if the current task has a superior capability * in effect, while signalling that this check is being done from within a - * setid syscall. + * setid or setgroups syscall. * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * diff --git a/kernel/groups.c b/kernel/groups.c index 6ee6691f6839..fe7e6385530e 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -178,7 +178,7 @@ bool may_setgroups(void) { struct user_namespace *user_ns = current_user_ns(); - return ns_capable(user_ns, CAP_SETGID) && + return ns_capable_setid(user_ns, CAP_SETGID) && userns_may_setgroups(user_ns); } diff --git a/kernel/sys.c b/kernel/sys.c index ab6c409b1159..ad80f9767f27 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -373,7 +373,7 @@ long __sys_setregid(gid_t rgid, gid_t egid) if (rgid != (gid_t) -1) { if (gid_eq(old->gid, krgid) || gid_eq(old->egid, krgid) || - ns_capable(old->user_ns, CAP_SETGID)) + ns_capable_setid(old->user_ns, CAP_SETGID)) new->gid = krgid; else goto error; @@ -382,7 +382,7 @@ long __sys_setregid(gid_t rgid, gid_t egid) if (gid_eq(old->gid, kegid) || gid_eq(old->egid, kegid) || gid_eq(old->sgid, kegid) || - ns_capable(old->user_ns, CAP_SETGID)) + ns_capable_setid(old->user_ns, CAP_SETGID)) new->egid = kegid; else goto error; @@ -432,7 +432,7 @@ long __sys_setgid(gid_t gid) old = current_cred(); retval = -EPERM; - if (ns_capable(old->user_ns, CAP_SETGID)) + if (ns_capable_setid(old->user_ns, CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = kgid; else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) new->egid = new->fsgid = kgid; @@ -744,7 +744,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) old = current_cred(); retval = -EPERM; - if (!ns_capable(old->user_ns, CAP_SETGID)) { + if (!ns_capable_setid(old->user_ns, CAP_SETGID)) { if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) goto error; @@ -871,7 +871,7 @@ long __sys_setfsgid(gid_t gid) if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || - ns_capable(old->user_ns, CAP_SETGID)) { + ns_capable_setid(old->user_ns, CAP_SETGID)) { if (!gid_eq(kgid, old->fsgid)) { new->fsgid = kgid; if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0) -- cgit v1.2.3 From 73fb952d83717697910d981e27fe2c252f64662b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2020 16:49:18 -0700 Subject: resource: report parent to walk_iomem_res_desc() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In support of detecting whether a resource might have been been claimed, report the parent to the walk_iomem_res_desc() callback. For example, the ACPI HMAT parser publishes "hmem" platform devices per target range. However, if the HMAT is disabled / missing a fallback driver can attach devices to the raw memory ranges as a fallback if it sees unclaimed / orphan "Soft Reserved" resources in the resource tree. Otherwise, find_next_iomem_res() returns a resource with garbage data from the stack allocation in __walk_iomem_res_desc() for the res->parent field. There are currently no users that expect ->child and ->sibling to be valid, and the resource_lock would be needed to traverse them. Use a compound literal to implicitly zero initialize the fields that are not being returned in addition to setting ->parent. Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Cc: Jason Gunthorpe Cc: Dave Hansen Cc: Wei Yang Cc: Tom Lendacky Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Benjamin Herrenschmidt Cc: Ben Skeggs Cc: Borislav Petkov Cc: Brice Goglin Cc: Catalin Marinas Cc: Daniel Vetter Cc: Dave Jiang Cc: David Airlie Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ira Weiny Cc: Jeff Moyer Cc: Jia He Cc: Joao Martins Cc: Jonathan Cameron Cc: Michael Ellerman Cc: Mike Rapoport Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Vishal Verma Cc: Will Deacon Cc: Ard Biesheuvel Cc: Bjorn Helgaas Cc: Boris Ostrovsky Cc: Hulk Robot Cc: Jason Yan Cc: "Jérôme Glisse" Cc: Juergen Gross Cc: kernel test robot Cc: Randy Dunlap Cc: Stefano Stabellini Cc: Vivek Goyal Link: https://lkml.kernel.org/r/159643097166.4062302.11875688887228572793.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds --- kernel/resource.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 841737bbda9e..f1175ce93a1d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -382,10 +382,13 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p) { /* copy data */ - res->start = max(start, p->start); - res->end = min(end, p->end); - res->flags = p->flags; - res->desc = p->desc; + *res = (struct resource) { + .start = max(start, p->start), + .end = min(end, p->end), + .flags = p->flags, + .desc = p->desc, + .parent = p->parent, + }; } read_unlock(&resource_lock); -- cgit v1.2.3 From cf508b58457cfe51b168d49ea6c79a221900d354 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 13 Oct 2020 16:54:10 -0700 Subject: mm: use helper function mapping_allow_writable() Commit 4bb5f5d9395b ("mm: allow drivers to prevent new writable mappings") changed i_mmap_writable from unsigned int to atomic_t and add the helper function mapping_allow_writable() to atomic_inc i_mmap_writable. But it forgot to use this helper function in dup_mmap() and __vma_link_file(). Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Cc: Christian Brauner Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "Eric W. Biederman" Cc: Christian Kellner Cc: Suren Baghdasaryan Cc: Adrian Reber Cc: Shakeel Butt Cc: Aleksa Sarai Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20200917112736.7789-1-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a3795aaaab5c..2dcb19a30650 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -559,7 +559,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, atomic_dec(&inode->i_writecount); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); + mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_interval_tree_insert_after(tmp, mpnt, -- cgit v1.2.3 From c78f463649d60f4ed12df97a32def4d77b583853 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 13 Oct 2020 16:54:21 -0700 Subject: mm: remove src/dst mm parameter in copy_page_range() Both of the mm pointers are not needed after commit 7a4830c380f3 ("mm/fork: Pass new vma pointer into copy_page_range()"). Jason Gunthorpe also reported that the ordering of copy_page_range() is odd. Since working at it, reorder the parameters to be logical, by (1) always put the dst_* fields to be before src_* fields, and (2) keep the same type of parameters together. [peterx@redhat.com: further reorder some parameters and line format, per Jason] Link: https://lkml.kernel.org/r/20201002192647.7161-1-peterx@redhat.com [peterx@redhat.com: fix warnings] Link: https://lkml.kernel.org/r/20201006200138.GA6026@xz-x1 Reported-by: Kirill A. Shutemov Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jason Gunthorpe Acked-by: Kirill A. Shutemov Link: https://lkml.kernel.org/r/20200930204950.6668-1-peterx@redhat.com Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2dcb19a30650..ede26e5a6097 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -590,7 +590,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt, tmp); + retval = copy_page_range(tmp, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); -- cgit v1.2.3 From e9aa36ccbb4e845d043bb51de274ae877ae925a7 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 13 Oct 2020 16:57:22 -0700 Subject: dma-contiguous: simplify cma_early_percent_memory() The memory size calculation in cma_early_percent_memory() traverses memblock.memory rather than simply call memblock_phys_mem_size(). The comment in that function suggests that at some point there should have been call to memblock_analyze() before memblock_phys_mem_size() could be used. As of now, there is no memblock_analyze() at all and memblock_phys_mem_size() can be used as soon as cold-plug memory is registered with memblock. Replace loop over memblock.memory with a call to memblock_phys_mem_size(). Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Reviewed-by: Baoquan He Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Daniel Axtens Cc: Dave Hansen Cc: Emil Renner Berthing Cc: Hari Bathini Cc: Ingo Molnar Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Marek Szyprowski Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Miguel Ojeda Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: https://lkml.kernel.org/r/20200818151634.14343-3-rppt@kernel.org Signed-off-by: Linus Torvalds --- kernel/dma/contiguous.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index cff7e60968b9..0369fd5fda8f 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -73,16 +73,7 @@ early_param("cma", early_cma); static phys_addr_t __init __maybe_unused cma_early_percent_memory(void) { - struct memblock_region *reg; - unsigned long total_pages = 0; - - /* - * We cannot use memblock_phys_mem_size() here, because - * memblock_analyze() has not been called yet. - */ - for_each_memblock(memory, reg) - total_pages += memblock_region_memory_end_pfn(reg) - - memblock_region_memory_base_pfn(reg); + unsigned long total_pages = PHYS_PFN(memblock_phys_mem_size()); return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT; } -- cgit v1.2.3 From 67197a4f28d28d0b073ab0427b03cb2ee5382578 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 13 Oct 2020 16:58:35 -0700 Subject: mm, oom_adj: don't loop through tasks in __set_oom_adj when not necessary Currently __set_oom_adj loops through all processes in the system to keep oom_score_adj and oom_score_adj_min in sync between processes sharing their mm. This is done for any task with more that one mm_users, which includes processes with multiple threads (sharing mm and signals). However for such processes the loop is unnecessary because their signal structure is shared as well. Android updates oom_score_adj whenever a tasks changes its role (background/foreground/...) or binds to/unbinds from a service, making it more/less important. Such operation can happen frequently. We noticed that updates to oom_score_adj became more expensive and after further investigation found out that the patch mentioned in "Fixes" introduced a regression. Using Pixel 4 with a typical Android workload, write time to oom_score_adj increased from ~3.57us to ~362us. Moreover this regression linearly depends on the number of multi-threaded processes running on the system. Mark the mm with a new MMF_MULTIPROCESS flag bit when task is created with (CLONE_VM && !CLONE_THREAD && !CLONE_VFORK). Change __set_oom_adj to use MMF_MULTIPROCESS instead of mm_users to decide whether oom_score_adj update should be synchronized between multiple processes. To prevent races between clone() and __set_oom_adj(), when oom_score_adj of the process being cloned might be modified from userspace, we use oom_adj_mutex. Its scope is changed to global. The combination of (CLONE_VM && !CLONE_THREAD) is rarely used except for the case of vfork(). To prevent performance regressions of vfork(), we skip taking oom_adj_mutex and setting MMF_MULTIPROCESS when CLONE_VFORK is specified. Clearing the MMF_MULTIPROCESS flag (when the last process sharing the mm exits) is left out of this patch to keep it simple and because it is believed that this threading model is rare. Should there ever be a need for optimizing that case as well, it can be done by hooking into the exit path, likely following the mm_update_next_owner pattern. With the combination of (CLONE_VM && !CLONE_THREAD && !CLONE_VFORK) being quite rare, the regression is gone after the change is applied. [surenb@google.com: v3] Link: https://lkml.kernel.org/r/20200902012558.2335613-1-surenb@google.com Fixes: 44a70adec910 ("mm, oom_adj: make sure processes sharing mm have same view of oom_score_adj") Reported-by: Tim Murray Suggested-by: Michal Hocko Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton Acked-by: Christian Brauner Acked-by: Michal Hocko Acked-by: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Eugene Syromiatnikov Cc: Christian Kellner Cc: Adrian Reber Cc: Shakeel Butt Cc: Aleksa Sarai Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Cc: Alexey Gladkov Cc: Michel Lespinasse Cc: Daniel Jordan Cc: Andrei Vagin Cc: Bernd Edlinger Cc: John Johansen Cc: Yafang Shao Link: https://lkml.kernel.org/r/20200824153036.3201505-1-surenb@google.com Debugged-by: Minchan Kim Signed-off-by: Linus Torvalds --- kernel/fork.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ede26e5a6097..50c90d368117 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1812,6 +1812,25 @@ static __always_inline void delayed_free_task(struct task_struct *tsk) free_task(tsk); } +static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) +{ + /* Skip if kernel thread */ + if (!tsk->mm) + return; + + /* Skip if spawning a thread or using vfork */ + if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM) + return; + + /* We need to synchronize with __set_oom_adj */ + mutex_lock(&oom_adj_mutex); + set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); + /* Update the values in case they were changed after copy_signal */ + tsk->signal->oom_score_adj = current->signal->oom_score_adj; + tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; + mutex_unlock(&oom_adj_mutex); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2288,6 +2307,8 @@ static __latent_entropy struct task_struct *copy_process( trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); + copy_oom_score_adj(clone_flags, p); + return p; bad_fork_cancel_cgroup: -- cgit v1.2.3 From eba9f08293d76370049ec85581ab3d7f6d069e3e Mon Sep 17 00:00:00 2001 From: zhuguangqing Date: Wed, 14 Oct 2020 22:02:20 +0800 Subject: sched: Replace zero-length array with flexible-array In the following commit: 04f5c362ec6d: ("sched/fair: Replace zero-length array with flexible-array") a zero-length array cpumask[0] has been replaced with cpumask[]. But there is still a cpumask[0] in 'struct sched_group_capacity' which was missed. The point of using [] instead of [0] is that with [] the compiler will generate a build warning if it isn't the last member of a struct. [ mingo: Rewrote the changelog. ] Signed-off-by: zhuguangqing Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20201014140220.11384-1-zhuguangqing83@gmail.com --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 28709f6b0975..648f02363ff9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1471,7 +1471,7 @@ struct sched_group_capacity { int id; #endif - unsigned long cpumask[0]; /* Balance mask */ + unsigned long cpumask[]; /* Balance mask */ }; struct sched_group { -- cgit v1.2.3 From a73f863af4ce9730795eab7097fb2102e6854365 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 13 Oct 2020 07:31:14 +0200 Subject: sched/features: Fix !CONFIG_JUMP_LABEL case Commit: 765cc3a4b224e ("sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds") made sched features static for !CONFIG_SCHED_DEBUG configurations, but overlooked the CONFIG_SCHED_DEBUG=y and !CONFIG_JUMP_LABEL cases. For the latter echoing changes to /sys/kernel/debug/sched_features has the nasty effect of effectively changing what sched_features reports, but without actually changing the scheduler behaviour (since different translation units get different sysctl_sched_features). Fix CONFIG_SCHED_DEBUG=y and !CONFIG_JUMP_LABEL configurations by properly restructuring ifdefs. Fixes: 765cc3a4b224e ("sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds") Co-developed-by: Daniel Bristot de Oliveira Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Juri Lelli Signed-off-by: Ingo Molnar Acked-by: Patrick Bellasi Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20201013053114.160628-1-juri.lelli@redhat.com --- kernel/sched/core.c | 2 +- kernel/sched/sched.h | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8160ab5263f8..d2003a7d5ab5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -44,7 +44,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) +#ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits * diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 648f02363ff9..df80bfcea92e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1629,7 +1629,7 @@ enum { #undef SCHED_FEAT -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) +#ifdef CONFIG_SCHED_DEBUG /* * To support run-time toggling of sched features, all the translation units @@ -1637,6 +1637,7 @@ enum { */ extern const_debug unsigned int sysctl_sched_features; +#ifdef CONFIG_JUMP_LABEL #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -1649,7 +1650,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */ +#else /* !CONFIG_JUMP_LABEL */ + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* CONFIG_JUMP_LABEL */ + +#else /* !SCHED_DEBUG */ /* * Each translation unit has its own copy of sysctl_sched_features to allow @@ -1665,7 +1672,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */ +#endif /* SCHED_DEBUG */ extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; -- cgit v1.2.3 From 72a2fbda53d057081d0bca2db221995435fb0d1e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 10 Sep 2020 10:20:34 +0200 Subject: rcu/tree: docs: document bkvcache new members at struct kfree_rcu_cpu Changeset 53c72b590b3a ("rcu/tree: cache specified number of objects") added new members for struct kfree_rcu_cpu, but didn't add the corresponding at the kernel-doc markup, as repoted when doing "make htmldocs": ./kernel/rcu/tree.c:3113: warning: Function parameter or member 'bkvcache' not described in 'kfree_rcu_cpu' ./kernel/rcu/tree.c:3113: warning: Function parameter or member 'nr_bkv_objs' not described in 'kfree_rcu_cpu' So, move the description for bkvcache to kernel-doc, and add a description for nr_bkv_objs. Fixes: 53c72b590b3a ("rcu/tree: cache specified number of objects") Acked-by: Paul E. McKenney Signed-off-by: Mauro Carvalho Chehab --- kernel/rcu/tree.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f78ee759af9c..03c54c3478b7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3022,6 +3022,12 @@ struct kfree_rcu_cpu_work { * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started + * @bkvcache: + * A simple cache list that contains objects for reuse purpose. + * In order to save some per-cpu space the list is singular. + * Even though it is lockless an access has to be protected by the + * per-cpu lock. + * @nr_bkv_objs: number of allocated objects at @bkvcache. * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from @@ -3037,14 +3043,6 @@ struct kfree_rcu_cpu { bool monitor_todo; bool initialized; int count; - - /* - * A simple cache list that contains objects for - * reuse purpose. In order to save some per-cpu - * space the list is singular. Even though it is - * lockless an access has to be protected by the - * per-cpu lock. - */ struct llist_head bkvcache; int nr_bkv_objs; }; -- cgit v1.2.3 From eac48eb6ce10c1dc6fd3366608f4d3ca2430166c Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 14 Oct 2020 19:50:51 +0200 Subject: printk: ringbuffer: Wrong data pointer when appending small string data_realloc() returns wrong data pointer when the block is wrapped and the size is not increased. It might happen when pr_cont() wants to add only few characters and there is already a space for them because of alignment. It might cause writing outsite the buffer. It has been detected by LTP tests with KASAN enabled: [ 221.921944] oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),cpuset=c,mems_allowed=0,oom_memcg=/0,task_memcg=in [ 221.922108] ================================================================== [ 221.922111] BUG: KASAN: global-out-of-bounds in vprintk_store+0x362/0x3d0 [ 221.922112] Write of size 2 at addr ffffffffba51dbcd by task memcg_test_1/11282 [ 221.922113] [ 221.922114] CPU: 1 PID: 11282 Comm: memcg_test_1 Not tainted 5.9.0-next-20201013 #1 [ 221.922116] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 07/27/2017 [ 221.922116] Call Trace: [ 221.922117] dump_stack+0xa4/0xd9 [ 221.922118] print_address_description.constprop.0+0x21/0x210 [ 221.922119] ? _raw_write_lock_bh+0xe0/0xe0 [ 221.922120] ? vprintk_store+0x362/0x3d0 [ 221.922121] kasan_report.cold+0x37/0x7c [ 221.922122] ? vprintk_store+0x362/0x3d0 [ 221.922123] check_memory_region+0x18c/0x1f0 [ 221.922124] memcpy+0x3c/0x60 [ 221.922125] vprintk_store+0x362/0x3d0 [ 221.922125] ? __ia32_sys_syslog+0x50/0x50 [ 221.922126] ? _raw_spin_lock_irqsave+0x9b/0x100 [ 221.922127] ? _raw_spin_lock_irq+0xf0/0xf0 [ 221.922128] ? __kasan_check_write+0x14/0x20 [ 221.922129] vprintk_emit+0x8d/0x1f0 [ 221.922130] vprintk_default+0x1d/0x20 [ 221.922131] vprintk_func+0x5a/0x100 [ 221.922132] printk+0xb2/0xe3 [ 221.922133] ? swsusp_write.cold+0x189/0x189 [ 221.922134] ? kernfs_vfs_xattr_set+0x60/0x60 [ 221.922134] ? _raw_write_lock_bh+0xe0/0xe0 [ 221.922135] ? trace_hardirqs_on+0x38/0x100 [ 221.922136] pr_cont_kernfs_path.cold+0x49/0x4b [ 221.922137] mem_cgroup_print_oom_context.cold+0x74/0xc3 [ 221.922138] dump_header+0x340/0x3bf [ 221.922139] oom_kill_process.cold+0xb/0x10 [ 221.922140] out_of_memory+0x1e9/0x860 [ 221.922141] ? oom_killer_disable+0x210/0x210 [ 221.922142] mem_cgroup_out_of_memory+0x198/0x1c0 [ 221.922143] ? mem_cgroup_count_precharge_pte_range+0x250/0x250 [ 221.922144] try_charge+0xa9b/0xc50 [ 221.922145] ? arch_stack_walk+0x9e/0xf0 [ 221.922146] ? memory_high_write+0x230/0x230 [ 221.922146] ? avc_has_extended_perms+0x830/0x830 [ 221.922147] ? stack_trace_save+0x94/0xc0 [ 221.922148] ? stack_trace_consume_entry+0x90/0x90 [ 221.922149] __memcg_kmem_charge+0x73/0x120 [ 221.922150] ? cred_has_capability+0x10f/0x200 [ 221.922151] ? mem_cgroup_can_attach+0x260/0x260 [ 221.922152] ? selinux_sb_eat_lsm_opts+0x2f0/0x2f0 [ 221.922153] ? obj_cgroup_charge+0x16b/0x220 [ 221.922154] ? kmem_cache_alloc+0x78/0x4c0 [ 221.922155] obj_cgroup_charge+0x122/0x220 [ 221.922156] ? vm_area_alloc+0x20/0x90 [ 221.922156] kmem_cache_alloc+0x78/0x4c0 [ 221.922157] vm_area_alloc+0x20/0x90 [ 221.922158] mmap_region+0x3ed/0x9a0 [ 221.922159] ? cap_mmap_addr+0x1d/0x80 [ 221.922160] do_mmap+0x3ee/0x720 [ 221.922161] vm_mmap_pgoff+0x16a/0x1c0 [ 221.922162] ? randomize_stack_top+0x90/0x90 [ 221.922163] ? copy_page_range+0x1980/0x1980 [ 221.922163] ksys_mmap_pgoff+0xab/0x350 [ 221.922164] ? find_mergeable_anon_vma+0x110/0x110 [ 221.922165] ? __audit_syscall_entry+0x1a6/0x1e0 [ 221.922166] __x64_sys_mmap+0x8d/0xb0 [ 221.922167] do_syscall_64+0x38/0x50 [ 221.922168] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 221.922169] RIP: 0033:0x7fe8f5e75103 [ 221.922172] Code: 54 41 89 d4 55 48 89 fd 53 4c 89 cb 48 85 ff 74 56 49 89 d9 45 89 f8 45 89 f2 44 89 e2 4c 89 ee 48 89 ef b8 09 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 7d 5b 5d 41 5c 41 5d 41 5e 41 5f c3 66 2e 0f [ 221.922173] RSP: 002b:00007ffd38c90198 EFLAGS: 00000246 ORIG_RAX: 0000000000000009 [ 221.922175] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fe8f5e75103 [ 221.922176] RDX: 0000000000000003 RSI: 0000000000001000 RDI: 0000000000000000 [ 221.922178] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [ 221.922179] R10: 0000000000002022 R11: 0000000000000246 R12: 0000000000000003 [ 221.922180] R13: 0000000000001000 R14: 0000000000002022 R15: 0000000000000000 [ 221.922181] [ 213O[ 221.922182] The buggy address belongs to the variable: [ 221.922183] clear_seq+0x2d/0x40 [ 221.922183] [ 221.922184] Memory state around the buggy address: [ 221.922185] ffffffffba51da80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 221.922187] ffffffffba51db00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 221.922188] >ffffffffba51db80: f9 f9 f9 f9 00 f9 f9 f9 f9 f9 f9 f9 00 f9 f9 f9 [ 221.922189] ^ [ 221.922190] ffffffffba51dc00: f9 f9 f9 f9 00 f9 f9 f9 f9 f9 f9 f9 00 f9 f9 f9 [ 221.922191] ffffffffba51dc80: f9 f9 f9 f9 01 f9 f9 f9 f9 f9 f9 f9 00 f9 f9 f9 [ 221.922193] ================================================================== [ 221.922194] Disabling lock debugging due to kernel taint [ 221.922196] ,task=memcg_test_1,pid=11280,uid=0 [ 221.922205] Memory cgroup out of memory: Killed process 11280 Link: https://lore.kernel.org/r/CA+G9fYt46oC7-BKryNDaaXPJ9GztvS2cs_7GjYRjanRi4+ryCQ@mail.gmail.com Fixes: 4cfc7258f876a7feba673ac ("printk: ringbuffer: add finalization/extension support") Reported-by: Naresh Kamboju Reviewed-by: John Ogness Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20201014175051.GC13775@alley --- kernel/printk/printk_ringbuffer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 2493348a1631..24a960a89aa8 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -1125,7 +1125,10 @@ static char *data_realloc(struct printk_ringbuffer *rb, /* If the data block does not increase, there is nothing to do. */ if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { - blk = to_block(data_ring, blk_lpos->begin); + if (wrapped) + blk = to_block(data_ring, 0); + else + blk = to_block(data_ring, blk_lpos->begin); return &blk->data[0]; } -- cgit v1.2.3 From e688c3db7ca69bea1872c5706aec6a7fdf89df17 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Oct 2020 10:56:08 -0700 Subject: bpf: Fix register equivalence tracking. The 64-bit JEQ/JNE handling in reg_set_min_max() was clearing reg->id in either true or false branch. In the case 'if (reg->id)' check was done on the other branch the counter part register would have reg->id == 0 when called into find_equal_scalars(). In such case the helper would incorrectly identify other registers with id == 0 as equivalent and propagate the state incorrectly. Fix it by preserving ID across reg_set_min_max(). In other words any kind of comparison operator on the scalar register should preserve its ID to recognize: r1 = r2 if (r1 == 20) { #1 here both r1 and r2 == 20 } else if (r2 < 20) { #2 here both r1 and r2 < 20 } The patch is addressing #1 case. The #2 was working correctly already. Fixes: 75748837b7e5 ("bpf: Propagate scalar ranges through register assignments.") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Tested-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201014175608.1416-1-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c43a5e8f0818..39d7f44e7c92 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1010,14 +1010,9 @@ static const int caller_saved[CALLER_SAVED_REGS] = { static void __mark_reg_not_init(const struct bpf_verifier_env *env, struct bpf_reg_state *reg); -/* Mark the unknown part of a register (variable offset or scalar value) as - * known to have the value @imm. - */ -static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) +/* This helper doesn't clear reg->id */ +static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { - /* Clear id, off, and union(map_ptr, range) */ - memset(((u8 *)reg) + sizeof(reg->type), 0, - offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); reg->var_off = tnum_const(imm); reg->smin_value = (s64)imm; reg->smax_value = (s64)imm; @@ -1030,6 +1025,17 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) reg->u32_max_value = (u32)imm; } +/* Mark the unknown part of a register (variable offset or scalar value) as + * known to have the value @imm. + */ +static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) +{ + /* Clear id, off, and union(map_ptr, range) */ + memset(((u8 *)reg) + sizeof(reg->type), 0, + offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); + ___mark_reg_known(reg, imm); +} + static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const_subreg(reg->var_off, imm); @@ -7001,14 +7007,18 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *reg = opcode == BPF_JEQ ? true_reg : false_reg; - /* For BPF_JEQ, if this is false we know nothing Jon Snow, but - * if it is true we know the value for sure. Likewise for - * BPF_JNE. + /* JEQ/JNE comparison doesn't change the register equivalence. + * r1 = r2; + * if (r1 == 42) goto label; + * ... + * label: // here both r1 and r2 are known to be 42. + * + * Hence when marking register as known preserve it's ID. */ if (is_jmp32) __mark_reg32_known(reg, val32); else - __mark_reg_known(reg, val); + ___mark_reg_known(reg, val); break; } case BPF_JSET: @@ -7551,7 +7561,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, reg_combine_min_max(&other_branch_regs[insn->src_reg], &other_branch_regs[insn->dst_reg], src_reg, dst_reg, opcode); - if (src_reg->id) { + if (src_reg->id && + !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { find_equal_scalars(this_branch, src_reg); find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); } @@ -7563,7 +7574,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, opcode, is_jmp32); } - if (dst_reg->type == SCALAR_VALUE && dst_reg->id) { + if (dst_reg->type == SCALAR_VALUE && dst_reg->id && + !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { find_equal_scalars(this_branch, dst_reg); find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]); } -- cgit v1.2.3 From 6d9bd139455d9d40fec8c242985996468b34180c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 13 Oct 2020 15:48:52 -0400 Subject: tracing: Check return value of __create_val_fields() before using its result After having a typo for writing a histogram trigger. Wrote: echo 'hist:key=pid:ts=common_timestamp.usec' > events/sched/sched_waking/trigger Instead of: echo 'hist:key=pid:ts=common_timestamp.usecs' > events/sched/sched_waking/trigger and the following crash happened: BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 4 PID: 1641 Comm: sh Not tainted 5.9.0-rc5-test+ #549 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:event_hist_trigger_func+0x70b/0x1ee0 Code: 24 08 89 d5 49 89 cc e9 8c 00 00 00 4c 89 f2 41 b9 00 10 00 00 4c 89 e1 44 89 ee 4c 89 ff e8 dc d3 ff ff 45 89 ea 4b 8b 14 d7 42 08 04 74 17 41 8b 8f c0 00 00 00 8d 71 01 41 89 b7 c0 00 00 RSP: 0018:ffff959213d53db0 EFLAGS: 00010202 RAX: ffffffffffffffea RBX: 0000000000000000 RCX: 0000000000084c04 RDX: 0000000000000000 RSI: df7326aefebd174c RDI: 0000000000031080 RBP: 0000000000000002 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000046 R12: ffff959211dcf690 R13: 0000000000000001 R14: ffff95925a36e370 R15: ffff959251c89800 FS: 00007fb9ea934740(0000) GS:ffff95925ab00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 00000000c976c005 CR4: 00000000001706e0 Call Trace: ? trigger_process_regex+0x78/0x110 trigger_process_regex+0xc5/0x110 event_trigger_write+0x71/0xd0 vfs_write+0xca/0x210 ksys_write+0x70/0xf0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fb9eaa29487 Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 This was caused by accessing the hlist_data fields after the call to __create_val_fields() without checking if the creation succeed. Link: https://lkml.kernel.org/r/20201013154852.3abd8702@gandalf.local.home Fixes: 63a1e5de3006 ("tracing: Save normal string variables") Reviewed-by: Masami Hiramatsu Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c74a7d157306..96c3f86b81c5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3687,7 +3687,7 @@ static int create_var_field(struct hist_trigger_data *hist_data, ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); - if (hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) + if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; return ret; -- cgit v1.2.3 From bbeb97464eefc65f506084fd9f18f21653e01137 Mon Sep 17 00:00:00 2001 From: Gaurav Kohli Date: Tue, 6 Oct 2020 15:03:53 +0530 Subject: tracing: Fix race in trace_open and buffer resize call Below race can come, if trace_open and resize of cpu buffer is running parallely on different cpus CPUX CPUY ring_buffer_resize atomic_read(&buffer->resize_disabled) tracing_open tracing_reset_online_cpus ring_buffer_reset_cpu rb_reset_cpu rb_update_pages remove/insert pages resetting pointer This race can cause data abort or some times infinte loop in rb_remove_pages and rb_insert_pages while checking pages for sanity. Take buffer lock to fix this. Link: https://lkml.kernel.org/r/1601976833-24377-1-git-send-email-gkohli@codeaurora.org Cc: stable@vger.kernel.org Fixes: b23d7a5f4a07a ("ring-buffer: speed up buffer resets by avoiding synchronize_rcu for each CPU") Signed-off-by: Gaurav Kohli Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 93ef0ab6ea20..15bf28b13e50 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4866,6 +4866,9 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); @@ -4876,6 +4879,8 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); + + mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -4889,6 +4894,9 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) struct ring_buffer_per_cpu *cpu_buffer; int cpu; + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + for_each_online_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -4907,6 +4915,8 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); } + + mutex_unlock(&buffer->mutex); } /** -- cgit v1.2.3 From c16340971949ba6560c8e7c985bad8a180c57aa3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 11 Oct 2020 00:28:09 +0900 Subject: tracing/boot: Add ftrace.instance.*.alloc_snapshot option Add ftrace.instance.*.alloc_snapshot option. This option has been described in Documentation/trace/boottime-trace.rst but not implemented yet. ftrace.[instance.INSTANCE.]alloc_snapshot Allocate snapshot buffer. The difference from kernel.alloc_snapshot is that the kernel.alloc_snapshot will allocate the buffer only for the main instance, but this can allocate buffer for any new instances. Link: https://lkml.kernel.org/r/160234368948.400560.15313384470765915015.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 754e3cf2df3a..c22a152ef0b4 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -284,6 +284,12 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node) if (tracing_set_tracer(tr, p) < 0) pr_err("Failed to set given tracer: %s\n", p); } + + /* Since tracer can free snapshot buffer, allocate snapshot here.*/ + if (xbc_node_find_value(node, "alloc_snapshot", NULL)) { + if (tracing_alloc_snapshot_instance(tr) < 0) + pr_err("Failed to allocate snapshot buffer\n"); + } } static void __init -- cgit v1.2.3 From 499f7bb0853570c5d9cbf2a2ecbed517852cacfa Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sat, 10 Oct 2020 22:09:24 +0800 Subject: tracing: Fix some typos in comments s/wihin/within/ s/retrieven/retrieved/ s/suppport/support/ s/wil/will/ s/accidently/accidentally/ s/if the if the/if the/ Link: https://lkml.kernel.org/r/20201010140924.3809-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0806fa9f2815..63c97012ed39 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9465,7 +9465,7 @@ __init static int tracer_alloc_buffers(void) } /* - * Make sure we don't accidently add more trace options + * Make sure we don't accidentally add more trace options * than we have bits for. */ BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); @@ -9494,7 +9494,7 @@ __init static int tracer_alloc_buffers(void) /* * The prepare callbacks allocates some memory for the ring buffer. We - * don't free the buffer if the if the CPU goes down. If we were to free + * don't free the buffer if the CPU goes down. If we were to free * the buffer, then the user would lose any trace that was in the * buffer. The memory will be removed once the "instance" is removed. */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5b0e797cacdd..f777bb68e660 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -246,7 +246,7 @@ typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is * passed in turn to the cond_snapshot.update() function. That data * can be compared by the update() implementation with the cond_data - * contained wihin the struct cond_snapshot instance associated with + * contained within the struct cond_snapshot instance associated with * the trace_array. Because the tr->max_lock is held throughout the * update() call, the update() function can directly retrieve the * cond_snapshot and cond_data associated with the per-instance @@ -271,7 +271,7 @@ typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); * take the snapshot, by returning 'true' if so, 'false' if no * snapshot should be taken. Because the max_lock is held for * the duration of update(), the implementation is safe to - * directly retrieven and save any implementation data it needs + * directly retrieved and save any implementation data it needs * to in association with the snapshot. */ struct cond_snapshot { @@ -573,7 +573,7 @@ struct tracer { * The function callback, which can use the FTRACE bits to * check for recursion. * - * Now if the arch does not suppport a feature, and it calls + * Now if the arch does not support a feature, and it calls * the global list function which calls the ftrace callback * all three of these steps will do a recursion protection. * There's no reason to do one if the previous caller already @@ -1479,7 +1479,7 @@ __trace_event_discard_commit(struct trace_buffer *buffer, /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires - * filtering against its fields, then they wil be called as the + * filtering against its fields, then they will be called as the * entry already holds the field information of the current event. * * It also checks if the event should be discarded or not. -- cgit v1.2.3 From 7d27adf575e7e917666f4eeca62188353a358060 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 13 Oct 2020 09:17:52 -0500 Subject: tracing: Don't show dynamic string internals in synthetic event description For synthetic event dynamic fields, the type contains "__data_loc", which is basically an internal part of the type which is only meant to be displayed in the format, not in the event description itself, which is confusing to users since they can't use __data_loc on the command-line to define an event field, which printing it would lead them to believe. So filter it out from the description, while leaving it in the type. Link: https://lkml.kernel.org/r/b3b7baf7813298a5ede4ff02e2e837b91c05a724.1602598160.git.zanussi@kernel.org Reported-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 3b2dcc42b8ee..b19e2f4159ab 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1867,14 +1867,22 @@ static int __synth_event_show(struct seq_file *m, struct synth_event *event) { struct synth_field *field; unsigned int i; + char *type, *t; seq_printf(m, "%s\t", event->name); for (i = 0; i < event->n_fields; i++) { field = event->fields[i]; + type = field->type; + t = strstr(type, "__data_loc"); + if (t) { /* __data_loc belongs in format but not event desc */ + t += sizeof("__data_loc"); + type = t; + } + /* parameter values */ - seq_printf(m, "%s %s%s", field->type, field->name, + seq_printf(m, "%s %s%s", type, field->name, i == event->n_fields - 1 ? "" : "; "); } -- cgit v1.2.3 From 42d120e2dda5724ea789461413b8691abc315ba1 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 13 Oct 2020 09:17:53 -0500 Subject: tracing: Move is_good_name() from trace_probe.h to trace.h is_good_name() is useful for other trace infrastructure, such as synthetic events, so make it available via trace.h. Link: https://lkml.kernel.org/r/cc6d6a2d7da6957fcbe1e2922e76d18d2bb459b4.1602598160.git.zanussi@kernel.org Acked-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 13 +++++++++++++ kernel/trace/trace_probe.h | 13 ------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f777bb68e660..34e0c4d5a6e7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef CONFIG_FTRACE_SYSCALLS #include /* For NR_SYSCALLS */ @@ -2090,4 +2091,16 @@ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) iter->pos = -1; } +/* Check the name is good for event/group/fields */ +static inline bool is_good_name(const char *name) +{ + if (!isalpha(*name) && *name != '_') + return false; + while (*++name != '\0') { + if (!isalpha(*name) && !isdigit(*name) && *name != '_') + return false; + } + return true; +} + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 04d00987da69..2f703a20c724 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -348,18 +347,6 @@ bool trace_probe_match_command_args(struct trace_probe *tp, #define trace_probe_for_each_link_rcu(pos, tp) \ list_for_each_entry_rcu(pos, &(tp)->event->files, list) -/* Check the name is good for event/group/fields */ -static inline bool is_good_name(const char *name) -{ - if (!isalpha(*name) && *name != '_') - return false; - while (*++name != '\0') { - if (!isalpha(*name) && !isdigit(*name) && *name != '_') - return false; - } - return true; -} - #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) -- cgit v1.2.3 From 9bbb33291f8e44819aaed32d367f702303ff663e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 13 Oct 2020 09:17:54 -0500 Subject: tracing: Check that the synthetic event and field names are legal Call the is_good_name() function used by probe events to make sure synthetic event and field names don't contain illegal characters and cause unexpected parsing of synthetic event commands. Link: https://lkml.kernel.org/r/c4d4bb59d3ac39bcbd70fba0cf837d6b1cedb015.1602598160.git.zanussi@kernel.org Fixes: 4b147936fa50 (tracing: Add support for 'synthetic' events) Reported-by: Masami Hiramatsu Reviewed-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index b19e2f4159ab..8c9d6e464da0 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -572,6 +572,10 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, ret = -ENOMEM; goto free; } + if (!is_good_name(field->name)) { + ret = -EINVAL; + goto free; + } if (field_type[0] == ';') field_type++; @@ -1112,6 +1116,11 @@ static int __create_synth_event(int argc, const char *name, const char **argv) mutex_lock(&event_mutex); + if (!is_good_name(name)) { + ret = -EINVAL; + goto out; + } + event = find_synth_event(name); if (event) { ret = -EEXIST; -- cgit v1.2.3 From d4d704637d935ef5e588b0610b647376dd9f37d4 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 13 Oct 2020 09:17:55 -0500 Subject: tracing: Add synthetic event error logging Add support for synthetic event error logging, which entails adding a logging function for it, a way to save the synthetic event command, and a set of specific synthetic event parse error strings and handling. Link: https://lkml.kernel.org/r/ed099c66df13b40cfc633aaeb17f66c37a923066.1602598160.git.zanussi@kernel.org [ : wrote save_cmdstr() seq_buf implementation. ] Tested-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 92 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 8c9d6e464da0..f77851018121 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -20,6 +20,48 @@ #include "trace_synth.h" +#undef ERRORS +#define ERRORS \ + C(BAD_NAME, "Illegal name"), \ + C(CMD_INCOMPLETE, "Incomplete command"), \ + C(EVENT_EXISTS, "Event already exists"), \ + C(TOO_MANY_FIELDS, "Too many fields"), \ + C(INCOMPLETE_TYPE, "Incomplete type"), \ + C(INVALID_TYPE, "Invalid type"), \ + C(INVALID_FIELD, "Invalid field"), \ + C(CMD_TOO_LONG, "Command too long"), + +#undef C +#define C(a, b) SYNTH_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static const char *err_text[] = { ERRORS }; + +static char last_cmd[MAX_FILTER_STR_VAL]; + +static int errpos(const char *str) +{ + return err_pos(last_cmd, str); +} + +static void last_cmd_set(char *str) +{ + if (!str) + return; + + strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); +} + +static void synth_err(u8 err_type, u8 err_pos) +{ + tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, + err_type, err_pos); +} + static int create_synth_event(int argc, const char **argv); static int synth_event_show(struct seq_file *m, struct dyn_event *ev); static int synth_event_release(struct dyn_event *ev); @@ -545,8 +587,10 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, field_type++; if (!strcmp(field_type, "unsigned")) { - if (argc < 3) + if (argc < 3) { + synth_err(SYNTH_ERR_INCOMPLETE_TYPE, errpos(field_type)); return ERR_PTR(-EINVAL); + } prefix = "unsigned "; field_type = argv[1]; field_name = argv[2]; @@ -573,6 +617,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, goto free; } if (!is_good_name(field->name)) { + synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name)); ret = -EINVAL; goto free; } @@ -601,6 +646,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, size = synth_field_size(field->type); if (size < 0) { + synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type)); ret = -EINVAL; goto free; } else if (size == 0) { @@ -621,6 +667,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, field->is_dynamic = true; size = sizeof(u64); } else { + synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type)); ret = -EINVAL; goto free; } @@ -1098,12 +1145,47 @@ int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, } EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); +static int save_cmdstr(int argc, const char *name, const char **argv) +{ + struct seq_buf s; + char *buf; + int i; + + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + seq_buf_init(&s, buf, MAX_DYNEVENT_CMD_LEN); + + seq_buf_puts(&s, name); + + for (i = 0; i < argc; i++) { + seq_buf_putc(&s, ' '); + seq_buf_puts(&s, argv[i]); + } + + if (!seq_buf_buffer_left(&s)) { + synth_err(SYNTH_ERR_CMD_TOO_LONG, 0); + kfree(buf); + return -EINVAL; + } + buf[s.len] = 0; + last_cmd_set(buf); + + kfree(buf); + return 0; +} + static int __create_synth_event(int argc, const char *name, const char **argv) { struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; struct synth_event *event = NULL; int i, consumed = 0, n_fields = 0, ret = 0; + ret = save_cmdstr(argc, name, argv); + if (ret) + return ret; + /* * Argument syntax: * - Add synthetic event: field[;field] ... @@ -1111,18 +1193,22 @@ static int __create_synth_event(int argc, const char *name, const char **argv) * where 'field' = type field_name */ - if (name[0] == '\0' || argc < 1) + if (name[0] == '\0' || argc < 1) { + synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0); return -EINVAL; + } mutex_lock(&event_mutex); if (!is_good_name(name)) { + synth_err(SYNTH_ERR_BAD_NAME, errpos(name)); ret = -EINVAL; goto out; } event = find_synth_event(name); if (event) { + synth_err(SYNTH_ERR_EVENT_EXISTS, errpos(name)); ret = -EEXIST; goto out; } @@ -1131,6 +1217,7 @@ static int __create_synth_event(int argc, const char *name, const char **argv) if (strcmp(argv[i], ";") == 0) continue; if (n_fields == SYNTH_FIELDS_MAX) { + synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); ret = -EINVAL; goto err; } @@ -1145,6 +1232,7 @@ static int __create_synth_event(int argc, const char *name, const char **argv) } if (i < argc && strcmp(argv[i], ";") != 0) { + synth_err(SYNTH_ERR_INVALID_FIELD, errpos(argv[i])); ret = -EINVAL; goto err; } -- cgit v1.2.3 From 10819e25799aae564005b6049a45e9808797b3bb Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 13 Oct 2020 09:17:57 -0500 Subject: tracing: Handle synthetic event array field type checking correctly Since synthetic event array types are derived from the field name, there may be a semicolon at the end of the type which should be stripped off. If there are more characters following that, normal type string checking will result in an invalid type. Without this patch, you can end up with an invalid field type string that gets displayed in both the synthetic event description and the event format: Before: # echo 'myevent char str[16]; int v' >> synthetic_events # cat synthetic_events myevent char[16]; str; int v name: myevent ID: 1936 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:char str[16];; offset:8; size:16; signed:1; field:int v; offset:40; size:4; signed:1; print fmt: "str=%s, v=%d", REC->str, REC->v After: # echo 'myevent char str[16]; int v' >> synthetic_events # cat synthetic_events myevent char[16] str; int v # cat events/synthetic/myevent/format name: myevent ID: 1936 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:char str[16]; offset:8; size:16; signed:1; field:int v; offset:40; size:4; signed:1; print fmt: "str=%s, v=%d", REC->str, REC->v Link: https://lkml.kernel.org/r/6587663b56c2d45ab9d8c8472a2110713cdec97d.1602598160.git.zanussi@kernel.org [ : wrote parse_synth_field() snippet. ] Fixes: 4b147936fa50 (tracing: Add support for 'synthetic' events) Reported-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index f77851018121..d239f0e2af8f 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -174,7 +174,7 @@ static int synth_field_string_size(char *type) start += sizeof("char[") - 1; end = strchr(type, ']'); - if (!end || end < start) + if (!end || end < start || type + strlen(type) > end + 1) return -EINVAL; len = end - start; @@ -625,8 +625,14 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, if (field_type[0] == ';') field_type++; len = strlen(field_type) + 1; - if (array) - len += strlen(array); + + if (array) { + int l = strlen(array); + + if (l && array[l - 1] == ';') + l--; + len += l; + } if (prefix) len += strlen(prefix); -- cgit v1.2.3 From 6107742d15832011cd0396d821f3225b52551f1f Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 9 Oct 2020 15:05:23 -0700 Subject: tracing: support "bool" type in synthetic trace events It's common [1] to define tracepoint fields as "bool" when they contain a true / false value. Currently, defining a synthetic event with a "bool" field yields EINVAL. It's possible to work around this by using e.g. u8 (assuming sizeof(bool) is 1, and bool is unsigned; if either of these properties don't match, you get EINVAL [2]). Supporting "bool" explicitly makes hooking this up easier and more portable for userspace. [1]: grep -r "bool" include/trace/events/ [2]: check_synth_field() in kernel/trace/trace_events_hist.c Link: https://lkml.kernel.org/r/20201009220524.485102-2-axelrasmussen@google.com Acked-by: Michel Lespinasse Acked-by: David Rientjes Signed-off-by: Axel Rasmussen Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index d239f0e2af8f..3212e2c653b3 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -229,6 +229,8 @@ static int synth_field_size(char *type) size = sizeof(long); else if (strcmp(type, "unsigned long") == 0) size = sizeof(unsigned long); + else if (strcmp(type, "bool") == 0) + size = sizeof(bool); else if (strcmp(type, "pid_t") == 0) size = sizeof(pid_t); else if (strcmp(type, "gfp_t") == 0) @@ -271,6 +273,8 @@ static const char *synth_field_fmt(char *type) fmt = "%ld"; else if (strcmp(type, "unsigned long") == 0) fmt = "%lu"; + else if (strcmp(type, "bool") == 0) + fmt = "%d"; else if (strcmp(type, "pid_t") == 0) fmt = "%d"; else if (strcmp(type, "gfp_t") == 0) -- cgit v1.2.3 From 3eb6b31bfb1732f4c48ef5a271dc15158efe8323 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 29 Sep 2020 13:12:51 +0200 Subject: workqueue: fix a kernel-doc warning As warned by Sphinx: ./Documentation/core-api/workqueue:400: ./kernel/workqueue.c:1218: WARNING: Unexpected indentation. the return code table is currently not recognized, as it lacks markups. Acked-by: Tejun Heo Signed-off-by: Mauro Carvalho Chehab --- kernel/workqueue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ac088ce6059b..437935e7a199 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1212,11 +1212,14 @@ out_put: * stable state - idle, on timer or on worklist. * * Return: + * + * ======== ================================================================ * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry * -ENOENT if someone else is canceling @work, this state may persist * for arbitrarily long + * ======== ================================================================ * * Note: * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting -- cgit v1.2.3 From ce66f6136460a51acfc32de4481fe8fd69dfd50b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 16 Oct 2020 13:20:02 +0900 Subject: tracing: Remove __init from __trace_early_add_new_event() The commit 720dee53ad8d ("tracing/boot: Initialize per-instance event list in early boot") removes __init from __trace_early_add_events() but __trace_early_add_new_event() still has __init and will cause a section mismatch. Remove __init from __trace_early_add_new_event() as same as __trace_early_add_events(). Link: https://lore.kernel.org/lkml/CAHk-=wjU86UhovK4XuwvCqTOfc+nvtpAuaN2PJBz15z=w=u0Xg@mail.gmail.com/ Reported-by: Linus Torvalds Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 851ab37058dd..e705f06c68c6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2498,7 +2498,7 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) * for enabling events at boot. We want to enable events before * the filesystem is initialized. */ -static __init int +static int __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { -- cgit v1.2.3 From ec62d04e3fdc4ba3a7912cd7f6da1a4e787a0d75 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:28 -0700 Subject: kernel/resource: make release_mem_region_adjustable() never fail Patch series "selective merging of system ram resources", v4. Some add_memory*() users add memory in small, contiguous memory blocks. Examples include virtio-mem, hyper-v balloon, and the XEN balloon. This can quickly result in a lot of memory resources, whereby the actual resource boundaries are not of interest (e.g., it might be relevant for DIMMs, exposed via /proc/iomem to user space). We really want to merge added resources in this scenario where possible. Resources are effectively stored in a list-based tree. Having a lot of resources not only wastes memory, it also makes traversing that tree more expensive, and makes /proc/iomem explode in size (e.g., requiring kexec-tools to manually merge resources when creating a kdump header. The current kexec-tools resource count limit does not allow for more than ~100GB of memory with a memory block size of 128MB on x86-64). Let's allow to selectively merge system ram resources by specifying a new flag for add_memory*(). Patch #5 contains a /proc/iomem example. Only tested with virtio-mem. This patch (of 8): Let's make sure splitting a resource on memory hotunplug will never fail. This will become more relevant once we merge selected System RAM resources - then, we'll trigger that case more often on memory hotunplug. In general, this function is already unlikely to fail. When we remove memory, we free up quite a lot of metadata (memmap, page tables, memory block device, etc.). The only reason it could really fail would be when injecting allocation errors. All other error cases inside release_mem_region_adjustable() seem to be sanity checks if the function would be abused in different context - let's add WARN_ON_ONCE() in these cases so we can catch them. [natechancellor@gmail.com: fix use of ternary condition in release_mem_region_adjustable] Link: https://lkml.kernel.org/r/20200922060748.2452056-1-natechancellor@gmail.com Link: https://github.com/ClangBuiltLinux/linux/issues/1159 Signed-off-by: David Hildenbrand Signed-off-by: Nathan Chancellor Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Wang Cc: Juergen Gross Cc: Julien Grall Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monn Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-2-david@redhat.com Signed-off-by: Linus Torvalds --- kernel/resource.c | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index f1175ce93a1d..4c1c487abbfb 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1258,21 +1258,28 @@ EXPORT_SYMBOL(__release_region); * assumes that all children remain in the lower address entry for * simplicity. Enhance this logic when necessary. */ -int release_mem_region_adjustable(struct resource *parent, - resource_size_t start, resource_size_t size) +void release_mem_region_adjustable(struct resource *parent, + resource_size_t start, resource_size_t size) { + struct resource *new_res = NULL; + bool alloc_nofail = false; struct resource **p; struct resource *res; - struct resource *new_res; resource_size_t end; - int ret = -EINVAL; end = start + size - 1; - if ((start < parent->start) || (end > parent->end)) - return ret; + if (WARN_ON_ONCE((start < parent->start) || (end > parent->end))) + return; - /* The alloc_resource() result gets checked later */ - new_res = alloc_resource(GFP_KERNEL); + /* + * We free up quite a lot of memory on memory hotunplug (esp., memap), + * just before releasing the region. This is highly unlikely to + * fail - let's play save and make it never fail as the caller cannot + * perform any error handling (e.g., trying to re-add memory will fail + * similarly). + */ +retry: + new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0)); p = &parent->child; write_lock(&resource_lock); @@ -1298,7 +1305,6 @@ int release_mem_region_adjustable(struct resource *parent, * so if we are dealing with them, let us just back off here. */ if (!(res->flags & IORESOURCE_SYSRAM)) { - ret = 0; break; } @@ -1315,20 +1321,23 @@ int release_mem_region_adjustable(struct resource *parent, /* free the whole entry */ *p = res->sibling; free_resource(res); - ret = 0; } else if (res->start == start && res->end != end) { /* adjust the start */ - ret = __adjust_resource(res, end + 1, - res->end - end); + WARN_ON_ONCE(__adjust_resource(res, end + 1, + res->end - end)); } else if (res->start != start && res->end == end) { /* adjust the end */ - ret = __adjust_resource(res, res->start, - start - res->start); + WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start)); } else { - /* split into two entries */ + /* split into two entries - we need a new resource */ if (!new_res) { - ret = -ENOMEM; - break; + new_res = alloc_resource(GFP_ATOMIC); + if (!new_res) { + alloc_nofail = true; + write_unlock(&resource_lock); + goto retry; + } } new_res->name = res->name; new_res->start = end + 1; @@ -1339,9 +1348,8 @@ int release_mem_region_adjustable(struct resource *parent, new_res->sibling = res->sibling; new_res->child = NULL; - ret = __adjust_resource(res, res->start, - start - res->start); - if (ret) + if (WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start))) break; res->sibling = new_res; new_res = NULL; @@ -1352,7 +1360,6 @@ int release_mem_region_adjustable(struct resource *parent, write_unlock(&resource_lock); free_resource(new_res); - return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit v1.2.3 From 7cf603d17d9bddbda90c424b6f30c7bc2e6f48f2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:33 -0700 Subject: kernel/resource: move and rename IORESOURCE_MEM_DRIVER_MANAGED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IORESOURCE_MEM_DRIVER_MANAGED currently uses an unused PnP bit, which is always set to 0 by hardware. This is far from beautiful (and confusing), and the bit only applies to SYSRAM. So let's move it out of the bus-specific (PnP) defined bits. We'll add another SYSRAM specific bit soon. If we ever need more bits for other purposes, we can steal some from "desc", or reshuffle/regroup what we have. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Eric Biederman Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dave Jiang Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Wang Cc: Juergen Gross Cc: Julien Grall Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monné Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-3-david@redhat.com Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 84f7316792a7..e21f6b9234f7 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -521,7 +521,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) /* Returning 0 will take to next memory range */ /* Don't use memory that will be detected and handled by a driver. */ - if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) return 0; if (sz < kbuf->memsz) -- cgit v1.2.3 From 9ca6551ee24368a4d2b09566ea4d10fe87860379 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:49 -0700 Subject: mm/memory_hotplug: MEMHP_MERGE_RESOURCE to specify merging of System RAM resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some add_memory*() users add memory in small, contiguous memory blocks. Examples include virtio-mem, hyper-v balloon, and the XEN balloon. This can quickly result in a lot of memory resources, whereby the actual resource boundaries are not of interest (e.g., it might be relevant for DIMMs, exposed via /proc/iomem to user space). We really want to merge added resources in this scenario where possible. Let's provide a flag (MEMHP_MERGE_RESOURCE) to specify that a resource either created within add_memory*() or passed via add_memory_resource() shall be marked mergeable and merged with applicable siblings. To implement that, we need a kernel/resource interface to mark selected System RAM resources mergeable (IORESOURCE_SYSRAM_MERGEABLE) and trigger merging. Note: We really want to merge after the whole operation succeeded, not directly when adding a resource to the resource tree (it would break add_memory_resource() and require splitting resources again when the operation failed - e.g., due to -ENOMEM). Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Pankaj Gupta Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Thomas Gleixner Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Roger Pau Monné Cc: Julien Grall Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Christian Borntraeger Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Jason Wang Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Vasily Gorbik Cc: Vishal Verma Link: https://lkml.kernel.org/r/20200911103459.10306-6-david@redhat.com Signed-off-by: Linus Torvalds --- kernel/resource.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 4c1c487abbfb..92026827d95b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1363,6 +1363,66 @@ retry: } #endif /* CONFIG_MEMORY_HOTREMOVE */ +#ifdef CONFIG_MEMORY_HOTPLUG +static bool system_ram_resources_mergeable(struct resource *r1, + struct resource *r2) +{ + /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */ + return r1->flags == r2->flags && r1->end + 1 == r2->start && + r1->name == r2->name && r1->desc == r2->desc && + !r1->child && !r2->child; +} + +/* + * merge_system_ram_resource - mark the System RAM resource mergeable and try to + * merge it with adjacent, mergeable resources + * @res: resource descriptor + * + * This interface is intended for memory hotplug, whereby lots of contiguous + * system ram resources are added (e.g., via add_memory*()) by a driver, and + * the actual resource boundaries are not of interest (e.g., it might be + * relevant for DIMMs). Only resources that are marked mergeable, that have the + * same parent, and that don't have any children are considered. All mergeable + * resources must be immutable during the request. + * + * Note: + * - The caller has to make sure that no pointers to resources that are + * marked mergeable are used anymore after this call - the resource might + * be freed and the pointer might be stale! + * - release_mem_region_adjustable() will split on demand on memory hotunplug + */ +void merge_system_ram_resource(struct resource *res) +{ + const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + struct resource *cur; + + if (WARN_ON_ONCE((res->flags & flags) != flags)) + return; + + write_lock(&resource_lock); + res->flags |= IORESOURCE_SYSRAM_MERGEABLE; + + /* Try to merge with next item in the list. */ + cur = res->sibling; + if (cur && system_ram_resources_mergeable(res, cur)) { + res->end = cur->end; + res->sibling = cur->sibling; + free_resource(cur); + } + + /* Try to merge with previous item in the list. */ + cur = res->parent->child; + while (cur && cur->sibling != res) + cur = cur->sibling; + if (cur && system_ram_resources_mergeable(cur, res)) { + cur->end = res->end; + cur->sibling = res->sibling; + free_resource(res); + } + write_unlock(&resource_lock); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + /* * Managed region resource */ -- cgit v1.2.3 From cb8e3c8b4f45e4ed8987a581956dc9c3827a5bcf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:09:12 -0700 Subject: kernel/resource: make iomem_resource implicit in release_mem_region_adjustable() "mem" in the name already indicates the root, similar to release_mem_region() and devm_request_mem_region(). Make it implicit. The only single caller always passes iomem_resource, other parents are not applicable. Suggested-by: Wei Yang Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Pankaj Gupta Cc: Baoquan He Link: https://lkml.kernel.org/r/20200916073041.10355-1-david@redhat.com Signed-off-by: Linus Torvalds --- kernel/resource.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 92026827d95b..3ae2f56cc79d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1240,7 +1240,6 @@ EXPORT_SYMBOL(__release_region); #ifdef CONFIG_MEMORY_HOTREMOVE /** * release_mem_region_adjustable - release a previously reserved memory region - * @parent: parent resource descriptor * @start: resource start address * @size: resource region size * @@ -1258,9 +1257,9 @@ EXPORT_SYMBOL(__release_region); * assumes that all children remain in the lower address entry for * simplicity. Enhance this logic when necessary. */ -void release_mem_region_adjustable(struct resource *parent, - resource_size_t start, resource_size_t size) +void release_mem_region_adjustable(resource_size_t start, resource_size_t size) { + struct resource *parent = &iomem_resource; struct resource *new_res = NULL; bool alloc_nofail = false; struct resource **p; -- cgit v1.2.3 From 73eb7f9a4ff034d20c8f9454a5a6f45a60cf8dfc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 15 Oct 2020 20:10:08 -0700 Subject: mm: use helper function put_write_access() In commit 1da177e4c3f4 ("Linux-2.6.12-rc2"), the helper put_write_access() came with the atomic_dec operation of the i_writecount field. But it forgot to use this helper in __vma_link_file() and dup_mmap(). Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200924115235.5111-1-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 3ca8f1f83fb3..e0f74be1423c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -556,7 +556,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, get_file(file); if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); + put_write_access(inode); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) mapping_allow_writable(mapping); -- cgit v1.2.3 From b296a6d53339a79082c1d2c1761e948e8b3def69 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 15 Oct 2020 20:10:21 -0700 Subject: kernel.h: split out min()/max() et al. helpers kernel.h is being used as a dump for all kinds of stuff for a long time. Here is the attempt to start cleaning it up by splitting out min()/max() et al. helpers. At the same time convert users in header and lib folder to use new header. Though for time being include new header back to kernel.h to avoid twisted indirected includes for other existing users. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Rasmus Villemoes Cc: Joe Perches Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20200910164152.GA1891694@smile.fi.intel.com Signed-off-by: Linus Torvalds --- kernel/range.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index d84de6766472..56435f96da73 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -2,8 +2,9 @@ /* * Range add and subtract */ -#include #include +#include +#include #include #include #include -- cgit v1.2.3 From 15ec0fcff6dab37533e02e2092d9228726186366 Mon Sep 17 00:00:00 2001 From: Liao Pingfang Date: Thu, 15 Oct 2020 20:10:25 -0700 Subject: kernel/sys.c: replace do_brk with do_brk_flags in comment of prctl_set_mm_map() Replace do_brk with do_brk_flags in comment of prctl_set_mm_map(), since do_brk was removed in following commit. Fixes: bb177a732c4369 ("mm: do not bug_on on incorrect length in __mm_populate()") Signed-off-by: Liao Pingfang Signed-off-by: Yi Wang Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/1600650751-43127-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index ab6c409b1159..6401880dff74 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2034,7 +2034,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * VMAs already unmapped and kernel uses these members for statistics * output in procfs mostly, except * - * - @start_brk/@brk which are used in do_brk but kernel lookups + * - @start_brk/@brk which are used in do_brk_flags but kernel lookups * for VMAs when updating these memvers so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself -- cgit v1.2.3 From 7b7b8a2c9560efb5874ea1d84d1dce5ba4c8c487 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:10:28 -0700 Subject: kernel/: fix repeated words in comments Fix multiple occurrences of duplicated words in kernel/. Fix one typo/spello on the same line as a duplicate word. Change one instance of "the the" to "that the". Otherwise just drop one of the repeated words. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/98202fa6-8919-ef63-9efe-c0fad5ca7af1@infradead.org Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- kernel/cgroup/cpuset.c | 2 +- kernel/dma/direct.c | 2 +- kernel/fork.c | 2 +- kernel/futex.c | 2 +- kernel/irq/timings.c | 2 +- kernel/jump_label.c | 2 +- kernel/kcsan/encoding.h | 2 +- kernel/kexec_core.c | 2 +- kernel/kthread.c | 2 +- kernel/livepatch/state.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/power/snapshot.c | 2 +- kernel/smp.c | 2 +- kernel/user_namespace.c | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index b0c5b3a9f5af..a1b3cf7b3c4b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -25,7 +25,7 @@ * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. * - * Fixed a nasty interaction with with sys_umount(). If the accointing + * Fixed a nasty interaction with sys_umount(). If the accounting * was suspeneded we failed to stop it on umount(). Messy. * Another one: remount to readonly didn't stop accounting. * Question: what should we do if we have CAP_SYS_ADMIN but not diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 642415b8c3c9..57b5b5d0a5fd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -390,7 +390,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * The top cpuset doesn't have any online cpu as a * consequence of a race between cpuset_hotplug_work * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to to be + * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ cpumask_copy(pmask, cpu_online_mask); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index b92d08e65999..06c111544f61 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -16,7 +16,7 @@ #include "direct.h" /* - * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it + * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ diff --git a/kernel/fork.c b/kernel/fork.c index e0f74be1423c..32083db7a2a2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2189,7 +2189,7 @@ static __latent_entropy struct task_struct *copy_process( /* * Ensure that the cgroup subsystem policies allow the new process to be - * forked. It should be noted the the new process's css_set can be changed + * forked. It should be noted that the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ diff --git a/kernel/futex.c b/kernel/futex.c index a5876694a60e..680854dcf156 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -916,7 +916,7 @@ static inline void exit_pi_state_list(struct task_struct *curr) { } * [10] Found | Found | task | !=taskTID | 0/1 | Invalid * * [1] Indicates that the kernel can acquire the futex atomically. We - * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. * * [2] Valid, if TID does not belong to a kernel thread. If no matching * thread is found then it indicates that the owner TID has died. diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index e960d7ce7bcc..773b6105c4ae 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -604,7 +604,7 @@ int irq_timings_alloc(int irq) /* * Some platforms can have the same private interrupt per cpu, - * so this function may be be called several times with the + * so this function may be called several times with the * same interrupt number. Just bail out in case the per cpu * stat structure is already allocated. */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index e661c61b3d6b..015ef903ce8c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -19,7 +19,7 @@ #include #include -/* mutex to protect coming/going of the the jump_label table */ +/* mutex to protect coming/going of the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); void jump_label_lock(void) diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index f03562aaf2eb..1a6db2f797ac 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -32,7 +32,7 @@ * 1. different addresses but with the same encoded address race; * 2. and both map onto the same watchpoint slots; * - * Both these are assumed to be very unlikely. However, in case it still happens + * Both these are assumed to be very unlikely. However, in case it still * happens, the report logic will filter out the false positive (see report.c). */ #define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c5e5e5a11535..8798a8183974 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -109,7 +109,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded); * defined more restrictively in . * * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size + * new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from diff --git a/kernel/kthread.c b/kernel/kthread.c index 3edaa380dc7b..e29773c82b70 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -775,7 +775,7 @@ EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it - * it to a given CPU and the associated NUMA node. + * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c index 7ee19476de9d..2565d039ade0 100644 --- a/kernel/livepatch/state.c +++ b/kernel/livepatch/state.c @@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(klp_get_state); * * The function can be called only during transition when a new * livepatch is being enabled or when such a transition is reverted. - * It is typically called only from from pre/post (un)patch + * It is typically called only from pre/post (un)patch * callbacks. * * Return: pointer to the latest struct klp_state from already diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ac135bd600eb..9de21803a8ae 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -233,7 +233,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * to pid_ns->child_reaper. Thus pidns->child_reaper needs to * stay valid until they all go away. * - * The code relies on the the pid_ns->child_reaper ignoring + * The code relies on the pid_ns->child_reaper ignoring * SIGCHILD to cause those EXIT_ZOMBIE processes to be * autoreaped if reparented. * diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d25749bce7cf..46b1804c1ddf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -735,7 +735,7 @@ zone_found: */ /* - * If the zone we wish to scan is the the current zone and the + * If the zone we wish to scan is the current zone and the * pfn falls into the current node then we do not need to walk * the tree. */ diff --git a/kernel/smp.c b/kernel/smp.c index d0ae8eb6bf8b..d9832a171046 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -741,7 +741,7 @@ EXPORT_SYMBOL(on_each_cpu_mask); * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and - * the the info parameter. The function is called + * the info parameter. The function is called * with preemption disabled. The function should * return a blooean value indicating whether to IPI * the specified CPU. diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 87804e0371fe..e703d5d9cbe8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -515,7 +515,7 @@ EXPORT_SYMBOL(from_kgid_munged); * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test - * for and handle handle INVALID_PROJID being returned. INVALID_PROJID + * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) -- cgit v1.2.3 From b7621ebf8a0870f8d822a09dbd040aa5c4909da5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Oct 2020 20:10:31 -0700 Subject: kernel: acct.c: fix some kernel-doc nits Fix kernel-doc notation to use the documented Returns: syntax and place the function description for acct_process() on the first line where it should be. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Cc: Alexander Viro Link: https://lkml.kernel.org/r/b4c33e5d-98e8-0c47-77b6-ac1859f94d7f@infradead.org Signed-off-by: Linus Torvalds --- kernel/acct.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index a1b3cf7b3c4b..f175df8f6aa4 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -263,12 +263,12 @@ static DEFINE_MUTEX(acct_on_mutex); * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting * - * Returns 0 for success or negative errno values for failure. - * * sys_acct() is the only system call needed to implement process * accounting. It takes the name of the file where accounting records * should be written. If the filename is NULL, accounting will be * shutdown. + * + * Returns: 0 for success or negative errno values for failure. */ SYSCALL_DEFINE1(acct, const char __user *, name) { @@ -586,9 +586,7 @@ static void slow_acct_process(struct pid_namespace *ns) } /** - * acct_process - * - * handles process accounting for an exiting task + * acct_process - handles process accounting for an exiting task */ void acct_process(void) { -- cgit v1.2.3 From 3f388f28639fd19d5bf6df7a882c94ccfbf49c2b Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 15 Oct 2020 20:13:22 -0700 Subject: panic: dump registers on panic_on_warn Currently we print stack and registers for ordinary warnings but we do not for panic_on_warn which looks as oversight - panic() will reboot the machine but won't print registers. This moves printing of registers and modules earlier. This does not move the stack dumping as panic() dumps it. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Douglas Anderson Cc: Ingo Molnar Cc: Kees Cook Cc: Rafael Aquini Cc: Thomas Gleixner Cc: Will Deacon Cc: Nicholas Piggin Link: https://lkml.kernel.org/r/20200804095054.68724-1-aik@ozlabs.ru Signed-off-by: Linus Torvalds --- kernel/panic.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index aef8872ba843..396142ee43fd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -589,6 +589,11 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (args) vprintk(args->fmt, args->args); + print_modules(); + + if (regs) + show_regs(regs); + if (panic_on_warn) { /* * This thread may hit another WARN() in the panic path. @@ -600,12 +605,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - print_modules(); - - if (regs) - show_regs(regs); - else - dump_stack(); + dump_stack(); print_irqtrace_events(current); -- cgit v1.2.3 From ac05b7a1b48ba9fc79937a08db4c7131dba8fc5f Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 15 Oct 2020 20:13:25 -0700 Subject: kernel/relay.c: drop unneeded initialization The variable 'consumed' is initialized with the consumed count but immediately after that the consumed count is updated and assigned to 'consumed' again thus overwriting the previous value. So, drop the unneeded initialization. Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: https://lkml.kernel.org/r/20201005205727.1147-1-sudipm.mukherjee@gmail.com Signed-off-by: Linus Torvalds --- kernel/relay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index fb4e0c530c08..b08d936d5fa7 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1002,7 +1002,7 @@ static int relay_file_read_avail(struct rchan_buf *buf) size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; - size_t consumed = buf->subbufs_consumed; + size_t consumed; relay_file_read_consume(buf, 0, 0); -- cgit v1.2.3 From 3c532798ec96b6c2d77706f04ed1d8b566a805df Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 3 Oct 2020 10:49:22 -0600 Subject: tracehook: clear TIF_NOTIFY_RESUME in tracehook_notify_resume() All the callers currently do this, clean it up and move the clearing into tracehook_notify_resume() instead. Reviewed-by: Oleg Nesterov Reviewed-by: Thomas Gleixner Signed-off-by: Jens Axboe --- kernel/entry/common.c | 1 - kernel/entry/kvm.c | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 145ab11b8318..971ef788b9ae 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -161,7 +161,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, arch_do_signal(regs); if (ti_work & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index eb1a8a4c867c..b6678a5e3cf6 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -16,10 +16,8 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ti_work & _TIF_NEED_RESCHED) schedule(); - if (ti_work & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (ti_work & _TIF_NOTIFY_RESUME) tracehook_notify_resume(NULL); - } ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); if (ret) -- cgit v1.2.3 From 91989c707884ecc7cd537281ab1a4b8fb7219da3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 Oct 2020 09:02:26 -0600 Subject: task_work: cleanup notification modes A previous commit changed the notification mode from true/false to an int, allowing notify-no, notify-yes, or signal-notify. This was backwards compatible in the sense that any existing true/false user would translate to either 0 (on notification sent) or 1, the latter which mapped to TWA_RESUME. TWA_SIGNAL was assigned a value of 2. Clean this up properly, and define a proper enum for the notification mode. Now we have: - TWA_NONE. This is 0, same as before the original change, meaning no notification requested. - TWA_RESUME. This is 1, same as before the original change, meaning that we use TIF_NOTIFY_RESUME. - TWA_SIGNAL. This uses TIF_SIGPENDING/JOBCTL_TASK_WORK for the notification. Clean up all the callers, switching their 0/1/false/true to using the appropriate TWA_* mode for notifications. Fixes: e91b48162332 ("task_work: teach task_work_add() to do signal_wake_up()") Reviewed-by: Thomas Gleixner Signed-off-by: Jens Axboe --- kernel/events/uprobes.c | 2 +- kernel/irq/manage.c | 2 +- kernel/sched/fair.c | 2 +- kernel/task_work.c | 30 ++++++++++++++++++++---------- 4 files changed, 23 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0e18aaf23a7b..00b0358739ab 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1823,7 +1823,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) t->utask->dup_xol_addr = area->vaddr; init_task_work(&t->utask->dup_xol_work, dup_xol_work); - task_work_add(t, &t->utask->dup_xol_work, true); + task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME); } /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5df903fccb60..c460e0496006 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1162,7 +1162,7 @@ static int irq_thread(void *data) handler_fn = irq_thread_fn; init_task_work(&on_exit_work, irq_thread_dtor); - task_work_add(current, &on_exit_work, false); + task_work_add(current, &on_exit_work, TWA_NONE); irq_thread_check_affinity(desc, action); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa4c6227cd6d..e17012be4d14 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2928,7 +2928,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) - task_work_add(curr, work, true); + task_work_add(curr, work, TWA_RESUME); } } diff --git a/kernel/task_work.c b/kernel/task_work.c index 613b2d634af8..8d6e1217c451 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -9,23 +9,28 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback * @work: the callback to run - * @notify: send the notification if true + * @notify: how to notify the targeted task * - * Queue @work for task_work_run() below and notify the @task if @notify. - * Fails if the @task is exiting/exited and thus it can't process this @work. - * Otherwise @work->func() will be called when the @task returns from kernel - * mode or exits. + * Queue @work for task_work_run() below and notify the @task if @notify + * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the + * it will interrupt the targeted task and run the task_work. @TWA_RESUME + * work is run only when the task exits the kernel and returns to user mode, + * or before entering guest mode. Fails if the @task is exiting/exited and thus + * it can't process this @work. Otherwise @work->func() will be called when the + * @task goes through one of the aforementioned transitions, or exits. * - * This is like the signal handler which runs in kernel mode, but it doesn't - * try to wake up the @task. + * If the targeted task is exiting, then an error is returned and the work item + * is not queued. It's up to the caller to arrange for an alternative mechanism + * in that case. * - * Note: there is no ordering guarantee on works queued here. + * Note: there is no ordering guarantee on works queued here. The task_work + * list is LIFO. * * RETURNS: * 0 if succeeds or -ESRCH. */ -int -task_work_add(struct task_struct *task, struct callback_head *work, int notify) +int task_work_add(struct task_struct *task, struct callback_head *work, + enum task_work_notify_mode notify) { struct callback_head *head; unsigned long flags; @@ -38,6 +43,8 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) } while (cmpxchg(&task->task_works, head, work) != head); switch (notify) { + case TWA_NONE: + break; case TWA_RESUME: set_notify_resume(task); break; @@ -54,6 +61,9 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) unlock_task_sighand(task, &flags); } break; + default: + WARN_ON_ONCE(1); + break; } return 0; -- cgit v1.2.3 From 1aa92cd31c1c032ddfed27e79d646bbb429e9b52 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 17 Oct 2020 16:14:54 -0700 Subject: pid: move pidfd_get_pid() to pid.c process_madvise syscall needs pidfd_get_pid function to translate pidfd to pid so this patch move the function to kernel/pid.c. Suggested-by: Alexander Duyck Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Reviewed-by: Suren Baghdasaryan Reviewed-by: Alexander Duyck Reviewed-by: Vlastimil Babka Acked-by: Christian Brauner Acked-by: David Rientjes Cc: Jens Axboe Cc: Jann Horn Cc: Brian Geffon Cc: Daniel Colascione Cc: Joel Fernandes Cc: Johannes Weiner Cc: John Dias Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleksandr Natalenko Cc: Sandeep Patil Cc: SeongJae Park Cc: SeongJae Park Cc: Shakeel Butt Cc: Sonny Rao Cc: Tim Murray Cc: Christian Brauner Cc: Florian Weimer Cc: Link: http://lkml.kernel.org/r/20200302193630.68771-5-minchan@kernel.org Link: http://lkml.kernel.org/r/20200622192900.22757-3-minchan@kernel.org Link: https://lkml.kernel.org/r/20200901000633.1920247-3-minchan@kernel.org Signed-off-by: Linus Torvalds --- kernel/exit.c | 19 ------------------- kernel/pid.c | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 1f51c27bae59..87a2d515de0d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1474,25 +1474,6 @@ end: return retval; } -static struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) -{ - struct fd f; - struct pid *pid; - - f = fdget(fd); - if (!f.file) - return ERR_PTR(-EBADF); - - pid = pidfd_pid(f.file); - if (!IS_ERR(pid)) { - get_pid(pid); - *flags = f.file->f_flags; - } - - fdput(f); - return pid; -} - static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { diff --git a/kernel/pid.c b/kernel/pid.c index 74ddbff1a6ba..a96bc4bf4f86 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -520,6 +520,25 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) +{ + struct fd f; + struct pid *pid; + + f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + + pid = pidfd_pid(f.file); + if (!IS_ERR(pid)) { + get_pid(pid); + *flags = f.file->f_flags; + } + + fdput(f); + return pid; +} + /** * pidfd_create() - Create a new pid file descriptor. * -- cgit v1.2.3 From ecb8ac8b1f146915aa6b96449b66dd48984caacc Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 17 Oct 2020 16:14:59 -0700 Subject: mm/madvise: introduce process_madvise() syscall: an external memory hinting API There is usecase that System Management Software(SMS) want to give a memory hint like MADV_[COLD|PAGEEOUT] to other processes and in the case of Android, it is the ActivityManagerService. The information required to make the reclaim decision is not known to the app. Instead, it is known to the centralized userspace daemon(ActivityManagerService), and that daemon must be able to initiate reclaim on its own without any app involvement. To solve the issue, this patch introduces a new syscall process_madvise(2). It uses pidfd of an external process to give the hint. It also supports vector address range because Android app has thousands of vmas due to zygote so it's totally waste of CPU and power if we should call the syscall one by one for each vma.(With testing 2000-vma syscall vs 1-vector syscall, it showed 15% performance improvement. I think it would be bigger in real practice because the testing ran very cache friendly environment). Another potential use case for the vector range is to amortize the cost ofTLB shootdowns for multiple ranges when using MADV_DONTNEED; this could benefit users like TCP receive zerocopy and malloc implementations. In future, we could find more usecases for other advises so let's make it happens as API since we introduce a new syscall at this moment. With that, existing madvise(2) user could replace it with process_madvise(2) with their own pid if they want to have batch address ranges support feature. ince it could affect other process's address range, only privileged process(PTRACE_MODE_ATTACH_FSCREDS) or something else(e.g., being the same UID) gives it the right to ptrace the process could use it successfully. The flag argument is reserved for future use if we need to extend the API. I think supporting all hints madvise has/will supported/support to process_madvise is rather risky. Because we are not sure all hints make sense from external process and implementation for the hint may rely on the caller being in the current context so it could be error-prone. Thus, I just limited hints as MADV_[COLD|PAGEOUT] in this patch. If someone want to add other hints, we could hear the usecase and review it for each hint. It's safer for maintenance rather than introducing a buggy syscall but hard to fix it later. So finally, the API is as follows, ssize_t process_madvise(int pidfd, const struct iovec *iovec, unsigned long vlen, int advice, unsigned int flags); DESCRIPTION The process_madvise() system call is used to give advice or directions to the kernel about the address ranges from external process as well as local process. It provides the advice to address ranges of process described by iovec and vlen. The goal of such advice is to improve system or application performance. The pidfd selects the process referred to by the PID file descriptor specified in pidfd. (See pidofd_open(2) for further information) The pointer iovec points to an array of iovec structures, defined in as: struct iovec { void *iov_base; /* starting address */ size_t iov_len; /* number of bytes to be advised */ }; The iovec describes address ranges beginning at address(iov_base) and with size length of bytes(iov_len). The vlen represents the number of elements in iovec. The advice is indicated in the advice argument, which is one of the following at this moment if the target process specified by pidfd is external. MADV_COLD MADV_PAGEOUT Permission to provide a hint to external process is governed by a ptrace access mode PTRACE_MODE_ATTACH_FSCREDS check; see ptrace(2). The process_madvise supports every advice madvise(2) has if target process is in same thread group with calling process so user could use process_madvise(2) to extend existing madvise(2) to support vector address ranges. RETURN VALUE On success, process_madvise() returns the number of bytes advised. This return value may be less than the total number of requested bytes, if an error occurred. The caller should check return value to determine whether a partial advice occurred. FAQ: Q.1 - Why does any external entity have better knowledge? Quote from Sandeep "For Android, every application (including the special SystemServer) are forked from Zygote. The reason of course is to share as many libraries and classes between the two as possible to benefit from the preloading during boot. After applications start, (almost) all of the APIs end up calling into this SystemServer process over IPC (binder) and back to the application. In a fully running system, the SystemServer monitors every single process periodically to calculate their PSS / RSS and also decides which process is "important" to the user for interactivity. So, because of how these processes start _and_ the fact that the SystemServer is looping to monitor each process, it does tend to *know* which address range of the application is not used / useful. Besides, we can never rely on applications to clean things up themselves. We've had the "hey app1, the system is low on memory, please trim your memory usage down" notifications for a long time[1]. They rely on applications honoring the broadcasts and very few do. So, if we want to avoid the inevitable killing of the application and restarting it, some way to be able to tell the OS about unimportant memory in these applications will be useful. - ssp Q.2 - How to guarantee the race(i.e., object validation) between when giving a hint from an external process and get the hint from the target process? process_madvise operates on the target process's address space as it exists at the instant that process_madvise is called. If the space target process can run between the time the process_madvise process inspects the target process address space and the time that process_madvise is actually called, process_madvise may operate on memory regions that the calling process does not expect. It's the responsibility of the process calling process_madvise to close this race condition. For example, the calling process can suspend the target process with ptrace, SIGSTOP, or the freezer cgroup so that it doesn't have an opportunity to change its own address space before process_madvise is called. Another option is to operate on memory regions that the caller knows a priori will be unchanged in the target process. Yet another option is to accept the race for certain process_madvise calls after reasoning that mistargeting will do no harm. The suggested API itself does not provide synchronization. It also apply other APIs like move_pages, process_vm_write. The race isn't really a problem though. Why is it so wrong to require that callers do their own synchronization in some manner? Nobody objects to write(2) merely because it's possible for two processes to open the same file and clobber each other's writes --- instead, we tell people to use flock or something. Think about mmap. It never guarantees newly allocated address space is still valid when the user tries to access it because other threads could unmap the memory right before. That's where we need synchronization by using other API or design from userside. It shouldn't be part of API itself. If someone needs more fine-grained synchronization rather than process level, there were two ideas suggested - cookie[2] and anon-fd[3]. Both are applicable via using last reserved argument of the API but I don't think it's necessary right now since we have already ways to prevent the race so don't want to add additional complexity with more fine-grained optimization model. To make the API extend, it reserved an unsigned long as last argument so we could support it in future if someone really needs it. Q.3 - Why doesn't ptrace work? Injecting an madvise in the target process using ptrace would not work for us because such injected madvise would have to be executed by the target process, which means that process would have to be runnable and that creates the risk of the abovementioned race and hinting a wrong VMA. Furthermore, we want to act the hint in caller's context, not the callee's, because the callee is usually limited in cpuset/cgroups or even freezed state so they can't act by themselves quick enough, which causes more thrashing/kill. It doesn't work if the target process are ptraced(e.g., strace, debugger, minidump) because a process can have at most one ptracer. [1] https://developer.android.com/topic/performance/memory" [2] process_getinfo for getting the cookie which is updated whenever vma of process address layout are changed - Daniel Colascione - https://lore.kernel.org/lkml/20190520035254.57579-1-minchan@kernel.org/T/#m7694416fd179b2066a2c62b5b139b14e3894e224 [3] anonymous fd which is used for the object(i.e., address range) validation - Michal Hocko - https://lore.kernel.org/lkml/20200120112722.GY18451@dhcp22.suse.cz/ [minchan@kernel.org: fix process_madvise build break for arm64] Link: http://lkml.kernel.org/r/20200303145756.GA219683@google.com [minchan@kernel.org: fix build error for mips of process_madvise] Link: http://lkml.kernel.org/r/20200508052517.GA197378@google.com [akpm@linux-foundation.org: fix patch ordering issue] [akpm@linux-foundation.org: fix arm64 whoops] [minchan@kernel.org: make process_madvise() vlen arg have type size_t, per Florian] [akpm@linux-foundation.org: fix i386 build] [sfr@canb.auug.org.au: fix syscall numbering] Link: https://lkml.kernel.org/r/20200905142639.49fc3f1a@canb.auug.org.au [sfr@canb.auug.org.au: madvise.c needs compat.h] Link: https://lkml.kernel.org/r/20200908204547.285646b4@canb.auug.org.au [minchan@kernel.org: fix mips build] Link: https://lkml.kernel.org/r/20200909173655.GC2435453@google.com [yuehaibing@huawei.com: remove duplicate header which is included twice] Link: https://lkml.kernel.org/r/20200915121550.30584-1-yuehaibing@huawei.com [minchan@kernel.org: do not use helper functions for process_madvise] Link: https://lkml.kernel.org/r/20200921175539.GB387368@google.com [akpm@linux-foundation.org: pidfd_get_pid() gained an argument] [sfr@canb.auug.org.au: fix up for "iov_iter: transparently handle compat iovecs in import_iovec"] Link: https://lkml.kernel.org/r/20200928212542.468e1fef@canb.auug.org.au Signed-off-by: Minchan Kim Signed-off-by: YueHaibing Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Cc: Alexander Duyck Cc: Brian Geffon Cc: Christian Brauner Cc: Daniel Colascione Cc: Jann Horn Cc: Jens Axboe Cc: Joel Fernandes Cc: Johannes Weiner Cc: John Dias Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleksandr Natalenko Cc: Sandeep Patil Cc: SeongJae Park Cc: SeongJae Park Cc: Shakeel Butt Cc: Sonny Rao Cc: Tim Murray Cc: Christian Brauner Cc: Florian Weimer Cc: Link: http://lkml.kernel.org/r/20200302193630.68771-3-minchan@kernel.org Link: http://lkml.kernel.org/r/20200508183320.GA125527@google.com Link: http://lkml.kernel.org/r/20200622192900.22757-4-minchan@kernel.org Link: https://lkml.kernel.org/r/20200901000633.1920247-4-minchan@kernel.org Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c925d1e1777e..f27ac94d5fa7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -280,6 +280,7 @@ COND_SYSCALL(mlockall); COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); +COND_SYSCALL(process_madvise); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); -- cgit v1.2.3 From 0070ea29623904224c0f5fa279a16a4ac9223295 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 16 Oct 2020 11:17:22 -0700 Subject: cpufreq: schedutil: restore cached freq when next_f is not changed We have the raw cached freq to reduce the chance in calling cpufreq driver where it could be costly in some arch/SoC. Currently, the raw cached freq is reset in sugov_update_single() when it avoids frequency reduction (which is not desirable sometimes), but it is better to restore the previous value of it in that case, because it may not change in the next cycle and it is not necessary to change the CPU frequency then. Adapted from https://android-review.googlesource.com/1352810/ Signed-off-by: Wei Wang Acked-by: Viresh Kumar [ rjw: Subject edit and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5ae7b4e6e8d6..e254745a82cb 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -441,6 +441,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned long util, max; unsigned int next_f; bool busy; + unsigned int cached_freq = sg_policy->cached_raw_freq; sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -464,8 +465,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (busy && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; - /* Reset cached freq as next_freq has changed */ - sg_policy->cached_raw_freq = 0; + /* Restore cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = cached_freq; } /* -- cgit v1.2.3 From 76702a2e7280594a0add4c1283623c81a868373f Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 19 Oct 2020 10:38:46 -0700 Subject: bpf: Remove unneeded break A break is not needed if it is preceded by a return. Signed-off-by: Tom Rix Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201019173846.1021-1-trix@redhat.com --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1110ecd7d1f3..8f50c9c19f1b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2913,7 +2913,6 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: return BPF_PROG_TYPE_CGROUP_SKB; - break; case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: -- cgit v1.2.3 From 93c230e3f5bd6e1d2b2759d582fdfe9c2731473b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 19 Oct 2020 12:42:12 -0700 Subject: bpf: Enforce id generation for all may-be-null register type The commit af7ec1383361 ("bpf: Add bpf_skc_to_tcp6_sock() helper") introduces RET_PTR_TO_BTF_ID_OR_NULL and the commit eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") introduces RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL. Note that for RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, the reg0->type could become PTR_TO_MEM_OR_NULL which is not covered by BPF_PROBE_MEM. The BPF_REG_0 will then hold a _OR_NULL pointer type. This _OR_NULL pointer type requires the bpf program to explicitly do a NULL check first. After NULL check, the verifier will mark all registers having the same reg->id as safe to use. However, the reg->id is not set for those new _OR_NULL return types. One of the ways that may be wrong is, checking NULL for one btf_id typed pointer will end up validating all other btf_id typed pointers because all of them have id == 0. The later tests will exercise this path. To fix it and also avoid similar issue in the future, this patch moves the id generation logic out of each individual RET type test in check_helper_call(). Instead, it does one reg_type_may_be_null() test and then do the id generation if needed. This patch also adds a WARN_ON_ONCE in mark_ptr_or_null_reg() to catch future breakage. The _OR_NULL pointer usage in the bpf_iter_reg.ctx_arg_info is fine because it just happens that the existing id generation after check_ctx_access() has covered it. It is also using the reg_type_may_be_null() to decide if id generation is needed or not. Fixes: af7ec1383361 ("bpf: Add bpf_skc_to_tcp6_sock() helper") Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201019194212.1050855-1-kafai@fb.com --- kernel/bpf/verifier.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 39d7f44e7c92..6200519582a6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5133,24 +5133,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].id = ++env->id_gen; } else { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { @@ -5199,6 +5194,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (reg_type_may_be_null(regs[BPF_REG_0].type)) + regs[BPF_REG_0].id = ++env->id_gen; + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; @@ -7212,7 +7210,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, struct bpf_reg_state *reg, u32 id, bool is_null) { - if (reg_type_may_be_null(reg->type) && reg->id == id) { + if (reg_type_may_be_null(reg->type) && reg->id == id && + !WARN_ON_ONCE(!reg->id)) { /* Old offset (both fixed and variable parts) should * have been known-zero, because we don't allow pointer * arithmetic on pointers that might be NULL. -- cgit v1.2.3 From 695cebe58dcf3d9802cdfa9c327b5c7641a5914b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Oct 2020 10:41:07 +0200 Subject: dma-mapping: move more functions to dma-map-ops.h Due to a mismerge a bunch of prototypes that should have moved to dma-map-ops.h are still in dma-mapping.h, fix that up. Signed-off-by: Christoph Hellwig --- kernel/dma/remap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 78b23f089cf1..905c3fa005f1 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2014 The Linux Foundation */ -#include +#include #include #include -- cgit v1.2.3 From c2f7d08cccf4af2ce6992feaabb9e68e4ae0bff3 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 15 Oct 2020 09:00:19 -0700 Subject: futex: Adjust absolute futex timeouts with per time namespace offset For all commands except FUTEX_WAIT, the timeout is interpreted as an absolute value. This absolute value is inside the task's time namespace and has to be converted to the host's time. Fixes: 5a590f35add9 ("posix-clocks: Wire up clock_gettime() with timens offsets") Reported-by: Hans van der Laan Signed-off-by: Andrei Vagin Signed-off-by: Thomas Gleixner Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Link: https://lore.kernel.org/r/20201015160020.293748-1-avagin@gmail.com --- kernel/futex.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 680854dcf156..be68ac0d49ad 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -3797,6 +3798,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, t = timespec64_to_ktime(ts); if (cmd == FUTEX_WAIT) t = ktime_add_safe(ktime_get(), t); + else if (!(op & FUTEX_CLOCK_REALTIME)) + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); tp = &t; } /* @@ -3989,6 +3992,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, t = timespec64_to_ktime(ts); if (cmd == FUTEX_WAIT) t = ktime_add_safe(ktime_get(), t); + else if (!(op & FUTEX_CLOCK_REALTIME)) + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); tp = &t; } if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || -- cgit v1.2.3 From 0f6372e522237f39aff63f2e158d629038f26238 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 12 Oct 2020 17:31:45 -0700 Subject: treewide: remove DISABLE_LTO This change removes all instances of DISABLE_LTO from Makefiles, as they are currently unused, and the preferred method of disabling LTO is to filter out the flags instead. Note added by Masahiro Yamada: DISABLE_LTO was added as preparation for GCC LTO, but GCC LTO was not pulled into the mainline. (https://lkml.org/lkml/2014/4/8/272) Suggested-by: Kees Cook Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Signed-off-by: Masahiro Yamada --- kernel/Makefile | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 9a20016d4900..347254f07dab 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -38,9 +38,6 @@ KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector -# cond_syscall is currently not LTO compatible -CFLAGS_sys_ni.o = $(DISABLE_LTO) - obj-y += sched/ obj-y += locking/ obj-y += power/ -- cgit v1.2.3 From 0a1754b2a97efa644aa6e84d1db5b17c42251483 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Mon, 19 Oct 2020 22:22:42 +0800 Subject: ring-buffer: Return 0 on success from ring_buffer_resize() We don't need to check the new buffer size, and the return value had confused resize_buffer_duplicate_size(). ... ret = ring_buffer_resize(trace_buf->buffer, per_cpu_ptr(size_buf->data,cpu_id)->entries, cpu_id); if (ret == 0) per_cpu_ptr(trace_buf->data, cpu_id)->entries = per_cpu_ptr(size_buf->data, cpu_id)->entries; ... Link: https://lkml.kernel.org/r/20201019142242.11560-1-hqjagain@gmail.com Cc: stable@vger.kernel.org Fixes: d60da506cbeb3 ("tracing: Add a resize function to make one buffer equivalent to another buffer") Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 15bf28b13e50..5c6a9c6a058f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1952,18 +1952,18 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, { struct ring_buffer_per_cpu *cpu_buffer; unsigned long nr_pages; - int cpu, err = 0; + int cpu, err; /* * Always succeed at resizing a non-existent buffer: */ if (!buffer) - return size; + return 0; /* Make sure the requested buffer exists */ if (cpu_id != RING_BUFFER_ALL_CPUS && !cpumask_test_cpu(cpu_id, buffer->cpumask)) - return size; + return 0; nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); @@ -2119,7 +2119,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } mutex_unlock(&buffer->mutex); - return size; + return 0; out_err: for_each_buffer_cpu(buffer, cpu) { -- cgit v1.2.3 From e1981f75d398c0afe83c8ffa4e5864f037967409 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sat, 17 Oct 2020 17:52:46 +0800 Subject: ring-buffer: Update the description for ring_buffer_wait The function changed at some point, but the description was not updated. Link: https://lkml.kernel.org/r/20201017095246.5170-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5c6a9c6a058f..7f45fd9d5a45 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -793,7 +793,7 @@ static void rb_wake_up_waiters(struct irq_work *work) * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on - * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS + * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise -- cgit v1.2.3 From c51f8f88d705e06bd696d7510aff22b33eb8e638 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Sun, 9 Aug 2020 06:57:44 +0000 Subject: random32: make prandom_u32() output unpredictable Non-cryptographic PRNGs may have great statistical properties, but are usually trivially predictable to someone who knows the algorithm, given a small sample of their output. An LFSR like prandom_u32() is particularly simple, even if the sample is widely scattered bits. It turns out the network stack uses prandom_u32() for some things like random port numbers which it would prefer are *not* trivially predictable. Predictability led to a practical DNS spoofing attack. Oops. This patch replaces the LFSR with a homebrew cryptographic PRNG based on the SipHash round function, which is in turn seeded with 128 bits of strong random key. (The authors of SipHash have *not* been consulted about this abuse of their algorithm.) Speed is prioritized over security; attacks are rare, while performance is always wanted. Replacing all callers of prandom_u32() is the quick fix. Whether to reinstate a weaker PRNG for uses which can tolerate it is an open question. Commit f227e3ec3b5c ("random32: update the net random state on interrupt and activity") was an earlier attempt at a solution. This patch replaces it. Reported-by: Amit Klein Cc: Willy Tarreau Cc: Eric Dumazet Cc: "Jason A. Donenfeld" Cc: Andy Lutomirski Cc: Kees Cook Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Linus Torvalds Cc: tytso@mit.edu Cc: Florian Westphal Cc: Marc Plumb Fixes: f227e3ec3b5c ("random32: update the net random state on interrupt and activity") Signed-off-by: George Spelvin Link: https://lore.kernel.org/netdev/20200808152628.GA27941@SDF.ORG/ [ willy: partial reversal of f227e3ec3b5c; moved SIPROUND definitions to prandom.h for later use; merged George's prandom_seed() proposal; inlined siprand_u32(); replaced the net_rand_state[] array with 4 members to fix a build issue; cosmetic cleanups to make checkpatch happy; fixed RANDOM32_SELFTEST build ] Signed-off-by: Willy Tarreau --- kernel/time/timer.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index dda05f4b7a1f..3e341af741b9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1717,13 +1717,6 @@ void update_process_times(int user_tick) scheduler_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); - - /* The current CPU might make use of net randoms without receiving IRQs - * to renew them often enough. Let's update the net_rand_state from a - * non-constant value that's not affine to the number of calls to make - * sure it's updated when there's some activity (we don't care in idle). - */ - this_cpu_add(net_rand_state.s1, rol32(jiffies, 24) + user_tick); } /** -- cgit v1.2.3 From 3744741adab6d9195551ce30e65e726c7a408421 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 10 Aug 2020 10:27:42 +0200 Subject: random32: add noise from network and scheduling activity With the removal of the interrupt perturbations in previous random32 change (random32: make prandom_u32() output unpredictable), the PRNG has become 100% deterministic again. While SipHash is expected to be way more robust against brute force than the previous Tausworthe LFSR, there's still the risk that whoever has even one temporary access to the PRNG's internal state is able to predict all subsequent draws till the next reseed (roughly every minute). This may happen through a side channel attack or any data leak. This patch restores the spirit of commit f227e3ec3b5c ("random32: update the net random state on interrupt and activity") in that it will perturb the internal PRNG's statee using externally collected noise, except that it will not pick that noise from the random pool's bits nor upon interrupt, but will rather combine a few elements along the Tx path that are collectively hard to predict, such as dev, skb and txq pointers, packet length and jiffies values. These ones are combined using a single round of SipHash into a single long variable that is mixed with the net_rand_state upon each invocation. The operation was inlined because it produces very small and efficient code, typically 3 xor, 2 add and 2 rol. The performance was measured to be the same (even very slightly better) than before the switch to SipHash; on a 6-core 12-thread Core i7-8700k equipped with a 40G NIC (i40e), the connection rate dropped from 556k/s to 555k/s while the SYN cookie rate grew from 5.38 Mpps to 5.45 Mpps. Link: https://lore.kernel.org/netdev/20200808152628.GA27941@SDF.ORG/ Cc: George Spelvin Cc: Amit Klein Cc: Eric Dumazet Cc: "Jason A. Donenfeld" Cc: Andy Lutomirski Cc: Kees Cook Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Linus Torvalds Cc: tytso@mit.edu Cc: Florian Westphal Cc: Marc Plumb Tested-by: Sedat Dilek Signed-off-by: Willy Tarreau --- kernel/time/timer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3e341af741b9..de37e33a868d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1706,6 +1706,8 @@ void update_process_times(int user_tick) { struct task_struct *p = current; + PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0); + /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); -- cgit v1.2.3 From 986b9eacb25910865b50e5f298aa8e2df7642f1b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Sat, 24 Oct 2020 03:04:26 +0200 Subject: kernel/sys.c: fix prototype of prctl_get_tid_address() tid_addr is not a "pointer to (pointer to int in userspace)"; it is in fact a "pointer to (pointer to int in userspace) in userspace". So sparse rightfully complains about passing a kernel pointer to put_user(). Reported-by: kernel test robot Signed-off-by: Rasmus Villemoes Signed-off-by: Linus Torvalds --- kernel/sys.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 84594bcd886e..a730c03ee607 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2238,12 +2238,12 @@ out: } #ifdef CONFIG_CHECKPOINT_RESTORE -static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) { return put_user(me->clear_child_tid, tid_addr); } #else -static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) { return -EINVAL; } @@ -2427,7 +2427,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = prctl_set_mm(arg2, arg3, arg4, arg5); break; case PR_GET_TID_ADDRESS: - error = prctl_get_tid_address(me, (int __user **)arg2); + error = prctl_get_tid_address(me, (int __user * __user *)arg2); break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; -- cgit v1.2.3 From 33def8498fdde180023444b08e12b72a9efed41d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 21 Oct 2020 19:36:07 -0700 Subject: treewide: Convert macro and uses of __section(foo) to __section("foo") Use a more generic form for __section that requires quotes to avoid complications with clang and gcc differences. Remove the quote operator # from compiler_attributes.h __section macro. Convert all unquoted __section(foo) uses to quoted __section("foo"). Also convert __attribute__((section("foo"))) uses to __section("foo") even if the __attribute__ has multiple list entry forms. Conversion done using the script at: https://lore.kernel.org/lkml/75393e5ddc272dc7403de74d645e6c6e0f4e70eb.camel@perches.com/2-convert_section.pl Signed-off-by: Joe Perches Reviewed-by: Nick Desaulniers Reviewed-by: Miguel Ojeda Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 4 ++-- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/idle.c | 2 +- kernel/sched/rt.c | 2 +- kernel/sched/stop_task.c | 2 +- kernel/trace/trace.h | 2 +- kernel/trace/trace_export.c | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 4fb15fa96734..fe9de067771c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -40,10 +40,10 @@ extern const u8 kallsyms_names[] __weak; * has one (eg: FRV). */ extern const unsigned int kallsyms_num_syms -__attribute__((weak, section(".rodata"))); +__section(".rodata") __attribute__((weak)); extern const unsigned long kallsyms_relative_base -__attribute__((weak, section(".rodata"))); +__section(".rodata") __attribute__((weak)); extern const char kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6d93f4518734..f232305dcefe 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2504,7 +2504,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } const struct sched_class dl_sched_class - __attribute__((section("__dl_sched_class"))) = { + __section("__dl_sched_class") = { .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e17012be4d14..290f9e38378c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11159,7 +11159,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * All the scheduling class methods: */ const struct sched_class fair_sched_class - __attribute__((section("__fair_sched_class"))) = { + __section("__fair_sched_class") = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f324dc36fc43..24d0ee26377d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -458,7 +458,7 @@ static void update_curr_idle(struct rq *rq) * Simple, special scheduling class for the per-CPU idle tasks: */ const struct sched_class idle_sched_class - __attribute__((section("__idle_sched_class"))) = { + __section("__idle_sched_class") = { /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f215eea6a966..49ec096a8aa1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2430,7 +2430,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) } const struct sched_class rt_sched_class - __attribute__((section("__rt_sched_class"))) = { + __section("__rt_sched_class") = { .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 394bc8126a1e..ceb5b6b12561 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -110,7 +110,7 @@ static void update_curr_stop(struct rq *rq) * Simple, special scheduling class for the per-CPU stop tasks: */ const struct sched_class stop_sched_class - __attribute__((section("__stop_sched_class"))) = { + __section("__stop_sched_class") = { .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 34e0c4d5a6e7..f3f5e77123ad 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -99,7 +99,7 @@ enum trace_type { /* Use this for memory failure errors */ #define MEM_FAIL(condition, fmt, ...) ({ \ - static bool __section(.data.once) __warned; \ + static bool __section(".data.once") __warned; \ int __ret_warn_once = !!(condition); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 70d3d0a09053..90f81d33fa3f 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -176,7 +176,7 @@ struct trace_event_call __used event_##call = { \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ static struct trace_event_call __used \ -__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; +__section("_ftrace_events") *__event_##call = &event_##call; #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ -- cgit v1.2.3