From b2b018ef48675a9a524fa9791ea7d67fdac405f7 Mon Sep 17 00:00:00 2001 From: Chris J Arges Date: Tue, 1 Dec 2015 20:40:54 -0600 Subject: livepatch: add old_sympos as disambiguator field to klp_func Currently, patching objects with duplicate symbol names fail because the creation of the sysfs function directory collides with the previous attempt. Appending old_addr to the function name is problematic as it reveals the address of the function being patch to a normal user. Using the symbol's occurrence in kallsyms to postfix the function name in the sysfs directory solves the issue of having consistent unique names and ensuring that the address is not exposed to a normal user. In addition, using the symbol position as the user's method to disambiguate symbols instead of addr allows for disambiguating symbols in modules as well for both function addresses and for relocs. This also simplifies much of the code. Special handling for kASLR is no longer needed and can be removed. The klp_find_verify_func_addr function can be replaced by klp_find_object_symbol, and klp_verify_vmlinux_symbol and its callback can be removed completely. In cases of duplicate symbols, old_sympos will be used to disambiguate instead of old_addr. By default old_sympos will be 0, and patching will only succeed if the symbol is unique. Specifying a positive value will ensure that occurrence of the symbol in kallsyms for the patched object will be used for patching if it is valid. In addition, make old_addr an internal structure field not to be specified by the user. Finally, remove klp_find_verify_func_addr as it can be replaced by klp_find_object_symbol directly. Support for symbol position disambiguation for relocations is added in the next patch in this series. Signed-off-by: Chris J Arges Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 72 +++++++++++++++++++++---------------------------- 1 file changed, 31 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index db545cbcdb89..e416f96e938d 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -135,13 +135,8 @@ struct klp_find_arg { const char *objname; const char *name; unsigned long addr; - /* - * If count == 0, the symbol was not found. If count == 1, a unique - * match was found and addr is set. If count > 1, there is - * unresolvable ambiguity among "count" number of symbols with the same - * name in the same object. - */ unsigned long count; + unsigned long pos; }; static int klp_find_callback(void *data, const char *name, @@ -158,37 +153,48 @@ static int klp_find_callback(void *data, const char *name, if (args->objname && strcmp(args->objname, mod->name)) return 0; - /* - * args->addr might be overwritten if another match is found - * but klp_find_object_symbol() handles this and only returns the - * addr if count == 1. - */ args->addr = addr; args->count++; + /* + * Finish the search when the symbol is found for the desired position + * or the position is not defined for a non-unique symbol. + */ + if ((args->pos && (args->count == args->pos)) || + (!args->pos && (args->count > 1))) + return 1; + return 0; } static int klp_find_object_symbol(const char *objname, const char *name, - unsigned long *addr) + unsigned long sympos, unsigned long *addr) { struct klp_find_arg args = { .objname = objname, .name = name, .addr = 0, - .count = 0 + .count = 0, + .pos = sympos, }; mutex_lock(&module_mutex); kallsyms_on_each_symbol(klp_find_callback, &args); mutex_unlock(&module_mutex); - if (args.count == 0) + /* + * Ensure an address was found. If sympos is 0, ensure symbol is unique; + * otherwise ensure the symbol position count matches sympos. + */ + if (args.addr == 0) pr_err("symbol '%s' not found in symbol table\n", name); - else if (args.count > 1) + else if (args.count > 1 && sympos == 0) { pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n", args.count, name, objname); - else { + } else if (sympos != args.count && sympos > 0) { + pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n", + sympos, name, objname ? objname : "vmlinux"); + } else { *addr = args.addr; return 0; } @@ -236,27 +242,6 @@ static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr) return 0; } -static int klp_find_verify_func_addr(struct klp_object *obj, - struct klp_func *func) -{ - int ret; - -#if defined(CONFIG_RANDOMIZE_BASE) - /* If KASLR has been enabled, adjust old_addr accordingly */ - if (kaslr_enabled() && func->old_addr) - func->old_addr += kaslr_offset(); -#endif - - if (!func->old_addr || klp_is_module(obj)) - ret = klp_find_object_symbol(obj->name, func->old_name, - &func->old_addr); - else - ret = klp_verify_vmlinux_symbol(func->old_name, - func->old_addr); - - return ret; -} - /* * external symbols are located outside the parent object (where the parent * object is either vmlinux or the kmod being patched). @@ -276,8 +261,11 @@ static int klp_find_external_symbol(struct module *pmod, const char *name, } preempt_enable(); - /* otherwise check if it's in another .o within the patch module */ - return klp_find_object_symbol(pmod->name, name, addr); + /* + * Check if it's in another .o within the patch module. This also + * checks that the external symbol is unique. + */ + return klp_find_object_symbol(pmod->name, name, 0, addr); } static int klp_write_object_relocations(struct module *pmod, @@ -313,7 +301,7 @@ static int klp_write_object_relocations(struct module *pmod, else ret = klp_find_object_symbol(obj->mod->name, reloc->name, - &reloc->val); + 0, &reloc->val); if (ret) return ret; } @@ -756,7 +744,9 @@ static int klp_init_object_loaded(struct klp_patch *patch, } klp_for_each_func(obj, func) { - ret = klp_find_verify_func_addr(obj, func); + ret = klp_find_object_symbol(obj->name, func->old_name, + func->old_sympos, + &func->old_addr); if (ret) return ret; } -- cgit v1.2.3 From 064c89df6247cd829a7880cc8a87b7ed2cdfccd8 Mon Sep 17 00:00:00 2001 From: Chris J Arges Date: Tue, 1 Dec 2015 20:40:55 -0600 Subject: livepatch: add sympos as disambiguator field to klp_reloc In cases of duplicate symbols, sympos will be used to disambiguate instead of val. By default sympos will be 0, and patching will only succeed if the symbol is unique. Specifying a positive value will ensure that occurrence of the symbol in kallsyms for the patched object will be used for patching if it is valid. For external relocations sympos is not supported. Remove klp_verify_callback, klp_verify_args and klp_verify_vmlinux_symbol as they are no longer used. From the klp_reloc structure remove val, as it can be refactored as a local variable in klp_write_object_relocations. Signed-off-by: Chris J Arges Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 84 +++++++++++-------------------------------------- 1 file changed, 19 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index e416f96e938d..e842534d3493 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -203,45 +203,6 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -struct klp_verify_args { - const char *name; - const unsigned long addr; -}; - -static int klp_verify_callback(void *data, const char *name, - struct module *mod, unsigned long addr) -{ - struct klp_verify_args *args = data; - - if (!mod && - !strcmp(args->name, name) && - args->addr == addr) - return 1; - - return 0; -} - -static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr) -{ - struct klp_verify_args args = { - .name = name, - .addr = addr, - }; - int ret; - - mutex_lock(&module_mutex); - ret = kallsyms_on_each_symbol(klp_verify_callback, &args); - mutex_unlock(&module_mutex); - - if (!ret) { - pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n", - name, addr); - return -EINVAL; - } - - return 0; -} - /* * external symbols are located outside the parent object (where the parent * object is either vmlinux or the kmod being patched). @@ -272,6 +233,7 @@ static int klp_write_object_relocations(struct module *pmod, struct klp_object *obj) { int ret; + unsigned long val; struct klp_reloc *reloc; if (WARN_ON(!klp_is_object_loaded(obj))) @@ -281,35 +243,27 @@ static int klp_write_object_relocations(struct module *pmod, return -EINVAL; for (reloc = obj->relocs; reloc->name; reloc++) { - if (!klp_is_module(obj)) { - -#if defined(CONFIG_RANDOMIZE_BASE) - /* If KASLR has been enabled, adjust old value accordingly */ - if (kaslr_enabled()) - reloc->val += kaslr_offset(); -#endif - ret = klp_verify_vmlinux_symbol(reloc->name, - reloc->val); - if (ret) - return ret; - } else { - /* module, reloc->val needs to be discovered */ - if (reloc->external) - ret = klp_find_external_symbol(pmod, - reloc->name, - &reloc->val); - else - ret = klp_find_object_symbol(obj->mod->name, - reloc->name, - 0, &reloc->val); - if (ret) - return ret; - } + /* discover the address of the referenced symbol */ + if (reloc->external) { + if (reloc->sympos > 0) { + pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n", + reloc->name); + return -EINVAL; + } + ret = klp_find_external_symbol(pmod, reloc->name, &val); + } else + ret = klp_find_object_symbol(obj->name, + reloc->name, + reloc->sympos, + &val); + if (ret) + return ret; + ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc, - reloc->val + reloc->addend); + val + reloc->addend); if (ret) { pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n", - reloc->name, reloc->val, ret); + reloc->name, val, ret); return ret; } } -- cgit v1.2.3 From 444f9e99a840c4050c0530cfef81801a21a59f4c Mon Sep 17 00:00:00 2001 From: Chris J Arges Date: Tue, 1 Dec 2015 20:40:56 -0600 Subject: livepatch: function,sympos scheme in livepatch sysfs directory The following directory structure will allow for cases when the same function name exists in a single object. /sys/kernel/livepatch/// The sympos number corresponds to the nth occurrence of the symbol name in kallsyms for the patched object. An example of patching multiple symbols can be found here: https://github.com/dynup/kpatch/issues/493 Signed-off-by: Chris J Arges Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index e842534d3493..94893e844e44 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -535,7 +535,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch/ * /sys/kernel/livepatch//enabled * /sys/kernel/livepatch// - * /sys/kernel/livepatch/// + * /sys/kernel/livepatch/// */ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -680,8 +680,14 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) INIT_LIST_HEAD(&func->stack_node); func->state = KLP_DISABLED; + /* The format for the sysfs directory is where sympos + * is the nth occurrence of this symbol in kallsyms for the patched + * object. If the user selects 0 for old_sympos, then 1 will be used + * since a unique symbol will be the first occurrence. + */ return kobject_init_and_add(&func->kobj, &klp_ktype_func, - &obj->kobj, "%s", func->old_name); + &obj->kobj, "%s,%lu", func->old_name, + func->old_sympos ? func->old_sympos : 1); } /* parts of the initialization that is done only when the object is loaded */ -- cgit v1.2.3 From 20ef10c1b3068f105004e247d8e7dd8120fa4b9a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 26 Nov 2015 09:42:08 +1030 Subject: module: Use the same logic for setting and unsetting RO/NX When setting a module's RO and NX permissions, set_section_ro_nx() is used, but when clearing them, unset_module_{init,core}_ro_nx() are used. The unset functions don't have the same checks the set function has for partial page protections. It's probably harmless, but it's still confusingly asymmetrical. Instead, use the same logic to do both. Also add some new set_module_{init,core}_ro_nx() helper functions for more symmetry with the unset functions. Signed-off-by: Josh Poimboeuf Signed-off-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/module.c | 57 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 8f051a106676..14b224967e7b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1886,7 +1886,9 @@ void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, static void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, - unsigned long total_size) + unsigned long total_size, + int (*set_ro)(unsigned long start, int num_pages), + int (*set_nx)(unsigned long start, int num_pages)) { /* begin and end PFNs of the current subsection */ unsigned long begin_pfn; @@ -1898,7 +1900,7 @@ static void set_section_ro_nx(void *base, * - Do not protect last partial page. */ if (ro_size > 0) - set_page_attributes(base, base + ro_size, set_memory_ro); + set_page_attributes(base, base + ro_size, set_ro); /* * Set NX permissions for module data: @@ -1909,28 +1911,36 @@ static void set_section_ro_nx(void *base, begin_pfn = PFN_UP((unsigned long)base + text_size); end_pfn = PFN_UP((unsigned long)base + total_size); if (end_pfn > begin_pfn) - set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); + set_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); } } +static void set_module_core_ro_nx(struct module *mod) +{ + set_section_ro_nx(mod->module_core, mod->core_text_size, + mod->core_ro_size, mod->core_size, + set_memory_ro, set_memory_nx); +} + static void unset_module_core_ro_nx(struct module *mod) { - set_page_attributes(mod->module_core + mod->core_text_size, - mod->module_core + mod->core_size, - set_memory_x); - set_page_attributes(mod->module_core, - mod->module_core + mod->core_ro_size, - set_memory_rw); + set_section_ro_nx(mod->module_core, mod->core_text_size, + mod->core_ro_size, mod->core_size, + set_memory_rw, set_memory_x); +} + +static void set_module_init_ro_nx(struct module *mod) +{ + set_section_ro_nx(mod->module_init, mod->init_text_size, + mod->init_ro_size, mod->init_size, + set_memory_ro, set_memory_nx); } static void unset_module_init_ro_nx(struct module *mod) { - set_page_attributes(mod->module_init + mod->init_text_size, - mod->module_init + mod->init_size, - set_memory_x); - set_page_attributes(mod->module_init, - mod->module_init + mod->init_ro_size, - set_memory_rw); + set_section_ro_nx(mod->module_init, mod->init_text_size, + mod->init_ro_size, mod->init_size, + set_memory_rw, set_memory_x); } /* Iterate through all modules and set each module's text as RW */ @@ -1979,7 +1989,8 @@ void set_all_modules_text_ro(void) mutex_unlock(&module_mutex); } #else -static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } +static void set_module_core_ro_nx(struct module *mod) { } +static void set_module_init_ro_nx(struct module *mod) { } static void unset_module_core_ro_nx(struct module *mod) { } static void unset_module_init_ro_nx(struct module *mod) { } #endif @@ -3373,17 +3384,9 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); - - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); + /* Set RO and NX regions */ + set_module_init_ro_nx(mod); + set_module_core_ro_nx(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ -- cgit v1.2.3 From c65abf358f211c3f88c8ed714dff25775ab49fc1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 26 Nov 2015 09:43:08 +1030 Subject: gcov: use within_module() helper. An exact mapping would be within_module_core(), but at this stage (MODULE_STATE_GOING) the init section is empty, and this is clearer. Reviewed-by: Peter Oberparleiter Signed-off-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/gcov/base.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 7080ae1eb6c1..2f9df37940a0 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -123,11 +123,6 @@ void gcov_enable_events(void) } #ifdef CONFIG_MODULES -static inline int within(void *addr, void *start, unsigned long size) -{ - return ((addr >= start) && (addr < start + size)); -} - /* Update list and generate events when modules are unloaded. */ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, void *data) @@ -142,7 +137,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, /* Remove entries located in module from linked list. */ while ((info = gcov_info_next(info))) { - if (within(info, mod->module_core, mod->core_size)) { + if (within_module((unsigned long)info, mod)) { gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); -- cgit v1.2.3 From 7523e4dc5057e157212b4741abd6256e03404cf1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 26 Nov 2015 09:44:08 +1030 Subject: module: use a structure to encapsulate layout. Makes it easier to handle init vs core cleanly, though the change is fairly invasive across random architectures. It simplifies the rbtree code immediately, however, while keeping the core data together in the same cachline (now iff the rbtree code is enabled). Acked-by: Peter Zijlstra Reviewed-by: Josh Poimboeuf Signed-off-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/debug/kdb/kdb_main.c | 4 +- kernel/module.c | 199 ++++++++++++++++++++------------------------ 2 files changed, 94 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4121345498e0..2a20c0dfdafc 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2021,7 +2021,7 @@ static int kdb_lsmod(int argc, const char **argv) continue; kdb_printf("%-20s%8u 0x%p ", mod->name, - mod->core_size, (void *)mod); + mod->core_layout.size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD kdb_printf("%4d ", module_refcount(mod)); #endif @@ -2031,7 +2031,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); - kdb_printf(" 0x%p", mod->module_core); + kdb_printf(" 0x%p", mod->core_layout.base); #ifdef CONFIG_MODULE_UNLOAD { diff --git a/kernel/module.c b/kernel/module.c index 14b224967e7b..a0a3d6d9d5e8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -108,13 +108,6 @@ static LIST_HEAD(modules); * Use a latched RB-tree for __module_address(); this allows us to use * RCU-sched lookups of the address from any context. * - * Because modules have two address ranges: init and core, we need two - * latch_tree_nodes entries. Therefore we need the back-pointer from - * mod_tree_node. - * - * Because init ranges are short lived we mark them unlikely and have placed - * them outside the critical cacheline in struct module. - * * This is conditional on PERF_EVENTS || TRACING because those can really hit * __module_address() hard by doing a lot of stack unwinding; potentially from * NMI context. @@ -122,24 +115,16 @@ static LIST_HEAD(modules); static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) { - struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); - struct module *mod = mtn->mod; + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - if (unlikely(mtn == &mod->mtn_init)) - return (unsigned long)mod->module_init; - - return (unsigned long)mod->module_core; + return (unsigned long)layout->base; } static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) { - struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); - struct module *mod = mtn->mod; - - if (unlikely(mtn == &mod->mtn_init)) - return (unsigned long)mod->init_size; + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - return (unsigned long)mod->core_size; + return (unsigned long)layout->size; } static __always_inline bool @@ -197,23 +182,23 @@ static void __mod_tree_remove(struct mod_tree_node *node) */ static void mod_tree_insert(struct module *mod) { - mod->mtn_core.mod = mod; - mod->mtn_init.mod = mod; + mod->core_layout.mtn.mod = mod; + mod->init_layout.mtn.mod = mod; - __mod_tree_insert(&mod->mtn_core); - if (mod->init_size) - __mod_tree_insert(&mod->mtn_init); + __mod_tree_insert(&mod->core_layout.mtn); + if (mod->init_layout.size) + __mod_tree_insert(&mod->init_layout.mtn); } static void mod_tree_remove_init(struct module *mod) { - if (mod->init_size) - __mod_tree_remove(&mod->mtn_init); + if (mod->init_layout.size) + __mod_tree_remove(&mod->init_layout.mtn); } static void mod_tree_remove(struct module *mod) { - __mod_tree_remove(&mod->mtn_core); + __mod_tree_remove(&mod->core_layout.mtn); mod_tree_remove_init(mod); } @@ -267,9 +252,9 @@ static void __mod_update_bounds(void *base, unsigned int size) static void mod_update_bounds(struct module *mod) { - __mod_update_bounds(mod->module_core, mod->core_size); - if (mod->init_size) - __mod_update_bounds(mod->module_init, mod->init_size); + __mod_update_bounds(mod->core_layout.base, mod->core_layout.size); + if (mod->init_layout.size) + __mod_update_bounds(mod->init_layout.base, mod->init_layout.size); } #ifdef CONFIG_KGDB_KDB @@ -1214,7 +1199,7 @@ struct module_attribute module_uevent = static ssize_t show_coresize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->core_size); + return sprintf(buffer, "%u\n", mk->mod->core_layout.size); } static struct module_attribute modinfo_coresize = @@ -1223,7 +1208,7 @@ static struct module_attribute modinfo_coresize = static ssize_t show_initsize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->init_size); + return sprintf(buffer, "%u\n", mk->mod->init_layout.size); } static struct module_attribute modinfo_initsize = @@ -1917,29 +1902,29 @@ static void set_section_ro_nx(void *base, static void set_module_core_ro_nx(struct module *mod) { - set_section_ro_nx(mod->module_core, mod->core_text_size, - mod->core_ro_size, mod->core_size, + set_section_ro_nx(mod->core_layout.base, mod->core_layout.text_size, + mod->core_layout.ro_size, mod->core_layout.size, set_memory_ro, set_memory_nx); } static void unset_module_core_ro_nx(struct module *mod) { - set_section_ro_nx(mod->module_core, mod->core_text_size, - mod->core_ro_size, mod->core_size, + set_section_ro_nx(mod->core_layout.base, mod->core_layout.text_size, + mod->core_layout.ro_size, mod->core_layout.size, set_memory_rw, set_memory_x); } static void set_module_init_ro_nx(struct module *mod) { - set_section_ro_nx(mod->module_init, mod->init_text_size, - mod->init_ro_size, mod->init_size, + set_section_ro_nx(mod->init_layout.base, mod->init_layout.text_size, + mod->init_layout.ro_size, mod->init_layout.size, set_memory_ro, set_memory_nx); } static void unset_module_init_ro_nx(struct module *mod) { - set_section_ro_nx(mod->module_init, mod->init_text_size, - mod->init_ro_size, mod->init_size, + set_section_ro_nx(mod->init_layout.base, mod->init_layout.text_size, + mod->init_layout.ro_size, mod->init_layout.size, set_memory_rw, set_memory_x); } @@ -1952,14 +1937,14 @@ void set_all_modules_text_rw(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->module_core) && (mod->core_text_size)) { - set_page_attributes(mod->module_core, - mod->module_core + mod->core_text_size, + if ((mod->core_layout.base) && (mod->core_layout.text_size)) { + set_page_attributes(mod->core_layout.base, + mod->core_layout.base + mod->core_layout.text_size, set_memory_rw); } - if ((mod->module_init) && (mod->init_text_size)) { - set_page_attributes(mod->module_init, - mod->module_init + mod->init_text_size, + if ((mod->init_layout.base) && (mod->init_layout.text_size)) { + set_page_attributes(mod->init_layout.base, + mod->init_layout.base + mod->init_layout.text_size, set_memory_rw); } } @@ -1975,14 +1960,14 @@ void set_all_modules_text_ro(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->module_core) && (mod->core_text_size)) { - set_page_attributes(mod->module_core, - mod->module_core + mod->core_text_size, + if ((mod->core_layout.base) && (mod->core_layout.text_size)) { + set_page_attributes(mod->core_layout.base, + mod->core_layout.base + mod->core_layout.text_size, set_memory_ro); } - if ((mod->module_init) && (mod->init_text_size)) { - set_page_attributes(mod->module_init, - mod->module_init + mod->init_text_size, + if ((mod->init_layout.base) && (mod->init_layout.text_size)) { + set_page_attributes(mod->init_layout.base, + mod->init_layout.base + mod->init_layout.text_size, set_memory_ro); } } @@ -2047,16 +2032,16 @@ static void free_module(struct module *mod) /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); module_arch_freeing_init(mod); - module_memfree(mod->module_init); + module_memfree(mod->init_layout.base); kfree(mod->args); percpu_modfree(mod); /* Free lock-classes; relies on the preceding sync_rcu(). */ - lockdep_free_key_range(mod->module_core, mod->core_size); + lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); /* Finally, free the core (containing the module structure) */ unset_module_core_ro_nx(mod); - module_memfree(mod->module_core); + module_memfree(mod->core_layout.base); #ifdef CONFIG_MPU update_protections(current->mm); @@ -2259,20 +2244,20 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || strstarts(sname, ".init")) continue; - s->sh_entsize = get_offset(mod, &mod->core_size, s, i); + s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->core_size = debug_align(mod->core_size); - mod->core_text_size = mod->core_size; + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.text_size = mod->core_layout.size; break; case 1: /* RO: text and ro-data */ - mod->core_size = debug_align(mod->core_size); - mod->core_ro_size = mod->core_size; + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.ro_size = mod->core_layout.size; break; case 3: /* whole core */ - mod->core_size = debug_align(mod->core_size); + mod->core_layout.size = debug_align(mod->core_layout.size); break; } } @@ -2288,21 +2273,21 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || !strstarts(sname, ".init")) continue; - s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) + s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->init_size = debug_align(mod->init_size); - mod->init_text_size = mod->init_size; + mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.text_size = mod->init_layout.size; break; case 1: /* RO: text and ro-data */ - mod->init_size = debug_align(mod->init_size); - mod->init_ro_size = mod->init_size; + mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.ro_size = mod->init_layout.size; break; case 3: /* whole init */ - mod->init_size = debug_align(mod->init_size); + mod->init_layout.size = debug_align(mod->init_layout.size); break; } } @@ -2477,7 +2462,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; - symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, + symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect, info->index.sym) | INIT_OFFSET_MASK; pr_debug("\t%s\n", info->secstrings + symsect->sh_name); @@ -2494,16 +2479,16 @@ static void layout_symtab(struct module *mod, struct load_info *info) } /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->core_size += strtab_size; - mod->core_size = debug_align(mod->core_size); + info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); + info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->core_layout.size += strtab_size; + mod->core_layout.size = debug_align(mod->core_layout.size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; - strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, + strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, info->index.str) | INIT_OFFSET_MASK; - mod->init_size = debug_align(mod->init_size); + mod->init_layout.size = debug_align(mod->init_layout.size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); } @@ -2524,8 +2509,8 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) for (i = 0; i < mod->num_symtab; i++) mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - mod->core_symtab = dst = mod->module_core + info->symoffs; - mod->core_strtab = s = mod->module_core + info->stroffs; + mod->core_symtab = dst = mod->core_layout.base + info->symoffs; + mod->core_strtab = s = mod->core_layout.base + info->stroffs; src = mod->symtab; for (ndst = i = 0; i < mod->num_symtab; i++) { if (i == 0 || @@ -2975,7 +2960,7 @@ static int move_module(struct module *mod, struct load_info *info) void *ptr; /* Do the allocs. */ - ptr = module_alloc(mod->core_size); + ptr = module_alloc(mod->core_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. Just mark it as not being a @@ -2985,11 +2970,11 @@ static int move_module(struct module *mod, struct load_info *info) if (!ptr) return -ENOMEM; - memset(ptr, 0, mod->core_size); - mod->module_core = ptr; + memset(ptr, 0, mod->core_layout.size); + mod->core_layout.base = ptr; - if (mod->init_size) { - ptr = module_alloc(mod->init_size); + if (mod->init_layout.size) { + ptr = module_alloc(mod->init_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. This block doesn't need to be @@ -2998,13 +2983,13 @@ static int move_module(struct module *mod, struct load_info *info) */ kmemleak_ignore(ptr); if (!ptr) { - module_memfree(mod->module_core); + module_memfree(mod->core_layout.base); return -ENOMEM; } - memset(ptr, 0, mod->init_size); - mod->module_init = ptr; + memset(ptr, 0, mod->init_layout.size); + mod->init_layout.base = ptr; } else - mod->module_init = NULL; + mod->init_layout.base = NULL; /* Transfer each section which specifies SHF_ALLOC */ pr_debug("final section addresses:\n"); @@ -3016,10 +3001,10 @@ static int move_module(struct module *mod, struct load_info *info) continue; if (shdr->sh_entsize & INIT_OFFSET_MASK) - dest = mod->module_init + dest = mod->init_layout.base + (shdr->sh_entsize & ~INIT_OFFSET_MASK); else - dest = mod->module_core + shdr->sh_entsize; + dest = mod->core_layout.base + shdr->sh_entsize; if (shdr->sh_type != SHT_NOBITS) memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); @@ -3081,12 +3066,12 @@ static void flush_module_icache(const struct module *mod) * Do it before processing of module parameters, so the module * can provide parameter accessor functions of its own. */ - if (mod->module_init) - flush_icache_range((unsigned long)mod->module_init, - (unsigned long)mod->module_init - + mod->init_size); - flush_icache_range((unsigned long)mod->module_core, - (unsigned long)mod->module_core + mod->core_size); + if (mod->init_layout.base) + flush_icache_range((unsigned long)mod->init_layout.base, + (unsigned long)mod->init_layout.base + + mod->init_layout.size); + flush_icache_range((unsigned long)mod->core_layout.base, + (unsigned long)mod->core_layout.base + mod->core_layout.size); set_fs(old_fs); } @@ -3144,8 +3129,8 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); - module_memfree(mod->module_init); - module_memfree(mod->module_core); + module_memfree(mod->init_layout.base); + module_memfree(mod->core_layout.base); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -3232,7 +3217,7 @@ static noinline int do_init_module(struct module *mod) ret = -ENOMEM; goto fail; } - freeinit->module_init = mod->module_init; + freeinit->module_init = mod->init_layout.base; /* * We want to find out whether @mod uses async during init. Clear @@ -3292,10 +3277,10 @@ static noinline int do_init_module(struct module *mod) mod_tree_remove_init(mod); unset_module_init_ro_nx(mod); module_arch_freeing_init(mod); - mod->module_init = NULL; - mod->init_size = 0; - mod->init_ro_size = 0; - mod->init_text_size = 0; + mod->init_layout.base = NULL; + mod->init_layout.size = 0; + mod->init_layout.ro_size = 0; + mod->init_layout.text_size = 0; /* * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we @@ -3575,7 +3560,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_unlock(&module_mutex); free_module: /* Free lock-classes; relies on the preceding sync_rcu() */ - lockdep_free_key_range(mod->module_core, mod->core_size); + lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); module_deallocate(mod, info); free_copy: @@ -3653,9 +3638,9 @@ static const char *get_ksymbol(struct module *mod, /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) - nextval = (unsigned long)mod->module_init+mod->init_text_size; + nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size; else - nextval = (unsigned long)mod->module_core+mod->core_text_size; + nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size; /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ @@ -3902,7 +3887,7 @@ static int m_show(struct seq_file *m, void *p) return 0; seq_printf(m, "%s %u", - mod->name, mod->init_size + mod->core_size); + mod->name, mod->init_layout.size + mod->core_layout.size); print_unload_info(m, mod); /* Informative for users. */ @@ -3911,7 +3896,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%pK", mod->module_core); + seq_printf(m, " 0x%pK", mod->core_layout.base); /* Taints info */ if (mod->taints) @@ -4054,8 +4039,8 @@ struct module *__module_text_address(unsigned long addr) struct module *mod = __module_address(addr); if (mod) { /* Make sure it's within the text section. */ - if (!within(addr, mod->module_init, mod->init_text_size) - && !within(addr, mod->module_core, mod->core_text_size)) + if (!within(addr, mod->init_layout.base, mod->init_layout.text_size) + && !within(addr, mod->core_layout.base, mod->core_layout.text_size)) mod = NULL; } return mod; -- cgit v1.2.3 From 85c898db6327353d38f3dd428457384cf81f83f8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 26 Nov 2015 09:45:08 +1030 Subject: module: clean up RO/NX handling. Modules have three sections: text, rodata and writable data. The code handled the case where these overlapped, however they never can: debug_align() ensures they are always page-aligned. This is why we got away with manually traversing the pages in set_all_modules_text_rw() without rounding. We create three helper functions: frob_text(), frob_rodata() and frob_writable_data(). We then call these explicitly at every point, so it's clear what we're doing. We also expose module_enable_ro() and module_disable_ro() for livepatch to use. Reviewed-by: Josh Poimboeuf Signed-off-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/module.c | 168 ++++++++++++++++++++++++++------------------------------ 1 file changed, 77 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index a0a3d6d9d5e8..77212128f34a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -80,15 +80,6 @@ # define debug_align(X) (X) #endif -/* - * Given BASE and SIZE this macro calculates the number of pages the - * memory regions occupies - */ -#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ - (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ - PFN_DOWN((unsigned long)BASE) + 1) \ - : (0UL)) - /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) @@ -1858,74 +1849,75 @@ static void mod_sysfs_teardown(struct module *mod) /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. + * + * General layout of module is: + * [text] [read-only-data] [writable data] + * text_size -----^ ^ ^ + * ro_size ------------------------| | + * size -------------------------------------------| + * + * These values are always page-aligned (as is base) */ -void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) +static void frob_text(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) { - unsigned long begin_pfn = PFN_DOWN((unsigned long)start); - unsigned long end_pfn = PFN_DOWN((unsigned long)end); - - if (end_pfn > begin_pfn) - set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base, + layout->text_size >> PAGE_SHIFT); } -static void set_section_ro_nx(void *base, - unsigned long text_size, - unsigned long ro_size, - unsigned long total_size, - int (*set_ro)(unsigned long start, int num_pages), - int (*set_nx)(unsigned long start, int num_pages)) +static void frob_rodata(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) { - /* begin and end PFNs of the current subsection */ - unsigned long begin_pfn; - unsigned long end_pfn; - - /* - * Set RO for module text and RO-data: - * - Always protect first page. - * - Do not protect last partial page. - */ - if (ro_size > 0) - set_page_attributes(base, base + ro_size, set_ro); + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->text_size, + (layout->ro_size - layout->text_size) >> PAGE_SHIFT); +} - /* - * Set NX permissions for module data: - * - Do not protect first partial page. - * - Always protect last page. - */ - if (total_size > text_size) { - begin_pfn = PFN_UP((unsigned long)base + text_size); - end_pfn = PFN_UP((unsigned long)base + total_size); - if (end_pfn > begin_pfn) - set_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); - } +static void frob_writable_data(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->size - layout->ro_size) >> PAGE_SHIFT); } -static void set_module_core_ro_nx(struct module *mod) +/* livepatching wants to disable read-only so it can frob module. */ +void module_disable_ro(const struct module *mod) { - set_section_ro_nx(mod->core_layout.base, mod->core_layout.text_size, - mod->core_layout.ro_size, mod->core_layout.size, - set_memory_ro, set_memory_nx); + frob_text(&mod->core_layout, set_memory_rw); + frob_rodata(&mod->core_layout, set_memory_rw); + frob_text(&mod->init_layout, set_memory_rw); + frob_rodata(&mod->init_layout, set_memory_rw); } -static void unset_module_core_ro_nx(struct module *mod) +void module_enable_ro(const struct module *mod) { - set_section_ro_nx(mod->core_layout.base, mod->core_layout.text_size, - mod->core_layout.ro_size, mod->core_layout.size, - set_memory_rw, set_memory_x); + frob_text(&mod->core_layout, set_memory_ro); + frob_rodata(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); + frob_rodata(&mod->init_layout, set_memory_ro); } -static void set_module_init_ro_nx(struct module *mod) +static void module_enable_nx(const struct module *mod) { - set_section_ro_nx(mod->init_layout.base, mod->init_layout.text_size, - mod->init_layout.ro_size, mod->init_layout.size, - set_memory_ro, set_memory_nx); + frob_rodata(&mod->core_layout, set_memory_nx); + frob_writable_data(&mod->core_layout, set_memory_nx); + frob_rodata(&mod->init_layout, set_memory_nx); + frob_writable_data(&mod->init_layout, set_memory_nx); } -static void unset_module_init_ro_nx(struct module *mod) +static void module_disable_nx(const struct module *mod) { - set_section_ro_nx(mod->init_layout.base, mod->init_layout.text_size, - mod->init_layout.ro_size, mod->init_layout.size, - set_memory_rw, set_memory_x); + frob_rodata(&mod->core_layout, set_memory_x); + frob_writable_data(&mod->core_layout, set_memory_x); + frob_rodata(&mod->init_layout, set_memory_x); + frob_writable_data(&mod->init_layout, set_memory_x); } /* Iterate through all modules and set each module's text as RW */ @@ -1937,16 +1929,9 @@ void set_all_modules_text_rw(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->core_layout.base) && (mod->core_layout.text_size)) { - set_page_attributes(mod->core_layout.base, - mod->core_layout.base + mod->core_layout.text_size, - set_memory_rw); - } - if ((mod->init_layout.base) && (mod->init_layout.text_size)) { - set_page_attributes(mod->init_layout.base, - mod->init_layout.base + mod->init_layout.text_size, - set_memory_rw); - } + + frob_text(&mod->core_layout, set_memory_rw); + frob_text(&mod->init_layout, set_memory_rw); } mutex_unlock(&module_mutex); } @@ -1960,24 +1945,25 @@ void set_all_modules_text_ro(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->core_layout.base) && (mod->core_layout.text_size)) { - set_page_attributes(mod->core_layout.base, - mod->core_layout.base + mod->core_layout.text_size, - set_memory_ro); - } - if ((mod->init_layout.base) && (mod->init_layout.text_size)) { - set_page_attributes(mod->init_layout.base, - mod->init_layout.base + mod->init_layout.text_size, - set_memory_ro); - } + + frob_text(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); } mutex_unlock(&module_mutex); } + +static void disable_ro_nx(const struct module_layout *layout) +{ + frob_text(layout, set_memory_rw); + frob_rodata(layout, set_memory_rw); + frob_rodata(layout, set_memory_x); + frob_writable_data(layout, set_memory_x); +} + #else -static void set_module_core_ro_nx(struct module *mod) { } -static void set_module_init_ro_nx(struct module *mod) { } -static void unset_module_core_ro_nx(struct module *mod) { } -static void unset_module_init_ro_nx(struct module *mod) { } +static void disable_ro_nx(const struct module_layout *layout) { } +static void module_enable_nx(const struct module *mod) { } +static void module_disable_nx(const struct module *mod) { } #endif void __weak module_memfree(void *module_region) @@ -2029,8 +2015,8 @@ static void free_module(struct module *mod) synchronize_sched(); mutex_unlock(&module_mutex); - /* This may be NULL, but that's OK */ - unset_module_init_ro_nx(mod); + /* This may be empty, but that's OK */ + disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); module_memfree(mod->init_layout.base); kfree(mod->args); @@ -2040,7 +2026,7 @@ static void free_module(struct module *mod) lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); /* Finally, free the core (containing the module structure) */ - unset_module_core_ro_nx(mod); + disable_ro_nx(&mod->core_layout); module_memfree(mod->core_layout.base); #ifdef CONFIG_MPU @@ -3275,7 +3261,7 @@ static noinline int do_init_module(struct module *mod) mod->strtab = mod->core_strtab; #endif mod_tree_remove_init(mod); - unset_module_init_ro_nx(mod); + disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); mod->init_layout.base = NULL; mod->init_layout.size = 0; @@ -3370,8 +3356,8 @@ static int complete_formation(struct module *mod, struct load_info *info) module_bug_finalize(info->hdr, info->sechdrs, mod); /* Set RO and NX regions */ - set_module_init_ro_nx(mod); - set_module_core_ro_nx(mod); + module_enable_ro(mod); + module_enable_nx(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ @@ -3536,8 +3522,8 @@ static int load_module(struct load_info *info, const char __user *uargs, MODULE_STATE_GOING, mod); /* we can't deallocate the module until we clear memory protection */ - unset_module_init_ro_nx(mod); - unset_module_core_ro_nx(mod); + module_disable_ro(mod); + module_disable_nx(mod); ddebug_cleanup: dynamic_debug_remove(info->debug); -- cgit v1.2.3 From e0224418516b4d8a6c2160574bac18447c354ef0 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Thu, 26 Nov 2015 13:18:06 +1030 Subject: module: keep percpu symbols in module's symtab Currently, percpu symbols from .data..percpu ELF section of a module are not copied over and stored in final symtab array of struct module. Consequently such symbol cannot be returned via kallsyms API (for example kallsyms_lookup_name). This can be especially confusing when the percpu symbol is exported. Only its __ksymtab et al. are present in its symtab. The culprit is in layout_and_allocate() function where SHF_ALLOC flag is dropped for .data..percpu section. There is in fact no need to copy the section to final struct module, because kernel module loader allocates extra percpu section by itself. Unfortunately only symbols from SHF_ALLOC sections are copied due to a check in is_core_symbol(). The patch changes is_core_symbol() function to copy over also percpu symbols (their st_shndx points to .data..percpu ELF section). We do it only if CONFIG_KALLSYMS_ALL is set to be consistent with the rest of the function (ELF section is SHF_ALLOC but !SHF_EXECINSTR). Finally elf_type() returns type 'a' for a percpu symbol because its address is absolute. Signed-off-by: Miroslav Benes Signed-off-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/module.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 77212128f34a..912e891e0e2f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2383,7 +2383,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } if (sym->st_shndx == SHN_UNDEF) return 'U'; - if (sym->st_shndx == SHN_ABS) + if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) return 'a'; if (sym->st_shndx >= SHN_LORESERVE) return '?'; @@ -2412,7 +2412,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum) + unsigned int shnum, unsigned int pcpundx) { const Elf_Shdr *sec; @@ -2421,6 +2421,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, || !src->st_name) return false; +#ifdef CONFIG_KALLSYMS_ALL + if (src->st_shndx == pcpundx) + return true; +#endif + sec = sechdrs + src->st_shndx; if (!(sec->sh_flags & SHF_ALLOC) #ifndef CONFIG_KALLSYMS_ALL @@ -2458,7 +2463,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Compute total space required for the core symbols' strtab. */ for (ndst = i = 0; i < nsrc; i++) { if (i == 0 || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { strtab_size += strlen(&info->strtab[src[i].st_name])+1; ndst++; } @@ -2500,7 +2506,8 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) src = mod->symtab; for (ndst = i = 0; i < mod->num_symtab; i++) { if (i == 0 || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_strtab; s += strlcpy(s, &mod->strtab[src[i].st_name], -- cgit v1.2.3 From b56b36ee6751abe7fb3890681e86fc8de2122953 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 3 Dec 2015 16:33:26 -0600 Subject: livepatch: Cleanup module page permission changes Calling set_memory_rw() and set_memory_ro() for every iteration of the loop in klp_write_object_relocations() is messy, inefficient, and error-prone. Change all the read-only pages to read-write before the loop and convert them back to read-only again afterwards. Suggested-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 94893e844e44..bc2c85c064c1 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -28,6 +28,7 @@ #include #include #include +#include /** * struct klp_ops - structure for tracking registered ftrace ops structs @@ -232,7 +233,7 @@ static int klp_find_external_symbol(struct module *pmod, const char *name, static int klp_write_object_relocations(struct module *pmod, struct klp_object *obj) { - int ret; + int ret = 0; unsigned long val; struct klp_reloc *reloc; @@ -242,13 +243,16 @@ static int klp_write_object_relocations(struct module *pmod, if (WARN_ON(!obj->relocs)) return -EINVAL; + module_disable_ro(pmod); + for (reloc = obj->relocs; reloc->name; reloc++) { /* discover the address of the referenced symbol */ if (reloc->external) { if (reloc->sympos > 0) { pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n", reloc->name); - return -EINVAL; + ret = -EINVAL; + goto out; } ret = klp_find_external_symbol(pmod, reloc->name, &val); } else @@ -257,18 +261,20 @@ static int klp_write_object_relocations(struct module *pmod, reloc->sympos, &val); if (ret) - return ret; + goto out; ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc, val + reloc->addend); if (ret) { pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n", reloc->name, val, ret); - return ret; + goto out; } } - return 0; +out: + module_enable_ro(pmod); + return ret; } static void notrace klp_ftrace_handler(unsigned long ip, -- cgit v1.2.3 From 90a545e981267e917b9d698ce07affd69787db87 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 23 Nov 2015 15:49:03 -0800 Subject: restrict /dev/mem to idle io memory ranges This effectively promotes IORESOURCE_BUSY to IORESOURCE_EXCLUSIVE semantics by default. If userspace really believes it is safe to access the memory region it can also perform the extra step of disabling an active driver. This protects device address ranges with read side effects and otherwise directs userspace to use the driver. Persistent memory presents a large "mistake surface" to /dev/mem as now accidental writes can corrupt a filesystem. In general if a device driver is busily using a memory region it already informs other parts of the kernel to not touch it via request_mem_region(). /dev/mem should honor the same safety restriction by default. Debugging a device driver from userspace becomes more difficult with this enabled. Any application using /dev/mem or mmap of sysfs pci resources will now need to perform the extra step of either: 1/ Disabling the driver, for example: echo > /dev/bus//drivers//unbind 2/ Rebooting with "iomem=relaxed" on the command line 3/ Recompiling with CONFIG_IO_STRICT_DEVMEM=n Traditional users of /dev/mem like dosemu are unaffected because the first 1MB of memory is not subject to the IO_STRICT_DEVMEM restriction. Legacy X configurations use /dev/mem to talk to graphics hardware, but that functionality has since moved to kernel graphics drivers. Cc: Arnd Bergmann Cc: Russell King Cc: Andrew Morton Cc: Greg Kroah-Hartman Acked-by: Kees Cook Acked-by: Ingo Molnar Signed-off-by: Dan Williams --- kernel/resource.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index f150dbbe6f62..09c0597840b0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr) break; if (p->end < addr) continue; - if (p->flags & IORESOURCE_BUSY && - p->flags & IORESOURCE_EXCLUSIVE) { + /* + * A resource is exclusive if IORESOURCE_EXCLUSIVE is set + * or CONFIG_IO_STRICT_DEVMEM is enabled and the + * resource is busy. + */ + if ((p->flags & IORESOURCE_BUSY) == 0) + continue; + if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) + || p->flags & IORESOURCE_EXCLUSIVE) { err = 1; break; } -- cgit v1.2.3 From 5d097056c9a017a3b720849efb5432f37acabbac Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:21 -0800 Subject: kmemcg: account certain kmem allocations to memcg Mark those kmem allocations that are known to be easily triggered from userspace as __GFP_ACCOUNT/SLAB_ACCOUNT, which makes them accounted to memcg. For the list, see below: - threadinfo - task_struct - task_delay_info - pid - cred - mm_struct - vm_area_struct and vm_region (nommu) - anon_vma and anon_vma_chain - signal_struct - sighand_struct - fs_struct - files_struct - fdtable and fdtable->full_fds_bits - dentry and external_name - inode for all filesystems. This is the most tedious part, because most filesystems overwrite the alloc_inode method. The list is far from complete, so feel free to add more objects. Nevertheless, it should be close to "account everything" approach and keep most workloads within bounds. Malevolent users will be able to breach the limit, but this was possible even with the former "account everything" approach (simply because it did not account everything in fact). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cred.c | 4 ++-- kernel/delayacct.c | 2 +- kernel/fork.c | 22 +++++++++++++--------- kernel/pid.c | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 71179a09c1d6..0c0cd8a62285 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds); void __init cred_init(void) { /* allocate a slab in which we can store credentials */ - cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); } /** diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ef90b04d783f..435c14a45118 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable); void delayacct_init(void) { - delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC); + delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); } diff --git a/kernel/fork.c b/kernel/fork.c index 6774e6b2e96d..51915842f1c0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -300,9 +300,9 @@ void __init fork_init(void) #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES #endif /* create a slab on which task_structs can be allocated */ - task_struct_cachep = - kmem_cache_create("task_struct", arch_task_struct_size, - ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); + task_struct_cachep = kmem_cache_create("task_struct", + arch_task_struct_size, ARCH_MIN_TASKALIGN, + SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -1848,16 +1848,19 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| - SLAB_NOTRACK, sighand_ctor); + SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the * whole struct cpumask for the OFFSTACK case. We could change @@ -1867,8 +1870,9 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } diff --git a/kernel/pid.c b/kernel/pid.c index 78b3d9f80d44..f4ad91b746f1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -604,5 +604,5 @@ void __init pidmap_init(void) atomic_dec(&init_pid_ns.pidmap[0].nr_free); init_pid_ns.pid_cachep = KMEM_CACHE(pid, - SLAB_HWCACHE_ALIGN | SLAB_PANIC); + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } -- cgit v1.2.3 From eca56ff906bdd0239485e8b47154a6e73dd9a2f3 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 14 Jan 2016 15:19:26 -0800 Subject: mm, shmem: add internal shmem resident memory accounting Currently looking at /proc//status or statm, there is no way to distinguish shmem pages from pages mapped to a regular file (shmem pages are mapped to /dev/zero), even though their implication in actual memory use is quite different. The internal accounting currently counts shmem pages together with regular files. As a preparation to extend the userspace interfaces, this patch adds MM_SHMEMPAGES counter to mm_rss_stat to account for shmem pages separately from MM_FILEPAGES. The next patch will expose it to userspace - this patch doesn't change the exported values yet, by adding up MM_SHMEMPAGES to MM_FILEPAGES at places where MM_FILEPAGES was used before. The only user-visible change after this patch is the OOM killer message that separates the reported "shmem-rss" from "file-rss". [vbabka@suse.cz: forward-porting, tweak changelog] Signed-off-by: Jerome Marchand Signed-off-by: Vlastimil Babka Acked-by: Konstantin Khlebnikov Acked-by: Michal Hocko Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7dad84913abf..bb0669169716 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -180,7 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter_file(page)); inc_mm_counter(mm, MM_ANONPAGES); } -- cgit v1.2.3 From d07e22597d1d355829b7b18ac19afa912cf758d1 Mon Sep 17 00:00:00 2001 From: Daniel Cashman Date: Thu, 14 Jan 2016 15:19:53 -0800 Subject: mm: mmap: add new /proc tunable for mmap_base ASLR Address Space Layout Randomization (ASLR) provides a barrier to exploitation of user-space processes in the presence of security vulnerabilities by making it more difficult to find desired code/data which could help an attack. This is done by adding a random offset to the location of regions in the process address space, with a greater range of potential offset values corresponding to better protection/a larger search-space for brute force, but also to greater potential for fragmentation. The offset added to the mmap_base address, which provides the basis for the majority of the mappings for a process, is set once on process exec in arch_pick_mmap_layout() and is done via hard-coded per-arch values, which reflect, hopefully, the best compromise for all systems. The trade-off between increased entropy in the offset value generation and the corresponding increased variability in address space fragmentation is not absolute, however, and some platforms may tolerate higher amounts of entropy. This patch introduces both new Kconfig values and a sysctl interface which may be used to change the amount of entropy used for offset generation on a system. The direct motivation for this change was in response to the libstagefright vulnerabilities that affected Android, specifically to information provided by Google's project zero at: http://googleprojectzero.blogspot.com/2015/09/stagefrightened.html The attack presented therein, by Google's project zero, specifically targeted the limited randomness used to generate the offset added to the mmap_base address in order to craft a brute-force-based attack. Concretely, the attack was against the mediaserver process, which was limited to respawning every 5 seconds, on an arm device. The hard-coded 8 bits used resulted in an average expected success rate of defeating the mmap ASLR after just over 10 minutes (128 tries at 5 seconds a piece). With this patch, and an accompanying increase in the entropy value to 16 bits, the same attack would take an average expected time of over 45 hours (32768 tries), which makes it both less feasible and more likely to be noticed. The introduced Kconfig and sysctl options are limited by per-arch minimum and maximum values, the minimum of which was chosen to match the current hard-coded value and the maximum of which was chosen so as to give the greatest flexibility without generating an invalid mmap_base address, generally a 3-4 bits less than the number of bits in the user-space accessible virtual address space. When decided whether or not to change the default value, a system developer should consider that mmap_base address could be placed anywhere up to 2^(value) bits away from the non-randomized location, which would introduce variable-sized areas above and below the mmap_base address such that the maximum vm_area_struct size may be reduced, preventing very large allocations. This patch (of 4): ASLR only uses as few as 8 bits to generate the random offset for the mmap base address on 32 bit architectures. This value was chosen to prevent a poorly chosen value from dividing the address space in such a way as to prevent large allocations. This may not be an issue on all platforms. Allow the specification of a minimum number of bits so that platforms desiring greater ASLR protection may determine where to place the trade-off. Signed-off-by: Daniel Cashman Cc: Russell King Acked-by: Kees Cook Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Don Zickus Cc: Eric W. Biederman Cc: Heinrich Schuchardt Cc: Josh Poimboeuf Cc: Kirill A. Shutemov Cc: Naoya Horiguchi Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Thomas Gleixner Cc: David Rientjes Cc: Mark Salyzyn Cc: Jeff Vander Stoep Cc: Nick Kralevich Cc: Catalin Marinas Cc: Will Deacon Cc: "H. Peter Anvin" Cc: Hector Marco-Gisbert Cc: Borislav Petkov Cc: Ralf Baechle Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5faf89ac9ec0..c810f8afdb7f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1568,6 +1568,28 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif { } }; -- cgit v1.2.3 From 0eb77e9880321915322d42913c3b53241739c8aa Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Jan 2016 15:21:40 -0800 Subject: vmstat: make vmstat_updater deferrable again and shut down on idle Currently the vmstat updater is not deferrable as a result of commit ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update"). This in turn can cause multiple interruptions of the applications because the vmstat updater may run at Make vmstate_update deferrable again and provide a function that folds the differentials when the processor is going to idle mode thus addressing the issue of the above commit in a clean way. Note that the shepherd thread will continue scanning the differentials from another processor and will reenable the vmstat workers if it detects any changes. Fixes: ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update") Signed-off-by: Christoph Lameter Cc: Michal Hocko Cc: Johannes Weiner Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/idle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a02fd3..2489140a7c51 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -219,6 +219,7 @@ static void cpu_idle_loop(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { -- cgit v1.2.3 From 84638335900f1995495838fe1bd4870c43ec1f67 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 14 Jan 2016 15:22:07 -0800 Subject: mm: rework virtual memory accounting When inspecting a vague code inside prctl(PR_SET_MM_MEM) call (which testing the RLIMIT_DATA value to figure out if we're allowed to assign new @start_brk, @brk, @start_data, @end_data from mm_struct) it's been commited that RLIMIT_DATA in a form it's implemented now doesn't do anything useful because most of user-space libraries use mmap() syscall for dynamic memory allocations. Linus suggested to convert RLIMIT_DATA rlimit into something suitable for anonymous memory accounting. But in this patch we go further, and the changes are bundled together as: * keep vma counting if CONFIG_PROC_FS=n, will be used for limits * replace mm->shared_vm with better defined mm->data_vm * account anonymous executable areas as executable * account file-backed growsdown/up areas as stack * drop struct file* argument from vm_stat_account * enforce RLIMIT_DATA for size of data areas This way code looks cleaner: now code/stack/data classification depends only on vm_flags state: VM_EXEC & ~VM_WRITE -> code (VmExe + VmLib in proc) VM_GROWSUP | VM_GROWSDOWN -> stack (VmStk) VM_WRITE & ~VM_SHARED & !stack -> data (VmData) The rest (VmSize - VmData - VmStk - VmExe - VmLib) could be called "shared", but that might be strange beast like readonly-private or VM_IO area. - RLIMIT_AS limits whole address space "VmSize" - RLIMIT_STACK limits stack "VmStk" (but each vma individually) - RLIMIT_DATA now limits "VmData" Signed-off-by: Konstantin Khlebnikov Signed-off-by: Cyrill Gorcunov Cc: Quentin Casasnovas Cc: Vegard Nossum Acked-by: Linus Torvalds Cc: Willy Tarreau Cc: Andy Lutomirski Cc: Kees Cook Cc: Vladimir Davydov Cc: Pavel Emelyanov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 51915842f1c0..2e391c754ae7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); mm->total_vm = oldmm->total_vm; - mm->shared_vm = oldmm->shared_vm; + mm->data_vm = oldmm->data_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; @@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -vma_pages(mpnt)); + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } charge = 0; -- cgit v1.2.3