diff options
-rw-r--r-- | arch/Kconfig | 17 | ||||
-rw-r--r-- | arch/x86/Kconfig | 5 | ||||
-rw-r--r-- | arch/x86/include/asm/thread_info.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/uprobes.h | 57 | ||||
-rw-r--r-- | arch/x86/kernel/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/kernel/signal.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/uprobes.c | 674 | ||||
-rw-r--r-- | include/linux/mm_types.h | 2 | ||||
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | include/linux/uprobes.h | 164 | ||||
-rw-r--r-- | kernel/events/Makefile | 3 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 1662 | ||||
-rw-r--r-- | kernel/fork.c | 9 | ||||
-rw-r--r-- | kernel/signal.c | 4 | ||||
-rw-r--r-- | mm/mmap.c | 33 |
15 files changed, 2641 insertions, 2 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 684eb5af439d..e5d3778d5554 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -76,6 +76,23 @@ config OPTPROBES depends on KPROBES && HAVE_OPTPROBES depends on !PREEMPT +config UPROBES + bool "Transparent user-space probes (EXPERIMENTAL)" + depends on ARCH_SUPPORTS_UPROBES && PERF_EVENTS + default n + help + Uprobes is the user-space counterpart to kprobes: they + enable instrumentation applications (such as 'perf probe') + to establish unintrusive probes in user-space binaries and + libraries, by executing handler functions when the probes + are hit by user-space applications. + + ( These probes come in the form of single-byte breakpoints, + managed by the kernel and kept transparent to the probed + application. ) + + If in doubt, say "N". + config HAVE_EFFICIENT_UNALIGNED_ACCESS bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d14cc6b79ad..76f5a466547a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -84,7 +84,7 @@ config X86 select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC config INSTRUCTION_DECODER - def_bool (KPROBES || PERF_EVENTS) + def_bool (KPROBES || PERF_EVENTS || UPROBES) config OUTPUT_FORMAT string @@ -243,6 +243,9 @@ config ARCH_CPU_PROBE_RELEASE def_bool y depends on HOTPLUG_CPU +config ARCH_SUPPORTS_UPROBES + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ad6df8ccd715..0710c11305d4 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -85,6 +85,7 @@ struct thread_info { #define TIF_SECCOMP 8 /* secure computing */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ +#define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ #define TIF_FORK 18 /* ret_from_fork */ @@ -109,6 +110,7 @@ struct thread_info { #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) +#define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h new file mode 100644 index 000000000000..1e9bed14f7ae --- /dev/null +++ b/arch/x86/include/asm/uprobes.h @@ -0,0 +1,57 @@ +#ifndef _ASM_UPROBES_H +#define _ASM_UPROBES_H +/* + * User-space Probes (UProbes) for x86 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2008-2011 + * Authors: + * Srikar Dronamraju + * Jim Keniston + */ + +#include <linux/notifier.h> + +typedef u8 uprobe_opcode_t; + +#define MAX_UINSN_BYTES 16 +#define UPROBE_XOL_SLOT_BYTES 128 /* to keep it cache aligned */ + +#define UPROBE_SWBP_INSN 0xcc +#define UPROBE_SWBP_INSN_SIZE 1 + +struct arch_uprobe { + u16 fixups; + u8 insn[MAX_UINSN_BYTES]; +#ifdef CONFIG_X86_64 + unsigned long rip_rela_target_address; +#endif +}; + +struct arch_uprobe_task { + unsigned long saved_trap_nr; +#ifdef CONFIG_X86_64 + unsigned long saved_scratch_register; +#endif +}; + +extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm); +extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); +extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); +extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); +extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data); +extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); +#endif /* _ASM_UPROBES_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 532d2e090e6f..d23d83577d6b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -101,6 +101,7 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o +obj-$(CONFIG_UPROBES) += uprobes.o ### # 64 bit specific files diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 115eac431483..041af2fd088d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -18,6 +18,7 @@ #include <linux/personality.h> #include <linux/uaccess.h> #include <linux/user-return-notifier.h> +#include <linux/uprobes.h> #include <asm/processor.h> #include <asm/ucontext.h> @@ -824,6 +825,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) mce_notify_process(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + if (thread_info_flags & _TIF_UPROBE) { + clear_thread_flag(TIF_UPROBE); + uprobe_notify_resume(regs); + } + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c new file mode 100644 index 000000000000..dc4e910a7d96 --- /dev/null +++ b/arch/x86/kernel/uprobes.c @@ -0,0 +1,674 @@ +/* + * User-space Probes (UProbes) for x86 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2008-2011 + * Authors: + * Srikar Dronamraju + * Jim Keniston + */ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/ptrace.h> +#include <linux/uprobes.h> +#include <linux/uaccess.h> + +#include <linux/kdebug.h> +#include <asm/processor.h> +#include <asm/insn.h> + +/* Post-execution fixups. */ + +/* No fixup needed */ +#define UPROBE_FIX_NONE 0x0 + +/* Adjust IP back to vicinity of actual insn */ +#define UPROBE_FIX_IP 0x1 + +/* Adjust the return address of a call insn */ +#define UPROBE_FIX_CALL 0x2 + +#define UPROBE_FIX_RIP_AX 0x8000 +#define UPROBE_FIX_RIP_CX 0x4000 + +#define UPROBE_TRAP_NR UINT_MAX + +/* Adaptations for mhiramat x86 decoder v14. */ +#define OPCODE1(insn) ((insn)->opcode.bytes[0]) +#define OPCODE2(insn) ((insn)->opcode.bytes[1]) +#define OPCODE3(insn) ((insn)->opcode.bytes[2]) +#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) + +#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + +/* + * Good-instruction tables for 32-bit apps. This is non-const and volatile + * to keep gcc from statically optimizing it out, as variable_test_bit makes + * some versions of gcc to think only *(unsigned long*) is used. + */ +static volatile u32 good_insns_32[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +/* Using this for both 64-bit and 32-bit apps */ +static volatile u32 good_2byte_insns[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ + W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +#ifdef CONFIG_X86_64 +/* Good-instruction tables for 64-bit apps */ +static volatile u32 good_insns_64[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */ + W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ + W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#endif +#undef W + +/* + * opcodes we'll probably never support: + * + * 6c-6d, e4-e5, ec-ed - in + * 6e-6f, e6-e7, ee-ef - out + * cc, cd - int3, int + * cf - iret + * d6 - illegal instruction + * f1 - int1/icebp + * f4 - hlt + * fa, fb - cli, sti + * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 + * + * invalid opcodes in 64-bit mode: + * + * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5 + * 63 - we support this opcode in x86_64 but not in i386. + * + * opcodes we may need to refine support for: + * + * 0f - 2-byte instructions: For many of these instructions, the validity + * depends on the prefix and/or the reg field. On such instructions, we + * just consider the opcode combination valid if it corresponds to any + * valid instruction. + * + * 8f - Group 1 - only reg = 0 is OK + * c6-c7 - Group 11 - only reg = 0 is OK + * d9-df - fpu insns with some illegal encodings + * f2, f3 - repnz, repz prefixes. These are also the first byte for + * certain floating-point instructions, such as addsd. + * + * fe - Group 4 - only reg = 0 or 1 is OK + * ff - Group 5 - only reg = 0-6 is OK + * + * others -- Do we need to support these? + * + * 0f - (floating-point?) prefetch instructions + * 07, 17, 1f - pop es, pop ss, pop ds + * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- + * but 64 and 65 (fs: and gs:) seem to be used, so we support them + * 67 - addr16 prefix + * ce - into + * f0 - lock prefix + */ + +/* + * TODO: + * - Where necessary, examine the modrm byte and allow only valid instructions + * in the different Groups and fpu instructions. + */ + +static bool is_prefix_bad(struct insn *insn) +{ + int i; + + for (i = 0; i < insn->prefixes.nbytes; i++) { + switch (insn->prefixes.bytes[i]) { + case 0x26: /* INAT_PFX_ES */ + case 0x2E: /* INAT_PFX_CS */ + case 0x36: /* INAT_PFX_DS */ + case 0x3E: /* INAT_PFX_SS */ + case 0xF0: /* INAT_PFX_LOCK */ + return true; + } + } + return false; +} + +static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) +{ + insn_init(insn, auprobe->insn, false); + + /* Skip good instruction prefixes; reject "bad" ones. */ + insn_get_opcode(insn); + if (is_prefix_bad(insn)) + return -ENOTSUPP; + + if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32)) + return 0; + + if (insn->opcode.nbytes == 2) { + if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) + return 0; + } + + return -ENOTSUPP; +} + +/* + * Figure out which fixups arch_uprobe_post_xol() will need to perform, and + * annotate arch_uprobe->fixups accordingly. To start with, + * arch_uprobe->fixups is either zero or it reflects rip-related fixups. + */ +static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) +{ + bool fix_ip = true, fix_call = false; /* defaults */ + int reg; + + insn_get_opcode(insn); /* should be a nop */ + + switch (OPCODE1(insn)) { + case 0xc3: /* ret/lret */ + case 0xcb: + case 0xc2: + case 0xca: + /* ip is correct */ + fix_ip = false; + break; + case 0xe8: /* call relative - Fix return addr */ + fix_call = true; + break; + case 0x9a: /* call absolute - Fix return addr, not ip */ + fix_call = true; + fix_ip = false; + break; + case 0xff: + insn_get_modrm(insn); + reg = MODRM_REG(insn); + if (reg == 2 || reg == 3) { + /* call or lcall, indirect */ + /* Fix return addr; ip is correct. */ + fix_call = true; + fix_ip = false; + } else if (reg == 4 || reg == 5) { + /* jmp or ljmp, indirect */ + /* ip is correct. */ + fix_ip = false; + } + break; + case 0xea: /* jmp absolute -- ip is correct */ + fix_ip = false; + break; + default: + break; + } + if (fix_ip) + auprobe->fixups |= UPROBE_FIX_IP; + if (fix_call) + auprobe->fixups |= UPROBE_FIX_CALL; +} + +#ifdef CONFIG_X86_64 +/* + * If arch_uprobe->insn doesn't use rip-relative addressing, return + * immediately. Otherwise, rewrite the instruction so that it accesses + * its memory operand indirectly through a scratch register. Set + * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address + * accordingly. (The contents of the scratch register will be saved + * before we single-step the modified instruction, and restored + * afterward.) + * + * We do this because a rip-relative instruction can access only a + * relatively small area (+/- 2 GB from the instruction), and the XOL + * area typically lies beyond that area. At least for instructions + * that store to memory, we can't execute the original instruction + * and "fix things up" later, because the misdirected store could be + * disastrous. + * + * Some useful facts about rip-relative instructions: + * + * - There's always a modrm byte. + * - There's never a SIB byte. + * - The displacement is always 4 bytes. + */ +static void +handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +{ + u8 *cursor; + u8 reg; + + if (mm->context.ia32_compat) + return; + + auprobe->rip_rela_target_address = 0x0; + if (!insn_rip_relative(insn)) + return; + + /* + * insn_rip_relative() would have decoded rex_prefix, modrm. + * Clear REX.b bit (extension of MODRM.rm field): + * we want to encode rax/rcx, not r8/r9. + */ + if (insn->rex_prefix.nbytes) { + cursor = auprobe->insn + insn_offset_rex_prefix(insn); + *cursor &= 0xfe; /* Clearing REX.B bit */ + } + + /* + * Point cursor at the modrm byte. The next 4 bytes are the + * displacement. Beyond the displacement, for some instructions, + * is the immediate operand. + */ + cursor = auprobe->insn + insn_offset_modrm(insn); + insn_get_length(insn); + + /* + * Convert from rip-relative addressing to indirect addressing + * via a scratch register. Change the r/m field from 0x5 (%rip) + * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field. + */ + reg = MODRM_REG(insn); + if (reg == 0) { + /* + * The register operand (if any) is either the A register + * (%rax, %eax, etc.) or (if the 0x4 bit is set in the + * REX prefix) %r8. In any case, we know the C register + * is NOT the register operand, so we use %rcx (register + * #1) for the scratch register. + */ + auprobe->fixups = UPROBE_FIX_RIP_CX; + /* Change modrm from 00 000 101 to 00 000 001. */ + *cursor = 0x1; + } else { + /* Use %rax (register #0) for the scratch register. */ + auprobe->fixups = UPROBE_FIX_RIP_AX; + /* Change modrm from 00 xxx 101 to 00 xxx 000 */ + *cursor = (reg << 3); + } + + /* Target address = address of next instruction + (signed) offset */ + auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value; + + /* Displacement field is gone; slide immediate field (if any) over. */ + if (insn->immediate.nbytes) { + cursor++; + memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); + } + return; +} + +static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) +{ + insn_init(insn, auprobe->insn, true); + + /* Skip good instruction prefixes; reject "bad" ones. */ + insn_get_opcode(insn); + if (is_prefix_bad(insn)) + return -ENOTSUPP; + + if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64)) + return 0; + + if (insn->opcode.nbytes == 2) { + if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) + return 0; + } + return -ENOTSUPP; +} + +static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +{ + if (mm->context.ia32_compat) + return validate_insn_32bits(auprobe, insn); + return validate_insn_64bits(auprobe, insn); +} +#else /* 32-bit: */ +static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +{ + /* No RIP-relative addressing on 32-bit */ +} + +static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +{ + return validate_insn_32bits(auprobe, insn); +} +#endif /* CONFIG_X86_64 */ + +/** + * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @mm: the probed address space. + * @arch_uprobe: the probepoint information. + * Return 0 on success or a -ve number on error. + */ +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) +{ + int ret; + struct insn insn; + + auprobe->fixups = 0; + ret = validate_insn_bits(auprobe, mm, &insn); + if (ret != 0) + return ret; + + handle_riprel_insn(auprobe, mm, &insn); + prepare_fixups(auprobe, &insn); + + return 0; +} + +#ifdef CONFIG_X86_64 +/* + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static void +pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ + if (auprobe->fixups & UPROBE_FIX_RIP_AX) { + autask->saved_scratch_register = regs->ax; + regs->ax = current->utask->vaddr; + regs->ax += auprobe->rip_rela_target_address; + } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { + autask->saved_scratch_register = regs->cx; + regs->cx = current->utask->vaddr; + regs->cx += auprobe->rip_rela_target_address; + } +} +#else +static void +pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ + /* No RIP-relative addressing on 32-bit */ +} +#endif + +/* + * arch_uprobe_pre_xol - prepare to execute out of line. + * @auprobe: the probepoint information. + * @regs: reflects the saved user state of current task. + */ +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct arch_uprobe_task *autask; + + autask = ¤t->utask->autask; + autask->saved_trap_nr = current->thread.trap_nr; + current->thread.trap_nr = UPROBE_TRAP_NR; + regs->ip = current->utask->xol_vaddr; + pre_xol_rip_insn(auprobe, regs, autask); + + return 0; +} + +/* + * This function is called by arch_uprobe_post_xol() to adjust the return + * address pushed by a call instruction executed out of line. + */ +static int adjust_ret_addr(unsigned long sp, long correction) +{ + int rasize, ncopied; + long ra = 0; + + if (is_ia32_task()) + rasize = 4; + else + rasize = 8; + + ncopied = copy_from_user(&ra, (void __user *)sp, rasize); + if (unlikely(ncopied)) + return -EFAULT; + + ra += correction; + ncopied = copy_to_user((void __user *)sp, &ra, rasize); + if (unlikely(ncopied)) + return -EFAULT; + + return 0; +} + +#ifdef CONFIG_X86_64 +static bool is_riprel_insn(struct arch_uprobe *auprobe) +{ + return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0); +} + +static void +handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +{ + if (is_riprel_insn(auprobe)) { + struct arch_uprobe_task *autask; + + autask = ¤t->utask->autask; + if (auprobe->fixups & UPROBE_FIX_RIP_AX) + regs->ax = autask->saved_scratch_register; + else + regs->cx = autask->saved_scratch_register; + + /* + * The original instruction includes a displacement, and so + * is 4 bytes longer than what we've just single-stepped. + * Fall through to handle stuff like "jmpq *...(%rip)" and + * "callq *...(%rip)". + */ + if (correction) + *correction += 4; + } +} +#else +static void +handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +{ + /* No RIP-relative addressing on 32-bit */ +} +#endif + +/* + * If xol insn itself traps and generates a signal(Say, + * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped + * instruction jumps back to its own address. It is assumed that anything + * like do_page_fault/do_trap/etc sets thread.trap_nr != -1. + * + * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr, + * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to + * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol(). + */ +bool arch_uprobe_xol_was_trapped(struct task_struct *t) +{ + if (t->thread.trap_nr != UPROBE_TRAP_NR) + return true; + + return false; +} + +/* + * Called after single-stepping. To avoid the SMP problems that can + * occur when we temporarily put back the original opcode to + * single-step, we single-stepped a copy of the instruction. + * + * This function prepares to resume execution after the single-step. + * We have to fix things up as follows: + * + * Typically, the new ip is relative to the copied instruction. We need + * to make it relative to the original instruction (FIX_IP). Exceptions + * are return instructions and absolute or indirect jump or call instructions. + * + * If the single-stepped instruction was a call, the return address that + * is atop the stack is the address following the copied instruction. We + * need to make it the address following the original instruction (FIX_CALL). + * + * If the original instruction was a rip-relative instruction such as + * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent + * instruction using a scratch register -- e.g., "movl %edx,(%rax)". + * We need to restore the contents of the scratch register and adjust + * the ip, keeping in mind that the instruction we executed is 4 bytes + * shorter than the original instruction (since we squeezed out the offset + * field). (FIX_RIP_AX or FIX_RIP_CX) + */ +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask; + long correction; + int result = 0; + + WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); + + utask = current->utask; + current->thread.trap_nr = utask->autask.saved_trap_nr; + correction = (long)(utask->vaddr - utask->xol_vaddr); + handle_riprel_post_xol(auprobe, regs, &correction); + if (auprobe->fixups & UPROBE_FIX_IP) + regs->ip += correction; + + if (auprobe->fixups & UPROBE_FIX_CALL) + result = adjust_ret_addr(regs->sp, correction); + + return result; +} + +/* callback routine for handling exceptions. */ +int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + int ret = NOTIFY_DONE; + + /* We are only interested in userspace traps */ + if (regs && !user_mode_vm(regs)) + return NOTIFY_DONE; + + switch (val) { + case DIE_INT3: + if (uprobe_pre_sstep_notifier(regs)) + ret = NOTIFY_STOP; + + break; + + case DIE_DEBUG: + if (uprobe_post_sstep_notifier(regs)) + ret = NOTIFY_STOP; + + default: + break; + } + + return ret; +} + +/* + * This function gets called when XOL instruction either gets trapped or + * the thread has a fatal signal, so reset the instruction pointer to its + * probed address. + */ +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + current->thread.trap_nr = utask->autask.saved_trap_nr; + handle_riprel_post_xol(auprobe, regs, NULL); + instruction_pointer_set(regs, utask->vaddr); +} + +/* + * Skip these instructions as per the currently known x86 ISA. + * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 } + */ +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + int i; + + for (i = 0; i < MAX_UINSN_BYTES; i++) { + if ((auprobe->insn[i] == 0x66)) + continue; + + if (auprobe->insn[i] == 0x90) + return true; + + if (i == (MAX_UINSN_BYTES - 1)) + break; + + if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x1f)) + return true; + + if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x19)) + return true; + + if ((auprobe->insn[i] == 0x87) && (auprobe->insn[i+1] == 0xc0)) + return true; + + break; + } + return false; +} diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3cc3062b3767..26574c726121 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include <linux/completion.h> #include <linux/cpumask.h> #include <linux/page-debug-flags.h> +#include <linux/uprobes.h> #include <asm/page.h> #include <asm/mmu.h> @@ -388,6 +389,7 @@ struct mm_struct { #ifdef CONFIG_CPUMASK_OFFSTACK struct cpumask cpumask_allocation; #endif + struct uprobes_state uprobes_state; }; static inline void mm_init_cpumask(struct mm_struct *mm) diff --git a/include/linux/sched.h b/include/linux/sched.h index 81a173c0897d..cff94cda34b2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1617,6 +1617,10 @@ struct task_struct { #ifdef CONFIG_HAVE_HW_BREAKPOINT atomic_t ptrace_bp_refcnt; #endif +#ifdef CONFIG_UPROBES + struct uprobe_task *utask; + int uprobe_srcu_id; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h new file mode 100644 index 000000000000..d594d3b3ad4c --- /dev/null +++ b/include/linux/uprobes.h @@ -0,0 +1,164 @@ +#ifndef _LINUX_UPROBES_H +#define _LINUX_UPROBES_H +/* + * User-space Probes (UProbes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2008-2012 + * Authors: + * Srikar Dronamraju + * Jim Keniston + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + */ + +#include <linux/errno.h> +#include <linux/rbtree.h> + +struct vm_area_struct; +struct mm_struct; +struct inode; + +#ifdef CONFIG_ARCH_SUPPORTS_UPROBES +# include <asm/uprobes.h> +#endif + +/* flags that denote/change uprobes behaviour */ + +/* Have a copy of original instruction */ +#define UPROBE_COPY_INSN 0x1 + +/* Dont run handlers when first register/ last unregister in progress*/ +#define UPROBE_RUN_HANDLER 0x2 +/* Can skip singlestep */ +#define UPROBE_SKIP_SSTEP 0x4 + +struct uprobe_consumer { + int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); + /* + * filter is optional; If a filter exists, handler is run + * if and only if filter returns true. + */ + bool (*filter)(struct uprobe_consumer *self, struct task_struct *task); + + struct uprobe_consumer *next; +}; + +#ifdef CONFIG_UPROBES +enum uprobe_task_state { + UTASK_RUNNING, + UTASK_BP_HIT, + UTASK_SSTEP, + UTASK_SSTEP_ACK, + UTASK_SSTEP_TRAPPED, +}; + +/* + * uprobe_task: Metadata of a task while it singlesteps. + */ +struct uprobe_task { + enum uprobe_task_state state; + struct arch_uprobe_task autask; + + struct uprobe *active_uprobe; + + unsigned long xol_vaddr; + unsigned long vaddr; +}; + +/* + * On a breakpoint hit, thread contests for a slot. It frees the + * slot after singlestep. Currently a fixed number of slots are + * allocated. + */ +struct xol_area { + wait_queue_head_t wq; /* if all slots are busy */ + atomic_t slot_count; /* number of in-use slots */ + unsigned long *bitmap; /* 0 = free slot */ + struct page *page; + + /* + * We keep the vma's vm_start rather than a pointer to the vma + * itself. The probed process or a naughty kernel module could make + * the vma go away, and we must handle that reasonably gracefully. + */ + unsigned long vaddr; /* Page(s) of instruction slots */ +}; + +struct uprobes_state { + struct xol_area *xol_area; + atomic_t count; +}; +extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr, bool verify); +extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); +extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); +extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); +extern int uprobe_mmap(struct vm_area_struct *vma); +extern void uprobe_munmap(struct vm_area_struct *vma); +extern void uprobe_free_utask(struct task_struct *t); +extern void uprobe_copy_process(struct task_struct *t); +extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); +extern int uprobe_post_sstep_notifier(struct pt_regs *regs); +extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); +extern void uprobe_notify_resume(struct pt_regs *regs); +extern bool uprobe_deny_signal(void); +extern bool __weak arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs); +extern void uprobe_clear_state(struct mm_struct *mm); +extern void uprobe_reset_state(struct mm_struct *mm); +#else /* !CONFIG_UPROBES */ +struct uprobes_state { +}; +static inline int +uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +{ + return -ENOSYS; +} +static inline void +uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +{ +} +static inline int uprobe_mmap(struct vm_area_struct *vma) +{ + return 0; +} +static inline void uprobe_munmap(struct vm_area_struct *vma) +{ +} +static inline void uprobe_notify_resume(struct pt_regs *regs) +{ +} +static inline bool uprobe_deny_signal(void) +{ + return false; +} +static inline unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) +{ + return 0; +} +static inline void uprobe_free_utask(struct task_struct *t) +{ +} +static inline void uprobe_copy_process(struct task_struct *t) +{ +} +static inline void uprobe_clear_state(struct mm_struct *mm) +{ +} +static inline void uprobe_reset_state(struct mm_struct *mm) +{ +} +#endif /* !CONFIG_UPROBES */ +#endif /* _LINUX_UPROBES_H */ diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 22d901f9caf4..103f5d147b2f 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -3,4 +3,7 @@ CFLAGS_REMOVE_core.o = -pg endif obj-y := core.o ring_buffer.o callchain.o + obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_UPROBES) += uprobes.o + diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c new file mode 100644 index 000000000000..29e881b0137d --- /dev/null +++ b/kernel/events/uprobes.c @@ -0,0 +1,1662 @@ +/* + * User-space Probes (UProbes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2008-2012 + * Authors: + * Srikar Dronamraju + * Jim Keniston + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> /* read_mapping_page */ +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/rmap.h> /* anon_vma_prepare */ +#include <linux/mmu_notifier.h> /* set_pte_at_notify */ +#include <linux/swap.h> /* try_to_free_swap */ +#include <linux/ptrace.h> /* user_enable_single_step */ +#include <linux/kdebug.h> /* notifier mechanism */ + +#include <linux/uprobes.h> + +#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) +#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE + +static struct srcu_struct uprobes_srcu; +static struct rb_root uprobes_tree = RB_ROOT; + +static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ + +#define UPROBES_HASH_SZ 13 + +/* serialize (un)register */ +static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; + +#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) + +/* serialize uprobe->pending_list */ +static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; +#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) + +/* + * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe + * events active at this time. Probably a fine grained per inode count is + * better? + */ +static atomic_t uprobe_events = ATOMIC_INIT(0); + +/* + * Maintain a temporary per vma info that can be used to search if a vma + * has already been handled. This structure is introduced since extending + * vm_area_struct wasnt recommended. + */ +struct vma_info { + struct list_head probe_list; + struct mm_struct *mm; + loff_t vaddr; +}; + +struct uprobe { + struct rb_node rb_node; /* node in the rb tree */ + atomic_t ref; + struct rw_semaphore consumer_rwsem; + struct list_head pending_list; + struct uprobe_consumer *consumers; + struct inode *inode; /* Also hold a ref to inode */ + loff_t offset; + int flags; + struct arch_uprobe arch; +}; + +/* + * valid_vma: Verify if the specified vma is an executable vma + * Relax restrictions while unregistering: vm_flags might have + * changed after breakpoint was inserted. + * - is_register: indicates if we are in register context. + * - Return 1 if the specified virtual address is in an + * executable vma. + */ +static bool valid_vma(struct vm_area_struct *vma, bool is_register) +{ + if (!vma->vm_file) + return false; + + if (!is_register) + return true; + + if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) + return true; + + return false; +} + +static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) +{ + loff_t vaddr; + + vaddr = vma->vm_start + offset; + vaddr -= vma->vm_pgoff << PAGE_SHIFT; + + return vaddr; +} + +/** + * __replace_page - replace page in vma by new page. + * based on replace_page in mm/ksm.c + * + * @vma: vma that holds the pte pointing to page + * @page: the cowed page we are replacing by kpage + * @kpage: the modified page we replace page by + * + * Returns 0 on success, -EFAULT on failure. + */ +static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + unsigned long addr; + int err = -EFAULT; + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!ptep) + goto out; + + get_page(kpage); + page_add_new_anon_rmap(kpage, vma, addr); + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); + + page_remove_rmap(page); + if (!page_mapped(page)) + try_to_free_swap(page); + put_page(page); + pte_unmap_unlock(ptep, ptl); + err = 0; + +out: + return err; +} + +/** + * is_swbp_insn - check if instruction is breakpoint instruction. + * @insn: instruction to be checked. + * Default implementation of is_swbp_insn + * Returns true if @insn is a breakpoint instruction. + */ +bool __weak is_swbp_insn(uprobe_opcode_t *insn) +{ + return *insn == UPROBE_SWBP_INSN; +} + +/* + * NOTE: + * Expect the breakpoint instruction to be the smallest size instruction for + * the architecture. If an arch has variable length instruction and the + * breakpoint instruction is not of the smallest length instruction + * supported by that architecture then we need to modify read_opcode / + * write_opcode accordingly. This would never be a problem for archs that + * have fixed length instructions. + */ + +/* + * write_opcode - write the opcode at a given virtual address. + * @auprobe: arch breakpointing information. + * @mm: the probed process address space. + * @vaddr: the virtual address to store the opcode. + * @opcode: opcode to be written at @vaddr. + * + * Called with mm->mmap_sem held (for read and with a reference to + * mm). + * + * For mm @mm, write the opcode at @vaddr. + * Return 0 (success) or a negative errno. + */ +static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr, uprobe_opcode_t opcode) +{ + struct page *old_page, *new_page; + struct address_space *mapping; + void *vaddr_old, *vaddr_new; + struct vm_area_struct *vma; + struct uprobe *uprobe; + loff_t addr; + int ret; + + /* Read the page with vaddr into memory */ + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); + if (ret <= 0) + return ret; + + ret = -EINVAL; + + /* + * We are interested in text pages only. Our pages of interest + * should be mapped for read and execute only. We desist from + * adding probes in write mapped pages since the breakpoints + * might end up in the file copy. + */ + if (!valid_vma(vma, is_swbp_insn(&opcode))) + goto put_out; + + uprobe = container_of(auprobe, struct uprobe, arch); + mapping = uprobe->inode->i_mapping; + if (mapping != vma->vm_file->f_mapping) + goto put_out; + + addr = vma_address(vma, uprobe->offset); + if (vaddr != (unsigned long)addr) + goto put_out; + + ret = -ENOMEM; + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); + if (!new_page) + goto put_out; + + __SetPageUptodate(new_page); + + /* + * lock page will serialize against do_wp_page()'s + * PageAnon() handling + */ + lock_page(old_page); + /* copy the page now that we've got it stable */ + vaddr_old = kmap_atomic(old_page); + vaddr_new = kmap_atomic(new_page); + + memcpy(vaddr_new, vaddr_old, PAGE_SIZE); + + /* poke the new insn in, ASSUMES we don't cross page boundary */ + vaddr &= ~PAGE_MASK; + BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + + kunmap_atomic(vaddr_new); + kunmap_atomic(vaddr_old); + + ret = anon_vma_prepare(vma); + if (ret) + goto unlock_out; + + lock_page(new_page); + ret = __replace_page(vma, old_page, new_page); + unlock_page(new_page); + +unlock_out: + unlock_page(old_page); + page_cache_release(new_page); + +put_out: + put_page(old_page); + + return ret; +} + +/** + * read_opcode - read the opcode at a given virtual address. + * @mm: the probed process address space. + * @vaddr: the virtual address to read the opcode. + * @opcode: location to store the read opcode. + * + * Called with mm->mmap_sem held (for read and with a reference to + * mm. + * + * For mm @mm, read the opcode at @vaddr and store it in @opcode. + * Return 0 (success) or a negative errno. + */ +static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode) +{ + struct page *page; + void *vaddr_new; + int ret; + + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); + if (ret <= 0) + return ret; + + lock_page(page); + vaddr_new = kmap_atomic(page); + vaddr &= ~PAGE_MASK; + memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); + kunmap_atomic(vaddr_new); + unlock_page(page); + + put_page(page); + + return 0; +} + +static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +{ + uprobe_opcode_t opcode; + int result; + + result = read_opcode(mm, vaddr, &opcode); + if (result) + return result; + + if (is_swbp_insn(&opcode)) + return 1; + + return 0; +} + +/** + * set_swbp - store breakpoint at a given address. + * @auprobe: arch specific probepoint information. + * @mm: the probed process address space. + * @vaddr: the virtual address to insert the opcode. + * + * For mm @mm, store the breakpoint instruction at @vaddr. + * Return 0 (success) or a negative errno. + */ +int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +{ + int result; + + result = is_swbp_at_addr(mm, vaddr); + if (result == 1) + return -EEXIST; + + if (result) + return result; + + return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); +} + +/** + * set_orig_insn - Restore the original instruction. + * @mm: the probed process address space. + * @auprobe: arch specific probepoint information. + * @vaddr: the virtual address to insert the opcode. + * @verify: if true, verify existance of breakpoint instruction. + * + * For mm @mm, restore the original opcode (opcode) at @vaddr. + * Return 0 (success) or a negative errno. + */ +int __weak +set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify) +{ + if (verify) { + int result; + + result = is_swbp_at_addr(mm, vaddr); + if (!result) + return -EINVAL; + + if (result != 1) + return result; + } + return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); +} + +static int match_uprobe(struct uprobe *l, struct uprobe *r) +{ + if (l->inode < r->inode) + return -1; + + if (l->inode > r->inode) + return 1; + + if (l->offset < r->offset) + return -1; + + if (l->offset > r->offset) + return 1; + + return 0; +} + +static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) +{ + struct uprobe u = { .inode = inode, .offset = offset }; + struct rb_node *n = uprobes_tree.rb_node; + struct uprobe *uprobe; + int match; + + while (n) { + uprobe = rb_entry(n, struct uprobe, rb_node); + match = match_uprobe(&u, uprobe); + if (!match) { + atomic_inc(&uprobe->ref); + return uprobe; + } + + if (match < 0) + n = n->rb_left; + else + n = n->rb_right; + } + return NULL; +} + +/* + * Find a uprobe corresponding to a given inode:offset + * Acquires uprobes_treelock + */ +static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) +{ + struct uprobe *uprobe; + unsigned long flags; + + spin_lock_irqsave(&uprobes_treelock, flags); + uprobe = __find_uprobe(inode, offset); + spin_unlock_irqrestore(&uprobes_treelock, flags); + + return uprobe; +} + +static struct uprobe *__insert_uprobe(struct uprobe *uprobe) +{ + struct rb_node **p = &uprobes_tree.rb_node; + struct rb_node *parent = NULL; + struct uprobe *u; + int match; + + while (*p) { + parent = *p; + u = rb_entry(parent, struct uprobe, rb_node); + match = match_uprobe(uprobe, u); + if (!match) { + atomic_inc(&u->ref); + return u; + } + + if (match < 0) + p = &parent->rb_left; + else + p = &parent->rb_right; + + } + + u = NULL; + rb_link_node(&uprobe->rb_node, parent, p); + rb_insert_color(&uprobe->rb_node, &uprobes_tree); + /* get access + creation ref */ + atomic_set(&uprobe->ref, 2); + + return u; +} + +/* + * Acquire uprobes_treelock. + * Matching uprobe already exists in rbtree; + * increment (access refcount) and return the matching uprobe. + * + * No matching uprobe; insert the uprobe in rb_tree; + * get a double refcount (access + creation) and return NULL. + */ +static struct uprobe *insert_uprobe(struct uprobe *uprobe) +{ + unsigned long flags; + struct uprobe *u; + + spin_lock_irqsave(&uprobes_treelock, flags); + u = __insert_uprobe(uprobe); + spin_unlock_irqrestore(&uprobes_treelock, flags); + + /* For now assume that the instruction need not be single-stepped */ + uprobe->flags |= UPROBE_SKIP_SSTEP; + + return u; +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (atomic_dec_and_test(&uprobe->ref)) + kfree(uprobe); +} + +static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) +{ + struct uprobe *uprobe, *cur_uprobe; + + uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); + if (!uprobe) + return NULL; + + uprobe->inode = igrab(inode); + uprobe->offset = offset; + init_rwsem(&uprobe->consumer_rwsem); + INIT_LIST_HEAD(&uprobe->pending_list); + + /* add to uprobes_tree, sorted on inode:offset */ + cur_uprobe = insert_uprobe(uprobe); + + /* a uprobe exists for this inode:offset combination */ + if (cur_uprobe) { + kfree(uprobe); + uprobe = cur_uprobe; + iput(inode); + } else { + atomic_inc(&uprobe_events); + } + + return uprobe; +} + +static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct uprobe_consumer *uc; + + if (!(uprobe->flags & UPROBE_RUN_HANDLER)) + return; + + down_read(&uprobe->consumer_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + if (!uc->filter || uc->filter(uc, current)) + uc->handler(uc, regs); + } + up_read(&uprobe->consumer_rwsem); +} + +/* Returns the previous consumer */ +static struct uprobe_consumer * +consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) +{ + down_write(&uprobe->consumer_rwsem); + uc->next = uprobe->consumers; + uprobe->consumers = uc; + up_write(&uprobe->consumer_rwsem); + + return uc->next; +} + +/* + * For uprobe @uprobe, delete the consumer @uc. + * Return true if the @uc is deleted successfully + * or return false. + */ +static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) +{ + struct uprobe_consumer **con; + bool ret = false; + + down_write(&uprobe->consumer_rwsem); + for (con = &uprobe->consumers; *con; con = &(*con)->next) { + if (*con == uc) { + *con = uc->next; + ret = true; + break; + } + } + up_write(&uprobe->consumer_rwsem); + + return ret; +} + +static int +__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, + unsigned long nbytes, unsigned long offset) +{ + struct file *filp = vma->vm_file; + struct page *page; + void *vaddr; + unsigned long off1; + unsigned long idx; + + if (!filp) + return -EINVAL; + + idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); + off1 = offset &= ~PAGE_MASK; + + /* + * Ensure that the page that has the original instruction is + * populated and in page-cache. + */ + page = read_mapping_page(mapping, idx, filp); + if (IS_ERR(page)) + return PTR_ERR(page); + + vaddr = kmap_atomic(page); + memcpy(insn, vaddr + off1, nbytes); + kunmap_atomic(vaddr); + page_cache_release(page); + + return 0; +} + +static int +copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) +{ + struct address_space *mapping; + unsigned long nbytes; + int bytes; + + addr &= ~PAGE_MASK; + nbytes = PAGE_SIZE - addr; + mapping = uprobe->inode->i_mapping; + + /* Instruction at end of binary; copy only available bytes */ + if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) + bytes = uprobe->inode->i_size - uprobe->offset; + else + bytes = MAX_UINSN_BYTES; + + /* Instruction at the page-boundary; copy bytes in second page */ + if (nbytes < bytes) { + if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, + bytes - nbytes, uprobe->offset + nbytes)) + return -ENOMEM; + + bytes = nbytes; + } + return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); +} + +/* + * How mm->uprobes_state.count gets updated + * uprobe_mmap() increments the count if + * - it successfully adds a breakpoint. + * - it cannot add a breakpoint, but sees that there is a underlying + * breakpoint (via a is_swbp_at_addr()). + * + * uprobe_munmap() decrements the count if + * - it sees a underlying breakpoint, (via is_swbp_at_addr) + * (Subsequent uprobe_unregister wouldnt find the breakpoint + * unless a uprobe_mmap kicks in, since the old vma would be + * dropped just after uprobe_munmap.) + * + * uprobe_register increments the count if: + * - it successfully adds a breakpoint. + * + * uprobe_unregister decrements the count if: + * - it sees a underlying breakpoint and removes successfully. + * (via is_swbp_at_addr) + * (Subsequent uprobe_munmap wouldnt find the breakpoint + * since there is no underlying breakpoint after the + * breakpoint removal.) + */ +static int +install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, + struct vm_area_struct *vma, loff_t vaddr) +{ + unsigned long addr; + int ret; + + /* + * If probe is being deleted, unregister thread could be done with + * the vma-rmap-walk through. Adding a probe now can be fatal since + * nobody will be able to cleanup. Also we could be from fork or + * mremap path, where the probe might have already been inserted. + * Hence behave as if probe already existed. + */ + if (!uprobe->consumers) + return -EEXIST; + + addr = (unsigned long)vaddr; + + if (!(uprobe->flags & UPROBE_COPY_INSN)) { + ret = copy_insn(uprobe, vma, addr); + if (ret) + return ret; + + if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) + return -EEXIST; + + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); + if (ret) + return ret; + + uprobe->flags |= UPROBE_COPY_INSN; + } + + /* + * Ideally, should be updating the probe count after the breakpoint + * has been successfully inserted. However a thread could hit the + * breakpoint we just inserted even before the probe count is + * incremented. If this is the first breakpoint placed, breakpoint + * notifier might ignore uprobes and pass the trap to the thread. + * Hence increment before and decrement on failure. + */ + atomic_inc(&mm->uprobes_state.count); + ret = set_swbp(&uprobe->arch, mm, addr); + if (ret) + atomic_dec(&mm->uprobes_state.count); + + return ret; +} + +static void +remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) +{ + if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) + atomic_dec(&mm->uprobes_state.count); +} + +/* + * There could be threads that have hit the breakpoint and are entering the + * notifier code and trying to acquire the uprobes_treelock. The thread + * calling delete_uprobe() that is removing the uprobe from the rb_tree can + * race with these threads and might acquire the uprobes_treelock compared + * to some of the breakpoint hit threads. In such a case, the breakpoint + * hit threads will not find the uprobe. The current unregistering thread + * waits till all other threads have hit a breakpoint, to acquire the + * uprobes_treelock before the uprobe is removed from the rbtree. + */ +static void delete_uprobe(struct uprobe *uprobe) +{ + unsigned long flags; + + synchronize_srcu(&uprobes_srcu); + spin_lock_irqsave(&uprobes_treelock, flags); + rb_erase(&uprobe->rb_node, &uprobes_tree); + spin_unlock_irqrestore(&uprobes_treelock, flags); + iput(uprobe->inode); + put_uprobe(uprobe); + atomic_dec(&uprobe_events); +} + +static struct vma_info * +__find_next_vma_info(struct address_space *mapping, struct list_head *head, + struct vma_info *vi, loff_t offset, bool is_register) +{ + struct prio_tree_iter iter; + struct vm_area_struct *vma; + struct vma_info *tmpvi; + unsigned long pgoff; + int existing_vma; + loff_t vaddr; + + pgoff = offset >> PAGE_SHIFT; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + if (!valid_vma(vma, is_register)) + continue; + + existing_vma = 0; + vaddr = vma_address(vma, offset); + + list_for_each_entry(tmpvi, head, probe_list) { + if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { + existing_vma = 1; + break; + } + } + + /* + * Another vma needs a probe to be installed. However skip + * installing the probe if the vma is about to be unlinked. + */ + if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) { + vi->mm = vma->vm_mm; + vi->vaddr = vaddr; + list_add(&vi->probe_list, head); + + return vi; + } + } + + return NULL; +} + +/* + * Iterate in the rmap prio tree and find a vma where a probe has not + * yet been inserted. + */ +static struct vma_info * +find_next_vma_info(struct address_space *mapping, struct list_head *head, + loff_t offset, bool is_register) +{ + struct vma_info *vi, *retvi; + + vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); + if (!vi) + return ERR_PTR(-ENOMEM); + + mutex_lock(&mapping->i_mmap_mutex); + retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); + mutex_unlock(&mapping->i_mmap_mutex); + + if (!retvi) + kfree(vi); + + return retvi; +} + +static int register_for_each_vma(struct uprobe *uprobe, bool is_register) +{ + struct list_head try_list; + struct vm_area_struct *vma; + struct address_space *mapping; + struct vma_info *vi, *tmpvi; + struct mm_struct *mm; + loff_t vaddr; + int ret; + + mapping = uprobe->inode->i_mapping; + INIT_LIST_HEAD(&try_list); + + ret = 0; + + for (;;) { + vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); + if (!vi) + break; + + if (IS_ERR(vi)) { + ret = PTR_ERR(vi); + break; + } + + mm = vi->mm; + down_read(&mm->mmap_sem); + vma = find_vma(mm, (unsigned long)vi->vaddr); + if (!vma || !valid_vma(vma, is_register)) { + list_del(&vi->probe_list); + kfree(vi); + up_read(&mm->mmap_sem); + mmput(mm); + continue; + } + vaddr = vma_address(vma, uprobe->offset); + if (vma->vm_file->f_mapping->host != uprobe->inode || + vaddr != vi->vaddr) { + list_del(&vi->probe_list); + kfree(vi); + up_read(&mm->mmap_sem); + mmput(mm); + continue; + } + + if (is_register) + ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); + else + remove_breakpoint(uprobe, mm, vi->vaddr); + + up_read(&mm->mmap_sem); + mmput(mm); + if (is_register) { + if (ret && ret == -EEXIST) + ret = 0; + if (ret) + break; + } + } + + list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { + list_del(&vi->probe_list); + kfree(vi); + } + + return ret; +} + +static int __uprobe_register(struct uprobe *uprobe) +{ + return register_for_each_vma(uprobe, true); +} + +static void __uprobe_unregister(struct uprobe *uprobe) +{ + if (!register_for_each_vma(uprobe, false)) + delete_uprobe(uprobe); + + /* TODO : cant unregister? schedule a worker thread */ +} + +/* + * uprobe_register - register a probe + * @inode: the file in which the probe has to be placed. + * @offset: offset from the start of the file. + * @uc: information on howto handle the probe.. + * + * Apart from the access refcount, uprobe_register() takes a creation + * refcount (thro alloc_uprobe) if and only if this @uprobe is getting + * inserted into the rbtree (i.e first consumer for a @inode:@offset + * tuple). Creation refcount stops uprobe_unregister from freeing the + * @uprobe even before the register operation is complete. Creation + * refcount is released when the last @uc for the @uprobe + * unregisters. + * + * Return errno if it cannot successully install probes + * else return 0 (success) + */ +int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +{ + struct uprobe *uprobe; + int ret; + + if (!inode || !uc || uc->next) + return -EINVAL; + + if (offset > i_size_read(inode)) + return -EINVAL; + + ret = 0; + mutex_lock(uprobes_hash(inode)); + uprobe = alloc_uprobe(inode, offset); + + if (uprobe && !consumer_add(uprobe, uc)) { + ret = __uprobe_register(uprobe); + if (ret) { + uprobe->consumers = NULL; + __uprobe_unregister(uprobe); + } else { + uprobe->flags |= UPROBE_RUN_HANDLER; + } + } + + mutex_unlock(uprobes_hash(inode)); + put_uprobe(uprobe); + + return ret; +} + +/* + * uprobe_unregister - unregister a already registered probe. + * @inode: the file in which the probe has to be removed. + * @offset: offset from the start of the file. + * @uc: identify which probe if multiple probes are colocated. + */ +void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +{ + struct uprobe *uprobe; + + if (!inode || !uc) + return; + + uprobe = find_uprobe(inode, offset); + if (!uprobe) + return; + + mutex_lock(uprobes_hash(inode)); + + if (consumer_del(uprobe, uc)) { + if (!uprobe->consumers) { + __uprobe_unregister(uprobe); + uprobe->flags &= ~UPROBE_RUN_HANDLER; + } + } + + mutex_unlock(uprobes_hash(inode)); + if (uprobe) + put_uprobe(uprobe); +} + +/* + * Of all the nodes that correspond to the given inode, return the node + * with the least offset. + */ +static struct rb_node *find_least_offset_node(struct inode *inode) +{ + struct uprobe u = { .inode = inode, .offset = 0}; + struct rb_node *n = uprobes_tree.rb_node; + struct rb_node *close_node = NULL; + struct uprobe *uprobe; + int match; + + while (n) { + uprobe = rb_entry(n, struct uprobe, rb_node); + match = match_uprobe(&u, uprobe); + + if (uprobe->inode == inode) + close_node = n; + + if (!match) + return close_node; + + if (match < 0) + n = n->rb_left; + else + n = n->rb_right; + } + + return close_node; +} + +/* + * For a given inode, build a list of probes that need to be inserted. + */ +static void build_probe_list(struct inode *inode, struct list_head *head) +{ + struct uprobe *uprobe; + unsigned long flags; + struct rb_node *n; + + spin_lock_irqsave(&uprobes_treelock, flags); + + n = find_least_offset_node(inode); + + for (; n; n = rb_next(n)) { + uprobe = rb_entry(n, struct uprobe, rb_node); + if (uprobe->inode != inode) + break; + + list_add(&uprobe->pending_list, head); + atomic_inc(&uprobe->ref); + } + + spin_unlock_irqrestore(&uprobes_treelock, flags); +} + +/* + * Called from mmap_region. + * called with mm->mmap_sem acquired. + * + * Return -ve no if we fail to insert probes and we cannot + * bail-out. + * Return 0 otherwise. i.e: + * + * - successful insertion of probes + * - (or) no possible probes to be inserted. + * - (or) insertion of probes failed but we can bail-out. + */ +int uprobe_mmap(struct vm_area_struct *vma) +{ + struct list_head tmp_list; + struct uprobe *uprobe, *u; + struct inode *inode; + int ret, count; + + if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) + return 0; + + inode = vma->vm_file->f_mapping->host; + if (!inode) + return 0; + + INIT_LIST_HEAD(&tmp_list); + mutex_lock(uprobes_mmap_hash(inode)); + build_probe_list(inode, &tmp_list); + + ret = 0; + count = 0; + + list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { + loff_t vaddr; + + list_del(&uprobe->pending_list); + if (!ret) { + vaddr = vma_address(vma, uprobe->offset); + + if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { + put_uprobe(uprobe); + continue; + } + + ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); + + /* Ignore double add: */ + if (ret == -EEXIST) { + ret = 0; + + if (!is_swbp_at_addr(vma->vm_mm, vaddr)) + continue; + + /* + * Unable to insert a breakpoint, but + * breakpoint lies underneath. Increment the + * probe count. + */ + atomic_inc(&vma->vm_mm->uprobes_state.count); + } + + if (!ret) + count++; + } + put_uprobe(uprobe); + } + + mutex_unlock(uprobes_mmap_hash(inode)); + + if (ret) + atomic_sub(count, &vma->vm_mm->uprobes_state.count); + + return ret; +} + +/* + * Called in context of a munmap of a vma. + */ +void uprobe_munmap(struct vm_area_struct *vma) +{ + struct list_head tmp_list; + struct uprobe *uprobe, *u; + struct inode *inode; + + if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) + return; + + if (!atomic_read(&vma->vm_mm->uprobes_state.count)) + return; + + inode = vma->vm_file->f_mapping->host; + if (!inode) + return; + + INIT_LIST_HEAD(&tmp_list); + mutex_lock(uprobes_mmap_hash(inode)); + build_probe_list(inode, &tmp_list); + + list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { + loff_t vaddr; + + list_del(&uprobe->pending_list); + vaddr = vma_address(vma, uprobe->offset); + + if (vaddr >= vma->vm_start && vaddr < vma->vm_end) { + /* + * An unregister could have removed the probe before + * unmap. So check before we decrement the count. + */ + if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) + atomic_dec(&vma->vm_mm->uprobes_state.count); + } + put_uprobe(uprobe); + } + mutex_unlock(uprobes_mmap_hash(inode)); +} + +/* Slot allocation for XOL */ +static int xol_add_vma(struct xol_area *area) +{ + struct mm_struct *mm; + int ret; + + area->page = alloc_page(GFP_HIGHUSER); + if (!area->page) + return -ENOMEM; + + ret = -EALREADY; + mm = current->mm; + + down_write(&mm->mmap_sem); + if (mm->uprobes_state.xol_area) + goto fail; + + ret = -ENOMEM; + + /* Try to map as high as possible, this is only a hint. */ + area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); + if (area->vaddr & ~PAGE_MASK) { + ret = area->vaddr; + goto fail; + } + + ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); + if (ret) + goto fail; + + smp_wmb(); /* pairs with get_xol_area() */ + mm->uprobes_state.xol_area = area; + ret = 0; + +fail: + up_write(&mm->mmap_sem); + if (ret) + __free_page(area->page); + + return ret; +} + +static struct xol_area *get_xol_area(struct mm_struct *mm) +{ + struct xol_area *area; + + area = mm->uprobes_state.xol_area; + smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + + return area; +} + +/* + * xol_alloc_area - Allocate process's xol_area. + * This area will be used for storing instructions for execution out of + * line. + * + * Returns the allocated area or NULL. + */ +static struct xol_area *xol_alloc_area(void) +{ + struct xol_area *area; + + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (unlikely(!area)) + return NULL; + + area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); + + if (!area->bitmap) + goto fail; + + init_waitqueue_head(&area->wq); + if (!xol_add_vma(area)) + return area; + +fail: + kfree(area->bitmap); + kfree(area); + + return get_xol_area(current->mm); +} + +/* + * uprobe_clear_state - Free the area allocated for slots. + */ +void uprobe_clear_state(struct mm_struct *mm) +{ + struct xol_area *area = mm->uprobes_state.xol_area; + + if (!area) + return; + + put_page(area->page); + kfree(area->bitmap); + kfree(area); +} + +/* + * uprobe_reset_state - Free the area allocated for slots. + */ +void uprobe_reset_state(struct mm_struct *mm) +{ + mm->uprobes_state.xol_area = NULL; + atomic_set(&mm->uprobes_state.count, 0); +} + +/* + * - search for a free slot. + */ +static unsigned long xol_take_insn_slot(struct xol_area *area) +{ + unsigned long slot_addr; + int slot_nr; + + do { + slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); + if (slot_nr < UINSNS_PER_PAGE) { + if (!test_and_set_bit(slot_nr, area->bitmap)) + break; + + slot_nr = UINSNS_PER_PAGE; + continue; + } + wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE)); + } while (slot_nr >= UINSNS_PER_PAGE); + + slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES); + atomic_inc(&area->slot_count); + + return slot_addr; +} + +/* + * xol_get_insn_slot - If was not allocated a slot, then + * allocate a slot. + * Returns the allocated slot address or 0. + */ +static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) +{ + struct xol_area *area; + unsigned long offset; + void *vaddr; + + area = get_xol_area(current->mm); + if (!area) { + area = xol_alloc_area(); + if (!area) + return 0; + } + current->utask->xol_vaddr = xol_take_insn_slot(area); + + /* + * Initialize the slot if xol_vaddr points to valid + * instruction slot. + */ + if (unlikely(!current->utask->xol_vaddr)) + return 0; + + current->utask->vaddr = slot_addr; + offset = current->utask->xol_vaddr & ~PAGE_MASK; + vaddr = kmap_atomic(area->page); + memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); + kunmap_atomic(vaddr); + + return current->utask->xol_vaddr; +} + +/* + * xol_free_insn_slot - If slot was earlier allocated by + * @xol_get_insn_slot(), make the slot available for + * subsequent requests. + */ +static void xol_free_insn_slot(struct task_struct *tsk) +{ + struct xol_area *area; + unsigned long vma_end; + unsigned long slot_addr; + + if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask) + return; + + slot_addr = tsk->utask->xol_vaddr; + + if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr))) + return; + + area = tsk->mm->uprobes_state.xol_area; + vma_end = area->vaddr + PAGE_SIZE; + if (area->vaddr <= slot_addr && slot_addr < vma_end) { + unsigned long offset; + int slot_nr; + + offset = slot_addr - area->vaddr; + slot_nr = offset / UPROBE_XOL_SLOT_BYTES; + if (slot_nr >= UINSNS_PER_PAGE) + return; + + clear_bit(slot_nr, area->bitmap); + atomic_dec(&area->slot_count); + if (waitqueue_active(&area->wq)) + wake_up(&area->wq); + + tsk->utask->xol_vaddr = 0; + } +} + +/** + * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs + * @regs: Reflects the saved state of the task after it has hit a breakpoint + * instruction. + * Return the address of the breakpoint instruction. + */ +unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) +{ + return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; +} + +/* + * Called with no locks held. + * Called in context of a exiting or a exec-ing thread. + */ +void uprobe_free_utask(struct task_struct *t) +{ + struct uprobe_task *utask = t->utask; + + if (t->uprobe_srcu_id != -1) + srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id); + + if (!utask) + return; + + if (utask->active_uprobe) + put_uprobe(utask->active_uprobe); + + xol_free_insn_slot(t); + kfree(utask); + t->utask = NULL; +} + +/* + * Called in context of a new clone/fork from copy_process. + */ +void uprobe_copy_process(struct task_struct *t) +{ + t->utask = NULL; + t->uprobe_srcu_id = -1; +} + +/* + * Allocate a uprobe_task object for the task. + * Called when the thread hits a breakpoint for the first time. + * + * Returns: + * - pointer to new uprobe_task on success + * - NULL otherwise + */ +static struct uprobe_task *add_utask(void) +{ + struct uprobe_task *utask; + + utask = kzalloc(sizeof *utask, GFP_KERNEL); + if (unlikely(!utask)) + return NULL; + + utask->active_uprobe = NULL; + current->utask = utask; + return utask; +} + +/* Prepare to single-step probed instruction out of line. */ +static int +pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) +{ + if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) + return 0; + + return -EFAULT; +} + +/* + * If we are singlestepping, then ensure this thread is not connected to + * non-fatal signals until completion of singlestep. When xol insn itself + * triggers the signal, restart the original insn even if the task is + * already SIGKILL'ed (since coredump should report the correct ip). This + * is even more important if the task has a handler for SIGSEGV/etc, The + * _same_ instruction should be repeated again after return from the signal + * handler, and SSTEP can never finish in this case. + */ +bool uprobe_deny_signal(void) +{ + struct task_struct *t = current; + struct uprobe_task *utask = t->utask; + + if (likely(!utask || !utask->active_uprobe)) + return false; + + WARN_ON_ONCE(utask->state != UTASK_SSTEP); + + if (signal_pending(t)) { + spin_lock_irq(&t->sighand->siglock); + clear_tsk_thread_flag(t, TIF_SIGPENDING); + spin_unlock_irq(&t->sighand->siglock); + + if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { + utask->state = UTASK_SSTEP_TRAPPED; + set_tsk_thread_flag(t, TIF_UPROBE); + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + } + } + + return true; +} + +/* + * Avoid singlestepping the original instruction if the original instruction + * is a NOP or can be emulated. + */ +static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) +{ + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) + return true; + + uprobe->flags &= ~UPROBE_SKIP_SSTEP; + return false; +} + +/* + * Run handler and ask thread to singlestep. + * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. + */ +static void handle_swbp(struct pt_regs *regs) +{ + struct vm_area_struct *vma; + struct uprobe_task *utask; + struct uprobe *uprobe; + struct mm_struct *mm; + unsigned long bp_vaddr; + + uprobe = NULL; + bp_vaddr = uprobe_get_swbp_addr(regs); + mm = current->mm; + down_read(&mm->mmap_sem); + vma = find_vma(mm, bp_vaddr); + + if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) { + struct inode *inode; + loff_t offset; + + inode = vma->vm_file->f_mapping->host; + offset = bp_vaddr - vma->vm_start; + offset += (vma->vm_pgoff << PAGE_SHIFT); + uprobe = find_uprobe(inode, offset); + } + + srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); + current->uprobe_srcu_id = -1; + up_read(&mm->mmap_sem); + + if (!uprobe) { + /* No matching uprobe; signal SIGTRAP. */ + send_sig(SIGTRAP, current, 0); + return; + } + + utask = current->utask; + if (!utask) { + utask = add_utask(); + /* Cannot allocate; re-execute the instruction. */ + if (!utask) + goto cleanup_ret; + } + utask->active_uprobe = uprobe; + handler_chain(uprobe, regs); + if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) + goto cleanup_ret; + + utask->state = UTASK_SSTEP; + if (!pre_ssout(uprobe, regs, bp_vaddr)) { + user_enable_single_step(current); + return; + } + +cleanup_ret: + if (utask) { + utask->active_uprobe = NULL; + utask->state = UTASK_RUNNING; + } + if (uprobe) { + if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) + + /* + * cannot singlestep; cannot skip instruction; + * re-execute the instruction. + */ + instruction_pointer_set(regs, bp_vaddr); + + put_uprobe(uprobe); + } +} + +/* + * Perform required fix-ups and disable singlestep. + * Allow pending signals to take effect. + */ +static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) +{ + struct uprobe *uprobe; + + uprobe = utask->active_uprobe; + if (utask->state == UTASK_SSTEP_ACK) + arch_uprobe_post_xol(&uprobe->arch, regs); + else if (utask->state == UTASK_SSTEP_TRAPPED) + arch_uprobe_abort_xol(&uprobe->arch, regs); + else + WARN_ON_ONCE(1); + + put_uprobe(uprobe); + utask->active_uprobe = NULL; + utask->state = UTASK_RUNNING; + user_disable_single_step(current); + xol_free_insn_slot(current); + + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); /* see uprobe_deny_signal() */ + spin_unlock_irq(¤t->sighand->siglock); +} + +/* + * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on + * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and + * allows the thread to return from interrupt. + * + * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and + * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from + * interrupt. + * + * While returning to userspace, thread notices the TIF_UPROBE flag and calls + * uprobe_notify_resume(). + */ +void uprobe_notify_resume(struct pt_regs *regs) +{ + struct uprobe_task *utask; + + utask = current->utask; + if (!utask || utask->state == UTASK_BP_HIT) + handle_swbp(regs); + else + handle_singlestep(utask, regs); +} + +/* + * uprobe_pre_sstep_notifier gets called from interrupt context as part of + * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit. + */ +int uprobe_pre_sstep_notifier(struct pt_regs *regs) +{ + struct uprobe_task *utask; + + if (!current->mm || !atomic_read(¤t->mm->uprobes_state.count)) + /* task is currently not uprobed */ + return 0; + + utask = current->utask; + if (utask) + utask->state = UTASK_BP_HIT; + + set_thread_flag(TIF_UPROBE); + current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu); + + return 1; +} + +/* + * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier + * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep. + */ +int uprobe_post_sstep_notifier(struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (!current->mm || !utask || !utask->active_uprobe) + /* task is currently not uprobed */ + return 0; + + utask->state = UTASK_SSTEP_ACK; + set_thread_flag(TIF_UPROBE); + return 1; +} + +static struct notifier_block uprobe_exception_nb = { + .notifier_call = arch_uprobe_exception_notify, + .priority = INT_MAX-1, /* notified after kprobes, kgdb */ +}; + +static int __init init_uprobes(void) +{ + int i; + + for (i = 0; i < UPROBES_HASH_SZ; i++) { + mutex_init(&uprobes_mutex[i]); + mutex_init(&uprobes_mmap_mutex[i]); + } + init_srcu_struct(&uprobes_srcu); + + return register_die_notifier(&uprobe_exception_nb); +} +module_init(init_uprobes); + +static void __exit exit_uprobes(void) +{ +} +module_exit(exit_uprobes); diff --git a/kernel/fork.c b/kernel/fork.c index b9372a0bff18..ca9a3845ef3e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -67,6 +67,7 @@ #include <linux/oom.h> #include <linux/khugepaged.h> #include <linux/signalfd.h> +#include <linux/uprobes.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -421,6 +422,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; + + if (file && uprobe_mmap(tmp)) + goto out; } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); @@ -569,6 +573,7 @@ void mmput(struct mm_struct *mm) might_sleep(); if (atomic_dec_and_test(&mm->mm_users)) { + uprobe_clear_state(mm); exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ @@ -747,6 +752,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) exit_pi_state_list(tsk); #endif + uprobe_free_utask(tsk); + /* Get rid of any cached register state */ deactivate_mm(tsk, mm); @@ -801,6 +808,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif + uprobe_reset_state(mm); if (!mm_init(mm, tsk)) goto fail_nomem; @@ -1342,6 +1350,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif + uprobe_copy_process(p); /* * sigaltstack should be cleared when sharing the same VM */ diff --git a/kernel/signal.c b/kernel/signal.c index 17afcaf582d0..60d80ab2601c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -29,6 +29,7 @@ #include <linux/pid_namespace.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> +#include <linux/uprobes.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -2202,6 +2203,9 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct signal_struct *signal = current->signal; int signr; + if (unlikely(uprobe_deny_signal())) + return 0; + relock: /* * We'll jump back here after any time we were stopped in TASK_STOPPED. diff --git a/mm/mmap.c b/mm/mmap.c index a7bf6a31c9f6..b17a39f31a5e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -30,6 +30,7 @@ #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> +#include <linux/uprobes.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -217,6 +218,7 @@ void unlink_file_vma(struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); __remove_shared_vm_struct(vma, file, mapping); mutex_unlock(&mapping->i_mmap_mutex); + uprobe_munmap(vma); } } @@ -544,8 +546,14 @@ again: remove_next = 1 + (end > next->vm_end); if (file) { mapping = file->f_mapping; - if (!(vma->vm_flags & VM_NONLINEAR)) + if (!(vma->vm_flags & VM_NONLINEAR)) { root = &mapping->i_mmap; + uprobe_munmap(vma); + + if (adjust_next) + uprobe_munmap(next); + } + mutex_lock(&mapping->i_mmap_mutex); if (insert) { /* @@ -615,8 +623,16 @@ again: remove_next = 1 + (end > next->vm_end); if (mapping) mutex_unlock(&mapping->i_mmap_mutex); + if (root) { + uprobe_mmap(vma); + + if (adjust_next) + uprobe_mmap(next); + } + if (remove_next) { if (file) { + uprobe_munmap(next); fput(file); if (next->vm_flags & VM_EXECUTABLE) removed_exe_file_vma(mm); @@ -636,6 +652,8 @@ again: remove_next = 1 + (end > next->vm_end); goto again; } } + if (insert && file) + uprobe_mmap(insert); validate_mm(mm); @@ -1344,6 +1362,11 @@ out: mm->locked_vm += (len >> PAGE_SHIFT); } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); + + if (file && uprobe_mmap(vma)) + /* matching probes but cannot insert */ + goto unmap_and_free_vma; + return addr; unmap_and_free_vma: @@ -2311,6 +2334,10 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, vma_pages(vma))) return -ENOMEM; + + if (vma->vm_file && uprobe_mmap(vma)) + return -EINVAL; + vma_link(mm, vma, prev, rb_link, rb_parent); return 0; } @@ -2380,6 +2407,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma->vm_pgoff = pgoff; if (new_vma->vm_file) { get_file(new_vma->vm_file); + + if (uprobe_mmap(new_vma)) + goto out_free_mempol; + if (vma->vm_flags & VM_EXECUTABLE) added_exe_file_vma(mm); } |