diff options
Diffstat (limited to 'arch/ia64/kernel')
27 files changed, 1780 insertions, 379 deletions
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index c1a02bbc252c..e1fb68ddec26 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -17,9 +17,12 @@ obj-$(CONFIG_IA64_PALINFO) += palinfo.o obj-$(CONFIG_IOSAPIC) += iosapic.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_SMP) += smp.o smpboot.o domain.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o +obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o +obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o mca_recovery-y += mca_drv.o mca_drv_asm.o # The gate DSO image is built using a special linker script. diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 7513ff9361a0..d362ecf5381b 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -242,9 +242,7 @@ acpi_parse_iosapic (acpi_table_entry_header *header, const unsigned long end) if (BAD_MADT_ENTRY(iosapic, end)) return -EINVAL; - iosapic_init(iosapic->address, iosapic->global_irq_base); - - return 0; + return iosapic_init(iosapic->address, iosapic->global_irq_base); } @@ -700,9 +698,11 @@ acpi_boot_init (void) if (smp_boot_data.cpu_phys_id[cpu] != hard_smp_processor_id()) node_cpuid[i++].phys_id = smp_boot_data.cpu_phys_id[cpu]; } - build_cpu_to_node_map(); # endif #endif +#ifdef CONFIG_ACPI_NUMA + build_cpu_to_node_map(); +#endif /* Make boot-up look pretty */ printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus, total_cpus); return 0; @@ -830,7 +830,7 @@ EXPORT_SYMBOL(acpi_unmap_lsapic); #ifdef CONFIG_ACPI_NUMA -acpi_status __init +acpi_status __devinit acpi_map_iosapic (acpi_handle handle, u32 depth, void *context, void **ret) { struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; @@ -883,4 +883,28 @@ acpi_map_iosapic (acpi_handle handle, u32 depth, void *context, void **ret) return AE_OK; } #endif /* CONFIG_NUMA */ + +int +acpi_register_ioapic (acpi_handle handle, u64 phys_addr, u32 gsi_base) +{ + int err; + + if ((err = iosapic_init(phys_addr, gsi_base))) + return err; + +#if CONFIG_ACPI_NUMA + acpi_map_iosapic(handle, 0, NULL, NULL); +#endif /* CONFIG_ACPI_NUMA */ + + return 0; +} +EXPORT_SYMBOL(acpi_register_ioapic); + +int +acpi_unregister_ioapic (acpi_handle handle, u32 gsi_base) +{ + return iosapic_remove(gsi_base); +} +EXPORT_SYMBOL(acpi_unregister_ioapic); + #endif /* CONFIG_ACPI_BOOT */ diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c index fe532c970438..d65e87b6394f 100644 --- a/arch/ia64/kernel/domain.c +++ b/arch/ia64/kernel/domain.c @@ -14,7 +14,7 @@ #include <linux/topology.h> #include <linux/nodemask.h> -#define SD_NODES_PER_DOMAIN 6 +#define SD_NODES_PER_DOMAIN 16 #ifdef CONFIG_NUMA /** @@ -27,7 +27,7 @@ * * Should use nodemask_t. */ -static int __devinit find_next_best_node(int node, unsigned long *used_nodes) +static int find_next_best_node(int node, unsigned long *used_nodes) { int i, n, val, min_val, best_node = 0; @@ -66,7 +66,7 @@ static int __devinit find_next_best_node(int node, unsigned long *used_nodes) * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ -static cpumask_t __devinit sched_domain_node_span(int node) +static cpumask_t sched_domain_node_span(int node) { int i; cpumask_t span, nodemask; @@ -96,7 +96,7 @@ static cpumask_t __devinit sched_domain_node_span(int node) #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; -static int __devinit cpu_to_cpu_group(int cpu) +static int cpu_to_cpu_group(int cpu) { return cpu; } @@ -104,7 +104,7 @@ static int __devinit cpu_to_cpu_group(int cpu) static DEFINE_PER_CPU(struct sched_domain, phys_domains); static struct sched_group sched_group_phys[NR_CPUS]; -static int __devinit cpu_to_phys_group(int cpu) +static int cpu_to_phys_group(int cpu) { #ifdef CONFIG_SCHED_SMT return first_cpu(cpu_sibling_map[cpu]); @@ -125,44 +125,36 @@ static struct sched_group *sched_group_nodes[MAX_NUMNODES]; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static struct sched_group sched_group_allnodes[MAX_NUMNODES]; -static int __devinit cpu_to_allnodes_group(int cpu) +static int cpu_to_allnodes_group(int cpu) { return cpu_to_node(cpu); } #endif /* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus */ -void __devinit arch_init_sched_domains(void) +void build_sched_domains(const cpumask_t *cpu_map) { int i; - cpumask_t cpu_default_map; /* - * Setup mask for cpus without special case scheduling requirements. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. + * Set up domains for cpus specified by the cpu_map. */ - cpus_complement(cpu_default_map, cpu_isolated_map); - cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); - - /* - * Set up domains. Isolated domains just stay on the dummy domain. - */ - for_each_cpu_mask(i, cpu_default_map) { + for_each_cpu_mask(i, *cpu_map) { int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); - cpus_and(nodemask, nodemask, cpu_default_map); + cpus_and(nodemask, nodemask, *cpu_map); #ifdef CONFIG_NUMA if (num_online_cpus() > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; - sd->span = cpu_default_map; + sd->span = *cpu_map; group = cpu_to_allnodes_group(i); sd->groups = &sched_group_allnodes[group]; p = sd; @@ -173,7 +165,7 @@ void __devinit arch_init_sched_domains(void) *sd = SD_NODE_INIT; sd->span = sched_domain_node_span(cpu_to_node(i)); sd->parent = p; - cpus_and(sd->span, sd->span, cpu_default_map); + cpus_and(sd->span, sd->span, *cpu_map); #endif p = sd; @@ -190,7 +182,7 @@ void __devinit arch_init_sched_domains(void) group = cpu_to_cpu_group(i); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; - cpus_and(sd->span, sd->span, cpu_default_map); + cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; sd->groups = &sched_group_cpus[group]; #endif @@ -198,9 +190,9 @@ void __devinit arch_init_sched_domains(void) #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu_mask(i, cpu_default_map) { + for_each_cpu_mask(i, *cpu_map) { cpumask_t this_sibling_map = cpu_sibling_map[i]; - cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); + cpus_and(this_sibling_map, this_sibling_map, *cpu_map); if (i != first_cpu(this_sibling_map)) continue; @@ -213,7 +205,7 @@ void __devinit arch_init_sched_domains(void) for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); - cpus_and(nodemask, nodemask, cpu_default_map); + cpus_and(nodemask, nodemask, *cpu_map); if (cpus_empty(nodemask)) continue; @@ -222,7 +214,7 @@ void __devinit arch_init_sched_domains(void) } #ifdef CONFIG_NUMA - init_sched_build_groups(sched_group_allnodes, cpu_default_map, + init_sched_build_groups(sched_group_allnodes, *cpu_map, &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { @@ -233,12 +225,12 @@ void __devinit arch_init_sched_domains(void) cpumask_t covered = CPU_MASK_NONE; int j; - cpus_and(nodemask, nodemask, cpu_default_map); + cpus_and(nodemask, nodemask, *cpu_map); if (cpus_empty(nodemask)) continue; domainspan = sched_domain_node_span(i); - cpus_and(domainspan, domainspan, cpu_default_map); + cpus_and(domainspan, domainspan, *cpu_map); sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); sched_group_nodes[i] = sg; @@ -266,7 +258,7 @@ void __devinit arch_init_sched_domains(void) int n = (i + j) % MAX_NUMNODES; cpus_complement(notcovered, covered); - cpus_and(tmp, notcovered, cpu_default_map); + cpus_and(tmp, notcovered, *cpu_map); cpus_and(tmp, tmp, domainspan); if (cpus_empty(tmp)) break; @@ -293,7 +285,7 @@ void __devinit arch_init_sched_domains(void) #endif /* Calculate CPU power for physical packages and nodes */ - for_each_cpu_mask(i, cpu_default_map) { + for_each_cpu_mask(i, *cpu_map) { int power; struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT @@ -359,13 +351,35 @@ next_sg: cpu_attach_domain(sd, i); } } +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + */ +void arch_init_sched_domains(const cpumask_t *cpu_map) +{ + cpumask_t cpu_default_map; + + /* + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ + cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); + + build_sched_domains(&cpu_default_map); +} -void __devinit arch_destroy_sched_domains(void) +void arch_destroy_sched_domains(const cpumask_t *cpu_map) { #ifdef CONFIG_NUMA int i; for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); struct sched_group *oldsg, *sg = sched_group_nodes[i]; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) + continue; + if (sg == NULL) continue; sg = sg->next; diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 4a3b1aac43e7..179f230816ed 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -410,6 +410,38 @@ efi_memmap_walk (efi_freemem_callback_t callback, void *arg) } /* + * Walk the EFI memory map to pull out leftover pages in the lower + * memory regions which do not end up in the regular memory map and + * stick them into the uncached allocator + * + * The regular walk function is significantly more complex than the + * uncached walk which means it really doesn't make sense to try and + * marge the two. + */ +void __init +efi_memmap_walk_uc (efi_freemem_callback_t callback) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size, start, end; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + if (md->attribute == EFI_MEMORY_UC) { + start = PAGE_ALIGN(md->phys_addr); + end = PAGE_ALIGN((md->phys_addr+(md->num_pages << EFI_PAGE_SHIFT)) & PAGE_MASK); + if ((*callback)(start, end, NULL) < 0) + return; + } + } +} + + +/* * Look for the PAL_CODE region reported by EFI and maps it using an * ITR to enable safe PAL calls in virtual mode. See IA-64 Processor * Abstraction Layer chapter 11 in ADAG diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index d99316c9be28..9be53e1ea404 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -470,18 +470,6 @@ ENTRY(load_switch_stack) br.cond.sptk.many b7 END(load_switch_stack) -GLOBAL_ENTRY(__ia64_syscall) - .regstk 6,0,0,0 - mov r15=in5 // put syscall number in place - break __BREAK_SYSCALL - movl r2=errno - cmp.eq p6,p7=-1,r10 - ;; -(p6) st4 [r2]=r8 -(p6) mov r8=-1 - br.ret.sptk.many rp -END(__ia64_syscall) - GLOBAL_ENTRY(execve) mov r15=__NR_execve // put syscall number in place break __BREAK_SYSCALL @@ -637,7 +625,7 @@ END(ia64_ret_from_syscall) * r8-r11: restored (syscall return value(s)) * r12: restored (user-level stack pointer) * r13: restored (user-level thread pointer) - * r14: cleared + * r14: set to __kernel_syscall_via_epc * r15: restored (syscall #) * r16-r17: cleared * r18: user-level b6 @@ -658,7 +646,7 @@ END(ia64_ret_from_syscall) * pr: restored (user-level pr) * b0: restored (user-level rp) * b6: restored - * b7: cleared + * b7: set to __kernel_syscall_via_epc * ar.unat: restored (user-level ar.unat) * ar.pfs: restored (user-level ar.pfs) * ar.rsc: restored (user-level ar.rsc) @@ -704,72 +692,79 @@ ENTRY(ia64_leave_syscall) ;; (p6) ld4 r31=[r18] // load current_thread_info()->flags ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs" - mov b7=r0 // clear b7 + nop.i 0 ;; - ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE) // load ar.bspstore (may be garbage) + mov r16=ar.bsp // M2 get existing backing store pointer ld8 r18=[r2],PT(R9)-PT(B6) // load b6 (p6) and r15=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE? ;; - mov r16=ar.bsp // M2 get existing backing store pointer + ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE) // load ar.bspstore (may be garbage) (p6) cmp4.ne.unc p6,p0=r15, r0 // any special work pending? (p6) br.cond.spnt .work_pending_syscall ;; // start restoring the state saved on the kernel stack (struct pt_regs): ld8 r9=[r2],PT(CR_IPSR)-PT(R9) ld8 r11=[r3],PT(CR_IIP)-PT(R11) - mov f6=f0 // clear f6 +(pNonSys) break 0 // bug check: we shouldn't be here if pNonSys is TRUE! ;; invala // M0|1 invalidate ALAT - rsm psr.i | psr.ic // M2 initiate turning off of interrupt and interruption collection - mov f9=f0 // clear f9 + rsm psr.i | psr.ic // M2 turn off interrupts and interruption collection + cmp.eq p9,p0=r0,r0 // A set p9 to indicate that we should restore cr.ifs - ld8 r29=[r2],16 // load cr.ipsr - ld8 r28=[r3],16 // load cr.iip - mov f8=f0 // clear f8 + ld8 r29=[r2],16 // M0|1 load cr.ipsr + ld8 r28=[r3],16 // M0|1 load cr.iip + mov r22=r0 // A clear r22 ;; ld8 r30=[r2],16 // M0|1 load cr.ifs ld8 r25=[r3],16 // M0|1 load ar.unat - cmp.eq p9,p0=r0,r0 // set p9 to indicate that we should restore cr.ifs +(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 ;; ld8 r26=[r2],PT(B0)-PT(AR_PFS) // M0|1 load ar.pfs -(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled - mov f10=f0 // clear f10 +(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled + nop 0 ;; - ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // load b0 - ld8 r27=[r3],PT(PR)-PT(AR_RSC) // load ar.rsc - mov f11=f0 // clear f11 + ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // M0|1 load b0 + ld8 r27=[r3],PT(PR)-PT(AR_RSC) // M0|1 load ar.rsc + mov f6=f0 // F clear f6 ;; - ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT) // load ar.rnat (may be garbage) - ld8 r31=[r3],PT(R1)-PT(PR) // load predicates -(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 + ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT) // M0|1 load ar.rnat (may be garbage) + ld8 r31=[r3],PT(R1)-PT(PR) // M0|1 load predicates + mov f7=f0 // F clear f7 ;; - ld8 r20=[r2],PT(R12)-PT(AR_FPSR) // load ar.fpsr - ld8.fill r1=[r3],16 // load r1 -(pUStk) mov r17=1 + ld8 r20=[r2],PT(R12)-PT(AR_FPSR) // M0|1 load ar.fpsr + ld8.fill r1=[r3],16 // M0|1 load r1 +(pUStk) mov r17=1 // A ;; - srlz.d // M0 ensure interruption collection is off - ld8.fill r13=[r3],16 - mov f7=f0 // clear f7 +(pUStk) st1 [r14]=r17 // M2|3 + ld8.fill r13=[r3],16 // M0|1 + mov f8=f0 // F clear f8 ;; - ld8.fill r12=[r2] // restore r12 (sp) - mov.m ar.ssd=r0 // M2 clear ar.ssd - mov r22=r0 // clear r22 + ld8.fill r12=[r2] // M0|1 restore r12 (sp) + ld8.fill r15=[r3] // M0|1 restore r15 + mov b6=r18 // I0 restore b6 + + addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 // A + mov f9=f0 // F clear f9 +(pKStk) br.cond.dpnt.many skip_rbs_switch // B - ld8.fill r15=[r3] // restore r15 -(pUStk) st1 [r14]=r17 - addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0 + srlz.d // M0 ensure interruption collection is off (for cover) + shr.u r18=r19,16 // I0|1 get byte size of existing "dirty" partition + cover // B add current frame into dirty partition & set cr.ifs ;; -(pUStk) ld4 r17=[r3] // r17 = cpu_data->phys_stacked_size_p8 - mov.m ar.csd=r0 // M2 clear ar.csd - mov b6=r18 // I0 restore b6 +(pUStk) ld4 r17=[r17] // M0|1 r17 = cpu_data->phys_stacked_size_p8 + mov r19=ar.bsp // M2 get new backing store pointer + mov f10=f0 // F clear f10 + + nop.m 0 + movl r14=__kernel_syscall_via_epc // X ;; - mov r14=r0 // clear r14 - shr.u r18=r19,16 // I0|1 get byte size of existing "dirty" partition -(pKStk) br.cond.dpnt.many skip_rbs_switch + mov.m ar.csd=r0 // M2 clear ar.csd + mov.m ar.ccv=r0 // M2 clear ar.ccv + mov b7=r14 // I0 clear b7 (hint with __kernel_syscall_via_epc) - mov.m ar.ccv=r0 // clear ar.ccv -(pNonSys) br.cond.dpnt.many dont_preserve_current_frame - br.cond.sptk.many rbs_switch + mov.m ar.ssd=r0 // M2 clear ar.ssd + mov f11=f0 // F clear f11 + br.cond.sptk.many rbs_switch // B END(ia64_leave_syscall) #ifdef CONFIG_IA32_SUPPORT @@ -885,7 +880,7 @@ GLOBAL_ENTRY(ia64_leave_kernel) ldf.fill f7=[r2],PT(F11)-PT(F7) ldf.fill f8=[r3],32 ;; - srlz.i // ensure interruption collection is off + srlz.d // ensure that inter. collection is off (VHPT is don't care, since text is pinned) mov ar.ccv=r15 ;; ldf.fill f11=[r2] @@ -945,11 +940,10 @@ GLOBAL_ENTRY(ia64_leave_kernel) * NOTE: alloc, loadrs, and cover can't be predicated. */ (pNonSys) br.cond.dpnt dont_preserve_current_frame - -rbs_switch: cover // add current frame into dirty partition and set cr.ifs ;; mov r19=ar.bsp // get new backing store pointer +rbs_switch: sub r16=r16,r18 // krbs = old bsp - size of dirty partition cmp.ne p9,p0=r0,r0 // clear p9 to skip restore of cr.ifs ;; @@ -1024,14 +1018,14 @@ rse_clear_invalid: mov loc5=0 mov loc6=0 mov loc7=0 -(pRecurse) br.call.sptk.few b0=rse_clear_invalid +(pRecurse) br.call.dptk.few b0=rse_clear_invalid ;; mov loc8=0 mov loc9=0 cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret mov loc10=0 mov loc11=0 -(pReturn) br.ret.sptk.many b0 +(pReturn) br.ret.dptk.many b0 #endif /* !CONFIG_ITANIUM */ # undef pRecurse # undef pReturn @@ -1255,7 +1249,7 @@ ENTRY(sys_rt_sigreturn) stf.spill [r17]=f11 adds out0=16,sp // out0 = &sigscratch br.call.sptk.many rp=ia64_rt_sigreturn -.ret19: .restore sp 0 +.ret19: .restore sp,0 adds sp=16,sp ;; ld8 r9=[sp] // load new ar.unat @@ -1577,11 +1571,11 @@ sys_call_table: data8 sys_add_key data8 sys_request_key data8 sys_keyctl + data8 sys_ioprio_set + data8 sys_ioprio_get // 1275 data8 sys_ni_syscall - data8 sys_ni_syscall // 1275 - data8 sys_ni_syscall - data8 sys_ni_syscall - data8 sys_ni_syscall - data8 sys_ni_syscall + data8 sys_inotify_init + data8 sys_inotify_add_watch + data8 sys_inotify_rm_watch .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls diff --git a/arch/ia64/kernel/entry.h b/arch/ia64/kernel/entry.h index 6d4ecec989b5..78eeb0793419 100644 --- a/arch/ia64/kernel/entry.h +++ b/arch/ia64/kernel/entry.h @@ -60,7 +60,7 @@ .spillsp @priunat,SW(AR_UNAT)+16+(off); \ .spillsp ar.rnat,SW(AR_RNAT)+16+(off); \ .spillsp ar.bspstore,SW(AR_BSPSTORE)+16+(off); \ - .spillsp pr,SW(PR)+16+(off)) + .spillsp pr,SW(PR)+16+(off) #define DO_SAVE_SWITCH_STACK \ movl r28=1f; \ diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 962b6c4e32b5..7d7684a369d3 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -531,93 +531,114 @@ GLOBAL_ENTRY(fsys_bubble_down) .altrp b6 .body /* - * We get here for syscalls that don't have a lightweight handler. For those, we - * need to bubble down into the kernel and that requires setting up a minimal - * pt_regs structure, and initializing the CPU state more or less as if an - * interruption had occurred. To make syscall-restarts work, we setup pt_regs - * such that cr_iip points to the second instruction in syscall_via_break. - * Decrementing the IP hence will restart the syscall via break and not - * decrementing IP will return us to the caller, as usual. Note that we preserve - * the value of psr.pp rather than initializing it from dcr.pp. This makes it - * possible to distinguish fsyscall execution from other privileged execution. + * We get here for syscalls that don't have a lightweight + * handler. For those, we need to bubble down into the kernel + * and that requires setting up a minimal pt_regs structure, + * and initializing the CPU state more or less as if an + * interruption had occurred. To make syscall-restarts work, + * we setup pt_regs such that cr_iip points to the second + * instruction in syscall_via_break. Decrementing the IP + * hence will restart the syscall via break and not + * decrementing IP will return us to the caller, as usual. + * Note that we preserve the value of psr.pp rather than + * initializing it from dcr.pp. This makes it possible to + * distinguish fsyscall execution from other privileged + * execution. * * On entry: - * - normal fsyscall handler register usage, except that we also have: + * - normal fsyscall handler register usage, except + * that we also have: * - r18: address of syscall entry point * - r21: ar.fpsr * - r26: ar.pfs * - r27: ar.rsc * - r29: psr + * + * We used to clear some PSR bits here but that requires slow + * serialization. Fortuntely, that isn't really necessary. + * The rationale is as follows: we used to clear bits + * ~PSR_PRESERVED_BITS in PSR.L. Since + * PSR_PRESERVED_BITS==PSR.{UP,MFL,MFH,PK,DT,PP,SP,RT,IC}, we + * ended up clearing PSR.{BE,AC,I,DFL,DFH,DI,DB,SI,TB}. + * However, + * + * PSR.BE : already is turned off in __kernel_syscall_via_epc() + * PSR.AC : don't care (kernel normally turns PSR.AC on) + * PSR.I : already turned off by the time fsys_bubble_down gets + * invoked + * PSR.DFL: always 0 (kernel never turns it on) + * PSR.DFH: don't care --- kernel never touches f32-f127 on its own + * initiative + * PSR.DI : always 0 (kernel never turns it on) + * PSR.SI : always 0 (kernel never turns it on) + * PSR.DB : don't care --- kernel never enables kernel-level + * breakpoints + * PSR.TB : must be 0 already; if it wasn't zero on entry to + * __kernel_syscall_via_epc, the branch to fsys_bubble_down + * will trigger a taken branch; the taken-trap-handler then + * converts the syscall into a break-based system-call. */ -# define PSR_PRESERVED_BITS (IA64_PSR_UP | IA64_PSR_MFL | IA64_PSR_MFH | IA64_PSR_PK \ - | IA64_PSR_DT | IA64_PSR_PP | IA64_PSR_SP | IA64_PSR_RT \ - | IA64_PSR_IC) /* - * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc. The rest we have - * to synthesize. + * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc. + * The rest we have to synthesize. */ -# define PSR_ONE_BITS ((3 << IA64_PSR_CPL0_BIT) | (0x1 << IA64_PSR_RI_BIT) \ +# define PSR_ONE_BITS ((3 << IA64_PSR_CPL0_BIT) \ + | (0x1 << IA64_PSR_RI_BIT) \ | IA64_PSR_BN | IA64_PSR_I) - invala - movl r8=PSR_ONE_BITS + invala // M0|1 + movl r14=ia64_ret_from_syscall // X - mov r25=ar.unat // save ar.unat (5 cyc) - movl r9=PSR_PRESERVED_BITS + nop.m 0 + movl r28=__kernel_syscall_via_break // X create cr.iip + ;; - mov ar.rsc=0 // set enforced lazy mode, pl 0, little-endian, loadrs=0 - movl r28=__kernel_syscall_via_break + mov r2=r16 // A get task addr to addl-addressable register + adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 // A + mov r31=pr // I0 save pr (2 cyc) ;; - mov r23=ar.bspstore // save ar.bspstore (12 cyc) - mov r31=pr // save pr (2 cyc) - mov r20=r1 // save caller's gp in r20 + st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag + addl r22=IA64_RBS_OFFSET,r2 // A compute base of RBS + add r3=TI_FLAGS+IA64_TASK_SIZE,r2 // A ;; - mov r2=r16 // copy current task addr to addl-addressable register - and r9=r9,r29 - mov r19=b6 // save b6 (2 cyc) + ld4 r3=[r3] // M0|1 r3 = current_thread_info()->flags + lfetch.fault.excl.nt1 [r22] // M0|1 prefetch register backing-store + nop.i 0 ;; - mov psr.l=r9 // slam the door (17 cyc to srlz.i) - or r29=r8,r29 // construct cr.ipsr value to save - addl r22=IA64_RBS_OFFSET,r2 // compute base of RBS + mov ar.rsc=0 // M2 set enforced lazy mode, pl 0, LE, loadrs=0 + nop.m 0 + nop.i 0 ;; - // GAS reports a spurious RAW hazard on the read of ar.rnat because it thinks - // we may be reading ar.itc after writing to psr.l. Avoid that message with - // this directive: - dv_serialize_data - mov.m r24=ar.rnat // read ar.rnat (5 cyc lat) - lfetch.fault.excl.nt1 [r22] - adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r2 - - // ensure previous insn group is issued before we stall for srlz.i: + mov r23=ar.bspstore // M2 (12 cyc) save ar.bspstore + mov.m r24=ar.rnat // M2 (5 cyc) read ar.rnat (dual-issues!) + nop.i 0 ;; - srlz.i // ensure new psr.l has been established - ///////////////////////////////////////////////////////////////////////////// - ////////// from this point on, execution is not interruptible anymore - ///////////////////////////////////////////////////////////////////////////// - addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 // compute base of memory stack - cmp.ne pKStk,pUStk=r0,r0 // set pKStk <- 0, pUStk <- 1 + mov ar.bspstore=r22 // M2 (6 cyc) switch to kernel RBS + movl r8=PSR_ONE_BITS // X ;; - st1 [r16]=r0 // clear current->thread.on_ustack flag - mov ar.bspstore=r22 // switch to kernel RBS - mov b6=r18 // copy syscall entry-point to b6 (7 cyc) - add r3=TI_FLAGS+IA64_TASK_SIZE,r2 + mov r25=ar.unat // M2 (5 cyc) save ar.unat + mov r19=b6 // I0 save b6 (2 cyc) + mov r20=r1 // A save caller's gp in r20 ;; - ld4 r3=[r3] // r2 = current_thread_info()->flags - mov r18=ar.bsp // save (kernel) ar.bsp (12 cyc) - mov ar.rsc=0x3 // set eager mode, pl 0, little-endian, loadrs=0 - br.call.sptk.many b7=ia64_syscall_setup - ;; - ssm psr.i - movl r2=ia64_ret_from_syscall + or r29=r8,r29 // A construct cr.ipsr value to save + mov b6=r18 // I0 copy syscall entry-point to b6 (7 cyc) + addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 // A compute base of memory stack + + mov r18=ar.bsp // M2 save (kernel) ar.bsp (12 cyc) + cmp.ne pKStk,pUStk=r0,r0 // A set pKStk <- 0, pUStk <- 1 + br.call.sptk.many b7=ia64_syscall_setup // B ;; - mov rp=r2 // set the real return addr - and r3=_TIF_SYSCALL_TRACEAUDIT,r3 + mov ar.rsc=0x3 // M2 set eager mode, pl 0, LE, loadrs=0 + mov rp=r14 // I0 set the real return addr + and r3=_TIF_SYSCALL_TRACEAUDIT,r3 // A ;; - cmp.eq p8,p0=r3,r0 + ssm psr.i // M2 we're on kernel stacks now, reenable irqs + cmp.eq p8,p0=r3,r0 // A +(p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT -(p10) br.cond.spnt.many ia64_ret_from_syscall // p10==true means out registers are more than 8 -(p8) br.call.sptk.many b6=b6 // ignore this return addr - br.cond.sptk ia64_trace_syscall + nop.m 0 +(p8) br.call.sptk.many b6=b6 // B (ignore return address) + br.cond.spnt ia64_trace_syscall // B END(fsys_bubble_down) .rodata diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S index facf75acdc85..86948ce63e43 100644 --- a/arch/ia64/kernel/gate.S +++ b/arch/ia64/kernel/gate.S @@ -72,38 +72,40 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) * bundle get executed. The remaining code must be safe even if * they do not get executed. */ - adds r17=-1024,r15 - mov r10=0 // default to successful syscall execution - epc + adds r17=-1024,r15 // A + mov r10=0 // A default to successful syscall execution + epc // B causes split-issue } ;; - rsm psr.be // note: on McKinley "rsm psr.be/srlz.d" is slightly faster than "rum psr.be" - LOAD_FSYSCALL_TABLE(r14) - - mov r16=IA64_KR(CURRENT) // 12 cycle read latency - tnat.nz p10,p9=r15 - mov r19=NR_syscalls-1 + rsm psr.be | psr.i // M2 (5 cyc to srlz.d) + LOAD_FSYSCALL_TABLE(r14) // X ;; - shladd r18=r17,3,r14 - - srlz.d - cmp.ne p8,p0=r0,r0 // p8 <- FALSE - /* Note: if r17 is a NaT, p6 will be set to zero. */ - cmp.geu p6,p7=r19,r17 // (syscall > 0 && syscall < 1024+NR_syscalls)? - ;; -(p6) ld8 r18=[r18] - mov r21=ar.fpsr - add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry - ;; -(p6) mov b7=r18 -(p6) tbit.z p8,p0=r18,0 -(p8) br.dptk.many b7 - -(p6) rsm psr.i - mov r27=ar.rsc - mov r26=ar.pfs + mov r16=IA64_KR(CURRENT) // M2 (12 cyc) + shladd r18=r17,3,r14 // A + mov r19=NR_syscalls-1 // A + ;; + lfetch [r18] // M0|1 + mov r29=psr // M2 (12 cyc) + // If r17 is a NaT, p6 will be zero + cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)? + ;; + mov r21=ar.fpsr // M2 (12 cyc) + tnat.nz p10,p9=r15 // I0 + mov.i r26=ar.pfs // I0 (would stall anyhow due to srlz.d...) + ;; + srlz.d // M0 (forces split-issue) ensure PSR.BE==0 +(p6) ld8 r18=[r18] // M0|1 + nop.i 0 + ;; + nop.m 0 +(p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!) + nop.i 0 ;; - mov r29=psr // read psr (12 cyc load latency) +(p8) ssm psr.i +(p6) mov b7=r18 // I0 +(p8) br.dptk.many b7 // B + + mov r27=ar.rsc // M2 (12 cyc) /* * brl.cond doesn't work as intended because the linker would convert this branch * into a branch to a PLT. Perhaps there will be a way to avoid this with some @@ -111,6 +113,8 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) * instead. */ #ifdef CONFIG_ITANIUM +(p6) add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry + ;; (p6) ld8 r14=[r14] // r14 <- fsys_bubble_down ;; (p6) mov b7=r14 @@ -118,7 +122,7 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) #else BRL_COND_FSYS_BUBBLE_DOWN(p6) #endif - + ssm psr.i mov r10=-1 (p10) mov r8=EINVAL (p9) mov r8=ENOSYS diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c index 7bbf019c9867..01572814abe4 100644 --- a/arch/ia64/kernel/ia64_ksyms.c +++ b/arch/ia64/kernel/ia64_ksyms.c @@ -58,9 +58,6 @@ EXPORT_SYMBOL(__strlen_user); EXPORT_SYMBOL(__strncpy_from_user); EXPORT_SYMBOL(__strnlen_user); -#include <asm/unistd.h> -EXPORT_SYMBOL(__ia64_syscall); - /* from arch/ia64/lib */ extern void __divsi3(void); extern void __udivsi3(void); diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index 40afbfa47eca..8f53915f4ae6 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c @@ -129,14 +129,13 @@ static struct iosapic { char __iomem *addr; /* base address of IOSAPIC */ unsigned int gsi_base; /* first GSI assigned to this IOSAPIC */ unsigned short num_rte; /* number of RTE in this IOSAPIC */ + int rtes_inuse; /* # of RTEs in use on this IOSAPIC */ #ifdef CONFIG_NUMA unsigned short node; /* numa node association via pxm */ #endif } iosapic_lists[NR_IOSAPICS]; -static int num_iosapic; - -static unsigned char pcat_compat __initdata; /* 8259 compatibility flag */ +static unsigned char pcat_compat __devinitdata; /* 8259 compatibility flag */ static int iosapic_kmalloc_ok; static LIST_HEAD(free_rte_list); @@ -149,7 +148,7 @@ find_iosapic (unsigned int gsi) { int i; - for (i = 0; i < num_iosapic; i++) { + for (i = 0; i < NR_IOSAPICS; i++) { if ((unsigned) (gsi - iosapic_lists[i].gsi_base) < iosapic_lists[i].num_rte) return i; } @@ -490,8 +489,6 @@ static int iosapic_find_sharable_vector (unsigned long trigger, unsigned long po } } } - if (vector < 0) - panic("%s: out of interrupt vectors!\n", __FUNCTION__); return vector; } @@ -507,6 +504,8 @@ iosapic_reassign_vector (int vector) if (!list_empty(&iosapic_intr_info[vector].rtes)) { new_vector = assign_irq_vector(AUTO_ASSIGN); + if (new_vector < 0) + panic("%s: out of interrupt vectors!\n", __FUNCTION__); printk(KERN_INFO "Reassigning vector %d to %d\n", vector, new_vector); memcpy(&iosapic_intr_info[new_vector], &iosapic_intr_info[vector], sizeof(struct iosapic_intr_info)); @@ -598,6 +597,7 @@ register_intr (unsigned int gsi, int vector, unsigned char delivery, rte->refcnt++; list_add_tail(&rte->rte_list, &iosapic_intr_info[vector].rtes); iosapic_intr_info[vector].count++; + iosapic_lists[index].rtes_inuse++; } else if (vector_is_shared(vector)) { struct iosapic_intr_info *info = &iosapic_intr_info[vector]; @@ -787,7 +787,7 @@ void iosapic_unregister_intr (unsigned int gsi) { unsigned long flags; - int irq, vector; + int irq, vector, index; irq_desc_t *idesc; u32 low32; unsigned long trigger, polarity; @@ -828,6 +828,9 @@ iosapic_unregister_intr (unsigned int gsi) list_del(&rte->rte_list); iosapic_intr_info[vector].count--; iosapic_free_rte(rte); + index = find_iosapic(gsi); + iosapic_lists[index].rtes_inuse--; + WARN_ON(iosapic_lists[index].rtes_inuse < 0); trigger = iosapic_intr_info[vector].trigger; polarity = iosapic_intr_info[vector].polarity; @@ -890,6 +893,8 @@ iosapic_register_platform_intr (u32 int_type, unsigned int gsi, break; case ACPI_INTERRUPT_INIT: vector = assign_irq_vector(AUTO_ASSIGN); + if (vector < 0) + panic("%s: out of interrupt vectors!\n", __FUNCTION__); delivery = IOSAPIC_INIT; break; case ACPI_INTERRUPT_CPEI: @@ -961,30 +966,86 @@ iosapic_system_init (int system_pcat_compat) } } -void __init +static inline int +iosapic_alloc (void) +{ + int index; + + for (index = 0; index < NR_IOSAPICS; index++) + if (!iosapic_lists[index].addr) + return index; + + printk(KERN_WARNING "%s: failed to allocate iosapic\n", __FUNCTION__); + return -1; +} + +static inline void +iosapic_free (int index) +{ + memset(&iosapic_lists[index], 0, sizeof(iosapic_lists[0])); +} + +static inline int +iosapic_check_gsi_range (unsigned int gsi_base, unsigned int ver) +{ + int index; + unsigned int gsi_end, base, end; + + /* check gsi range */ + gsi_end = gsi_base + ((ver >> 16) & 0xff); + for (index = 0; index < NR_IOSAPICS; index++) { + if (!iosapic_lists[index].addr) + continue; + + base = iosapic_lists[index].gsi_base; + end = base + iosapic_lists[index].num_rte - 1; + + if (gsi_base < base && gsi_end < base) + continue;/* OK */ + + if (gsi_base > end && gsi_end > end) + continue; /* OK */ + + return -EBUSY; + } + return 0; +} + +int __devinit iosapic_init (unsigned long phys_addr, unsigned int gsi_base) { - int num_rte; + int num_rte, err, index; unsigned int isa_irq, ver; char __iomem *addr; + unsigned long flags; - addr = ioremap(phys_addr, 0); - ver = iosapic_version(addr); + spin_lock_irqsave(&iosapic_lock, flags); + { + addr = ioremap(phys_addr, 0); + ver = iosapic_version(addr); - /* - * The MAX_REDIR register holds the highest input pin - * number (starting from 0). - * We add 1 so that we can use it for number of pins (= RTEs) - */ - num_rte = ((ver >> 16) & 0xff) + 1; + if ((err = iosapic_check_gsi_range(gsi_base, ver))) { + iounmap(addr); + spin_unlock_irqrestore(&iosapic_lock, flags); + return err; + } + + /* + * The MAX_REDIR register holds the highest input pin + * number (starting from 0). + * We add 1 so that we can use it for number of pins (= RTEs) + */ + num_rte = ((ver >> 16) & 0xff) + 1; - iosapic_lists[num_iosapic].addr = addr; - iosapic_lists[num_iosapic].gsi_base = gsi_base; - iosapic_lists[num_iosapic].num_rte = num_rte; + index = iosapic_alloc(); + iosapic_lists[index].addr = addr; + iosapic_lists[index].gsi_base = gsi_base; + iosapic_lists[index].num_rte = num_rte; #ifdef CONFIG_NUMA - iosapic_lists[num_iosapic].node = MAX_NUMNODES; + iosapic_lists[index].node = MAX_NUMNODES; #endif - num_iosapic++; + } + spin_unlock_irqrestore(&iosapic_lock, flags); if ((gsi_base == 0) && pcat_compat) { /* @@ -995,10 +1056,43 @@ iosapic_init (unsigned long phys_addr, unsigned int gsi_base) for (isa_irq = 0; isa_irq < 16; ++isa_irq) iosapic_override_isa_irq(isa_irq, isa_irq, IOSAPIC_POL_HIGH, IOSAPIC_EDGE); } + return 0; +} + +#ifdef CONFIG_HOTPLUG +int +iosapic_remove (unsigned int gsi_base) +{ + int index, err = 0; + unsigned long flags; + + spin_lock_irqsave(&iosapic_lock, flags); + { + index = find_iosapic(gsi_base); + if (index < 0) { + printk(KERN_WARNING "%s: No IOSAPIC for GSI base %u\n", + __FUNCTION__, gsi_base); + goto out; + } + + if (iosapic_lists[index].rtes_inuse) { + err = -EBUSY; + printk(KERN_WARNING "%s: IOSAPIC for GSI base %u is busy\n", + __FUNCTION__, gsi_base); + goto out; + } + + iounmap(iosapic_lists[index].addr); + iosapic_free(index); + } + out: + spin_unlock_irqrestore(&iosapic_lock, flags); + return err; } +#endif /* CONFIG_HOTPLUG */ #ifdef CONFIG_NUMA -void __init +void __devinit map_iosapic_to_node(unsigned int gsi_base, int node) { int index; diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 4fe60c7a2e90..6c4d59fd0364 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -63,30 +63,19 @@ EXPORT_SYMBOL(isa_irq_to_vector_map); static unsigned long ia64_vector_mask[BITS_TO_LONGS(IA64_NUM_DEVICE_VECTORS)]; int -assign_irq_vector_nopanic (int irq) +assign_irq_vector (int irq) { int pos, vector; again: pos = find_first_zero_bit(ia64_vector_mask, IA64_NUM_DEVICE_VECTORS); vector = IA64_FIRST_DEVICE_VECTOR + pos; if (vector > IA64_LAST_DEVICE_VECTOR) - return -1; + return -ENOSPC; if (test_and_set_bit(pos, ia64_vector_mask)) goto again; return vector; } -int -assign_irq_vector (int irq) -{ - int vector = assign_irq_vector_nopanic(irq); - - if (vector < 0) - panic("assign_irq_vector: out of interrupt vectors!"); - - return vector; -} - void free_irq_vector (int vector) { diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S index d9c05d53435b..3bb3a13c4047 100644 --- a/arch/ia64/kernel/ivt.S +++ b/arch/ia64/kernel/ivt.S @@ -1,7 +1,7 @@ /* * arch/ia64/kernel/ivt.S * - * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co + * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co * Stephane Eranian <eranian@hpl.hp.com> * David Mosberger <davidm@hpl.hp.com> * Copyright (C) 2000, 2002-2003 Intel Co @@ -405,17 +405,22 @@ ENTRY(nested_dtlb_miss) * r30: continuation address * r31: saved pr * - * Clobbered: b0, r18, r19, r21, psr.dt (cleared) + * Clobbered: b0, r18, r19, r21, r22, psr.dt (cleared) */ rsm psr.dt // switch to using physical data addressing mov r19=IA64_KR(PT_BASE) // get the page table base address shl r21=r16,3 // shift bit 60 into sign bit + mov r18=cr.itir ;; shr.u r17=r16,61 // get the region number into r17 + extr.u r18=r18,2,6 // get the faulting page size ;; cmp.eq p6,p7=5,r17 // is faulting address in region 5? - shr.u r18=r16,PGDIR_SHIFT // get bits 33-63 of faulting address + add r22=-PAGE_SHIFT,r18 // adjustment for hugetlb address + add r18=PGDIR_SHIFT-PAGE_SHIFT,r18 ;; + shr.u r22=r16,r22 + shr.u r18=r16,r18 (p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place srlz.d @@ -428,7 +433,7 @@ ENTRY(nested_dtlb_miss) (p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8 (p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8) cmp.eq p7,p6=0,r21 // unused address bits all zeroes? - shr.u r18=r16,PMD_SHIFT // shift L2 index into position + shr.u r18=r22,PMD_SHIFT // shift L2 index into position ;; ld8 r17=[r17] // fetch the L1 entry (may be 0) ;; @@ -436,7 +441,7 @@ ENTRY(nested_dtlb_miss) dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry ;; (p7) ld8 r17=[r17] // fetch the L2 entry (may be 0) - shr.u r19=r16,PAGE_SHIFT // shift L3 index into position + shr.u r19=r22,PAGE_SHIFT // shift L3 index into position ;; (p7) cmp.eq.or.andcm p6,p7=r17,r0 // was L2 entry NULL? dep r17=r19,r17,3,(PAGE_SHIFT-3) // compute address of L3 page table entry @@ -687,82 +692,118 @@ ENTRY(break_fault) * to prevent leaking bits from kernel to user level. */ DBG_FAULT(11) - mov r16=IA64_KR(CURRENT) // r16 = current task; 12 cycle read lat. - mov r17=cr.iim - mov r18=__IA64_BREAK_SYSCALL - mov r21=ar.fpsr - mov r29=cr.ipsr - mov r19=b6 - mov r25=ar.unat - mov r27=ar.rsc - mov r26=ar.pfs - mov r28=cr.iip - mov r31=pr // prepare to save predicates - mov r20=r1 - ;; + mov.m r16=IA64_KR(CURRENT) // M2 r16 <- current task (12 cyc) + mov r29=cr.ipsr // M2 (12 cyc) + mov r31=pr // I0 (2 cyc) + + mov r17=cr.iim // M2 (2 cyc) + mov.m r27=ar.rsc // M2 (12 cyc) + mov r18=__IA64_BREAK_SYSCALL // A + + mov.m ar.rsc=0 // M2 + mov.m r21=ar.fpsr // M2 (12 cyc) + mov r19=b6 // I0 (2 cyc) + ;; + mov.m r23=ar.bspstore // M2 (12 cyc) + mov.m r24=ar.rnat // M2 (5 cyc) + mov.i r26=ar.pfs // I0 (2 cyc) + + invala // M0|1 + nop.m 0 // M + mov r20=r1 // A save r1 + + nop.m 0 + movl r30=sys_call_table // X + + mov r28=cr.iip // M2 (2 cyc) + cmp.eq p0,p7=r18,r17 // I0 is this a system call? +(p7) br.cond.spnt non_syscall // B no -> + // + // From this point on, we are definitely on the syscall-path + // and we can use (non-banked) scratch registers. + // +/////////////////////////////////////////////////////////////////////// + mov r1=r16 // A move task-pointer to "addl"-addressable reg + mov r2=r16 // A setup r2 for ia64_syscall_setup + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 // A r9 = ¤t_thread_info()->flags + adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 - cmp.eq p0,p7=r18,r17 // is this a system call? (p7 <- false, if so) -(p7) br.cond.spnt non_syscall + adds r15=-1024,r15 // A subtract 1024 from syscall number + mov r3=NR_syscalls - 1 ;; - ld1 r17=[r16] // load current->thread.on_ustack flag - st1 [r16]=r0 // clear current->thread.on_ustack flag - add r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 // set r1 for MINSTATE_START_SAVE_MIN_VIRT + ld1.bias r17=[r16] // M0|1 r17 = current->thread.on_ustack flag + ld4 r9=[r9] // M0|1 r9 = current_thread_info()->flags + extr.u r8=r29,41,2 // I0 extract ei field from cr.ipsr + + shladd r30=r15,3,r30 // A r30 = sys_call_table + 8*(syscall-1024) + addl r22=IA64_RBS_OFFSET,r1 // A compute base of RBS + cmp.leu p6,p7=r15,r3 // A syscall number in range? ;; - invala - /* adjust return address so we skip over the break instruction: */ + lfetch.fault.excl.nt1 [r22] // M0|1 prefetch RBS +(p6) ld8 r30=[r30] // M0|1 load address of syscall entry point + tnat.nz.or p7,p0=r15 // I0 is syscall nr a NaT? - extr.u r8=r29,41,2 // extract ei field from cr.ipsr - ;; - cmp.eq p6,p7=2,r8 // isr.ei==2? - mov r2=r1 // setup r2 for ia64_syscall_setup - ;; -(p6) mov r8=0 // clear ei to 0 -(p6) adds r28=16,r28 // switch cr.iip to next bundle cr.ipsr.ei wrapped -(p7) adds r8=1,r8 // increment ei to next slot - ;; - cmp.eq pKStk,pUStk=r0,r17 // are we in kernel mode already? - dep r29=r8,r29,41,2 // insert new ei into cr.ipsr + mov.m ar.bspstore=r22 // M2 switch to kernel RBS + cmp.eq p8,p9=2,r8 // A isr.ei==2? ;; - // switch from user to kernel RBS: - MINSTATE_START_SAVE_MIN_VIRT - br.call.sptk.many b7=ia64_syscall_setup - ;; - MINSTATE_END_SAVE_MIN_VIRT // switch to bank 1 - ssm psr.ic | PSR_DEFAULT_BITS - ;; - srlz.i // guarantee that interruption collection is on - mov r3=NR_syscalls - 1 - ;; -(p15) ssm psr.i // restore psr.i - // p10==true means out registers are more than 8 or r15's Nat is true -(p10) br.cond.spnt.many ia64_ret_from_syscall - ;; - movl r16=sys_call_table +(p8) mov r8=0 // A clear ei to 0 +(p7) movl r30=sys_ni_syscall // X - adds r15=-1024,r15 // r15 contains the syscall number---subtract 1024 - movl r2=ia64_ret_from_syscall - ;; - shladd r20=r15,3,r16 // r20 = sys_call_table + 8*(syscall-1024) - cmp.leu p6,p7=r15,r3 // (syscall > 0 && syscall < 1024 + NR_syscalls) ? - mov rp=r2 // set the real return addr +(p8) adds r28=16,r28 // A switch cr.iip to next bundle +(p9) adds r8=1,r8 // A increment ei to next slot + nop.i 0 ;; -(p6) ld8 r20=[r20] // load address of syscall entry point -(p7) movl r20=sys_ni_syscall - add r2=TI_FLAGS+IA64_TASK_SIZE,r13 - ;; - ld4 r2=[r2] // r2 = current_thread_info()->flags - ;; - and r2=_TIF_SYSCALL_TRACEAUDIT,r2 // mask trace or audit + mov.m r25=ar.unat // M2 (5 cyc) + dep r29=r8,r29,41,2 // I0 insert new ei into cr.ipsr + adds r15=1024,r15 // A restore original syscall number + // + // If any of the above loads miss in L1D, we'll stall here until + // the data arrives. + // +/////////////////////////////////////////////////////////////////////// + st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag + mov b6=r30 // I0 setup syscall handler branch reg early + cmp.eq pKStk,pUStk=r0,r17 // A were we on kernel stacks already? + + and r9=_TIF_SYSCALL_TRACEAUDIT,r9 // A mask trace or audit + mov r18=ar.bsp // M2 (12 cyc) +(pKStk) br.cond.spnt .break_fixup // B we're already in kernel-mode -- fix up RBS + ;; +.back_from_break_fixup: +(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1 // A compute base of memory stack + cmp.eq p14,p0=r9,r0 // A are syscalls being traced/audited? + br.call.sptk.many b7=ia64_syscall_setup // B +1: + mov ar.rsc=0x3 // M2 set eager mode, pl 0, LE, loadrs=0 + nop 0 + bsw.1 // B (6 cyc) regs are saved, switch to bank 1 ;; - cmp.eq p8,p0=r2,r0 - mov b6=r20 + + ssm psr.ic | PSR_DEFAULT_BITS // M2 now it's safe to re-enable intr.-collection + movl r3=ia64_ret_from_syscall // X ;; -(p8) br.call.sptk.many b6=b6 // ignore this return addr - br.cond.sptk ia64_trace_syscall + + srlz.i // M0 ensure interruption collection is on + mov rp=r3 // I0 set the real return addr +(p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT + +(p15) ssm psr.i // M2 restore psr.i +(p14) br.call.sptk.many b6=b6 // B invoke syscall-handker (ignore return addr) + br.cond.spnt.many ia64_trace_syscall // B do syscall-tracing thingamagic // NOT REACHED +/////////////////////////////////////////////////////////////////////// + // On entry, we optimistically assumed that we're coming from user-space. + // For the rare cases where a system-call is done from within the kernel, + // we fix things up at this point: +.break_fixup: + add r1=-IA64_PT_REGS_SIZE,sp // A allocate space for pt_regs structure + mov ar.rnat=r24 // M2 restore kernel's AR.RNAT + ;; + mov ar.bspstore=r23 // M2 restore kernel's AR.BSPSTORE + br.cond.sptk .back_from_break_fixup END(break_fault) .org ia64_ivt+0x3000 @@ -837,8 +878,6 @@ END(interrupt) * - r31: saved pr * - b0: original contents (to be saved) * On exit: - * - executing on bank 1 registers - * - psr.ic enabled, interrupts restored * - p10: TRUE if syscall is invoked with more than 8 out * registers or r15's Nat is true * - r1: kernel's gp @@ -846,8 +885,11 @@ END(interrupt) * - r8: -EINVAL if p10 is true * - r12: points to kernel stack * - r13: points to current task + * - r14: preserved (same as on entry) + * - p13: preserved * - p15: TRUE if interrupts need to be re-enabled * - ar.fpsr: set to kernel settings + * - b6: preserved (same as on entry) */ GLOBAL_ENTRY(ia64_syscall_setup) #if PT(B6) != 0 @@ -915,10 +957,10 @@ GLOBAL_ENTRY(ia64_syscall_setup) (p13) mov in5=-1 ;; st8 [r16]=r21,PT(R8)-PT(AR_FPSR) // save ar.fpsr - tnat.nz p14,p0=in6 + tnat.nz p13,p0=in6 cmp.lt p10,p9=r11,r8 // frame size can't be more than local+8 ;; - stf8 [r16]=f1 // ensure pt_regs.r8 != 0 (see handle_syscall_error) + mov r8=1 (p9) tnat.nz p10,p0=r15 adds r12=-16,r1 // switch to kernel memory stack (with 16 bytes of scratch) @@ -929,9 +971,9 @@ GLOBAL_ENTRY(ia64_syscall_setup) mov r13=r2 // establish `current' movl r1=__gp // establish kernel global pointer ;; -(p14) mov in6=-1 + st8 [r16]=r8 // ensure pt_regs.r8 != 0 (see handle_syscall_error) +(p13) mov in6=-1 (p8) mov in7=-1 - nop.i 0 cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0 movl r17=FPSR_DEFAULT @@ -1002,6 +1044,8 @@ END(dispatch_illegal_op_fault) FAULT(17) ENTRY(non_syscall) + mov ar.rsc=r27 // restore ar.rsc before SAVE_MIN_WITH_COVER + ;; SAVE_MIN_WITH_COVER // There is no particular reason for this code to be here, other than that @@ -1199,6 +1243,25 @@ END(disabled_fp_reg) // 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50) ENTRY(nat_consumption) DBG_FAULT(26) + + mov r16=cr.ipsr + mov r17=cr.isr + mov r31=pr // save PR + ;; + and r18=0xf,r17 // r18 = cr.ipsr.code{3:0} + tbit.z p6,p0=r17,IA64_ISR_NA_BIT + ;; + cmp.ne.or p6,p0=IA64_ISR_CODE_LFETCH,r18 + dep r16=-1,r16,IA64_PSR_ED_BIT,1 +(p6) br.cond.spnt 1f // branch if (cr.ispr.na == 0 || cr.ipsr.code{3:0} != LFETCH) + ;; + mov cr.ipsr=r16 // set cr.ipsr.na + mov pr=r31,-1 + ;; + rfi + +1: mov pr=r31,-1 + ;; FAULT(26) END(nat_consumption) diff --git a/arch/ia64/kernel/jprobes.S b/arch/ia64/kernel/jprobes.S new file mode 100644 index 000000000000..b7fa3ccd2b0f --- /dev/null +++ b/arch/ia64/kernel/jprobes.S @@ -0,0 +1,61 @@ +/* + * Jprobe specific operations + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) Intel Corporation, 2005 + * + * 2005-May Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy + * <anil.s.keshavamurthy@intel.com> initial implementation + * + * Jprobes (a.k.a. "jump probes" which is built on-top of kprobes) allow a + * probe to be inserted into the beginning of a function call. The fundamental + * difference between a jprobe and a kprobe is the jprobe handler is executed + * in the same context as the target function, while the kprobe handlers + * are executed in interrupt context. + * + * For jprobes we initially gain control by placing a break point in the + * first instruction of the targeted function. When we catch that specific + * break, we: + * * set the return address to our jprobe_inst_return() function + * * jump to the jprobe handler function + * + * Since we fixed up the return address, the jprobe handler will return to our + * jprobe_inst_return() function, giving us control again. At this point we + * are back in the parents frame marker, so we do yet another call to our + * jprobe_break() function to fix up the frame marker as it would normally + * exist in the target function. + * + * Our jprobe_return function then transfers control back to kprobes.c by + * executing a break instruction using one of our reserved numbers. When we + * catch that break in kprobes.c, we continue like we do for a normal kprobe + * by single stepping the emulated instruction, and then returning execution + * to the correct location. + */ +#include <asm/asmmacro.h> + + /* + * void jprobe_break(void) + */ +ENTRY(jprobe_break) + break.m 0x80300 +END(jprobe_break) + + /* + * void jprobe_inst_return(void) + */ +GLOBAL_ENTRY(jprobe_inst_return) + br.call.sptk.many b0=jprobe_break +END(jprobe_inst_return) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c new file mode 100644 index 000000000000..884f5cd27d8a --- /dev/null +++ b/arch/ia64/kernel/kprobes.c @@ -0,0 +1,721 @@ +/* + * Kernel Probes (KProbes) + * arch/ia64/kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * Copyright (C) Intel Corporation, 2005 + * + * 2005-Apr Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy + * <anil.s.keshavamurthy@intel.com> adapted from i386 + */ + +#include <linux/config.h> +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/preempt.h> +#include <linux/moduleloader.h> + +#include <asm/pgtable.h> +#include <asm/kdebug.h> +#include <asm/sections.h> + +extern void jprobe_inst_return(void); + +/* kprobe_status settings */ +#define KPROBE_HIT_ACTIVE 0x00000001 +#define KPROBE_HIT_SS 0x00000002 + +static struct kprobe *current_kprobe, *kprobe_prev; +static unsigned long kprobe_status, kprobe_status_prev; +static struct pt_regs jprobe_saved_regs; + +enum instruction_type {A, I, M, F, B, L, X, u}; +static enum instruction_type bundle_encoding[32][3] = { + { M, I, I }, /* 00 */ + { M, I, I }, /* 01 */ + { M, I, I }, /* 02 */ + { M, I, I }, /* 03 */ + { M, L, X }, /* 04 */ + { M, L, X }, /* 05 */ + { u, u, u }, /* 06 */ + { u, u, u }, /* 07 */ + { M, M, I }, /* 08 */ + { M, M, I }, /* 09 */ + { M, M, I }, /* 0A */ + { M, M, I }, /* 0B */ + { M, F, I }, /* 0C */ + { M, F, I }, /* 0D */ + { M, M, F }, /* 0E */ + { M, M, F }, /* 0F */ + { M, I, B }, /* 10 */ + { M, I, B }, /* 11 */ + { M, B, B }, /* 12 */ + { M, B, B }, /* 13 */ + { u, u, u }, /* 14 */ + { u, u, u }, /* 15 */ + { B, B, B }, /* 16 */ + { B, B, B }, /* 17 */ + { M, M, B }, /* 18 */ + { M, M, B }, /* 19 */ + { u, u, u }, /* 1A */ + { u, u, u }, /* 1B */ + { M, F, B }, /* 1C */ + { M, F, B }, /* 1D */ + { u, u, u }, /* 1E */ + { u, u, u }, /* 1F */ +}; + +/* + * In this function we check to see if the instruction + * is IP relative instruction and update the kprobe + * inst flag accordingly + */ +static void update_kprobe_inst_flag(uint template, uint slot, uint major_opcode, + unsigned long kprobe_inst, struct kprobe *p) +{ + p->ainsn.inst_flag = 0; + p->ainsn.target_br_reg = 0; + + if (bundle_encoding[template][slot] == B) { + switch (major_opcode) { + case INDIRECT_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + case IP_RELATIVE_PREDICT_OPCODE: + case IP_RELATIVE_BRANCH_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; + break; + case IP_RELATIVE_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + } + } else if (bundle_encoding[template][slot] == X) { + switch (major_opcode) { + case LONG_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + } + } + return; +} + +/* + * In this function we check to see if the instruction + * on which we are inserting kprobe is supported. + * Returns 0 if supported + * Returns -EINVAL if unsupported + */ +static int unsupported_inst(uint template, uint slot, uint major_opcode, + unsigned long kprobe_inst, struct kprobe *p) +{ + unsigned long addr = (unsigned long)p->addr; + + if (bundle_encoding[template][slot] == I) { + switch (major_opcode) { + case 0x0: //I_UNIT_MISC_OPCODE: + /* + * Check for Integer speculation instruction + * - Bit 33-35 to be equal to 0x1 + */ + if (((kprobe_inst >> 33) & 0x7) == 1) { + printk(KERN_WARNING + "Kprobes on speculation inst at <0x%lx> not supported\n", + addr); + return -EINVAL; + } + + /* + * IP relative mov instruction + * - Bit 27-35 to be equal to 0x30 + */ + if (((kprobe_inst >> 27) & 0x1FF) == 0x30) { + printk(KERN_WARNING + "Kprobes on \"mov r1=ip\" at <0x%lx> not supported\n", + addr); + return -EINVAL; + + } + } + } + return 0; +} + + +/* + * In this function we check to see if the instruction + * (qp) cmpx.crel.ctype p1,p2=r2,r3 + * on which we are inserting kprobe is cmp instruction + * with ctype as unc. + */ +static uint is_cmp_ctype_unc_inst(uint template, uint slot, uint major_opcode, +unsigned long kprobe_inst) +{ + cmp_inst_t cmp_inst; + uint ctype_unc = 0; + + if (!((bundle_encoding[template][slot] == I) || + (bundle_encoding[template][slot] == M))) + goto out; + + if (!((major_opcode == 0xC) || (major_opcode == 0xD) || + (major_opcode == 0xE))) + goto out; + + cmp_inst.l = kprobe_inst; + if ((cmp_inst.f.x2 == 0) || (cmp_inst.f.x2 == 1)) { + /* Integere compare - Register Register (A6 type)*/ + if ((cmp_inst.f.tb == 0) && (cmp_inst.f.ta == 0) + &&(cmp_inst.f.c == 1)) + ctype_unc = 1; + } else if ((cmp_inst.f.x2 == 2)||(cmp_inst.f.x2 == 3)) { + /* Integere compare - Immediate Register (A8 type)*/ + if ((cmp_inst.f.ta == 0) &&(cmp_inst.f.c == 1)) + ctype_unc = 1; + } +out: + return ctype_unc; +} + +/* + * In this function we override the bundle with + * the break instruction at the given slot. + */ +static void prepare_break_inst(uint template, uint slot, uint major_opcode, + unsigned long kprobe_inst, struct kprobe *p) +{ + unsigned long break_inst = BREAK_INST; + bundle_t *bundle = &p->ainsn.insn.bundle; + + /* + * Copy the original kprobe_inst qualifying predicate(qp) + * to the break instruction iff !is_cmp_ctype_unc_inst + * because for cmp instruction with ctype equal to unc, + * which is a special instruction always needs to be + * executed regradless of qp + */ + if (!is_cmp_ctype_unc_inst(template, slot, major_opcode, kprobe_inst)) + break_inst |= (0x3f & kprobe_inst); + + switch (slot) { + case 0: + bundle->quad0.slot0 = break_inst; + break; + case 1: + bundle->quad0.slot1_p0 = break_inst; + bundle->quad1.slot1_p1 = break_inst >> (64-46); + break; + case 2: + bundle->quad1.slot2 = break_inst; + break; + } + + /* + * Update the instruction flag, so that we can + * emulate the instruction properly after we + * single step on original instruction + */ + update_kprobe_inst_flag(template, slot, major_opcode, kprobe_inst, p); +} + +static inline void get_kprobe_inst(bundle_t *bundle, uint slot, + unsigned long *kprobe_inst, uint *major_opcode) +{ + unsigned long kprobe_inst_p0, kprobe_inst_p1; + unsigned int template; + + template = bundle->quad0.template; + + switch (slot) { + case 0: + *major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT); + *kprobe_inst = bundle->quad0.slot0; + break; + case 1: + *major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT); + kprobe_inst_p0 = bundle->quad0.slot1_p0; + kprobe_inst_p1 = bundle->quad1.slot1_p1; + *kprobe_inst = kprobe_inst_p0 | (kprobe_inst_p1 << (64-46)); + break; + case 2: + *major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT); + *kprobe_inst = bundle->quad1.slot2; + break; + } +} + +/* Returns non-zero if the addr is in the Interrupt Vector Table */ +static inline int in_ivt_functions(unsigned long addr) +{ + return (addr >= (unsigned long)__start_ivt_text + && addr < (unsigned long)__end_ivt_text); +} + +static int valid_kprobe_addr(int template, int slot, unsigned long addr) +{ + if ((slot > 2) || ((bundle_encoding[template][1] == L) && slot > 1)) { + printk(KERN_WARNING "Attempting to insert unaligned kprobe " + "at 0x%lx\n", addr); + return -EINVAL; + } + + if (in_ivt_functions(addr)) { + printk(KERN_WARNING "Kprobes can't be inserted inside " + "IVT functions at 0x%lx\n", addr); + return -EINVAL; + } + + if (slot == 1 && bundle_encoding[template][1] != L) { + printk(KERN_WARNING "Inserting kprobes on slot #1 " + "is not supported\n"); + return -EINVAL; + } + + return 0; +} + +static inline void save_previous_kprobe(void) +{ + kprobe_prev = current_kprobe; + kprobe_status_prev = kprobe_status; +} + +static inline void restore_previous_kprobe(void) +{ + current_kprobe = kprobe_prev; + kprobe_status = kprobe_status_prev; +} + +static inline void set_current_kprobe(struct kprobe *p) +{ + current_kprobe = p; +} + +static void kretprobe_trampoline(void) +{ +} + +/* + * At this point the target function has been tricked into + * returning into our trampoline. Lookup the associated instance + * and then: + * - call the handler function + * - cleanup by marking the instance as unused + * - long jump back to the original return address + */ +int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address = + ((struct fnptr *)kretprobe_trampoline)->ip; + + head = kretprobe_inst_table_head(current); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->cr_iip = orig_ret_address; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; +} + +void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) +{ + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *)regs->b0; + + /* Replace the return addr with trampoline addr */ + regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip; + + add_rp_inst(ri); + } else { + rp->nmissed++; + } +} + +int arch_prepare_kprobe(struct kprobe *p) +{ + unsigned long addr = (unsigned long) p->addr; + unsigned long *kprobe_addr = (unsigned long *)(addr & ~0xFULL); + unsigned long kprobe_inst=0; + unsigned int slot = addr & 0xf, template, major_opcode = 0; + bundle_t *bundle = &p->ainsn.insn.bundle; + + memcpy(&p->opcode.bundle, kprobe_addr, sizeof(bundle_t)); + memcpy(&p->ainsn.insn.bundle, kprobe_addr, sizeof(bundle_t)); + + template = bundle->quad0.template; + + if(valid_kprobe_addr(template, slot, addr)) + return -EINVAL; + + /* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */ + if (slot == 1 && bundle_encoding[template][1] == L) + slot++; + + /* Get kprobe_inst and major_opcode from the bundle */ + get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode); + + if (unsupported_inst(template, slot, major_opcode, kprobe_inst, p)) + return -EINVAL; + + prepare_break_inst(template, slot, major_opcode, kprobe_inst, p); + + return 0; +} + +void arch_arm_kprobe(struct kprobe *p) +{ + unsigned long addr = (unsigned long)p->addr; + unsigned long arm_addr = addr & ~0xFULL; + + memcpy((char *)arm_addr, &p->ainsn.insn.bundle, sizeof(bundle_t)); + flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t)); +} + +void arch_disarm_kprobe(struct kprobe *p) +{ + unsigned long addr = (unsigned long)p->addr; + unsigned long arm_addr = addr & ~0xFULL; + + /* p->opcode contains the original unaltered bundle */ + memcpy((char *) arm_addr, (char *) &p->opcode.bundle, sizeof(bundle_t)); + flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t)); +} + +void arch_remove_kprobe(struct kprobe *p) +{ +} + +/* + * We are resuming execution after a single step fault, so the pt_regs + * structure reflects the register state after we executed the instruction + * located in the kprobe (p->ainsn.insn.bundle). We still need to adjust + * the ip to point back to the original stack address. To set the IP address + * to original stack address, handle the case where we need to fixup the + * relative IP address and/or fixup branch register. + */ +static void resume_execution(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long bundle_addr = ((unsigned long) (&p->opcode.bundle)) & ~0xFULL; + unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL; + unsigned long template; + int slot = ((unsigned long)p->addr & 0xf); + + template = p->opcode.bundle.quad0.template; + + if (slot == 1 && bundle_encoding[template][1] == L) + slot = 2; + + if (p->ainsn.inst_flag) { + + if (p->ainsn.inst_flag & INST_FLAG_FIX_RELATIVE_IP_ADDR) { + /* Fix relative IP address */ + regs->cr_iip = (regs->cr_iip - bundle_addr) + resume_addr; + } + + if (p->ainsn.inst_flag & INST_FLAG_FIX_BRANCH_REG) { + /* + * Fix target branch register, software convention is + * to use either b0 or b6 or b7, so just checking + * only those registers + */ + switch (p->ainsn.target_br_reg) { + case 0: + if ((regs->b0 == bundle_addr) || + (regs->b0 == bundle_addr + 0x10)) { + regs->b0 = (regs->b0 - bundle_addr) + + resume_addr; + } + break; + case 6: + if ((regs->b6 == bundle_addr) || + (regs->b6 == bundle_addr + 0x10)) { + regs->b6 = (regs->b6 - bundle_addr) + + resume_addr; + } + break; + case 7: + if ((regs->b7 == bundle_addr) || + (regs->b7 == bundle_addr + 0x10)) { + regs->b7 = (regs->b7 - bundle_addr) + + resume_addr; + } + break; + } /* end switch */ + } + goto turn_ss_off; + } + + if (slot == 2) { + if (regs->cr_iip == bundle_addr + 0x10) { + regs->cr_iip = resume_addr + 0x10; + } + } else { + if (regs->cr_iip == bundle_addr) { + regs->cr_iip = resume_addr; + } + } + +turn_ss_off: + /* Turn off Single Step bit */ + ia64_psr(regs)->ss = 0; +} + +static void prepare_ss(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long bundle_addr = (unsigned long) &p->opcode.bundle; + unsigned long slot = (unsigned long)p->addr & 0xf; + + /* Update instruction pointer (IIP) and slot number (IPSR.ri) */ + regs->cr_iip = bundle_addr & ~0xFULL; + + if (slot > 2) + slot = 0; + + ia64_psr(regs)->ri = slot; + + /* turn on single stepping */ + ia64_psr(regs)->ss = 1; +} + +static int pre_kprobes_handler(struct die_args *args) +{ + struct kprobe *p; + int ret = 0; + struct pt_regs *regs = args->regs; + kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs); + + preempt_disable(); + + /* Handle recursion cases */ + if (kprobe_running()) { + p = get_kprobe(addr); + if (p) { + if (kprobe_status == KPROBE_HIT_SS) { + unlock_kprobes(); + goto no_kprobe; + } + /* We have reentered the pre_kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(); + set_current_kprobe(p); + p->nmissed++; + prepare_ss(p, regs); + kprobe_status = KPROBE_REENTER; + return 1; + } else if (args->err == __IA64_BREAK_JPROBE) { + /* + * jprobe instrumented function just completed + */ + p = current_kprobe; + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } + } else { + /* Not our break */ + goto no_kprobe; + } + } + + lock_kprobes(); + p = get_kprobe(addr); + if (!p) { + unlock_kprobes(); + goto no_kprobe; + } + + kprobe_status = KPROBE_HIT_ACTIVE; + set_current_kprobe(p); + + if (p->pre_handler && p->pre_handler(p, regs)) + /* + * Our pre-handler is specifically requesting that we just + * do a return. This is used for both the jprobe pre-handler + * and the kretprobe trampoline + */ + return 1; + +ss_probe: + prepare_ss(p, regs); + kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +static int post_kprobes_handler(struct pt_regs *regs) +{ + if (!kprobe_running()) + return 0; + + if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) { + kprobe_status = KPROBE_HIT_SSDONE; + current_kprobe->post_handler(current_kprobe, regs, 0); + } + + resume_execution(current_kprobe, regs); + + /*Restore back the original saved kprobes variables and continue. */ + if (kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(); + goto out; + } + + unlock_kprobes(); + +out: + preempt_enable_no_resched(); + return 1; +} + +static int kprobes_fault_handler(struct pt_regs *regs, int trapnr) +{ + if (!kprobe_running()) + return 0; + + if (current_kprobe->fault_handler && + current_kprobe->fault_handler(current_kprobe, regs, trapnr)) + return 1; + + if (kprobe_status & KPROBE_HIT_SS) { + resume_execution(current_kprobe, regs); + unlock_kprobes(); + preempt_enable_no_resched(); + } + + return 0; +} + +int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct die_args *args = (struct die_args *)data; + switch(val) { + case DIE_BREAK: + if (pre_kprobes_handler(args)) + return NOTIFY_STOP; + break; + case DIE_SS: + if (post_kprobes_handler(args->regs)) + return NOTIFY_STOP; + break; + case DIE_PAGE_FAULT: + if (kprobes_fault_handler(args->regs, args->trapnr)) + return NOTIFY_STOP; + default: + break; + } + return NOTIFY_DONE; +} + +int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr = ((struct fnptr *)(jp->entry))->ip; + + /* save architectural state */ + jprobe_saved_regs = *regs; + + /* after rfi, execute the jprobe instrumented function */ + regs->cr_iip = addr & ~0xFULL; + ia64_psr(regs)->ri = addr & 0xf; + regs->r1 = ((struct fnptr *)(jp->entry))->gp; + + /* + * fix the return address to our jprobe_inst_return() function + * in the jprobes.S file + */ + regs->b0 = ((struct fnptr *)(jprobe_inst_return))->ip; + + return 1; +} + +int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + *regs = jprobe_saved_regs; + return 1; +} + +static struct kprobe trampoline_p = { + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init_kprobes(void) +{ + trampoline_p.addr = + (kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip; + return register_kprobe(&trampoline_p); +} diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c new file mode 100644 index 000000000000..a68ce6678092 --- /dev/null +++ b/arch/ia64/kernel/numa.c @@ -0,0 +1,57 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ia64 kernel NUMA specific stuff + * + * Copyright (C) 2002 Erich Focht <efocht@ess.nec.de> + * Copyright (C) 2004 Silicon Graphics, Inc. + * Jesse Barnes <jbarnes@sgi.com> + */ +#include <linux/config.h> +#include <linux/topology.h> +#include <linux/module.h> +#include <asm/processor.h> +#include <asm/smp.h> + +u8 cpu_to_node_map[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_to_node_map); + +cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; + +/** + * build_cpu_to_node_map - setup cpu to node and node to cpumask arrays + * + * Build cpu to node mapping and initialize the per node cpu masks using + * info from the node_cpuid array handed to us by ACPI. + */ +void __init build_cpu_to_node_map(void) +{ + int cpu, i, node; + + for(node=0; node < MAX_NUMNODES; node++) + cpus_clear(node_to_cpu_mask[node]); + + for(cpu = 0; cpu < NR_CPUS; ++cpu) { + node = -1; + for (i = 0; i < NR_CPUS; ++i) + if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) { + node = node_cpuid[i].nid; + break; + } + cpu_to_node_map[cpu] = (node >= 0) ? node : 0; + if (node >= 0) + cpu_set(cpu, node_to_cpu_mask[node]); + } +} diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 6407bff6bfd7..b8ebb8e427ef 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -37,7 +37,6 @@ #include <linux/vfs.h> #include <linux/pagemap.h> #include <linux/mount.h> -#include <linux/version.h> #include <linux/bitops.h> #include <asm/errno.h> diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index a9bfba46733d..66e840609808 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -27,6 +27,7 @@ #include <linux/efi.h> #include <linux/interrupt.h> #include <linux/delay.h> +#include <linux/kprobes.h> #include <asm/cpu.h> #include <asm/delay.h> @@ -708,6 +709,13 @@ kernel_thread_helper (int (*fn)(void *), void *arg) void flush_thread (void) { + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(current); + /* drop floating-point and debug-register state if it exists: */ current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID); ia64_drop_fpu(current); @@ -722,6 +730,14 @@ flush_thread (void) void exit_thread (void) { + + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(current); + ia64_drop_fpu(current); #ifdef CONFIG_PERFMON /* if needed, stop monitoring and flush state to perfmon context */ @@ -791,16 +807,12 @@ machine_restart (char *restart_cmd) (*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL); } -EXPORT_SYMBOL(machine_restart); - void machine_halt (void) { cpu_halt(); } -EXPORT_SYMBOL(machine_halt); - void machine_power_off (void) { @@ -809,4 +821,3 @@ machine_power_off (void) machine_halt(); } -EXPORT_SYMBOL(machine_power_off); diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 575a8f657b31..bbb8bc7c0552 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -725,12 +725,32 @@ convert_to_non_syscall (struct task_struct *child, struct pt_regs *pt, break; } + /* + * Note: at the time of this call, the target task is blocked + * in notify_resume_user() and by clearling PRED_LEAVE_SYSCALL + * (aka, "pLvSys") we redirect execution from + * .work_pending_syscall_end to .work_processed_kernel. + */ unw_get_pr(&prev_info, &pr); - pr &= ~(1UL << PRED_SYSCALL); + pr &= ~((1UL << PRED_SYSCALL) | (1UL << PRED_LEAVE_SYSCALL)); pr |= (1UL << PRED_NON_SYSCALL); unw_set_pr(&prev_info, pr); pt->cr_ifs = (1UL << 63) | cfm; + /* + * Clear the memory that is NOT written on syscall-entry to + * ensure we do not leak kernel-state to user when execution + * resumes. + */ + pt->r2 = 0; + pt->r3 = 0; + pt->r14 = 0; + memset(&pt->r16, 0, 16*8); /* clear r16-r31 */ + memset(&pt->f6, 0, 6*16); /* clear f6-f11 */ + pt->b7 = 0; + pt->ar_ccv = 0; + pt->ar_csd = 0; + pt->ar_ssd = 0; } static int @@ -945,6 +965,13 @@ access_uarea (struct task_struct *child, unsigned long addr, *data = (pt->cr_ipsr & IPSR_MASK); return 0; + case PT_AR_RSC: + if (write_access) + pt->ar_rsc = *data | (3 << 2); /* force PL3 */ + else + *data = pt->ar_rsc; + return 0; + case PT_AR_RNAT: urbs_end = ia64_get_user_rbs_end(child, pt, NULL); rnat_addr = (long) ia64_rse_rnat_addr((long *) @@ -996,9 +1023,6 @@ access_uarea (struct task_struct *child, unsigned long addr, case PT_AR_BSPSTORE: ptr = pt_reg_addr(pt, ar_bspstore); break; - case PT_AR_RSC: - ptr = pt_reg_addr(pt, ar_rsc); - break; case PT_AR_UNAT: ptr = pt_reg_addr(pt, ar_unat); break; @@ -1234,7 +1258,7 @@ ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) static long ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) { - unsigned long psr, ec, lc, rnat, bsp, cfm, nat_bits, val = 0; + unsigned long psr, rsc, ec, lc, rnat, bsp, cfm, nat_bits, val = 0; struct unw_frame_info info; struct switch_stack *sw; struct ia64_fpreg fpval; @@ -1267,7 +1291,7 @@ ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) /* app regs */ retval |= __get_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]); - retval |= __get_user(pt->ar_rsc, &ppr->ar[PT_AUR_RSC]); + retval |= __get_user(rsc, &ppr->ar[PT_AUR_RSC]); retval |= __get_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]); retval |= __get_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]); retval |= __get_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]); @@ -1365,6 +1389,7 @@ ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) retval |= __get_user(nat_bits, &ppr->nat); retval |= access_uarea(child, PT_CR_IPSR, &psr, 1); + retval |= access_uarea(child, PT_AR_RSC, &rsc, 1); retval |= access_uarea(child, PT_AR_EC, &ec, 1); retval |= access_uarea(child, PT_AR_LC, &lc, 1); retval |= access_uarea(child, PT_AR_RNAT, &rnat, 1); diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index bb9033187d42..84f89da7c640 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -20,6 +20,7 @@ * 02/01/00 R.Seth fixed get_cpuinfo for SMP * 01/07/99 S.Eranian added the support for command line argument * 06/24/99 W.Drummond added boot_cpu_data. + * 05/28/05 Z. Menyhart Dynamic stride size for "flush_icache_range()" */ #include <linux/config.h> #include <linux/module.h> @@ -74,6 +75,8 @@ DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8); unsigned long ia64_cycles_per_usec; struct ia64_boot_param *ia64_boot_param; struct screen_info screen_info; +unsigned long vga_console_iobase; +unsigned long vga_console_membase; unsigned long ia64_max_cacheline_size; unsigned long ia64_iobase; /* virtual address for I/O accesses */ @@ -83,6 +86,13 @@ EXPORT_SYMBOL(io_space); unsigned int num_io_spaces; /* + * "flush_icache_range()" needs to know what processor dependent stride size to use + * when it makes i-cache(s) coherent with d-caches. + */ +#define I_CACHE_STRIDE_SHIFT 5 /* Safest way to go: 32 bytes by 32 bytes */ +unsigned long ia64_i_cache_stride_shift = ~0; + +/* * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1). This * mask specifies a mask of address bits that must be 0 in order for two buffers to be * mergeable by the I/O MMU (i.e., the end address of the first buffer and the start @@ -275,23 +285,25 @@ io_port_init (void) static inline int __init early_console_setup (char *cmdline) { + int earlycons = 0; + #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE { extern int sn_serial_console_early_setup(void); if (!sn_serial_console_early_setup()) - return 0; + earlycons++; } #endif #ifdef CONFIG_EFI_PCDP if (!efi_setup_pcdp_console(cmdline)) - return 0; + earlycons++; #endif #ifdef CONFIG_SERIAL_8250_CONSOLE if (!early_serial_console_init(cmdline)) - return 0; + earlycons++; #endif - return -1; + return (earlycons) ? 0 : -1; } static inline void @@ -624,6 +636,12 @@ setup_per_cpu_areas (void) /* start_kernel() requires this... */ } +/* + * Calculate the max. cache line size. + * + * In addition, the minimum of the i-cache stride sizes is calculated for + * "flush_icache_range()". + */ static void get_max_cacheline_size (void) { @@ -637,6 +655,8 @@ get_max_cacheline_size (void) printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n", __FUNCTION__, status); max = SMP_CACHE_BYTES; + /* Safest setup for "flush_icache_range()" */ + ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT; goto out; } @@ -645,14 +665,31 @@ get_max_cacheline_size (void) &cci); if (status != 0) { printk(KERN_ERR - "%s: ia64_pal_cache_config_info(l=%lu) failed (status=%ld)\n", + "%s: ia64_pal_cache_config_info(l=%lu, 2) failed (status=%ld)\n", __FUNCTION__, l, status); max = SMP_CACHE_BYTES; + /* The safest setup for "flush_icache_range()" */ + cci.pcci_stride = I_CACHE_STRIDE_SHIFT; + cci.pcci_unified = 1; } line_size = 1 << cci.pcci_line_size; if (line_size > max) max = line_size; - } + if (!cci.pcci_unified) { + status = ia64_pal_cache_config_info(l, + /* cache_type (instruction)= */ 1, + &cci); + if (status != 0) { + printk(KERN_ERR + "%s: ia64_pal_cache_config_info(l=%lu, 1) failed (status=%ld)\n", + __FUNCTION__, l, status); + /* The safest setup for "flush_icache_range()" */ + cci.pcci_stride = I_CACHE_STRIDE_SHIFT; + } + } + if (cci.pcci_stride < ia64_i_cache_stride_shift) + ia64_i_cache_stride_shift = cci.pcci_stride; + } out: if (max > ia64_max_cacheline_size) ia64_max_cacheline_size = max; diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 499b7e5317cf..b8a0a7d257a9 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -94,7 +94,7 @@ sys_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, long arg2, static long restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) { - unsigned long ip, flags, nat, um, cfm; + unsigned long ip, flags, nat, um, cfm, rsc; long err; /* Always make any pending restarted system calls return -EINTR */ @@ -106,7 +106,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) err |= __get_user(ip, &sc->sc_ip); /* instruction pointer */ err |= __get_user(cfm, &sc->sc_cfm); err |= __get_user(um, &sc->sc_um); /* user mask */ - err |= __get_user(scr->pt.ar_rsc, &sc->sc_ar_rsc); + err |= __get_user(rsc, &sc->sc_ar_rsc); err |= __get_user(scr->pt.ar_unat, &sc->sc_ar_unat); err |= __get_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr); err |= __get_user(scr->pt.ar_pfs, &sc->sc_ar_pfs); @@ -119,6 +119,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) err |= __copy_from_user(&scr->pt.r15, &sc->sc_gr[15], 8); /* r15 */ scr->pt.cr_ifs = cfm | (1UL << 63); + scr->pt.ar_rsc = rsc | (3 << 2); /* force PL3 */ /* establish new instruction pointer: */ scr->pt.cr_iip = ip & ~0x3UL; @@ -142,6 +143,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) __copy_from_user(current->thread.fph, &sc->sc_fr[32], 96*16); psr->mfh = 0; /* drop signal handler's fph contents... */ + preempt_disable(); if (psr->dfh) ia64_drop_fpu(current); else { @@ -149,6 +151,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) __ia64_load_fpu(current->thread.fph); ia64_set_local_fpu_owner(current); } + preempt_enable(); } return err; } diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index 953095e2ce15..0166a9847095 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -231,13 +231,16 @@ smp_flush_tlb_all (void) void smp_flush_tlb_mm (struct mm_struct *mm) { + preempt_disable(); /* this happens for the common case of a single-threaded fork(): */ if (likely(mm == current->active_mm && atomic_read(&mm->mm_users) == 1)) { local_finish_flush_tlb_mm(mm); + preempt_enable(); return; } + preempt_enable(); /* * We could optimize this further by using mm->cpu_vm_mask to track which CPUs * have been running in the address space. It's not clear that this is worth the @@ -269,7 +272,7 @@ smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int int me = get_cpu(); /* prevent preemption and reschedule on another processor */ if (cpuid == me) { - printk("%s: trying to call self\n", __FUNCTION__); + printk(KERN_INFO "%s: trying to call self\n", __FUNCTION__); put_cpu(); return -EBUSY; } diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 3865f088ffa2..7d72c0d872b3 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -346,6 +346,7 @@ smp_callin (void) lock_ipi_calllock(); cpu_set(cpuid, cpu_online_map); unlock_ipi_calllock(); + per_cpu(cpu_state, cpuid) = CPU_ONLINE; smp_setup_percpu_timer(); @@ -524,47 +525,6 @@ smp_build_cpu_map (void) } } -#ifdef CONFIG_NUMA - -/* on which node is each logical CPU (one cacheline even for 64 CPUs) */ -u8 cpu_to_node_map[NR_CPUS] __cacheline_aligned; -EXPORT_SYMBOL(cpu_to_node_map); -/* which logical CPUs are on which nodes */ -cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; - -/* - * Build cpu to node mapping and initialize the per node cpu masks. - */ -void __init -build_cpu_to_node_map (void) -{ - int cpu, i, node; - - for(node=0; node<MAX_NUMNODES; node++) - cpus_clear(node_to_cpu_mask[node]); - for(cpu = 0; cpu < NR_CPUS; ++cpu) { - /* - * All Itanium NUMA platforms I know use ACPI, so maybe we - * can drop this ifdef completely. [EF] - */ -#ifdef CONFIG_ACPI_NUMA - node = -1; - for (i = 0; i < NR_CPUS; ++i) - if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) { - node = node_cpuid[i].nid; - break; - } -#else -# error Fixme: Dunno how to build CPU-to-node map. -#endif - cpu_to_node_map[cpu] = (node >= 0) ? node : 0; - if (node >= 0) - cpu_set(cpu, node_to_cpu_mask[node]); - } -} - -#endif /* CONFIG_NUMA */ - /* * Cycle through the APs sending Wakeup IPIs to boot each. */ @@ -611,6 +571,7 @@ void __devinit smp_prepare_boot_cpu(void) { cpu_set(smp_processor_id(), cpu_online_map); cpu_set(smp_processor_id(), cpu_callin_map); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; } /* @@ -688,6 +649,7 @@ int __cpu_disable(void) return -EBUSY; remove_siblinginfo(cpu); + cpu_clear(cpu, cpu_online_map); fixup_irqs(); local_flush_tlb_all(); cpu_clear(cpu, cpu_callin_map); @@ -774,6 +736,7 @@ __cpu_up (unsigned int cpu) if (cpu_isset(cpu, cpu_callin_map)) return -EINVAL; + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Processor goes to start_secondary(), sets online flag */ ret = do_boot_cpu(sapicid, cpu); if (ret < 0) diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index d8030f3bd865..92ff46ad21e2 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -36,12 +36,14 @@ int arch_register_cpu(int num) parent = &sysfs_nodes[cpu_to_node(num)]; #endif /* CONFIG_NUMA */ +#ifdef CONFIG_ACPI_BOOT /* * If CPEI cannot be re-targetted, and this is * CPEI target, then dont create the control file */ if (!can_cpei_retarget() && is_cpu_cpei_target(num)) sysfs_cpus[num].cpu.no_control = 1; +#endif return register_cpu(&sysfs_cpus[num].cpu, num, parent); } diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c index 1861173bd4f6..4440c8343fa4 100644 --- a/arch/ia64/kernel/traps.c +++ b/arch/ia64/kernel/traps.c @@ -21,12 +21,26 @@ #include <asm/intrinsics.h> #include <asm/processor.h> #include <asm/uaccess.h> +#include <asm/kdebug.h> extern spinlock_t timerlist_lock; fpswa_interface_t *fpswa_interface; EXPORT_SYMBOL(fpswa_interface); +struct notifier_block *ia64die_chain; +static DEFINE_SPINLOCK(die_notifier_lock); + +int register_die_notifier(struct notifier_block *nb) +{ + int err = 0; + unsigned long flags; + spin_lock_irqsave(&die_notifier_lock, flags); + err = notifier_chain_register(&ia64die_chain, nb); + spin_unlock_irqrestore(&die_notifier_lock, flags); + return err; +} + void __init trap_init (void) { @@ -76,14 +90,16 @@ die (const char *str, struct pt_regs *regs, long err) .lock_owner_depth = 0 }; static int die_counter; + int cpu = get_cpu(); - if (die.lock_owner != smp_processor_id()) { + if (die.lock_owner != cpu) { console_verbose(); spin_lock_irq(&die.lock); - die.lock_owner = smp_processor_id(); + die.lock_owner = cpu; die.lock_owner_depth = 0; bust_spinlocks(1); } + put_cpu(); if (++die.lock_owner_depth < 3) { printk("%s[%d]: %s %ld [%d]\n", @@ -137,6 +153,10 @@ ia64_bad_break (unsigned long break_num, struct pt_regs *regs) switch (break_num) { case 0: /* unknown error (used by GCC for __builtin_abort()) */ + if (notify_die(DIE_BREAK, "break 0", regs, break_num, TRAP_BRKPT, SIGTRAP) + == NOTIFY_STOP) { + return; + } die_if_kernel("bugcheck!", regs, break_num); sig = SIGILL; code = ILL_ILLOPC; break; @@ -189,6 +209,15 @@ ia64_bad_break (unsigned long break_num, struct pt_regs *regs) sig = SIGILL; code = __ILL_BNDMOD; break; + case 0x80200: + case 0x80300: + if (notify_die(DIE_BREAK, "kprobe", regs, break_num, TRAP_BRKPT, SIGTRAP) + == NOTIFY_STOP) { + return; + } + sig = SIGTRAP; code = TRAP_BRKPT; + break; + default: if (break_num < 0x40000 || break_num > 0x100000) die_if_kernel("Bad break", regs, break_num); @@ -548,7 +577,11 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, #endif break; case 35: siginfo.si_code = TRAP_BRANCH; ifa = 0; break; - case 36: siginfo.si_code = TRAP_TRACE; ifa = 0; break; + case 36: + if (notify_die(DIE_SS, "ss", ®s, vector, + vector, SIGTRAP) == NOTIFY_STOP) + return; + siginfo.si_code = TRAP_TRACE; ifa = 0; break; } siginfo.si_signo = SIGTRAP; siginfo.si_errno = 0; diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c new file mode 100644 index 000000000000..490dfc9ab47f --- /dev/null +++ b/arch/ia64/kernel/uncached.c @@ -0,0 +1,246 @@ +/* + * Copyright (C) 2001-2005 Silicon Graphics, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * A simple uncached page allocator using the generic allocator. This + * allocator first utilizes the spare (spill) pages found in the EFI + * memmap and will then start converting cached pages to uncached ones + * at a granule at a time. Node awareness is implemented by having a + * pool of pages per node. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/efi.h> +#include <linux/genalloc.h> +#include <asm/page.h> +#include <asm/pal.h> +#include <asm/system.h> +#include <asm/pgtable.h> +#include <asm/atomic.h> +#include <asm/tlbflush.h> +#include <asm/sn/arch.h> + +#define DEBUG 0 + +#if DEBUG +#define dprintk printk +#else +#define dprintk(x...) do { } while (0) +#endif + +void __init efi_memmap_walk_uc (efi_freemem_callback_t callback); + +#define MAX_UNCACHED_GRANULES 5 +static int allocated_granules; + +struct gen_pool *uncached_pool[MAX_NUMNODES]; + + +static void uncached_ipi_visibility(void *data) +{ + int status; + + status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL); + if ((status != PAL_VISIBILITY_OK) && + (status != PAL_VISIBILITY_OK_REMOTE_NEEDED)) + printk(KERN_DEBUG "pal_prefetch_visibility() returns %i on " + "CPU %i\n", status, get_cpu()); +} + + +static void uncached_ipi_mc_drain(void *data) +{ + int status; + status = ia64_pal_mc_drain(); + if (status) + printk(KERN_WARNING "ia64_pal_mc_drain() failed with %i on " + "CPU %i\n", status, get_cpu()); +} + + +static unsigned long +uncached_get_new_chunk(struct gen_pool *poolp) +{ + struct page *page; + void *tmp; + int status, i; + unsigned long addr, node; + + if (allocated_granules >= MAX_UNCACHED_GRANULES) + return 0; + + node = poolp->private; + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, + IA64_GRANULE_SHIFT-PAGE_SHIFT); + + dprintk(KERN_INFO "get_new_chunk page %p, addr %lx\n", + page, (unsigned long)(page-vmem_map) << PAGE_SHIFT); + + /* + * Do magic if no mem on local node! XXX + */ + if (!page) + return 0; + tmp = page_address(page); + + /* + * There's a small race here where it's possible for someone to + * access the page through /dev/mem halfway through the conversion + * to uncached - not sure it's really worth bothering about + */ + for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++) + SetPageUncached(&page[i]); + + flush_tlb_kernel_range(tmp, tmp + IA64_GRANULE_SIZE); + + status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL); + + dprintk(KERN_INFO "pal_prefetch_visibility() returns %i on cpu %i\n", + status, get_cpu()); + + if (!status) { + status = smp_call_function(uncached_ipi_visibility, NULL, 0, 1); + if (status) + printk(KERN_WARNING "smp_call_function failed for " + "uncached_ipi_visibility! (%i)\n", status); + } + + if (ia64_platform_is("sn2")) + sn_flush_all_caches((unsigned long)tmp, IA64_GRANULE_SIZE); + else + flush_icache_range((unsigned long)tmp, + (unsigned long)tmp+IA64_GRANULE_SIZE); + + ia64_pal_mc_drain(); + status = smp_call_function(uncached_ipi_mc_drain, NULL, 0, 1); + if (status) + printk(KERN_WARNING "smp_call_function failed for " + "uncached_ipi_mc_drain! (%i)\n", status); + + addr = (unsigned long)tmp - PAGE_OFFSET + __IA64_UNCACHED_OFFSET; + + allocated_granules++; + return addr; +} + + +/* + * uncached_alloc_page + * + * Allocate 1 uncached page. Allocates on the requested node. If no + * uncached pages are available on the requested node, roundrobin starting + * with higher nodes. + */ +unsigned long +uncached_alloc_page(int nid) +{ + unsigned long maddr; + + maddr = gen_pool_alloc(uncached_pool[nid], PAGE_SIZE); + + dprintk(KERN_DEBUG "uncached_alloc_page returns %lx on node %i\n", + maddr, nid); + + /* + * If no memory is availble on our local node, try the + * remaining nodes in the system. + */ + if (!maddr) { + int i; + + for (i = MAX_NUMNODES - 1; i >= 0; i--) { + if (i == nid || !node_online(i)) + continue; + maddr = gen_pool_alloc(uncached_pool[i], PAGE_SIZE); + dprintk(KERN_DEBUG "uncached_alloc_page alternate search " + "returns %lx on node %i\n", maddr, i); + if (maddr) { + break; + } + } + } + + return maddr; +} +EXPORT_SYMBOL(uncached_alloc_page); + + +/* + * uncached_free_page + * + * Free a single uncached page. + */ +void +uncached_free_page(unsigned long maddr) +{ + int node; + + node = nasid_to_cnodeid(NASID_GET(maddr)); + + dprintk(KERN_DEBUG "uncached_free_page(%lx) on node %i\n", maddr, node); + + if ((maddr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET) + panic("uncached_free_page invalid address %lx\n", maddr); + + gen_pool_free(uncached_pool[node], maddr, PAGE_SIZE); +} +EXPORT_SYMBOL(uncached_free_page); + + +/* + * uncached_build_memmap, + * + * Called at boot time to build a map of pages that can be used for + * memory special operations. + */ +static int __init +uncached_build_memmap(unsigned long start, unsigned long end, void *arg) +{ + long length; + unsigned long vstart, vend; + int node; + + length = end - start; + vstart = start + __IA64_UNCACHED_OFFSET; + vend = end + __IA64_UNCACHED_OFFSET; + + dprintk(KERN_ERR "uncached_build_memmap(%lx %lx)\n", start, end); + + memset((char *)vstart, 0, length); + + node = nasid_to_cnodeid(NASID_GET(start)); + + for (; vstart < vend ; vstart += PAGE_SIZE) { + dprintk(KERN_INFO "sticking %lx into the pool!\n", vstart); + gen_pool_free(uncached_pool[node], vstart, PAGE_SIZE); + } + + return 0; +} + + +static int __init uncached_init(void) { + int i; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!node_online(i)) + continue; + uncached_pool[i] = gen_pool_create(0, IA64_GRANULE_SHIFT, + &uncached_get_new_chunk, i); + } + + efi_memmap_walk_uc(uncached_build_memmap); + + return 0; +} + +__initcall(uncached_init); diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c index 2776a074c6f1..3288be47bc75 100644 --- a/arch/ia64/kernel/unwind.c +++ b/arch/ia64/kernel/unwind.c @@ -362,7 +362,7 @@ unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char if (info->pri_unat_loc) nat_addr = info->pri_unat_loc; else - nat_addr = &info->sw->ar_unat; + nat_addr = &info->sw->caller_unat; nat_mask = (1UL << ((long) addr & 0x1f8)/8); } } else { @@ -524,7 +524,7 @@ unw_access_ar (struct unw_frame_info *info, int regnum, unsigned long *val, int case UNW_AR_UNAT: addr = info->unat_loc; if (!addr) - addr = &info->sw->ar_unat; + addr = &info->sw->caller_unat; break; case UNW_AR_LC: @@ -1775,7 +1775,7 @@ run_script (struct unw_script *script, struct unw_frame_info *state) case UNW_INSN_SETNAT_MEMSTK: if (!state->pri_unat_loc) - state->pri_unat_loc = &state->sw->ar_unat; + state->pri_unat_loc = &state->sw->caller_unat; /* register off. is a multiple of 8, so the least 3 bits (type) are 0 */ s[dst+1] = ((unsigned long) state->pri_unat_loc - s[dst]) | UNW_NAT_MEMSTK; break; @@ -2243,11 +2243,11 @@ unw_init (void) if (8*sizeof(unw_hash_index_t) < UNW_LOG_HASH_SIZE) unw_hash_index_t_is_too_narrow(); - unw.sw_off[unw.preg_index[UNW_REG_PRI_UNAT_GR]] = SW(AR_UNAT); + unw.sw_off[unw.preg_index[UNW_REG_PRI_UNAT_GR]] = SW(CALLER_UNAT); unw.sw_off[unw.preg_index[UNW_REG_BSPSTORE]] = SW(AR_BSPSTORE); - unw.sw_off[unw.preg_index[UNW_REG_PFS]] = SW(AR_UNAT); + unw.sw_off[unw.preg_index[UNW_REG_PFS]] = SW(AR_PFS); unw.sw_off[unw.preg_index[UNW_REG_RP]] = SW(B0); - unw.sw_off[unw.preg_index[UNW_REG_UNAT]] = SW(AR_UNAT); + unw.sw_off[unw.preg_index[UNW_REG_UNAT]] = SW(CALLER_UNAT); unw.sw_off[unw.preg_index[UNW_REG_PR]] = SW(PR); unw.sw_off[unw.preg_index[UNW_REG_LC]] = SW(AR_LC); unw.sw_off[unw.preg_index[UNW_REG_FPSR]] = SW(AR_FPSR); diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index b9f0db4c1b04..a676e79e0681 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -8,6 +8,11 @@ #define LOAD_OFFSET (KERNEL_START - KERNEL_TR_PAGE_SIZE) #include <asm-generic/vmlinux.lds.h> +#define IVT_TEXT \ + VMLINUX_SYMBOL(__start_ivt_text) = .; \ + *(.text.ivt) \ + VMLINUX_SYMBOL(__end_ivt_text) = .; + OUTPUT_FORMAT("elf64-ia64-little") OUTPUT_ARCH(ia64) ENTRY(phys_start) @@ -39,7 +44,7 @@ SECTIONS .text : AT(ADDR(.text) - LOAD_OFFSET) { - *(.text.ivt) + IVT_TEXT *(.text) SCHED_TEXT LOCK_TEXT |